node-html-parser 3.1.4 → 3.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -1
- package/dist/esm/nodes/html.js +95 -2
- package/dist/esm/parse.js +1 -41
- package/dist/main.js +130 -67
- package/dist/nodes/html.d.ts +10 -0
- package/dist/nodes/html.js +113 -7
- package/dist/parse.d.ts +1 -6
- package/dist/parse.js +2 -48
- package/package.json +4 -3
package/README.md
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# Fast HTML Parser [](http://badge.fury.io/js/node-html-parser) [](https://travis-ci.org/taoqf/node-html-parser)
|
|
2
2
|
|
|
3
3
|
Fast HTML Parser is a _very fast_ HTML parser. Which will generate a simplified
|
|
4
|
-
DOM tree, with
|
|
4
|
+
DOM tree, with element query support.
|
|
5
5
|
|
|
6
6
|
Per the design, it intends to parse massive HTML files in lowest price, thus the
|
|
7
7
|
performance is the top priority. For this reason, some malformatted HTML may not
|
|
@@ -112,6 +112,10 @@ Note: Full css3 selector supported since v3.0.0.
|
|
|
112
112
|
|
|
113
113
|
Query CSS Selector to find matching node.
|
|
114
114
|
|
|
115
|
+
### HTMLElement#closest(selector)
|
|
116
|
+
|
|
117
|
+
Query closest element by css selector.
|
|
118
|
+
|
|
115
119
|
### HTMLElement#appendChild(node)
|
|
116
120
|
|
|
117
121
|
Append a child node to childNodes
|
package/dist/esm/nodes/html.js
CHANGED
|
@@ -6,7 +6,6 @@ import TextNode from './text';
|
|
|
6
6
|
import Matcher from '../matcher';
|
|
7
7
|
import arr_back from '../back';
|
|
8
8
|
import CommentNode from './comment';
|
|
9
|
-
import parse from '../parse';
|
|
10
9
|
// const { decode } = he;
|
|
11
10
|
function decode(val) {
|
|
12
11
|
// clone string
|
|
@@ -479,6 +478,61 @@ export default class HTMLElement extends Node {
|
|
|
479
478
|
// }
|
|
480
479
|
// return null;
|
|
481
480
|
}
|
|
481
|
+
/**
|
|
482
|
+
* traverses the Element and its parents (heading toward the document root) until it finds a node that matches the provided selector string. Will return itself or the matching ancestor. If no such element exists, it returns null.
|
|
483
|
+
* @param selector a DOMString containing a selector list
|
|
484
|
+
*/
|
|
485
|
+
closest(selector) {
|
|
486
|
+
const mapChild = new Map();
|
|
487
|
+
let el = this;
|
|
488
|
+
let old = null;
|
|
489
|
+
function findOne(test, elems) {
|
|
490
|
+
let elem = null;
|
|
491
|
+
for (let i = 0, l = elems.length; i < l && !elem; i++) {
|
|
492
|
+
const el = elems[i];
|
|
493
|
+
if (test(el)) {
|
|
494
|
+
elem = el;
|
|
495
|
+
}
|
|
496
|
+
else {
|
|
497
|
+
const child = mapChild.get(el);
|
|
498
|
+
if (child) {
|
|
499
|
+
elem = findOne(test, [child]);
|
|
500
|
+
}
|
|
501
|
+
}
|
|
502
|
+
}
|
|
503
|
+
return elem;
|
|
504
|
+
}
|
|
505
|
+
while (el) {
|
|
506
|
+
mapChild.set(el, old);
|
|
507
|
+
old = el;
|
|
508
|
+
el = el.parentNode;
|
|
509
|
+
}
|
|
510
|
+
el = this;
|
|
511
|
+
while (el) {
|
|
512
|
+
const e = selectOne(selector, el, {
|
|
513
|
+
xmlMode: true,
|
|
514
|
+
adapter: {
|
|
515
|
+
...Matcher,
|
|
516
|
+
getChildren(node) {
|
|
517
|
+
const child = mapChild.get(node);
|
|
518
|
+
return child && [child];
|
|
519
|
+
},
|
|
520
|
+
getSiblings(node) {
|
|
521
|
+
return [node];
|
|
522
|
+
},
|
|
523
|
+
findOne,
|
|
524
|
+
findAll() {
|
|
525
|
+
return [];
|
|
526
|
+
}
|
|
527
|
+
}
|
|
528
|
+
});
|
|
529
|
+
if (e) {
|
|
530
|
+
return e;
|
|
531
|
+
}
|
|
532
|
+
el = el.parentNode;
|
|
533
|
+
}
|
|
534
|
+
return null;
|
|
535
|
+
}
|
|
482
536
|
/**
|
|
483
537
|
* Append a child node to childNodes
|
|
484
538
|
* @param {Node} node node to append
|
|
@@ -540,7 +594,7 @@ export default class HTMLElement extends Node {
|
|
|
540
594
|
}
|
|
541
595
|
const attrs = {};
|
|
542
596
|
if (this.rawAttrs) {
|
|
543
|
-
const re = /\b([a-z][a-z0-9-_]*)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|(\S+)))?/ig;
|
|
597
|
+
const re = /\b([a-z][a-z0-9-_:]*)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|(\S+)))?/ig;
|
|
544
598
|
let match;
|
|
545
599
|
while ((match = re.exec(this.rawAttrs))) {
|
|
546
600
|
attrs[match[1]] = match[2] || match[3] || match[4] || null;
|
|
@@ -929,3 +983,42 @@ export function base_parse(data, options = { lowerCaseTagName: false, comment: f
|
|
|
929
983
|
}
|
|
930
984
|
return stack;
|
|
931
985
|
}
|
|
986
|
+
/**
|
|
987
|
+
* Parses HTML and returns a root element
|
|
988
|
+
* Parse a chuck of HTML source.
|
|
989
|
+
*/
|
|
990
|
+
export function parse(data, options = { lowerCaseTagName: false, comment: false }) {
|
|
991
|
+
const stack = base_parse(data, options);
|
|
992
|
+
const [root] = stack;
|
|
993
|
+
while (stack.length > 1) {
|
|
994
|
+
// Handle each error elements.
|
|
995
|
+
const last = stack.pop();
|
|
996
|
+
const oneBefore = arr_back(stack);
|
|
997
|
+
if (last.parentNode && last.parentNode.parentNode) {
|
|
998
|
+
if (last.parentNode === oneBefore && last.tagName === oneBefore.tagName) {
|
|
999
|
+
// Pair error case <h3> <h3> handle : Fixes to <h3> </h3>
|
|
1000
|
+
oneBefore.removeChild(last);
|
|
1001
|
+
last.childNodes.forEach((child) => {
|
|
1002
|
+
oneBefore.parentNode.appendChild(child);
|
|
1003
|
+
});
|
|
1004
|
+
stack.pop();
|
|
1005
|
+
}
|
|
1006
|
+
else {
|
|
1007
|
+
// Single error <div> <h3> </div> handle: Just removes <h3>
|
|
1008
|
+
oneBefore.removeChild(last);
|
|
1009
|
+
last.childNodes.forEach((child) => {
|
|
1010
|
+
oneBefore.appendChild(child);
|
|
1011
|
+
});
|
|
1012
|
+
}
|
|
1013
|
+
}
|
|
1014
|
+
else {
|
|
1015
|
+
// If it's final element just skip.
|
|
1016
|
+
}
|
|
1017
|
+
}
|
|
1018
|
+
// response.childNodes.forEach((node) => {
|
|
1019
|
+
// if (node instanceof HTMLElement) {
|
|
1020
|
+
// node.parentNode = null;
|
|
1021
|
+
// }
|
|
1022
|
+
// });
|
|
1023
|
+
return root;
|
|
1024
|
+
}
|
package/dist/esm/parse.js
CHANGED
|
@@ -1,41 +1 @@
|
|
|
1
|
-
|
|
2
|
-
import { base_parse } from './nodes/html';
|
|
3
|
-
/**
|
|
4
|
-
* Parses HTML and returns a root element
|
|
5
|
-
* Parse a chuck of HTML source.
|
|
6
|
-
*/
|
|
7
|
-
export default function parse(data, options = { lowerCaseTagName: false, comment: false }) {
|
|
8
|
-
const stack = base_parse(data, options);
|
|
9
|
-
const [root] = stack;
|
|
10
|
-
while (stack.length > 1) {
|
|
11
|
-
// Handle each error elements.
|
|
12
|
-
const last = stack.pop();
|
|
13
|
-
const oneBefore = arr_back(stack);
|
|
14
|
-
if (last.parentNode && last.parentNode.parentNode) {
|
|
15
|
-
if (last.parentNode === oneBefore && last.tagName === oneBefore.tagName) {
|
|
16
|
-
// Pair error case <h3> <h3> handle : Fixes to <h3> </h3>
|
|
17
|
-
oneBefore.removeChild(last);
|
|
18
|
-
last.childNodes.forEach((child) => {
|
|
19
|
-
oneBefore.parentNode.appendChild(child);
|
|
20
|
-
});
|
|
21
|
-
stack.pop();
|
|
22
|
-
}
|
|
23
|
-
else {
|
|
24
|
-
// Single error <div> <h3> </div> handle: Just removes <h3>
|
|
25
|
-
oneBefore.removeChild(last);
|
|
26
|
-
last.childNodes.forEach((child) => {
|
|
27
|
-
oneBefore.appendChild(child);
|
|
28
|
-
});
|
|
29
|
-
}
|
|
30
|
-
}
|
|
31
|
-
else {
|
|
32
|
-
// If it's final element just skip.
|
|
33
|
-
}
|
|
34
|
-
}
|
|
35
|
-
// response.childNodes.forEach((node) => {
|
|
36
|
-
// if (node instanceof HTMLElement) {
|
|
37
|
-
// node.parentNode = null;
|
|
38
|
-
// }
|
|
39
|
-
// });
|
|
40
|
-
return root;
|
|
41
|
-
}
|
|
1
|
+
export { parse as default } from './nodes/html';
|
package/dist/main.js
CHANGED
|
@@ -16,6 +16,17 @@ var __extends = (this && this.__extends) || (function () {
|
|
|
16
16
|
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
17
17
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
18
18
|
};
|
|
19
|
+
var __assign = (this && this.__assign) || function () {
|
|
20
|
+
__assign = Object.assign || function(t) {
|
|
21
|
+
for (var s, i = 1, n = arguments.length; i < n; i++) {
|
|
22
|
+
s = arguments[i];
|
|
23
|
+
for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p))
|
|
24
|
+
t[p] = s[p];
|
|
25
|
+
}
|
|
26
|
+
return t;
|
|
27
|
+
};
|
|
28
|
+
return __assign.apply(this, arguments);
|
|
29
|
+
};
|
|
19
30
|
var __spreadArray = (this && this.__spreadArray) || function (to, from) {
|
|
20
31
|
for (var i = 0, il = from.length, j = to.length; i < il; i++, j++)
|
|
21
32
|
to[j] = from[i];
|
|
@@ -195,67 +206,17 @@ define("matcher", ["require", "exports", "nodes/type"], function (require, expor
|
|
|
195
206
|
findAll: findAll
|
|
196
207
|
};
|
|
197
208
|
});
|
|
198
|
-
define("
|
|
199
|
-
"use strict";
|
|
200
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
201
|
-
back_1 = __importDefault(back_1);
|
|
202
|
-
/**
|
|
203
|
-
* Parses HTML and returns a root element
|
|
204
|
-
* Parse a chuck of HTML source.
|
|
205
|
-
*/
|
|
206
|
-
function parse(data, options) {
|
|
207
|
-
if (options === void 0) { options = { lowerCaseTagName: false, comment: false }; }
|
|
208
|
-
var stack = html_1.base_parse(data, options);
|
|
209
|
-
var root = stack[0];
|
|
210
|
-
var _loop_1 = function () {
|
|
211
|
-
// Handle each error elements.
|
|
212
|
-
var last = stack.pop();
|
|
213
|
-
var oneBefore = back_1.default(stack);
|
|
214
|
-
if (last.parentNode && last.parentNode.parentNode) {
|
|
215
|
-
if (last.parentNode === oneBefore && last.tagName === oneBefore.tagName) {
|
|
216
|
-
// Pair error case <h3> <h3> handle : Fixes to <h3> </h3>
|
|
217
|
-
oneBefore.removeChild(last);
|
|
218
|
-
last.childNodes.forEach(function (child) {
|
|
219
|
-
oneBefore.parentNode.appendChild(child);
|
|
220
|
-
});
|
|
221
|
-
stack.pop();
|
|
222
|
-
}
|
|
223
|
-
else {
|
|
224
|
-
// Single error <div> <h3> </div> handle: Just removes <h3>
|
|
225
|
-
oneBefore.removeChild(last);
|
|
226
|
-
last.childNodes.forEach(function (child) {
|
|
227
|
-
oneBefore.appendChild(child);
|
|
228
|
-
});
|
|
229
|
-
}
|
|
230
|
-
}
|
|
231
|
-
else {
|
|
232
|
-
// If it's final element just skip.
|
|
233
|
-
}
|
|
234
|
-
};
|
|
235
|
-
while (stack.length > 1) {
|
|
236
|
-
_loop_1();
|
|
237
|
-
}
|
|
238
|
-
// response.childNodes.forEach((node) => {
|
|
239
|
-
// if (node instanceof HTMLElement) {
|
|
240
|
-
// node.parentNode = null;
|
|
241
|
-
// }
|
|
242
|
-
// });
|
|
243
|
-
return root;
|
|
244
|
-
}
|
|
245
|
-
exports.default = parse;
|
|
246
|
-
});
|
|
247
|
-
define("nodes/html", ["require", "exports", "he", "css-select", "nodes/node", "nodes/type", "nodes/text", "matcher", "back", "nodes/comment", "parse"], function (require, exports, he_1, css_select_1, node_2, type_3, text_1, matcher_1, back_2, comment_1, parse_1) {
|
|
209
|
+
define("nodes/html", ["require", "exports", "he", "css-select", "nodes/node", "nodes/type", "nodes/text", "matcher", "back", "nodes/comment"], function (require, exports, he_1, css_select_1, node_2, type_3, text_1, matcher_1, back_1, comment_1) {
|
|
248
210
|
"use strict";
|
|
249
211
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
250
|
-
exports.base_parse = void 0;
|
|
212
|
+
exports.parse = exports.base_parse = void 0;
|
|
251
213
|
he_1 = __importDefault(he_1);
|
|
252
214
|
node_2 = __importDefault(node_2);
|
|
253
215
|
type_3 = __importDefault(type_3);
|
|
254
216
|
text_1 = __importDefault(text_1);
|
|
255
217
|
matcher_1 = __importDefault(matcher_1);
|
|
256
|
-
|
|
218
|
+
back_1 = __importDefault(back_1);
|
|
257
219
|
comment_1 = __importDefault(comment_1);
|
|
258
|
-
parse_1 = __importDefault(parse_1);
|
|
259
220
|
// const { decode } = he;
|
|
260
221
|
function decode(val) {
|
|
261
222
|
// clone string
|
|
@@ -542,7 +503,7 @@ define("nodes/html", ["require", "exports", "he", "css-select", "nodes/node", "n
|
|
|
542
503
|
},
|
|
543
504
|
set: function (content) {
|
|
544
505
|
//const r = parse(content, global.options); // TODO global.options ?
|
|
545
|
-
var r =
|
|
506
|
+
var r = parse(content);
|
|
546
507
|
this.childNodes = r.childNodes.length ? r.childNodes : [new text_1.default(content, this)];
|
|
547
508
|
},
|
|
548
509
|
enumerable: false,
|
|
@@ -554,7 +515,7 @@ define("nodes/html", ["require", "exports", "he", "css-select", "nodes/node", "n
|
|
|
554
515
|
content = [content];
|
|
555
516
|
}
|
|
556
517
|
else if (typeof content == 'string') {
|
|
557
|
-
var r =
|
|
518
|
+
var r = parse(content, options);
|
|
558
519
|
content = r.childNodes.length ? r.childNodes : [new text_1.default(content, this)];
|
|
559
520
|
}
|
|
560
521
|
this.childNodes = content;
|
|
@@ -571,7 +532,7 @@ define("nodes/html", ["require", "exports", "he", "css-select", "nodes/node", "n
|
|
|
571
532
|
}
|
|
572
533
|
else if (typeof node == 'string') {
|
|
573
534
|
// const r = parse(content, global.options); // TODO global.options ?
|
|
574
|
-
var r =
|
|
535
|
+
var r = parse(node);
|
|
575
536
|
return r.childNodes.length ? r.childNodes : [new text_1.default(node, _this)];
|
|
576
537
|
}
|
|
577
538
|
return [];
|
|
@@ -782,6 +743,58 @@ define("nodes/html", ["require", "exports", "he", "css-select", "nodes/node", "n
|
|
|
782
743
|
// }
|
|
783
744
|
// return null;
|
|
784
745
|
};
|
|
746
|
+
/**
|
|
747
|
+
* traverses the Element and its parents (heading toward the document root) until it finds a node that matches the provided selector string. Will return itself or the matching ancestor. If no such element exists, it returns null.
|
|
748
|
+
* @param selector a DOMString containing a selector list
|
|
749
|
+
*/
|
|
750
|
+
HTMLElement.prototype.closest = function (selector) {
|
|
751
|
+
var mapChild = new Map();
|
|
752
|
+
var el = this;
|
|
753
|
+
var old = null;
|
|
754
|
+
function findOne(test, elems) {
|
|
755
|
+
var elem = null;
|
|
756
|
+
for (var i = 0, l = elems.length; i < l && !elem; i++) {
|
|
757
|
+
var el_1 = elems[i];
|
|
758
|
+
if (test(el_1)) {
|
|
759
|
+
elem = el_1;
|
|
760
|
+
}
|
|
761
|
+
else {
|
|
762
|
+
var child = mapChild.get(el_1);
|
|
763
|
+
if (child) {
|
|
764
|
+
elem = findOne(test, [child]);
|
|
765
|
+
}
|
|
766
|
+
}
|
|
767
|
+
}
|
|
768
|
+
return elem;
|
|
769
|
+
}
|
|
770
|
+
while (el) {
|
|
771
|
+
mapChild.set(el, old);
|
|
772
|
+
old = el;
|
|
773
|
+
el = el.parentNode;
|
|
774
|
+
}
|
|
775
|
+
el = this;
|
|
776
|
+
while (el) {
|
|
777
|
+
var e = css_select_1.selectOne(selector, el, {
|
|
778
|
+
xmlMode: true,
|
|
779
|
+
adapter: __assign(__assign({}, matcher_1.default), { getChildren: function (node) {
|
|
780
|
+
var child = mapChild.get(node);
|
|
781
|
+
return child && [child];
|
|
782
|
+
},
|
|
783
|
+
getSiblings: function (node) {
|
|
784
|
+
return [node];
|
|
785
|
+
},
|
|
786
|
+
findOne: findOne,
|
|
787
|
+
findAll: function () {
|
|
788
|
+
return [];
|
|
789
|
+
} })
|
|
790
|
+
});
|
|
791
|
+
if (e) {
|
|
792
|
+
return e;
|
|
793
|
+
}
|
|
794
|
+
el = el.parentNode;
|
|
795
|
+
}
|
|
796
|
+
return null;
|
|
797
|
+
};
|
|
785
798
|
/**
|
|
786
799
|
* Append a child node to childNodes
|
|
787
800
|
* @param {Node} node node to append
|
|
@@ -810,7 +823,7 @@ define("nodes/html", ["require", "exports", "he", "css-select", "nodes/node", "n
|
|
|
810
823
|
* @return {Node} last child node
|
|
811
824
|
*/
|
|
812
825
|
get: function () {
|
|
813
|
-
return
|
|
826
|
+
return back_1.default(this.childNodes);
|
|
814
827
|
},
|
|
815
828
|
enumerable: false,
|
|
816
829
|
configurable: true
|
|
@@ -860,7 +873,7 @@ define("nodes/html", ["require", "exports", "he", "css-select", "nodes/node", "n
|
|
|
860
873
|
}
|
|
861
874
|
var attrs = {};
|
|
862
875
|
if (this.rawAttrs) {
|
|
863
|
-
var re = /\b([a-z][a-z0-9-_]*)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|(\S+)))?/ig;
|
|
876
|
+
var re = /\b([a-z][a-z0-9-_:]*)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|(\S+)))?/ig;
|
|
864
877
|
var match = void 0;
|
|
865
878
|
while ((match = re.exec(this.rawAttrs))) {
|
|
866
879
|
attrs[match[1]] = match[2] || match[3] || match[4] || null;
|
|
@@ -965,7 +978,7 @@ define("nodes/html", ["require", "exports", "he", "css-select", "nodes/node", "n
|
|
|
965
978
|
if (arguments.length < 2) {
|
|
966
979
|
throw new Error('2 arguments required');
|
|
967
980
|
}
|
|
968
|
-
var p =
|
|
981
|
+
var p = parse(html);
|
|
969
982
|
if (where === 'afterend') {
|
|
970
983
|
var idx = this.parentNode.childNodes.findIndex(function (child) {
|
|
971
984
|
return child === _this;
|
|
@@ -1171,7 +1184,7 @@ define("nodes/html", ["require", "exports", "he", "css-select", "nodes/node", "n
|
|
|
1171
1184
|
var match;
|
|
1172
1185
|
// https://github.com/taoqf/node-html-parser/issues/38
|
|
1173
1186
|
data = "<" + frameflag + ">" + data + "</" + frameflag + ">";
|
|
1174
|
-
var
|
|
1187
|
+
var _loop_1 = function () {
|
|
1175
1188
|
if (lastTextPos > -1) {
|
|
1176
1189
|
if (lastTextPos + match[0].length < kMarkupPattern.lastIndex) {
|
|
1177
1190
|
// if has content
|
|
@@ -1205,7 +1218,7 @@ define("nodes/html", ["require", "exports", "he", "css-select", "nodes/node", "n
|
|
|
1205
1218
|
if (!match[4] && kElementsClosedByOpening[tagName]) {
|
|
1206
1219
|
if (kElementsClosedByOpening[tagName][match[2]]) {
|
|
1207
1220
|
stack.pop();
|
|
1208
|
-
currentParent =
|
|
1221
|
+
currentParent = back_1.default(stack);
|
|
1209
1222
|
}
|
|
1210
1223
|
}
|
|
1211
1224
|
// ignore container tag we add above
|
|
@@ -1248,7 +1261,7 @@ define("nodes/html", ["require", "exports", "he", "css-select", "nodes/node", "n
|
|
|
1248
1261
|
while (true) {
|
|
1249
1262
|
if (currentParent.rawTagName === match[2]) {
|
|
1250
1263
|
stack.pop();
|
|
1251
|
-
currentParent =
|
|
1264
|
+
currentParent = back_1.default(stack);
|
|
1252
1265
|
break;
|
|
1253
1266
|
}
|
|
1254
1267
|
else {
|
|
@@ -1257,7 +1270,7 @@ define("nodes/html", ["require", "exports", "he", "css-select", "nodes/node", "n
|
|
|
1257
1270
|
if (kElementsClosedByClosing[tagName]) {
|
|
1258
1271
|
if (kElementsClosedByClosing[tagName][match[2]]) {
|
|
1259
1272
|
stack.pop();
|
|
1260
|
-
currentParent =
|
|
1273
|
+
currentParent = back_1.default(stack);
|
|
1261
1274
|
continue;
|
|
1262
1275
|
}
|
|
1263
1276
|
}
|
|
@@ -1268,11 +1281,55 @@ define("nodes/html", ["require", "exports", "he", "css-select", "nodes/node", "n
|
|
|
1268
1281
|
}
|
|
1269
1282
|
};
|
|
1270
1283
|
while ((match = kMarkupPattern.exec(data))) {
|
|
1271
|
-
|
|
1284
|
+
_loop_1();
|
|
1272
1285
|
}
|
|
1273
1286
|
return stack;
|
|
1274
1287
|
}
|
|
1275
1288
|
exports.base_parse = base_parse;
|
|
1289
|
+
/**
|
|
1290
|
+
* Parses HTML and returns a root element
|
|
1291
|
+
* Parse a chuck of HTML source.
|
|
1292
|
+
*/
|
|
1293
|
+
function parse(data, options) {
|
|
1294
|
+
if (options === void 0) { options = { lowerCaseTagName: false, comment: false }; }
|
|
1295
|
+
var stack = base_parse(data, options);
|
|
1296
|
+
var root = stack[0];
|
|
1297
|
+
var _loop_2 = function () {
|
|
1298
|
+
// Handle each error elements.
|
|
1299
|
+
var last = stack.pop();
|
|
1300
|
+
var oneBefore = back_1.default(stack);
|
|
1301
|
+
if (last.parentNode && last.parentNode.parentNode) {
|
|
1302
|
+
if (last.parentNode === oneBefore && last.tagName === oneBefore.tagName) {
|
|
1303
|
+
// Pair error case <h3> <h3> handle : Fixes to <h3> </h3>
|
|
1304
|
+
oneBefore.removeChild(last);
|
|
1305
|
+
last.childNodes.forEach(function (child) {
|
|
1306
|
+
oneBefore.parentNode.appendChild(child);
|
|
1307
|
+
});
|
|
1308
|
+
stack.pop();
|
|
1309
|
+
}
|
|
1310
|
+
else {
|
|
1311
|
+
// Single error <div> <h3> </div> handle: Just removes <h3>
|
|
1312
|
+
oneBefore.removeChild(last);
|
|
1313
|
+
last.childNodes.forEach(function (child) {
|
|
1314
|
+
oneBefore.appendChild(child);
|
|
1315
|
+
});
|
|
1316
|
+
}
|
|
1317
|
+
}
|
|
1318
|
+
else {
|
|
1319
|
+
// If it's final element just skip.
|
|
1320
|
+
}
|
|
1321
|
+
};
|
|
1322
|
+
while (stack.length > 1) {
|
|
1323
|
+
_loop_2();
|
|
1324
|
+
}
|
|
1325
|
+
// response.childNodes.forEach((node) => {
|
|
1326
|
+
// if (node instanceof HTMLElement) {
|
|
1327
|
+
// node.parentNode = null;
|
|
1328
|
+
// }
|
|
1329
|
+
// });
|
|
1330
|
+
return root;
|
|
1331
|
+
}
|
|
1332
|
+
exports.parse = parse;
|
|
1276
1333
|
});
|
|
1277
1334
|
define("nodes/node", ["require", "exports"], function (require, exports) {
|
|
1278
1335
|
"use strict";
|
|
@@ -1342,6 +1399,12 @@ define("nodes/comment", ["require", "exports", "nodes/node", "nodes/type"], func
|
|
|
1342
1399
|
}(node_3.default));
|
|
1343
1400
|
exports.default = CommentNode;
|
|
1344
1401
|
});
|
|
1402
|
+
define("parse", ["require", "exports", "nodes/html"], function (require, exports, html_1) {
|
|
1403
|
+
"use strict";
|
|
1404
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
1405
|
+
exports.default = void 0;
|
|
1406
|
+
Object.defineProperty(exports, "default", { enumerable: true, get: function () { return html_1.parse; } });
|
|
1407
|
+
});
|
|
1345
1408
|
define("valid", ["require", "exports", "nodes/html"], function (require, exports, html_2) {
|
|
1346
1409
|
"use strict";
|
|
1347
1410
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
@@ -1356,14 +1419,14 @@ define("valid", ["require", "exports", "nodes/html"], function (require, exports
|
|
|
1356
1419
|
}
|
|
1357
1420
|
exports.default = valid;
|
|
1358
1421
|
});
|
|
1359
|
-
define("index", ["require", "exports", "nodes/comment", "nodes/html", "parse", "valid", "nodes/node", "nodes/text", "nodes/type"], function (require, exports, comment_2, html_3,
|
|
1422
|
+
define("index", ["require", "exports", "nodes/comment", "nodes/html", "parse", "valid", "nodes/node", "nodes/text", "nodes/type"], function (require, exports, comment_2, html_3, parse_1, valid_1, node_4, text_2, type_5) {
|
|
1360
1423
|
"use strict";
|
|
1361
1424
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
1362
1425
|
exports.NodeType = exports.TextNode = exports.Node = exports.valid = exports.default = exports.parse = exports.HTMLElement = exports.CommentNode = void 0;
|
|
1363
1426
|
Object.defineProperty(exports, "CommentNode", { enumerable: true, get: function () { return __importDefault(comment_2).default; } });
|
|
1364
1427
|
Object.defineProperty(exports, "HTMLElement", { enumerable: true, get: function () { return __importDefault(html_3).default; } });
|
|
1365
|
-
Object.defineProperty(exports, "parse", { enumerable: true, get: function () { return __importDefault(
|
|
1366
|
-
Object.defineProperty(exports, "default", { enumerable: true, get: function () { return __importDefault(
|
|
1428
|
+
Object.defineProperty(exports, "parse", { enumerable: true, get: function () { return __importDefault(parse_1).default; } });
|
|
1429
|
+
Object.defineProperty(exports, "default", { enumerable: true, get: function () { return __importDefault(parse_1).default; } });
|
|
1367
1430
|
Object.defineProperty(exports, "valid", { enumerable: true, get: function () { return __importDefault(valid_1).default; } });
|
|
1368
1431
|
Object.defineProperty(exports, "Node", { enumerable: true, get: function () { return __importDefault(node_4).default; } });
|
|
1369
1432
|
Object.defineProperty(exports, "TextNode", { enumerable: true, get: function () { return __importDefault(text_2).default; } });
|
package/dist/nodes/html.d.ts
CHANGED
|
@@ -122,6 +122,11 @@ export default class HTMLElement extends Node {
|
|
|
122
122
|
* @return {HTMLElement} matching node
|
|
123
123
|
*/
|
|
124
124
|
querySelector(selector: string): HTMLElement;
|
|
125
|
+
/**
|
|
126
|
+
* traverses the Element and its parents (heading toward the document root) until it finds a node that matches the provided selector string. Will return itself or the matching ancestor. If no such element exists, it returns null.
|
|
127
|
+
* @param selector a DOMString containing a selector list
|
|
128
|
+
*/
|
|
129
|
+
closest(selector: string): Node;
|
|
125
130
|
/**
|
|
126
131
|
* Append a child node to childNodes
|
|
127
132
|
* @param {Node} node node to append
|
|
@@ -187,4 +192,9 @@ export interface Options {
|
|
|
187
192
|
* @return {HTMLElement} root element
|
|
188
193
|
*/
|
|
189
194
|
export declare function base_parse(data: string, options?: Partial<Options>): HTMLElement[];
|
|
195
|
+
/**
|
|
196
|
+
* Parses HTML and returns a root element
|
|
197
|
+
* Parse a chuck of HTML source.
|
|
198
|
+
*/
|
|
199
|
+
export declare function parse(data: string, options?: Partial<Options>): HTMLElement;
|
|
190
200
|
export {};
|
package/dist/nodes/html.js
CHANGED
|
@@ -14,6 +14,17 @@ var __extends = (this && this.__extends) || (function () {
|
|
|
14
14
|
d.prototype = b === null ? Object.create(b) : (__.prototype = b.prototype, new __());
|
|
15
15
|
};
|
|
16
16
|
})();
|
|
17
|
+
var __assign = (this && this.__assign) || function () {
|
|
18
|
+
__assign = Object.assign || function(t) {
|
|
19
|
+
for (var s, i = 1, n = arguments.length; i < n; i++) {
|
|
20
|
+
s = arguments[i];
|
|
21
|
+
for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p))
|
|
22
|
+
t[p] = s[p];
|
|
23
|
+
}
|
|
24
|
+
return t;
|
|
25
|
+
};
|
|
26
|
+
return __assign.apply(this, arguments);
|
|
27
|
+
};
|
|
17
28
|
var __spreadArray = (this && this.__spreadArray) || function (to, from) {
|
|
18
29
|
for (var i = 0, il = from.length, j = to.length; i < il; i++, j++)
|
|
19
30
|
to[j] = from[i];
|
|
@@ -23,7 +34,7 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
|
23
34
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
24
35
|
};
|
|
25
36
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
26
|
-
exports.base_parse = void 0;
|
|
37
|
+
exports.parse = exports.base_parse = void 0;
|
|
27
38
|
var he_1 = __importDefault(require("he"));
|
|
28
39
|
var css_select_1 = require("css-select");
|
|
29
40
|
var node_1 = __importDefault(require("./node"));
|
|
@@ -32,7 +43,6 @@ var text_1 = __importDefault(require("./text"));
|
|
|
32
43
|
var matcher_1 = __importDefault(require("../matcher"));
|
|
33
44
|
var back_1 = __importDefault(require("../back"));
|
|
34
45
|
var comment_1 = __importDefault(require("./comment"));
|
|
35
|
-
var parse_1 = __importDefault(require("../parse"));
|
|
36
46
|
// const { decode } = he;
|
|
37
47
|
function decode(val) {
|
|
38
48
|
// clone string
|
|
@@ -319,7 +329,7 @@ var HTMLElement = /** @class */ (function (_super) {
|
|
|
319
329
|
},
|
|
320
330
|
set: function (content) {
|
|
321
331
|
//const r = parse(content, global.options); // TODO global.options ?
|
|
322
|
-
var r =
|
|
332
|
+
var r = parse(content);
|
|
323
333
|
this.childNodes = r.childNodes.length ? r.childNodes : [new text_1.default(content, this)];
|
|
324
334
|
},
|
|
325
335
|
enumerable: false,
|
|
@@ -331,7 +341,7 @@ var HTMLElement = /** @class */ (function (_super) {
|
|
|
331
341
|
content = [content];
|
|
332
342
|
}
|
|
333
343
|
else if (typeof content == 'string') {
|
|
334
|
-
var r =
|
|
344
|
+
var r = parse(content, options);
|
|
335
345
|
content = r.childNodes.length ? r.childNodes : [new text_1.default(content, this)];
|
|
336
346
|
}
|
|
337
347
|
this.childNodes = content;
|
|
@@ -348,7 +358,7 @@ var HTMLElement = /** @class */ (function (_super) {
|
|
|
348
358
|
}
|
|
349
359
|
else if (typeof node == 'string') {
|
|
350
360
|
// const r = parse(content, global.options); // TODO global.options ?
|
|
351
|
-
var r =
|
|
361
|
+
var r = parse(node);
|
|
352
362
|
return r.childNodes.length ? r.childNodes : [new text_1.default(node, _this)];
|
|
353
363
|
}
|
|
354
364
|
return [];
|
|
@@ -559,6 +569,58 @@ var HTMLElement = /** @class */ (function (_super) {
|
|
|
559
569
|
// }
|
|
560
570
|
// return null;
|
|
561
571
|
};
|
|
572
|
+
/**
|
|
573
|
+
* traverses the Element and its parents (heading toward the document root) until it finds a node that matches the provided selector string. Will return itself or the matching ancestor. If no such element exists, it returns null.
|
|
574
|
+
* @param selector a DOMString containing a selector list
|
|
575
|
+
*/
|
|
576
|
+
HTMLElement.prototype.closest = function (selector) {
|
|
577
|
+
var mapChild = new Map();
|
|
578
|
+
var el = this;
|
|
579
|
+
var old = null;
|
|
580
|
+
function findOne(test, elems) {
|
|
581
|
+
var elem = null;
|
|
582
|
+
for (var i = 0, l = elems.length; i < l && !elem; i++) {
|
|
583
|
+
var el_1 = elems[i];
|
|
584
|
+
if (test(el_1)) {
|
|
585
|
+
elem = el_1;
|
|
586
|
+
}
|
|
587
|
+
else {
|
|
588
|
+
var child = mapChild.get(el_1);
|
|
589
|
+
if (child) {
|
|
590
|
+
elem = findOne(test, [child]);
|
|
591
|
+
}
|
|
592
|
+
}
|
|
593
|
+
}
|
|
594
|
+
return elem;
|
|
595
|
+
}
|
|
596
|
+
while (el) {
|
|
597
|
+
mapChild.set(el, old);
|
|
598
|
+
old = el;
|
|
599
|
+
el = el.parentNode;
|
|
600
|
+
}
|
|
601
|
+
el = this;
|
|
602
|
+
while (el) {
|
|
603
|
+
var e = css_select_1.selectOne(selector, el, {
|
|
604
|
+
xmlMode: true,
|
|
605
|
+
adapter: __assign(__assign({}, matcher_1.default), { getChildren: function (node) {
|
|
606
|
+
var child = mapChild.get(node);
|
|
607
|
+
return child && [child];
|
|
608
|
+
},
|
|
609
|
+
getSiblings: function (node) {
|
|
610
|
+
return [node];
|
|
611
|
+
},
|
|
612
|
+
findOne: findOne,
|
|
613
|
+
findAll: function () {
|
|
614
|
+
return [];
|
|
615
|
+
} })
|
|
616
|
+
});
|
|
617
|
+
if (e) {
|
|
618
|
+
return e;
|
|
619
|
+
}
|
|
620
|
+
el = el.parentNode;
|
|
621
|
+
}
|
|
622
|
+
return null;
|
|
623
|
+
};
|
|
562
624
|
/**
|
|
563
625
|
* Append a child node to childNodes
|
|
564
626
|
* @param {Node} node node to append
|
|
@@ -637,7 +699,7 @@ var HTMLElement = /** @class */ (function (_super) {
|
|
|
637
699
|
}
|
|
638
700
|
var attrs = {};
|
|
639
701
|
if (this.rawAttrs) {
|
|
640
|
-
var re = /\b([a-z][a-z0-9-_]*)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|(\S+)))?/ig;
|
|
702
|
+
var re = /\b([a-z][a-z0-9-_:]*)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|(\S+)))?/ig;
|
|
641
703
|
var match = void 0;
|
|
642
704
|
while ((match = re.exec(this.rawAttrs))) {
|
|
643
705
|
attrs[match[1]] = match[2] || match[3] || match[4] || null;
|
|
@@ -742,7 +804,7 @@ var HTMLElement = /** @class */ (function (_super) {
|
|
|
742
804
|
if (arguments.length < 2) {
|
|
743
805
|
throw new Error('2 arguments required');
|
|
744
806
|
}
|
|
745
|
-
var p =
|
|
807
|
+
var p = parse(html);
|
|
746
808
|
if (where === 'afterend') {
|
|
747
809
|
var idx = this.parentNode.childNodes.findIndex(function (child) {
|
|
748
810
|
return child === _this;
|
|
@@ -1050,3 +1112,47 @@ function base_parse(data, options) {
|
|
|
1050
1112
|
return stack;
|
|
1051
1113
|
}
|
|
1052
1114
|
exports.base_parse = base_parse;
|
|
1115
|
+
/**
|
|
1116
|
+
* Parses HTML and returns a root element
|
|
1117
|
+
* Parse a chuck of HTML source.
|
|
1118
|
+
*/
|
|
1119
|
+
function parse(data, options) {
|
|
1120
|
+
if (options === void 0) { options = { lowerCaseTagName: false, comment: false }; }
|
|
1121
|
+
var stack = base_parse(data, options);
|
|
1122
|
+
var root = stack[0];
|
|
1123
|
+
var _loop_2 = function () {
|
|
1124
|
+
// Handle each error elements.
|
|
1125
|
+
var last = stack.pop();
|
|
1126
|
+
var oneBefore = back_1.default(stack);
|
|
1127
|
+
if (last.parentNode && last.parentNode.parentNode) {
|
|
1128
|
+
if (last.parentNode === oneBefore && last.tagName === oneBefore.tagName) {
|
|
1129
|
+
// Pair error case <h3> <h3> handle : Fixes to <h3> </h3>
|
|
1130
|
+
oneBefore.removeChild(last);
|
|
1131
|
+
last.childNodes.forEach(function (child) {
|
|
1132
|
+
oneBefore.parentNode.appendChild(child);
|
|
1133
|
+
});
|
|
1134
|
+
stack.pop();
|
|
1135
|
+
}
|
|
1136
|
+
else {
|
|
1137
|
+
// Single error <div> <h3> </div> handle: Just removes <h3>
|
|
1138
|
+
oneBefore.removeChild(last);
|
|
1139
|
+
last.childNodes.forEach(function (child) {
|
|
1140
|
+
oneBefore.appendChild(child);
|
|
1141
|
+
});
|
|
1142
|
+
}
|
|
1143
|
+
}
|
|
1144
|
+
else {
|
|
1145
|
+
// If it's final element just skip.
|
|
1146
|
+
}
|
|
1147
|
+
};
|
|
1148
|
+
while (stack.length > 1) {
|
|
1149
|
+
_loop_2();
|
|
1150
|
+
}
|
|
1151
|
+
// response.childNodes.forEach((node) => {
|
|
1152
|
+
// if (node instanceof HTMLElement) {
|
|
1153
|
+
// node.parentNode = null;
|
|
1154
|
+
// }
|
|
1155
|
+
// });
|
|
1156
|
+
return root;
|
|
1157
|
+
}
|
|
1158
|
+
exports.parse = parse;
|
package/dist/parse.d.ts
CHANGED
|
@@ -1,6 +1 @@
|
|
|
1
|
-
|
|
2
|
-
/**
|
|
3
|
-
* Parses HTML and returns a root element
|
|
4
|
-
* Parse a chuck of HTML source.
|
|
5
|
-
*/
|
|
6
|
-
export default function parse(data: string, options?: Partial<Options>): import("./nodes/html").default;
|
|
1
|
+
export { parse as default } from './nodes/html';
|
package/dist/parse.js
CHANGED
|
@@ -1,51 +1,5 @@
|
|
|
1
1
|
"use strict";
|
|
2
|
-
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
-
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
-
};
|
|
5
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
-
|
|
3
|
+
exports.default = void 0;
|
|
7
4
|
var html_1 = require("./nodes/html");
|
|
8
|
-
|
|
9
|
-
* Parses HTML and returns a root element
|
|
10
|
-
* Parse a chuck of HTML source.
|
|
11
|
-
*/
|
|
12
|
-
function parse(data, options) {
|
|
13
|
-
if (options === void 0) { options = { lowerCaseTagName: false, comment: false }; }
|
|
14
|
-
var stack = html_1.base_parse(data, options);
|
|
15
|
-
var root = stack[0];
|
|
16
|
-
var _loop_1 = function () {
|
|
17
|
-
// Handle each error elements.
|
|
18
|
-
var last = stack.pop();
|
|
19
|
-
var oneBefore = back_1.default(stack);
|
|
20
|
-
if (last.parentNode && last.parentNode.parentNode) {
|
|
21
|
-
if (last.parentNode === oneBefore && last.tagName === oneBefore.tagName) {
|
|
22
|
-
// Pair error case <h3> <h3> handle : Fixes to <h3> </h3>
|
|
23
|
-
oneBefore.removeChild(last);
|
|
24
|
-
last.childNodes.forEach(function (child) {
|
|
25
|
-
oneBefore.parentNode.appendChild(child);
|
|
26
|
-
});
|
|
27
|
-
stack.pop();
|
|
28
|
-
}
|
|
29
|
-
else {
|
|
30
|
-
// Single error <div> <h3> </div> handle: Just removes <h3>
|
|
31
|
-
oneBefore.removeChild(last);
|
|
32
|
-
last.childNodes.forEach(function (child) {
|
|
33
|
-
oneBefore.appendChild(child);
|
|
34
|
-
});
|
|
35
|
-
}
|
|
36
|
-
}
|
|
37
|
-
else {
|
|
38
|
-
// If it's final element just skip.
|
|
39
|
-
}
|
|
40
|
-
};
|
|
41
|
-
while (stack.length > 1) {
|
|
42
|
-
_loop_1();
|
|
43
|
-
}
|
|
44
|
-
// response.childNodes.forEach((node) => {
|
|
45
|
-
// if (node instanceof HTMLElement) {
|
|
46
|
-
// node.parentNode = null;
|
|
47
|
-
// }
|
|
48
|
-
// });
|
|
49
|
-
return root;
|
|
50
|
-
}
|
|
51
|
-
exports.default = parse;
|
|
5
|
+
Object.defineProperty(exports, "default", { enumerable: true, get: function () { return html_1.parse; } });
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "node-html-parser",
|
|
3
|
-
"version": "3.
|
|
3
|
+
"version": "3.3.0",
|
|
4
4
|
"description": "A very fast HTML parser, generating a simplified DOM, with basic element query support.",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"module": "dist/esm/index.js",
|
|
@@ -15,7 +15,7 @@
|
|
|
15
15
|
"build": "npm run lint && npm run clean && npm run ts:cjs && npm run ts:amd && npm run ts:esm",
|
|
16
16
|
"dev": "tsc -w & mocha -w ./test/*.js",
|
|
17
17
|
"pretest": "tsc -m commonjs",
|
|
18
|
-
"release": "np"
|
|
18
|
+
"release": "yarn build && np"
|
|
19
19
|
},
|
|
20
20
|
"keywords": [
|
|
21
21
|
"parser",
|
|
@@ -81,5 +81,6 @@
|
|
|
81
81
|
"bugs": {
|
|
82
82
|
"url": "https://github.com/taoqf/node-fast-html-parser/issues"
|
|
83
83
|
},
|
|
84
|
-
"homepage": "https://github.com/taoqf/node-fast-html-parser"
|
|
84
|
+
"homepage": "https://github.com/taoqf/node-fast-html-parser",
|
|
85
|
+
"sideEffects": false
|
|
85
86
|
}
|