node-html-parser 4.1.4 → 5.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,1102 +0,0 @@
1
- import he from 'he';
2
- import { selectAll, selectOne } from 'css-select';
3
- import Node from './node';
4
- import NodeType from './type';
5
- import TextNode from './text';
6
- import Matcher from '../matcher';
7
- import arr_back from '../back';
8
- import CommentNode from './comment';
9
- // const { decode } = he;
10
- function decode(val) {
11
- // clone string
12
- return JSON.parse(JSON.stringify(he.decode(val)));
13
- }
14
- // https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements
15
- const kBlockElements = new Set();
16
- kBlockElements.add('address');
17
- kBlockElements.add('ADDRESS');
18
- kBlockElements.add('article');
19
- kBlockElements.add('ARTICLE');
20
- kBlockElements.add('aside');
21
- kBlockElements.add('ASIDE');
22
- kBlockElements.add('blockquote');
23
- kBlockElements.add('BLOCKQUOTE');
24
- kBlockElements.add('br');
25
- kBlockElements.add('BR');
26
- kBlockElements.add('details');
27
- kBlockElements.add('DETAILS');
28
- kBlockElements.add('dialog');
29
- kBlockElements.add('DIALOG');
30
- kBlockElements.add('dd');
31
- kBlockElements.add('DD');
32
- kBlockElements.add('div');
33
- kBlockElements.add('DIV');
34
- kBlockElements.add('dl');
35
- kBlockElements.add('DL');
36
- kBlockElements.add('dt');
37
- kBlockElements.add('DT');
38
- kBlockElements.add('fieldset');
39
- kBlockElements.add('FIELDSET');
40
- kBlockElements.add('figcaption');
41
- kBlockElements.add('FIGCAPTION');
42
- kBlockElements.add('figure');
43
- kBlockElements.add('FIGURE');
44
- kBlockElements.add('footer');
45
- kBlockElements.add('FOOTER');
46
- kBlockElements.add('form');
47
- kBlockElements.add('FORM');
48
- kBlockElements.add('h1');
49
- kBlockElements.add('H1');
50
- kBlockElements.add('h2');
51
- kBlockElements.add('H2');
52
- kBlockElements.add('h3');
53
- kBlockElements.add('H3');
54
- kBlockElements.add('h4');
55
- kBlockElements.add('H4');
56
- kBlockElements.add('h5');
57
- kBlockElements.add('H5');
58
- kBlockElements.add('h6');
59
- kBlockElements.add('H6');
60
- kBlockElements.add('header');
61
- kBlockElements.add('HEADER');
62
- kBlockElements.add('hgroup');
63
- kBlockElements.add('HGROUP');
64
- kBlockElements.add('hr');
65
- kBlockElements.add('HR');
66
- kBlockElements.add('li');
67
- kBlockElements.add('LI');
68
- kBlockElements.add('main');
69
- kBlockElements.add('MAIN');
70
- kBlockElements.add('nav');
71
- kBlockElements.add('NAV');
72
- kBlockElements.add('ol');
73
- kBlockElements.add('OL');
74
- kBlockElements.add('p');
75
- kBlockElements.add('P');
76
- kBlockElements.add('pre');
77
- kBlockElements.add('PRE');
78
- kBlockElements.add('section');
79
- kBlockElements.add('SECTION');
80
- kBlockElements.add('table');
81
- kBlockElements.add('TABLE');
82
- kBlockElements.add('td');
83
- kBlockElements.add('TD');
84
- kBlockElements.add('tr');
85
- kBlockElements.add('TR');
86
- kBlockElements.add('ul');
87
- kBlockElements.add('UL');
88
- class DOMTokenList {
89
- constructor(valuesInit = [], afterUpdate = (() => null)) {
90
- this._set = new Set(valuesInit);
91
- this._afterUpdate = afterUpdate;
92
- }
93
- _validate(c) {
94
- if (/\s/.test(c)) {
95
- throw new Error(`DOMException in DOMTokenList.add: The token '${c}' contains HTML space characters, which are not valid in tokens.`);
96
- }
97
- }
98
- add(c) {
99
- this._validate(c);
100
- this._set.add(c);
101
- this._afterUpdate(this); // eslint-disable-line @typescript-eslint/no-unsafe-call
102
- }
103
- replace(c1, c2) {
104
- this._validate(c2);
105
- this._set.delete(c1);
106
- this._set.add(c2);
107
- this._afterUpdate(this); // eslint-disable-line @typescript-eslint/no-unsafe-call
108
- }
109
- remove(c) {
110
- this._set.delete(c) &&
111
- this._afterUpdate(this); // eslint-disable-line @typescript-eslint/no-unsafe-call
112
- }
113
- toggle(c) {
114
- this._validate(c);
115
- if (this._set.has(c))
116
- this._set.delete(c);
117
- else
118
- this._set.add(c);
119
- this._afterUpdate(this); // eslint-disable-line @typescript-eslint/no-unsafe-call
120
- }
121
- contains(c) {
122
- return this._set.has(c);
123
- }
124
- get length() {
125
- return this._set.size;
126
- }
127
- values() {
128
- return this._set.values();
129
- }
130
- get value() {
131
- return Array.from(this._set.values());
132
- }
133
- toString() {
134
- return Array.from(this._set.values()).join(' ');
135
- }
136
- }
137
- /**
138
- * HTMLElement, which contains a set of children.
139
- *
140
- * Note: this is a minimalist implementation, no complete tree
141
- * structure provided (no parentNode, nextSibling,
142
- * previousSibling etc).
143
- * @class HTMLElement
144
- * @extends {Node}
145
- */
146
- export default class HTMLElement extends Node {
147
- /**
148
- * Creates an instance of HTMLElement.
149
- * @param keyAttrs id and class attribute
150
- * @param [rawAttrs] attributes in string
151
- *
152
- * @memberof HTMLElement
153
- */
154
- constructor(tagName, keyAttrs, rawAttrs = '', parentNode, range) {
155
- super(parentNode, range);
156
- this.rawAttrs = rawAttrs;
157
- /**
158
- * Node Type declaration.
159
- */
160
- this.nodeType = NodeType.ELEMENT_NODE;
161
- this.rawTagName = tagName;
162
- this.rawAttrs = rawAttrs || '';
163
- this.id = keyAttrs.id || '';
164
- this.childNodes = [];
165
- this.classList = new DOMTokenList(keyAttrs.class ? keyAttrs.class.split(/\s+/) : [], (classList) => (this.setAttribute('class', classList.toString()) // eslint-disable-line @typescript-eslint/no-unsafe-member-access, @typescript-eslint/no-unsafe-call
166
- ));
167
- if (keyAttrs.id) {
168
- if (!rawAttrs) {
169
- this.rawAttrs = `id="${keyAttrs.id}"`;
170
- }
171
- }
172
- if (keyAttrs.class) {
173
- if (!rawAttrs) {
174
- const cls = `class="${this.classList.toString()}"`;
175
- if (this.rawAttrs) {
176
- this.rawAttrs += ` ${cls}`;
177
- }
178
- else {
179
- this.rawAttrs = cls;
180
- }
181
- }
182
- }
183
- }
184
- /**
185
- * Quote attribute values
186
- * @param attr attribute value
187
- * @returns {string} quoted value
188
- */
189
- quoteAttribute(attr) {
190
- if (attr === null) {
191
- return "null";
192
- }
193
- return JSON.stringify(attr.replace(/"/g, '"'));
194
- }
195
- /**
196
- * Remove current element
197
- */
198
- remove() {
199
- if (this.parentNode) {
200
- const children = this.parentNode.childNodes;
201
- this.parentNode.childNodes = children.filter((child) => {
202
- return this !== child;
203
- });
204
- }
205
- }
206
- /**
207
- * Remove Child element from childNodes array
208
- * @param {HTMLElement} node node to remove
209
- */
210
- removeChild(node) {
211
- this.childNodes = this.childNodes.filter((child) => {
212
- return (child !== node);
213
- });
214
- }
215
- /**
216
- * Exchanges given child with new child
217
- * @param {HTMLElement} oldNode node to exchange
218
- * @param {HTMLElement} newNode new node
219
- */
220
- exchangeChild(oldNode, newNode) {
221
- const children = this.childNodes;
222
- this.childNodes = children.map((child) => {
223
- if (child === oldNode) {
224
- return newNode;
225
- }
226
- return child;
227
- });
228
- }
229
- get tagName() {
230
- return this.rawTagName ? this.rawTagName.toUpperCase() : this.rawTagName;
231
- }
232
- get localName() {
233
- return this.rawTagName.toLowerCase();
234
- }
235
- /**
236
- * Get escpaed (as-it) text value of current node and its children.
237
- * @return {string} text content
238
- */
239
- get rawText() {
240
- return this.childNodes.reduce((pre, cur) => {
241
- return (pre += cur.rawText);
242
- }, '');
243
- }
244
- get textContent() {
245
- return decode(this.rawText);
246
- }
247
- set textContent(val) {
248
- const content = [new TextNode(val, this)];
249
- this.childNodes = content;
250
- }
251
- /**
252
- * Get unescaped text value of current node and its children.
253
- * @return {string} text content
254
- */
255
- get text() {
256
- return decode(this.rawText);
257
- }
258
- /**
259
- * Get structured Text (with '\n' etc.)
260
- * @return {string} structured text
261
- */
262
- get structuredText() {
263
- let currentBlock = [];
264
- const blocks = [currentBlock];
265
- function dfs(node) {
266
- if (node.nodeType === NodeType.ELEMENT_NODE) {
267
- if (kBlockElements.has(node.rawTagName)) {
268
- if (currentBlock.length > 0) {
269
- blocks.push(currentBlock = []);
270
- }
271
- node.childNodes.forEach(dfs);
272
- if (currentBlock.length > 0) {
273
- blocks.push(currentBlock = []);
274
- }
275
- }
276
- else {
277
- node.childNodes.forEach(dfs);
278
- }
279
- }
280
- else if (node.nodeType === NodeType.TEXT_NODE) {
281
- if (node.isWhitespace) {
282
- // Whitespace node, postponed output
283
- currentBlock.prependWhitespace = true;
284
- }
285
- else {
286
- let text = node.trimmedText;
287
- if (currentBlock.prependWhitespace) {
288
- text = ` ${text}`;
289
- currentBlock.prependWhitespace = false;
290
- }
291
- currentBlock.push(text);
292
- }
293
- }
294
- }
295
- dfs(this);
296
- return blocks.map((block) => {
297
- // Normalize each line's whitespace
298
- return block.join('').replace(/\s{2,}/g, ' ');
299
- })
300
- .join('\n').replace(/\s+$/, ''); // trimRight;
301
- }
302
- toString() {
303
- const tag = this.rawTagName;
304
- if (tag) {
305
- // const void_tags = new Set('area|base|br|col|embed|hr|img|input|link|meta|param|source|track|wbr'.split('|'));
306
- // const is_void = void_tags.has(tag);
307
- const is_void = /^(area|base|br|col|embed|hr|img|input|link|meta|param|source|track|wbr)$/i.test(tag);
308
- const attrs = this.rawAttrs ? ` ${this.rawAttrs}` : '';
309
- if (is_void) {
310
- return `<${tag}${attrs}>`;
311
- }
312
- return `<${tag}${attrs}>${this.innerHTML}</${tag}>`;
313
- }
314
- return this.innerHTML;
315
- }
316
- get innerHTML() {
317
- return this.childNodes.map((child) => {
318
- return child.toString();
319
- }).join('');
320
- }
321
- set innerHTML(content) {
322
- //const r = parse(content, global.options); // TODO global.options ?
323
- const r = parse(content);
324
- this.childNodes = r.childNodes.length ? r.childNodes : [new TextNode(content, this)];
325
- }
326
- set_content(content, options = {}) {
327
- if (content instanceof Node) {
328
- content = [content];
329
- }
330
- else if (typeof content == 'string') {
331
- const r = parse(content, options);
332
- content = r.childNodes.length ? r.childNodes : [new TextNode(content, this)];
333
- }
334
- this.childNodes = content;
335
- }
336
- replaceWith(...nodes) {
337
- const content = nodes.map((node) => {
338
- if (node instanceof Node) {
339
- return [node];
340
- }
341
- else if (typeof node == 'string') {
342
- // const r = parse(content, global.options); // TODO global.options ?
343
- const r = parse(node);
344
- return r.childNodes.length ? r.childNodes : [new TextNode(node, this)];
345
- }
346
- return [];
347
- }).flat();
348
- const idx = this.parentNode.childNodes.findIndex((child) => {
349
- return child === this;
350
- });
351
- this.parentNode.childNodes = [
352
- ...this.parentNode.childNodes.slice(0, idx),
353
- ...content,
354
- ...this.parentNode.childNodes.slice(idx + 1),
355
- ];
356
- }
357
- get outerHTML() {
358
- return this.toString();
359
- }
360
- /**
361
- * Trim element from right (in block) after seeing pattern in a TextNode.
362
- * @param {RegExp} pattern pattern to find
363
- * @return {HTMLElement} reference to current node
364
- */
365
- trimRight(pattern) {
366
- for (let i = 0; i < this.childNodes.length; i++) {
367
- const childNode = this.childNodes[i];
368
- if (childNode.nodeType === NodeType.ELEMENT_NODE) {
369
- childNode.trimRight(pattern);
370
- }
371
- else {
372
- const index = childNode.rawText.search(pattern);
373
- if (index > -1) {
374
- childNode.rawText = childNode.rawText.substr(0, index);
375
- // trim all following nodes.
376
- this.childNodes.length = i + 1;
377
- }
378
- }
379
- }
380
- return this;
381
- }
382
- /**
383
- * Get DOM structure
384
- * @return {string} strucutre
385
- */
386
- get structure() {
387
- const res = [];
388
- let indention = 0;
389
- function write(str) {
390
- res.push(' '.repeat(indention) + str);
391
- }
392
- function dfs(node) {
393
- const idStr = node.id ? (`#${node.id}`) : '';
394
- const classStr = node.classList.length ? (`.${node.classList.value.join('.')}`) : ''; // eslint-disable-line @typescript-eslint/no-unsafe-member-access, @typescript-eslint/no-unsafe-member-access, @typescript-eslint/restrict-template-expressions, @typescript-eslint/no-unsafe-call
395
- write(`${node.rawTagName}${idStr}${classStr}`);
396
- indention++;
397
- node.childNodes.forEach((childNode) => {
398
- if (childNode.nodeType === NodeType.ELEMENT_NODE) {
399
- dfs(childNode);
400
- }
401
- else if (childNode.nodeType === NodeType.TEXT_NODE) {
402
- if (!childNode.isWhitespace) {
403
- write('#text');
404
- }
405
- }
406
- });
407
- indention--;
408
- }
409
- dfs(this);
410
- return res.join('\n');
411
- }
412
- /**
413
- * Remove whitespaces in this sub tree.
414
- * @return {HTMLElement} pointer to this
415
- */
416
- removeWhitespace() {
417
- let o = 0;
418
- this.childNodes.forEach((node) => {
419
- if (node.nodeType === NodeType.TEXT_NODE) {
420
- if (node.isWhitespace) {
421
- return;
422
- }
423
- node.rawText = node.trimmedRawText;
424
- }
425
- else if (node.nodeType === NodeType.ELEMENT_NODE) {
426
- node.removeWhitespace();
427
- }
428
- this.childNodes[o++] = node;
429
- });
430
- this.childNodes.length = o;
431
- return this;
432
- }
433
- /**
434
- * Query CSS selector to find matching nodes.
435
- * @param {string} selector Simplified CSS selector
436
- * @return {HTMLElement[]} matching elements
437
- */
438
- querySelectorAll(selector) {
439
- return selectAll(selector, this, {
440
- xmlMode: true,
441
- adapter: Matcher
442
- });
443
- // let matcher: Matcher;
444
- // if (selector instanceof Matcher) {
445
- // matcher = selector;
446
- // matcher.reset();
447
- // } else {
448
- // if (selector.includes(',')) {
449
- // const selectors = selector.split(',');
450
- // return Array.from(selectors.reduce((pre, cur) => {
451
- // const result = this.querySelectorAll(cur.trim());
452
- // return result.reduce((p, c) => {
453
- // return p.add(c);
454
- // }, pre);
455
- // }, new Set<HTMLElement>()));
456
- // }
457
- // matcher = new Matcher(selector);
458
- // }
459
- // interface IStack {
460
- // 0: Node; // node
461
- // 1: number; // children
462
- // 2: boolean; // found flag
463
- // }
464
- // const stack = [] as IStack[];
465
- // return this.childNodes.reduce((res, cur) => {
466
- // stack.push([cur, 0, false]);
467
- // while (stack.length) {
468
- // const state = arr_back(stack); // get last element
469
- // const el = state[0];
470
- // if (state[1] === 0) {
471
- // // Seen for first time.
472
- // if (el.nodeType !== NodeType.ELEMENT_NODE) {
473
- // stack.pop();
474
- // continue;
475
- // }
476
- // const html_el = el as HTMLElement;
477
- // state[2] = matcher.advance(html_el);
478
- // if (state[2]) {
479
- // if (matcher.matched) {
480
- // res.push(html_el);
481
- // res.push(...(html_el.querySelectorAll(selector)));
482
- // // no need to go further.
483
- // matcher.rewind();
484
- // stack.pop();
485
- // continue;
486
- // }
487
- // }
488
- // }
489
- // if (state[1] < el.childNodes.length) {
490
- // stack.push([el.childNodes[state[1]++], 0, false]);
491
- // } else {
492
- // if (state[2]) {
493
- // matcher.rewind();
494
- // }
495
- // stack.pop();
496
- // }
497
- // }
498
- // return res;
499
- // }, [] as HTMLElement[]);
500
- }
501
- /**
502
- * Query CSS Selector to find matching node.
503
- * @param {string} selector Simplified CSS selector
504
- * @return {HTMLElement} matching node
505
- */
506
- querySelector(selector) {
507
- return selectOne(selector, this, {
508
- xmlMode: true,
509
- adapter: Matcher
510
- });
511
- // let matcher: Matcher;
512
- // if (selector instanceof Matcher) {
513
- // matcher = selector;
514
- // matcher.reset();
515
- // } else {
516
- // matcher = new Matcher(selector);
517
- // }
518
- // const stack = [] as { 0: Node; 1: 0 | 1; 2: boolean }[];
519
- // for (const node of this.childNodes) {
520
- // stack.push([node, 0, false]);
521
- // while (stack.length) {
522
- // const state = arr_back(stack);
523
- // const el = state[0];
524
- // if (state[1] === 0) {
525
- // // Seen for first time.
526
- // if (el.nodeType !== NodeType.ELEMENT_NODE) {
527
- // stack.pop();
528
- // continue;
529
- // }
530
- // state[2] = matcher.advance(el as HTMLElement);
531
- // if (state[2]) {
532
- // if (matcher.matched) {
533
- // return el as HTMLElement;
534
- // }
535
- // }
536
- // }
537
- // if (state[1] < el.childNodes.length) {
538
- // stack.push([el.childNodes[state[1]++], 0, false]);
539
- // } else {
540
- // if (state[2]) {
541
- // matcher.rewind();
542
- // }
543
- // stack.pop();
544
- // }
545
- // }
546
- // }
547
- // return null;
548
- }
549
- /**
550
- * traverses the Element and its parents (heading toward the document root) until it finds a node that matches the provided selector string. Will return itself or the matching ancestor. If no such element exists, it returns null.
551
- * @param selector a DOMString containing a selector list
552
- */
553
- closest(selector) {
554
- const mapChild = new Map();
555
- let el = this;
556
- let old = null;
557
- function findOne(test, elems) {
558
- let elem = null;
559
- for (let i = 0, l = elems.length; i < l && !elem; i++) {
560
- const el = elems[i];
561
- if (test(el)) {
562
- elem = el;
563
- }
564
- else {
565
- const child = mapChild.get(el);
566
- if (child) {
567
- elem = findOne(test, [child]);
568
- }
569
- }
570
- }
571
- return elem;
572
- }
573
- while (el) {
574
- mapChild.set(el, old);
575
- old = el;
576
- el = el.parentNode;
577
- }
578
- el = this;
579
- while (el) {
580
- const e = selectOne(selector, el, {
581
- xmlMode: true,
582
- adapter: {
583
- ...Matcher,
584
- getChildren(node) {
585
- const child = mapChild.get(node);
586
- return child && [child];
587
- },
588
- getSiblings(node) {
589
- return [node];
590
- },
591
- findOne,
592
- findAll() {
593
- return [];
594
- }
595
- }
596
- });
597
- if (e) {
598
- return e;
599
- }
600
- el = el.parentNode;
601
- }
602
- return null;
603
- }
604
- /**
605
- * Append a child node to childNodes
606
- * @param {Node} node node to append
607
- * @return {Node} node appended
608
- */
609
- appendChild(node) {
610
- // node.parentNode = this;
611
- this.childNodes.push(node);
612
- node.parentNode = this;
613
- return node;
614
- }
615
- /**
616
- * Get first child node
617
- * @return {Node} first child node
618
- */
619
- get firstChild() {
620
- return this.childNodes[0];
621
- }
622
- /**
623
- * Get last child node
624
- * @return {Node} last child node
625
- */
626
- get lastChild() {
627
- return arr_back(this.childNodes);
628
- }
629
- /**
630
- * Get attributes
631
- * @access private
632
- * @return {Object} parsed and unescaped attributes
633
- */
634
- get attrs() {
635
- if (this._attrs) {
636
- return this._attrs;
637
- }
638
- this._attrs = {};
639
- const attrs = this.rawAttributes;
640
- for (const key in attrs) {
641
- const val = attrs[key] || '';
642
- this._attrs[key.toLowerCase()] = decode(val);
643
- }
644
- return this._attrs;
645
- }
646
- get attributes() {
647
- const ret_attrs = {};
648
- const attrs = this.rawAttributes;
649
- for (const key in attrs) {
650
- const val = attrs[key] || '';
651
- ret_attrs[key] = decode(val);
652
- }
653
- return ret_attrs;
654
- }
655
- /**
656
- * Get escaped (as-it) attributes
657
- * @return {Object} parsed attributes
658
- */
659
- get rawAttributes() {
660
- if (this._rawAttrs) {
661
- return this._rawAttrs;
662
- }
663
- const attrs = {};
664
- if (this.rawAttrs) {
665
- const re = /([a-z()#][a-z0-9-_:()#]*)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|(\S+)))?/ig;
666
- let match;
667
- while ((match = re.exec(this.rawAttrs))) {
668
- attrs[match[1]] = match[2] || match[3] || match[4] || null;
669
- }
670
- }
671
- this._rawAttrs = attrs;
672
- return attrs;
673
- }
674
- removeAttribute(key) {
675
- const attrs = this.rawAttributes;
676
- delete attrs[key];
677
- // Update this.attribute
678
- if (this._attrs) {
679
- delete this._attrs[key];
680
- }
681
- // Update rawString
682
- this.rawAttrs = Object.keys(attrs).map((name) => {
683
- const val = JSON.stringify(attrs[name]);
684
- if (val === undefined || val === 'null') {
685
- return name;
686
- }
687
- return `${name}=${val}`;
688
- }).join(' ');
689
- // Update this.id
690
- if (key === 'id') {
691
- this.id = '';
692
- }
693
- }
694
- hasAttribute(key) {
695
- return key.toLowerCase() in this.attrs;
696
- }
697
- /**
698
- * Get an attribute
699
- * @return {string} value of the attribute
700
- */
701
- getAttribute(key) {
702
- return this.attrs[key.toLowerCase()];
703
- }
704
- /**
705
- * Set an attribute value to the HTMLElement
706
- * @param {string} key The attribute name
707
- * @param {string} value The value to set, or null / undefined to remove an attribute
708
- */
709
- setAttribute(key, value) {
710
- if (arguments.length < 2) {
711
- throw new Error('Failed to execute \'setAttribute\' on \'Element\'');
712
- }
713
- const k2 = key.toLowerCase();
714
- const attrs = this.rawAttributes;
715
- for (const k in attrs) {
716
- if (k.toLowerCase() === k2) {
717
- key = k;
718
- break;
719
- }
720
- }
721
- attrs[key] = String(value);
722
- // update this.attrs
723
- if (this._attrs) {
724
- this._attrs[k2] = decode(attrs[key]);
725
- }
726
- // Update rawString
727
- this.rawAttrs = Object.keys(attrs).map((name) => {
728
- const val = this.quoteAttribute(attrs[name]);
729
- if (val === 'null' || val === '""') {
730
- return name;
731
- }
732
- return `${name}=${val}`;
733
- }).join(' ');
734
- // Update this.id
735
- if (key === 'id') {
736
- this.id = value;
737
- }
738
- }
739
- /**
740
- * Replace all the attributes of the HTMLElement by the provided attributes
741
- * @param {Attributes} attributes the new attribute set
742
- */
743
- setAttributes(attributes) {
744
- // Invalidate current this.attributes
745
- if (this._attrs) {
746
- delete this._attrs;
747
- }
748
- // Invalidate current this.rawAttributes
749
- if (this._rawAttrs) {
750
- delete this._rawAttrs;
751
- }
752
- // Update rawString
753
- this.rawAttrs = Object.keys(attributes).map((name) => {
754
- const val = attributes[name];
755
- if (val === 'null' || val === '""') {
756
- return name;
757
- }
758
- return `${name}=${this.quoteAttribute(String(val))}`;
759
- }).join(' ');
760
- }
761
- insertAdjacentHTML(where, html) {
762
- if (arguments.length < 2) {
763
- throw new Error('2 arguments required');
764
- }
765
- const p = parse(html);
766
- if (where === 'afterend') {
767
- const idx = this.parentNode.childNodes.findIndex((child) => {
768
- return child === this;
769
- });
770
- this.parentNode.childNodes.splice(idx + 1, 0, ...p.childNodes);
771
- p.childNodes.forEach((n) => {
772
- if (n instanceof HTMLElement) {
773
- n.parentNode = this.parentNode;
774
- }
775
- });
776
- }
777
- else if (where === 'afterbegin') {
778
- this.childNodes.unshift(...p.childNodes);
779
- }
780
- else if (where === 'beforeend') {
781
- p.childNodes.forEach((n) => {
782
- this.appendChild(n);
783
- });
784
- }
785
- else if (where === 'beforebegin') {
786
- const idx = this.parentNode.childNodes.findIndex((child) => {
787
- return child === this;
788
- });
789
- this.parentNode.childNodes.splice(idx, 0, ...p.childNodes);
790
- p.childNodes.forEach((n) => {
791
- if (n instanceof HTMLElement) {
792
- n.parentNode = this.parentNode;
793
- }
794
- });
795
- }
796
- else {
797
- throw new Error(`The value provided ('${where}') is not one of 'beforebegin', 'afterbegin', 'beforeend', or 'afterend'`);
798
- }
799
- // if (!where || html === undefined || html === null) {
800
- // return;
801
- // }
802
- }
803
- get nextSibling() {
804
- if (this.parentNode) {
805
- const children = this.parentNode.childNodes;
806
- let i = 0;
807
- while (i < children.length) {
808
- const child = children[i++];
809
- if (this === child) {
810
- return children[i] || null;
811
- }
812
- }
813
- return null;
814
- }
815
- }
816
- get nextElementSibling() {
817
- if (this.parentNode) {
818
- const children = this.parentNode.childNodes;
819
- let i = 0;
820
- let find = false;
821
- while (i < children.length) {
822
- const child = children[i++];
823
- if (find) {
824
- if (child instanceof HTMLElement) {
825
- return child || null;
826
- }
827
- }
828
- else if (this === child) {
829
- find = true;
830
- }
831
- }
832
- return null;
833
- }
834
- }
835
- get classNames() {
836
- return this.classList.toString();
837
- }
838
- }
839
- // https://html.spec.whatwg.org/multipage/custom-elements.html#valid-custom-element-name
840
- const kMarkupPattern = /<!--[^]*?(?=-->)-->|<(\/?)([a-z][-.:0-9_a-z]*)\s*([^>]*?)(\/?)>/ig;
841
- // <(?<tag>[^\s]*)(.*)>(.*)</\k<tag>>
842
- // <([a-z][-.:0-9_a-z]*)\s*\/>
843
- // <(area|base|br|col|hr|img|input|link|meta|source)\s*(.*)\/?>
844
- // <(area|base|br|col|hr|img|input|link|meta|source)\s*(.*)\/?>|<(?<tag>[^\s]*)(.*)>(.*)</\k<tag>>
845
- const kAttributePattern = /(^|\s)(id|class)\s*=\s*("([^"]*)"|'([^']*)'|(\S+))/ig;
846
- const kSelfClosingElements = {
847
- area: true,
848
- AREA: true,
849
- base: true,
850
- BASE: true,
851
- br: true,
852
- BR: true,
853
- col: true,
854
- COL: true,
855
- hr: true,
856
- HR: true,
857
- img: true,
858
- IMG: true,
859
- input: true,
860
- INPUT: true,
861
- link: true,
862
- LINK: true,
863
- meta: true,
864
- META: true,
865
- source: true,
866
- SOURCE: true,
867
- embed: true,
868
- EMBED: true,
869
- param: true,
870
- PARAM: true,
871
- track: true,
872
- TRACK: true,
873
- wbr: true,
874
- WBR: true
875
- };
876
- const kElementsClosedByOpening = {
877
- li: { li: true, LI: true },
878
- LI: { li: true, LI: true },
879
- p: { p: true, div: true, P: true, DIV: true },
880
- P: { p: true, div: true, P: true, DIV: true },
881
- b: { div: true, DIV: true },
882
- B: { div: true, DIV: true },
883
- td: { td: true, th: true, TD: true, TH: true },
884
- TD: { td: true, th: true, TD: true, TH: true },
885
- th: { td: true, th: true, TD: true, TH: true },
886
- TH: { td: true, th: true, TD: true, TH: true },
887
- h1: { h1: true, H1: true },
888
- H1: { h1: true, H1: true },
889
- h2: { h2: true, H2: true },
890
- H2: { h2: true, H2: true },
891
- h3: { h3: true, H3: true },
892
- H3: { h3: true, H3: true },
893
- h4: { h4: true, H4: true },
894
- H4: { h4: true, H4: true },
895
- h5: { h5: true, H5: true },
896
- H5: { h5: true, H5: true },
897
- h6: { h6: true, H6: true },
898
- H6: { h6: true, H6: true }
899
- };
900
- const kElementsClosedByClosing = {
901
- li: { ul: true, ol: true, UL: true, OL: true },
902
- LI: { ul: true, ol: true, UL: true, OL: true },
903
- a: { div: true, DIV: true },
904
- A: { div: true, DIV: true },
905
- b: { div: true, DIV: true },
906
- B: { div: true, DIV: true },
907
- i: { div: true, DIV: true },
908
- I: { div: true, DIV: true },
909
- p: { div: true, DIV: true },
910
- P: { div: true, DIV: true },
911
- td: { tr: true, table: true, TR: true, TABLE: true },
912
- TD: { tr: true, table: true, TR: true, TABLE: true },
913
- th: { tr: true, table: true, TR: true, TABLE: true },
914
- TH: { tr: true, table: true, TR: true, TABLE: true }
915
- };
916
- const frameflag = 'documentfragmentcontainer';
917
- /**
918
- * Parses HTML and returns a root element
919
- * Parse a chuck of HTML source.
920
- * @param {string} data html
921
- * @return {HTMLElement} root element
922
- */
923
- export function base_parse(data, options = { lowerCaseTagName: false, comment: false }) {
924
- const elements = options.blockTextElements || {
925
- script: true,
926
- noscript: true,
927
- style: true,
928
- pre: true
929
- };
930
- const element_names = Object.keys(elements);
931
- const kBlockTextElements = element_names.map((it) => {
932
- return new RegExp(it, 'i');
933
- });
934
- const kIgnoreElements = element_names.filter((it) => {
935
- return elements[it];
936
- }).map((it) => {
937
- return new RegExp(it, 'i');
938
- });
939
- function element_should_be_ignore(tag) {
940
- return kIgnoreElements.some((it) => {
941
- return it.test(tag);
942
- });
943
- }
944
- function is_block_text_element(tag) {
945
- return kBlockTextElements.some((it) => {
946
- return it.test(tag);
947
- });
948
- }
949
- const createRange = (startPos, endPos) => [startPos - frameFlagOffset, endPos - frameFlagOffset];
950
- const root = new HTMLElement(null, {}, '', null, [0, data.length]);
951
- let currentParent = root;
952
- const stack = [root];
953
- let lastTextPos = -1;
954
- let match;
955
- // https://github.com/taoqf/node-html-parser/issues/38
956
- data = `<${frameflag}>${data}</${frameflag}>`;
957
- const dataEndPos = data.length - (frameflag.length + 2);
958
- const frameFlagOffset = frameflag.length + 2;
959
- while ((match = kMarkupPattern.exec(data))) {
960
- const tagStartPos = kMarkupPattern.lastIndex - match[0].length;
961
- const tagEndPos = kMarkupPattern.lastIndex;
962
- // Add TextNode if content
963
- if (lastTextPos > -1) {
964
- if (lastTextPos + match[0].length < tagEndPos) {
965
- const text = data.substring(lastTextPos, tagStartPos);
966
- currentParent.appendChild(new TextNode(text, currentParent, createRange(lastTextPos, tagStartPos)));
967
- }
968
- }
969
- lastTextPos = kMarkupPattern.lastIndex;
970
- // https://github.com/taoqf/node-html-parser/issues/38
971
- // Skip frameflag node
972
- if (match[2] === frameflag)
973
- continue;
974
- // Handle comments
975
- if (match[0][1] === '!') {
976
- if (options.comment) {
977
- // Only keep what is in between <!-- and -->
978
- const text = data.substring(tagStartPos + 4, tagEndPos - 3);
979
- currentParent.appendChild(new CommentNode(text, currentParent, createRange(tagStartPos, tagEndPos)));
980
- }
981
- continue;
982
- }
983
- /* -- Handle tag matching -- */
984
- // Fix tag casing if necessary
985
- if (options.lowerCaseTagName)
986
- match[2] = match[2].toLowerCase();
987
- // Handle opening tags (ie. <this> not </that>)
988
- if (!match[1]) {
989
- /* Populate attributes */
990
- const attrs = {};
991
- for (let attMatch; (attMatch = kAttributePattern.exec(match[3]));) {
992
- attrs[attMatch[2].toLowerCase()] = attMatch[4] || attMatch[5] || attMatch[6];
993
- }
994
- const tagName = currentParent.rawTagName;
995
- if (!match[4] && kElementsClosedByOpening[tagName]) {
996
- if (kElementsClosedByOpening[tagName][match[2]]) {
997
- stack.pop();
998
- currentParent = arr_back(stack);
999
- }
1000
- }
1001
- // console.error('111111111111111111', currentParent.rawTagName);
1002
- // console.error('22222222222222222222', match);
1003
- if (currentParent.rawTagName === 'a' && match[2] === 'a') {
1004
- stack.pop();
1005
- currentParent = arr_back(stack);
1006
- }
1007
- const tagEndPos = kMarkupPattern.lastIndex;
1008
- const tagStartPos = tagEndPos - match[0].length;
1009
- currentParent = currentParent.appendChild(
1010
- // Initialize range (end position updated later for closed tags)
1011
- new HTMLElement(match[2], attrs, match[3], null, createRange(tagStartPos, tagEndPos)));
1012
- stack.push(currentParent);
1013
- if (is_block_text_element(match[2])) {
1014
- // Find closing tag
1015
- const closeMarkup = `</${match[2]}>`;
1016
- const closeIndex = options.lowerCaseTagName
1017
- ? data.toLocaleLowerCase().indexOf(closeMarkup, kMarkupPattern.lastIndex)
1018
- : data.indexOf(closeMarkup, kMarkupPattern.lastIndex);
1019
- const textEndPos = closeIndex === -1 ? dataEndPos : closeIndex;
1020
- if (element_should_be_ignore(match[2])) {
1021
- const text = data.substring(tagEndPos, textEndPos);
1022
- if (text.length > 0 && /\S/.test(text)) {
1023
- currentParent.appendChild(new TextNode(text, currentParent, createRange(tagEndPos, textEndPos)));
1024
- }
1025
- }
1026
- if (closeIndex === -1) {
1027
- lastTextPos = kMarkupPattern.lastIndex = data.length + 1;
1028
- }
1029
- else {
1030
- lastTextPos = kMarkupPattern.lastIndex = closeIndex + closeMarkup.length;
1031
- // Cause to be treated as self-closing, because no close found
1032
- match[1] = 'true';
1033
- }
1034
- }
1035
- }
1036
- // Handle closing tags or self-closed elements (ie </tag> or <br>)
1037
- if (match[1] || match[4] || kSelfClosingElements[match[2]]) {
1038
- while (true) {
1039
- if (currentParent.rawTagName === match[2]) {
1040
- // Update range end for closed tag
1041
- currentParent.range[1] = createRange(-1, Math.max(lastTextPos, tagEndPos))[1];
1042
- stack.pop();
1043
- currentParent = arr_back(stack);
1044
- break;
1045
- }
1046
- else {
1047
- const tagName = currentParent.tagName;
1048
- // Trying to close current tag, and move on
1049
- if (kElementsClosedByClosing[tagName]) {
1050
- if (kElementsClosedByClosing[tagName][match[2]]) {
1051
- stack.pop();
1052
- currentParent = arr_back(stack);
1053
- continue;
1054
- }
1055
- }
1056
- // Use aggressive strategy to handle unmatching markups.
1057
- break;
1058
- }
1059
- }
1060
- }
1061
- }
1062
- return stack;
1063
- }
1064
- /**
1065
- * Parses HTML and returns a root element
1066
- * Parse a chuck of HTML source.
1067
- */
1068
- export function parse(data, options = { lowerCaseTagName: false, comment: false }) {
1069
- const stack = base_parse(data, options);
1070
- const [root] = stack;
1071
- while (stack.length > 1) {
1072
- // Handle each error elements.
1073
- const last = stack.pop();
1074
- const oneBefore = arr_back(stack);
1075
- if (last.parentNode && last.parentNode.parentNode) {
1076
- if (last.parentNode === oneBefore && last.tagName === oneBefore.tagName) {
1077
- // Pair error case <h3> <h3> handle : Fixes to <h3> </h3>
1078
- oneBefore.removeChild(last);
1079
- last.childNodes.forEach((child) => {
1080
- oneBefore.parentNode.appendChild(child);
1081
- });
1082
- stack.pop();
1083
- }
1084
- else {
1085
- // Single error <div> <h3> </div> handle: Just removes <h3>
1086
- oneBefore.removeChild(last);
1087
- last.childNodes.forEach((child) => {
1088
- oneBefore.appendChild(child);
1089
- });
1090
- }
1091
- }
1092
- else {
1093
- // If it's final element just skip.
1094
- }
1095
- }
1096
- // response.childNodes.forEach((node) => {
1097
- // if (node instanceof HTMLElement) {
1098
- // node.parentNode = null;
1099
- // }
1100
- // });
1101
- return root;
1102
- }