node-html-parser 4.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1048 @@
1
+ import he from 'he';
2
+ import { selectAll, selectOne } from 'css-select';
3
+ import Node from './node';
4
+ import NodeType from './type';
5
+ import TextNode from './text';
6
+ import Matcher from '../matcher';
7
+ import arr_back from '../back';
8
+ import CommentNode from './comment';
9
+ function decode(val) {
10
+ // clone string
11
+ return JSON.parse(JSON.stringify(he.decode(val)));
12
+ }
13
+ // https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements
14
+ const Htags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup'];
15
+ const Dtags = ['details', 'dialog', 'dd', 'div', 'dt'];
16
+ const Ftags = ['fieldset', 'figcaption', 'figure', 'footer', 'form'];
17
+ const tableTags = ['table', 'td', 'tr'];
18
+ const htmlTags = ['address', 'article', 'aside', 'blockquote', 'br', 'hr', 'li', 'main', 'nav', 'ol', 'p', 'pre', 'section', 'ul'];
19
+ const kBlockElements = new Set();
20
+ function addToKBlockElement(...args) {
21
+ const addToSet = (array) => {
22
+ for (let index = 0; index < array.length; index++) {
23
+ const element = array[index];
24
+ kBlockElements.add(element);
25
+ kBlockElements.add(element.toUpperCase());
26
+ }
27
+ };
28
+ for (const arg of args)
29
+ addToSet(arg);
30
+ }
31
+ addToKBlockElement(Htags, Dtags, Ftags, tableTags, htmlTags);
32
+ class DOMTokenList {
33
+ constructor(valuesInit = [], afterUpdate = () => null) {
34
+ this._set = new Set(valuesInit);
35
+ this._afterUpdate = afterUpdate;
36
+ }
37
+ _validate(c) {
38
+ if (/\s/.test(c)) {
39
+ throw new Error(`DOMException in DOMTokenList.add: The token '${c}' contains HTML space characters, which are not valid in tokens.`);
40
+ }
41
+ }
42
+ add(c) {
43
+ this._validate(c);
44
+ this._set.add(c);
45
+ this._afterUpdate(this); // eslint-disable-line @typescript-eslint/no-unsafe-call
46
+ }
47
+ replace(c1, c2) {
48
+ this._validate(c2);
49
+ this._set.delete(c1);
50
+ this._set.add(c2);
51
+ this._afterUpdate(this); // eslint-disable-line @typescript-eslint/no-unsafe-call
52
+ }
53
+ remove(c) {
54
+ this._set.delete(c) && this._afterUpdate(this); // eslint-disable-line @typescript-eslint/no-unsafe-call
55
+ }
56
+ toggle(c) {
57
+ this._validate(c);
58
+ if (this._set.has(c))
59
+ this._set.delete(c);
60
+ else
61
+ this._set.add(c);
62
+ this._afterUpdate(this); // eslint-disable-line @typescript-eslint/no-unsafe-call
63
+ }
64
+ contains(c) {
65
+ return this._set.has(c);
66
+ }
67
+ get length() {
68
+ return this._set.size;
69
+ }
70
+ values() {
71
+ return this._set.values();
72
+ }
73
+ get value() {
74
+ return Array.from(this._set.values());
75
+ }
76
+ toString() {
77
+ return Array.from(this._set.values()).join(' ');
78
+ }
79
+ }
80
+ /**
81
+ * HTMLElement, which contains a set of children.
82
+ *
83
+ * Note: this is a minimalist implementation, no complete tree
84
+ * structure provided (no parentNode, nextSibling,
85
+ * previousSibling etc).
86
+ * @class HTMLElement
87
+ * @extends {Node}
88
+ */
89
+ export default class HTMLElement extends Node {
90
+ /**
91
+ * Creates an instance of HTMLElement.
92
+ * @param keyAttrs id and class attribute
93
+ * @param [rawAttrs] attributes in string
94
+ *
95
+ * @memberof HTMLElement
96
+ */
97
+ constructor(tagName, keyAttrs, rawAttrs = '', parentNode, range) {
98
+ super(parentNode, range);
99
+ this.rawAttrs = rawAttrs;
100
+ /**
101
+ * Node Type declaration.
102
+ */
103
+ this.nodeType = NodeType.ELEMENT_NODE;
104
+ this.rawTagName = tagName;
105
+ this.rawAttrs = rawAttrs || '';
106
+ this.id = keyAttrs.id || '';
107
+ this.childNodes = [];
108
+ this.classList = new DOMTokenList(keyAttrs.class ? keyAttrs.class.split(/\s+/) : [], (classList) => this.setAttribute('class', classList.toString()) // eslint-disable-line @typescript-eslint/no-unsafe-member-access, @typescript-eslint/no-unsafe-call
109
+ );
110
+ if (keyAttrs.id) {
111
+ if (!rawAttrs) {
112
+ this.rawAttrs = `id="${keyAttrs.id}"`;
113
+ }
114
+ }
115
+ if (keyAttrs.class) {
116
+ if (!rawAttrs) {
117
+ const cls = `class="${this.classList.toString()}"`;
118
+ if (this.rawAttrs) {
119
+ this.rawAttrs += ` ${cls}`;
120
+ }
121
+ else {
122
+ this.rawAttrs = cls;
123
+ }
124
+ }
125
+ }
126
+ }
127
+ /**
128
+ * Quote attribute values
129
+ * @param attr attribute value
130
+ * @returns {string} quoted value
131
+ */
132
+ quoteAttribute(attr) {
133
+ if (attr === null) {
134
+ return 'null';
135
+ }
136
+ return JSON.stringify(attr.replace(/"/g, '&quot;'));
137
+ }
138
+ /**
139
+ * Remove current element
140
+ */
141
+ remove() {
142
+ if (this.parentNode) {
143
+ const children = this.parentNode.childNodes;
144
+ this.parentNode.childNodes = children.filter((child) => {
145
+ return this !== child;
146
+ });
147
+ }
148
+ }
149
+ /**
150
+ * Remove Child element from childNodes array
151
+ * @param {HTMLElement} node node to remove
152
+ */
153
+ removeChild(node) {
154
+ this.childNodes = this.childNodes.filter((child) => {
155
+ return child !== node;
156
+ });
157
+ }
158
+ /**
159
+ * Exchanges given child with new child
160
+ * @param {HTMLElement} oldNode node to exchange
161
+ * @param {HTMLElement} newNode new node
162
+ */
163
+ exchangeChild(oldNode, newNode) {
164
+ const children = this.childNodes;
165
+ this.childNodes = children.map((child) => {
166
+ if (child === oldNode) {
167
+ return newNode;
168
+ }
169
+ return child;
170
+ });
171
+ }
172
+ get tagName() {
173
+ return this.rawTagName ? this.rawTagName.toUpperCase() : this.rawTagName;
174
+ }
175
+ get localName() {
176
+ return this.rawTagName.toLowerCase();
177
+ }
178
+ /**
179
+ * Get escpaed (as-it) text value of current node and its children.
180
+ * @return {string} text content
181
+ */
182
+ get rawText() {
183
+ return this.childNodes.reduce((pre, cur) => {
184
+ return (pre += cur.rawText);
185
+ }, '');
186
+ }
187
+ get textContent() {
188
+ return decode(this.rawText);
189
+ }
190
+ set textContent(val) {
191
+ const content = [new TextNode(val, this)];
192
+ this.childNodes = content;
193
+ }
194
+ /**
195
+ * Get unescaped text value of current node and its children.
196
+ * @return {string} text content
197
+ */
198
+ get text() {
199
+ return decode(this.rawText);
200
+ }
201
+ /**
202
+ * Get structured Text (with '\n' etc.)
203
+ * @return {string} structured text
204
+ */
205
+ get structuredText() {
206
+ let currentBlock = [];
207
+ const blocks = [currentBlock];
208
+ function dfs(node) {
209
+ if (node.nodeType === NodeType.ELEMENT_NODE) {
210
+ if (kBlockElements.has(node.rawTagName)) {
211
+ if (currentBlock.length > 0) {
212
+ blocks.push((currentBlock = []));
213
+ }
214
+ node.childNodes.forEach(dfs);
215
+ if (currentBlock.length > 0) {
216
+ blocks.push((currentBlock = []));
217
+ }
218
+ }
219
+ else {
220
+ node.childNodes.forEach(dfs);
221
+ }
222
+ }
223
+ else if (node.nodeType === NodeType.TEXT_NODE) {
224
+ if (node.isWhitespace) {
225
+ // Whitespace node, postponed output
226
+ currentBlock.prependWhitespace = true;
227
+ }
228
+ else {
229
+ let text = node.trimmedText;
230
+ if (currentBlock.prependWhitespace) {
231
+ text = ` ${text}`;
232
+ currentBlock.prependWhitespace = false;
233
+ }
234
+ currentBlock.push(text);
235
+ }
236
+ }
237
+ }
238
+ dfs(this);
239
+ return blocks
240
+ .map((block) => {
241
+ return block.join('').replace(/\s{2,}/g, ' '); // Normalize each line's whitespace
242
+ })
243
+ .join('\n')
244
+ .replace(/\s+$/, ''); // trimRight;
245
+ }
246
+ toString() {
247
+ const tag = this.rawTagName;
248
+ if (tag) {
249
+ // const void_tags = new Set('area|base|br|col|embed|hr|img|input|link|meta|param|source|track|wbr'.split('|'));
250
+ // const is_void = void_tags.has(tag);
251
+ const is_void = /^(area|base|br|col|embed|hr|img|input|link|meta|param|source|track|wbr)$/i.test(tag);
252
+ const attrs = this.rawAttrs ? ` ${this.rawAttrs}` : '';
253
+ if (is_void) {
254
+ return `<${tag}${attrs}>`;
255
+ }
256
+ return `<${tag}${attrs}>${this.innerHTML}</${tag}>`;
257
+ }
258
+ return this.innerHTML;
259
+ }
260
+ get innerHTML() {
261
+ return this.childNodes
262
+ .map((child) => {
263
+ return child.toString();
264
+ })
265
+ .join('');
266
+ }
267
+ set innerHTML(content) {
268
+ //const r = parse(content, global.options); // TODO global.options ?
269
+ const r = parse(content);
270
+ this.childNodes = r.childNodes.length ? r.childNodes : [new TextNode(content, this)];
271
+ }
272
+ set_content(content, options = {}) {
273
+ if (content instanceof Node) {
274
+ content = [content];
275
+ }
276
+ else if (typeof content == 'string') {
277
+ const r = parse(content, options);
278
+ content = r.childNodes.length ? r.childNodes : [new TextNode(content, this)];
279
+ }
280
+ this.childNodes = content;
281
+ }
282
+ replaceWith(...nodes) {
283
+ const content = nodes
284
+ .map((node) => {
285
+ if (node instanceof Node) {
286
+ return [node];
287
+ }
288
+ else if (typeof node == 'string') {
289
+ // const r = parse(content, global.options); // TODO global.options ?
290
+ const r = parse(node);
291
+ return r.childNodes.length ? r.childNodes : [new TextNode(node, this)];
292
+ }
293
+ return [];
294
+ })
295
+ .flat();
296
+ const idx = this.parentNode.childNodes.findIndex((child) => {
297
+ return child === this;
298
+ });
299
+ this.parentNode.childNodes = [
300
+ ...this.parentNode.childNodes.slice(0, idx),
301
+ ...content,
302
+ ...this.parentNode.childNodes.slice(idx + 1),
303
+ ];
304
+ }
305
+ get outerHTML() {
306
+ return this.toString();
307
+ }
308
+ /**
309
+ * Trim element from right (in block) after seeing pattern in a TextNode.
310
+ * @param {RegExp} pattern pattern to find
311
+ * @return {HTMLElement} reference to current node
312
+ */
313
+ trimRight(pattern) {
314
+ for (let i = 0; i < this.childNodes.length; i++) {
315
+ const childNode = this.childNodes[i];
316
+ if (childNode.nodeType === NodeType.ELEMENT_NODE) {
317
+ childNode.trimRight(pattern);
318
+ }
319
+ else {
320
+ const index = childNode.rawText.search(pattern);
321
+ if (index > -1) {
322
+ childNode.rawText = childNode.rawText.substr(0, index);
323
+ // trim all following nodes.
324
+ this.childNodes.length = i + 1;
325
+ }
326
+ }
327
+ }
328
+ return this;
329
+ }
330
+ /**
331
+ * Get DOM structure
332
+ * @return {string} strucutre
333
+ */
334
+ get structure() {
335
+ const res = [];
336
+ let indention = 0;
337
+ function write(str) {
338
+ res.push(' '.repeat(indention) + str);
339
+ }
340
+ function dfs(node) {
341
+ const idStr = node.id ? `#${node.id}` : '';
342
+ const classStr = node.classList.length ? `.${node.classList.value.join('.')}` : ''; // eslint-disable-line @typescript-eslint/no-unsafe-member-access, @typescript-eslint/no-unsafe-member-access, @typescript-eslint/restrict-template-expressions, @typescript-eslint/no-unsafe-call
343
+ write(`${node.rawTagName}${idStr}${classStr}`);
344
+ indention++;
345
+ node.childNodes.forEach((childNode) => {
346
+ if (childNode.nodeType === NodeType.ELEMENT_NODE) {
347
+ dfs(childNode);
348
+ }
349
+ else if (childNode.nodeType === NodeType.TEXT_NODE) {
350
+ if (!childNode.isWhitespace) {
351
+ write('#text');
352
+ }
353
+ }
354
+ });
355
+ indention--;
356
+ }
357
+ dfs(this);
358
+ return res.join('\n');
359
+ }
360
+ /**
361
+ * Remove whitespaces in this sub tree.
362
+ * @return {HTMLElement} pointer to this
363
+ */
364
+ removeWhitespace() {
365
+ let o = 0;
366
+ this.childNodes.forEach((node) => {
367
+ if (node.nodeType === NodeType.TEXT_NODE) {
368
+ if (node.isWhitespace) {
369
+ return;
370
+ }
371
+ node.rawText = node.trimmedRawText;
372
+ }
373
+ else if (node.nodeType === NodeType.ELEMENT_NODE) {
374
+ node.removeWhitespace();
375
+ }
376
+ this.childNodes[o++] = node;
377
+ });
378
+ this.childNodes.length = o;
379
+ return this;
380
+ }
381
+ /**
382
+ * Query CSS selector to find matching nodes.
383
+ * @param {string} selector Simplified CSS selector
384
+ * @return {HTMLElement[]} matching elements
385
+ */
386
+ querySelectorAll(selector) {
387
+ return selectAll(selector, this, {
388
+ xmlMode: true,
389
+ adapter: Matcher,
390
+ });
391
+ // let matcher: Matcher;
392
+ // if (selector instanceof Matcher) {
393
+ // matcher = selector;
394
+ // matcher.reset();
395
+ // } else {
396
+ // if (selector.includes(',')) {
397
+ // const selectors = selector.split(',');
398
+ // return Array.from(selectors.reduce((pre, cur) => {
399
+ // const result = this.querySelectorAll(cur.trim());
400
+ // return result.reduce((p, c) => {
401
+ // return p.add(c);
402
+ // }, pre);
403
+ // }, new Set<HTMLElement>()));
404
+ // }
405
+ // matcher = new Matcher(selector);
406
+ // }
407
+ // interface IStack {
408
+ // 0: Node; // node
409
+ // 1: number; // children
410
+ // 2: boolean; // found flag
411
+ // }
412
+ // const stack = [] as IStack[];
413
+ // return this.childNodes.reduce((res, cur) => {
414
+ // stack.push([cur, 0, false]);
415
+ // while (stack.length) {
416
+ // const state = arr_back(stack); // get last element
417
+ // const el = state[0];
418
+ // if (state[1] === 0) {
419
+ // // Seen for first time.
420
+ // if (el.nodeType !== NodeType.ELEMENT_NODE) {
421
+ // stack.pop();
422
+ // continue;
423
+ // }
424
+ // const html_el = el as HTMLElement;
425
+ // state[2] = matcher.advance(html_el);
426
+ // if (state[2]) {
427
+ // if (matcher.matched) {
428
+ // res.push(html_el);
429
+ // res.push(...(html_el.querySelectorAll(selector)));
430
+ // // no need to go further.
431
+ // matcher.rewind();
432
+ // stack.pop();
433
+ // continue;
434
+ // }
435
+ // }
436
+ // }
437
+ // if (state[1] < el.childNodes.length) {
438
+ // stack.push([el.childNodes[state[1]++], 0, false]);
439
+ // } else {
440
+ // if (state[2]) {
441
+ // matcher.rewind();
442
+ // }
443
+ // stack.pop();
444
+ // }
445
+ // }
446
+ // return res;
447
+ // }, [] as HTMLElement[]);
448
+ }
449
+ /**
450
+ * Query CSS Selector to find matching node.
451
+ * @param {string} selector Simplified CSS selector
452
+ * @return {HTMLElement} matching node
453
+ */
454
+ querySelector(selector) {
455
+ return selectOne(selector, this, {
456
+ xmlMode: true,
457
+ adapter: Matcher,
458
+ });
459
+ // let matcher: Matcher;
460
+ // if (selector instanceof Matcher) {
461
+ // matcher = selector;
462
+ // matcher.reset();
463
+ // } else {
464
+ // matcher = new Matcher(selector);
465
+ // }
466
+ // const stack = [] as { 0: Node; 1: 0 | 1; 2: boolean }[];
467
+ // for (const node of this.childNodes) {
468
+ // stack.push([node, 0, false]);
469
+ // while (stack.length) {
470
+ // const state = arr_back(stack);
471
+ // const el = state[0];
472
+ // if (state[1] === 0) {
473
+ // // Seen for first time.
474
+ // if (el.nodeType !== NodeType.ELEMENT_NODE) {
475
+ // stack.pop();
476
+ // continue;
477
+ // }
478
+ // state[2] = matcher.advance(el as HTMLElement);
479
+ // if (state[2]) {
480
+ // if (matcher.matched) {
481
+ // return el as HTMLElement;
482
+ // }
483
+ // }
484
+ // }
485
+ // if (state[1] < el.childNodes.length) {
486
+ // stack.push([el.childNodes[state[1]++], 0, false]);
487
+ // } else {
488
+ // if (state[2]) {
489
+ // matcher.rewind();
490
+ // }
491
+ // stack.pop();
492
+ // }
493
+ // }
494
+ // }
495
+ // return null;
496
+ }
497
+ /**
498
+ * traverses the Element and its parents (heading toward the document root) until it finds a node that matches the provided selector string. Will return itself or the matching ancestor. If no such element exists, it returns null.
499
+ * @param selector a DOMString containing a selector list
500
+ */
501
+ closest(selector) {
502
+ const mapChild = new Map();
503
+ let el = this;
504
+ let old = null;
505
+ function findOne(test, elems) {
506
+ let elem = null;
507
+ for (let i = 0, l = elems.length; i < l && !elem; i++) {
508
+ const el = elems[i];
509
+ if (test(el)) {
510
+ elem = el;
511
+ }
512
+ else {
513
+ const child = mapChild.get(el);
514
+ if (child) {
515
+ elem = findOne(test, [child]);
516
+ }
517
+ }
518
+ }
519
+ return elem;
520
+ }
521
+ while (el) {
522
+ mapChild.set(el, old);
523
+ old = el;
524
+ el = el.parentNode;
525
+ }
526
+ el = this;
527
+ while (el) {
528
+ const e = selectOne(selector, el, {
529
+ xmlMode: true,
530
+ adapter: {
531
+ ...Matcher,
532
+ getChildren(node) {
533
+ const child = mapChild.get(node);
534
+ return child && [child];
535
+ },
536
+ getSiblings(node) {
537
+ return [node];
538
+ },
539
+ findOne,
540
+ findAll() {
541
+ return [];
542
+ },
543
+ },
544
+ });
545
+ if (e) {
546
+ return e;
547
+ }
548
+ el = el.parentNode;
549
+ }
550
+ return null;
551
+ }
552
+ /**
553
+ * Append a child node to childNodes
554
+ * @param {Node} node node to append
555
+ * @return {Node} node appended
556
+ */
557
+ appendChild(node) {
558
+ // node.parentNode = this;
559
+ this.childNodes.push(node);
560
+ node.parentNode = this;
561
+ return node;
562
+ }
563
+ /**
564
+ * Get first child node
565
+ * @return {Node} first child node
566
+ */
567
+ get firstChild() {
568
+ return this.childNodes[0];
569
+ }
570
+ /**
571
+ * Get last child node
572
+ * @return {Node} last child node
573
+ */
574
+ get lastChild() {
575
+ return arr_back(this.childNodes);
576
+ }
577
+ /**
578
+ * Get attributes
579
+ * @access private
580
+ * @return {Object} parsed and unescaped attributes
581
+ */
582
+ get attrs() {
583
+ if (this._attrs) {
584
+ return this._attrs;
585
+ }
586
+ this._attrs = {};
587
+ const attrs = this.rawAttributes;
588
+ for (const key in attrs) {
589
+ const val = attrs[key] || '';
590
+ this._attrs[key.toLowerCase()] = decode(val);
591
+ }
592
+ return this._attrs;
593
+ }
594
+ get attributes() {
595
+ const ret_attrs = {};
596
+ const attrs = this.rawAttributes;
597
+ for (const key in attrs) {
598
+ const val = attrs[key] || '';
599
+ ret_attrs[key] = decode(val);
600
+ }
601
+ return ret_attrs;
602
+ }
603
+ /**
604
+ * Get escaped (as-it) attributes
605
+ * @return {Object} parsed attributes
606
+ */
607
+ get rawAttributes() {
608
+ if (this._rawAttrs) {
609
+ return this._rawAttrs;
610
+ }
611
+ const attrs = {};
612
+ if (this.rawAttrs) {
613
+ const re = /([a-z()#][a-z0-9-_:()#]*)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|(\S+)))?/gi;
614
+ let match;
615
+ while ((match = re.exec(this.rawAttrs))) {
616
+ attrs[match[1]] = match[2] || match[3] || match[4] || null;
617
+ }
618
+ }
619
+ this._rawAttrs = attrs;
620
+ return attrs;
621
+ }
622
+ removeAttribute(key) {
623
+ const attrs = this.rawAttributes;
624
+ delete attrs[key];
625
+ // Update this.attribute
626
+ if (this._attrs) {
627
+ delete this._attrs[key];
628
+ }
629
+ // Update rawString
630
+ this.rawAttrs = Object.keys(attrs)
631
+ .map((name) => {
632
+ const val = JSON.stringify(attrs[name]);
633
+ if (val === undefined || val === 'null') {
634
+ return name;
635
+ }
636
+ return `${name}=${val}`;
637
+ })
638
+ .join(' ');
639
+ // Update this.id
640
+ if (key === 'id') {
641
+ this.id = '';
642
+ }
643
+ }
644
+ hasAttribute(key) {
645
+ return key.toLowerCase() in this.attrs;
646
+ }
647
+ /**
648
+ * Get an attribute
649
+ * @return {string} value of the attribute
650
+ */
651
+ getAttribute(key) {
652
+ return this.attrs[key.toLowerCase()];
653
+ }
654
+ /**
655
+ * Set an attribute value to the HTMLElement
656
+ * @param {string} key The attribute name
657
+ * @param {string} value The value to set, or null / undefined to remove an attribute
658
+ */
659
+ setAttribute(key, value) {
660
+ if (arguments.length < 2) {
661
+ throw new Error("Failed to execute 'setAttribute' on 'Element'");
662
+ }
663
+ const k2 = key.toLowerCase();
664
+ const attrs = this.rawAttributes;
665
+ for (const k in attrs) {
666
+ if (k.toLowerCase() === k2) {
667
+ key = k;
668
+ break;
669
+ }
670
+ }
671
+ attrs[key] = String(value);
672
+ // update this.attrs
673
+ if (this._attrs) {
674
+ this._attrs[k2] = decode(attrs[key]);
675
+ }
676
+ // Update rawString
677
+ this.rawAttrs = Object.keys(attrs)
678
+ .map((name) => {
679
+ const val = this.quoteAttribute(attrs[name]);
680
+ if (val === 'null' || val === '""')
681
+ return name;
682
+ return `${name}=${val}`;
683
+ })
684
+ .join(' ');
685
+ // Update this.id
686
+ if (key === 'id') {
687
+ this.id = value;
688
+ }
689
+ }
690
+ /**
691
+ * Replace all the attributes of the HTMLElement by the provided attributes
692
+ * @param {Attributes} attributes the new attribute set
693
+ */
694
+ setAttributes(attributes) {
695
+ // Invalidate current this.attributes
696
+ if (this._attrs) {
697
+ delete this._attrs;
698
+ }
699
+ // Invalidate current this.rawAttributes
700
+ if (this._rawAttrs) {
701
+ delete this._rawAttrs;
702
+ }
703
+ // Update rawString
704
+ this.rawAttrs = Object.keys(attributes)
705
+ .map((name) => {
706
+ const val = attributes[name];
707
+ if (val === 'null' || val === '""')
708
+ return name;
709
+ return `${name}=${this.quoteAttribute(String(val))}`;
710
+ })
711
+ .join(' ');
712
+ }
713
+ insertAdjacentHTML(where, html) {
714
+ if (arguments.length < 2) {
715
+ throw new Error('2 arguments required');
716
+ }
717
+ const p = parse(html);
718
+ if (where === 'afterend') {
719
+ const idx = this.parentNode.childNodes.findIndex((child) => {
720
+ return child === this;
721
+ });
722
+ this.parentNode.childNodes.splice(idx + 1, 0, ...p.childNodes);
723
+ p.childNodes.forEach((n) => {
724
+ if (n instanceof HTMLElement) {
725
+ n.parentNode = this.parentNode;
726
+ }
727
+ });
728
+ }
729
+ else if (where === 'afterbegin') {
730
+ this.childNodes.unshift(...p.childNodes);
731
+ }
732
+ else if (where === 'beforeend') {
733
+ p.childNodes.forEach((n) => {
734
+ this.appendChild(n);
735
+ });
736
+ }
737
+ else if (where === 'beforebegin') {
738
+ const idx = this.parentNode.childNodes.findIndex((child) => {
739
+ return child === this;
740
+ });
741
+ this.parentNode.childNodes.splice(idx, 0, ...p.childNodes);
742
+ p.childNodes.forEach((n) => {
743
+ if (n instanceof HTMLElement) {
744
+ n.parentNode = this.parentNode;
745
+ }
746
+ });
747
+ }
748
+ else {
749
+ throw new Error(`The value provided ('${where}') is not one of 'beforebegin', 'afterbegin', 'beforeend', or 'afterend'`);
750
+ }
751
+ // if (!where || html === undefined || html === null) {
752
+ // return;
753
+ // }
754
+ }
755
+ get nextSibling() {
756
+ if (this.parentNode) {
757
+ const children = this.parentNode.childNodes;
758
+ let i = 0;
759
+ while (i < children.length) {
760
+ const child = children[i++];
761
+ if (this === child)
762
+ return children[i] || null;
763
+ }
764
+ return null;
765
+ }
766
+ }
767
+ get nextElementSibling() {
768
+ if (this.parentNode) {
769
+ const children = this.parentNode.childNodes;
770
+ let i = 0;
771
+ let find = false;
772
+ while (i < children.length) {
773
+ const child = children[i++];
774
+ if (find) {
775
+ if (child instanceof HTMLElement) {
776
+ return child || null;
777
+ }
778
+ }
779
+ else if (this === child) {
780
+ find = true;
781
+ }
782
+ }
783
+ return null;
784
+ }
785
+ }
786
+ get classNames() {
787
+ return this.classList.toString();
788
+ }
789
+ }
790
+ // https://html.spec.whatwg.org/multipage/custom-elements.html#valid-custom-element-name
791
+ const kMarkupPattern = /<!--[^]*?(?=-->)-->|<(\/?)([a-z][-.:0-9_a-z]*)\s*((?=[/>]*?)|(?:.*?[\s\d/'"])|(?:.*?[\w]))(\/?)>/gi;
792
+ // <(?<tag>[^\s]*)(.*)>(.*)</\k<tag>>
793
+ // <([a-z][-.:0-9_a-z]*)\s*\/>
794
+ // <(area|base|br|col|hr|img|input|link|meta|source)\s*(.*)\/?>
795
+ // <(area|base|br|col|hr|img|input|link|meta|source)\s*(.*)\/?>|<(?<tag>[^\s]*)(.*)>(.*)</\k<tag>>
796
+ const kAttributePattern = /(^|\s)(id|class)\s*=\s*("([^"]*)"|'([^']*)'|(\S+))/gi;
797
+ const kSelfClosingElements = {
798
+ area: true,
799
+ AREA: true,
800
+ base: true,
801
+ BASE: true,
802
+ br: true,
803
+ BR: true,
804
+ col: true,
805
+ COL: true,
806
+ hr: true,
807
+ HR: true,
808
+ img: true,
809
+ IMG: true,
810
+ input: true,
811
+ INPUT: true,
812
+ link: true,
813
+ LINK: true,
814
+ meta: true,
815
+ META: true,
816
+ source: true,
817
+ SOURCE: true,
818
+ embed: true,
819
+ EMBED: true,
820
+ param: true,
821
+ PARAM: true,
822
+ track: true,
823
+ TRACK: true,
824
+ wbr: true,
825
+ WBR: true,
826
+ };
827
+ const kElementsClosedByOpening = {
828
+ li: { li: true, LI: true },
829
+ LI: { li: true, LI: true },
830
+ p: { p: true, div: true, P: true, DIV: true },
831
+ P: { p: true, div: true, P: true, DIV: true },
832
+ b: { div: true, DIV: true },
833
+ B: { div: true, DIV: true },
834
+ td: { td: true, th: true, TD: true, TH: true },
835
+ TD: { td: true, th: true, TD: true, TH: true },
836
+ th: { td: true, th: true, TD: true, TH: true },
837
+ TH: { td: true, th: true, TD: true, TH: true },
838
+ h1: { h1: true, H1: true },
839
+ H1: { h1: true, H1: true },
840
+ h2: { h2: true, H2: true },
841
+ H2: { h2: true, H2: true },
842
+ h3: { h3: true, H3: true },
843
+ H3: { h3: true, H3: true },
844
+ h4: { h4: true, H4: true },
845
+ H4: { h4: true, H4: true },
846
+ h5: { h5: true, H5: true },
847
+ H5: { h5: true, H5: true },
848
+ h6: { h6: true, H6: true },
849
+ H6: { h6: true, H6: true },
850
+ };
851
+ const kElementsClosedByClosing = {
852
+ li: { ul: true, ol: true, UL: true, OL: true },
853
+ LI: { ul: true, ol: true, UL: true, OL: true },
854
+ a: { div: true, DIV: true },
855
+ A: { div: true, DIV: true },
856
+ b: { div: true, DIV: true },
857
+ B: { div: true, DIV: true },
858
+ i: { div: true, DIV: true },
859
+ I: { div: true, DIV: true },
860
+ p: { div: true, DIV: true },
861
+ P: { div: true, DIV: true },
862
+ td: { tr: true, table: true, TR: true, TABLE: true },
863
+ TD: { tr: true, table: true, TR: true, TABLE: true },
864
+ th: { tr: true, table: true, TR: true, TABLE: true },
865
+ TH: { tr: true, table: true, TR: true, TABLE: true },
866
+ };
867
+ const frameflag = 'documentfragmentcontainer';
868
+ /**
869
+ * Parses HTML and returns a root element
870
+ * Parse a chuck of HTML source.
871
+ * @param {string} data html
872
+ * @return {HTMLElement} root element
873
+ */
874
+ export function base_parse(data, options = { lowerCaseTagName: false, comment: false }) {
875
+ const elements = options.blockTextElements || {
876
+ script: true,
877
+ noscript: true,
878
+ style: true,
879
+ pre: true,
880
+ };
881
+ const element_names = Object.keys(elements);
882
+ const kBlockTextElements = element_names.map((it) => new RegExp(it, 'i'));
883
+ const kIgnoreElements = element_names.filter((it) => elements[it]).map((it) => new RegExp(it, 'i'));
884
+ function element_should_be_ignore(tag) {
885
+ return kIgnoreElements.some((it) => it.test(tag));
886
+ }
887
+ function is_block_text_element(tag) {
888
+ return kBlockTextElements.some((it) => it.test(tag));
889
+ }
890
+ const createRange = (startPos, endPos) => [startPos - frameFlagOffset, endPos - frameFlagOffset];
891
+ const root = new HTMLElement(null, {}, '', null, [0, data.length]);
892
+ let currentParent = root;
893
+ const stack = [root];
894
+ let lastTextPos = -1;
895
+ let noNestedTagIndex = undefined;
896
+ let match;
897
+ // https://github.com/taoqf/node-html-parser/issues/38
898
+ data = `<${frameflag}>${data}</${frameflag}>`;
899
+ const dataEndPos = data.length - (frameflag.length + 2);
900
+ const frameFlagOffset = frameflag.length + 2;
901
+ while ((match = kMarkupPattern.exec(data))) {
902
+ const tagStartPos = kMarkupPattern.lastIndex - match[0].length;
903
+ const tagEndPos = kMarkupPattern.lastIndex;
904
+ // Add TextNode if content
905
+ if (lastTextPos > -1) {
906
+ if (lastTextPos + match[0].length < tagEndPos) {
907
+ const text = data.substring(lastTextPos, tagStartPos);
908
+ currentParent.appendChild(new TextNode(text, currentParent, createRange(lastTextPos, tagStartPos)));
909
+ }
910
+ }
911
+ lastTextPos = kMarkupPattern.lastIndex;
912
+ // https://github.com/taoqf/node-html-parser/issues/38
913
+ // Skip frameflag node
914
+ if (match[2] === frameflag)
915
+ continue;
916
+ // Handle comments
917
+ if (match[0][1] === '!') {
918
+ if (options.comment) {
919
+ // Only keep what is in between <!-- and -->
920
+ const text = data.substring(tagStartPos + 4, tagEndPos - 3);
921
+ currentParent.appendChild(new CommentNode(text, currentParent, createRange(tagStartPos, tagEndPos)));
922
+ }
923
+ continue;
924
+ }
925
+ /* -- Handle tag matching -- */
926
+ // Fix tag casing if necessary
927
+ if (options.lowerCaseTagName)
928
+ match[2] = match[2].toLowerCase();
929
+ // Handle opening tags (ie. <this> not </that>)
930
+ if (!match[1]) {
931
+ /* Populate attributes */
932
+ const attrs = {};
933
+ for (let attMatch; (attMatch = kAttributePattern.exec(match[3]));) {
934
+ attrs[attMatch[2].toLowerCase()] = attMatch[4] || attMatch[5] || attMatch[6];
935
+ }
936
+ const tagName = currentParent.rawTagName;
937
+ if (!match[4] && kElementsClosedByOpening[tagName]) {
938
+ if (kElementsClosedByOpening[tagName][match[2]]) {
939
+ stack.pop();
940
+ currentParent = arr_back(stack);
941
+ }
942
+ }
943
+ // Prevent nested A tags by terminating the last A and starting a new one : see issue #144
944
+ if (match[2] === 'a' || match[2] === 'A') {
945
+ if (noNestedTagIndex !== undefined) {
946
+ stack.splice(noNestedTagIndex);
947
+ currentParent = arr_back(stack);
948
+ }
949
+ noNestedTagIndex = stack.length;
950
+ }
951
+ const tagEndPos = kMarkupPattern.lastIndex;
952
+ const tagStartPos = tagEndPos - match[0].length;
953
+ currentParent = currentParent.appendChild(
954
+ // Initialize range (end position updated later for closed tags)
955
+ new HTMLElement(match[2], attrs, match[3], null, createRange(tagStartPos, tagEndPos)));
956
+ stack.push(currentParent);
957
+ if (is_block_text_element(match[2])) {
958
+ // Find closing tag
959
+ const closeMarkup = `</${match[2]}>`;
960
+ const closeIndex = options.lowerCaseTagName
961
+ ? data.toLocaleLowerCase().indexOf(closeMarkup, kMarkupPattern.lastIndex)
962
+ : data.indexOf(closeMarkup, kMarkupPattern.lastIndex);
963
+ const textEndPos = closeIndex === -1 ? dataEndPos : closeIndex;
964
+ if (element_should_be_ignore(match[2])) {
965
+ const text = data.substring(tagEndPos, textEndPos);
966
+ if (text.length > 0 && /\S/.test(text)) {
967
+ currentParent.appendChild(new TextNode(text, currentParent, createRange(tagEndPos, textEndPos)));
968
+ }
969
+ }
970
+ if (closeIndex === -1) {
971
+ lastTextPos = kMarkupPattern.lastIndex = data.length + 1;
972
+ }
973
+ else {
974
+ lastTextPos = kMarkupPattern.lastIndex = closeIndex + closeMarkup.length;
975
+ // Cause to be treated as self-closing, because no close found
976
+ match[1] = 'true';
977
+ }
978
+ }
979
+ }
980
+ // Handle closing tags or self-closed elements (ie </tag> or <br>)
981
+ if (match[1] || match[4] || kSelfClosingElements[match[2]]) {
982
+ while (true) {
983
+ if (match[2] === 'a' || match[2] === 'A')
984
+ noNestedTagIndex = undefined;
985
+ if (currentParent.rawTagName === match[2]) {
986
+ // Update range end for closed tag
987
+ currentParent.range[1] = createRange(-1, Math.max(lastTextPos, tagEndPos))[1];
988
+ stack.pop();
989
+ currentParent = arr_back(stack);
990
+ break;
991
+ }
992
+ else {
993
+ const tagName = currentParent.tagName;
994
+ // Trying to close current tag, and move on
995
+ if (kElementsClosedByClosing[tagName]) {
996
+ if (kElementsClosedByClosing[tagName][match[2]]) {
997
+ stack.pop();
998
+ currentParent = arr_back(stack);
999
+ continue;
1000
+ }
1001
+ }
1002
+ // Use aggressive strategy to handle unmatching markups.
1003
+ break;
1004
+ }
1005
+ }
1006
+ }
1007
+ }
1008
+ return stack;
1009
+ }
1010
+ /**
1011
+ * Parses HTML and returns a root element
1012
+ * Parse a chuck of HTML source.
1013
+ */
1014
+ export function parse(data, options = { lowerCaseTagName: false, comment: false }) {
1015
+ const stack = base_parse(data, options);
1016
+ const [root] = stack;
1017
+ while (stack.length > 1) {
1018
+ // Handle each error elements.
1019
+ const last = stack.pop();
1020
+ const oneBefore = arr_back(stack);
1021
+ if (last.parentNode && last.parentNode.parentNode) {
1022
+ if (last.parentNode === oneBefore && last.tagName === oneBefore.tagName) {
1023
+ // Pair error case <h3> <h3> handle : Fixes to <h3> </h3>
1024
+ oneBefore.removeChild(last);
1025
+ last.childNodes.forEach((child) => {
1026
+ oneBefore.parentNode.appendChild(child);
1027
+ });
1028
+ stack.pop();
1029
+ }
1030
+ else {
1031
+ // Single error <div> <h3> </div> handle: Just removes <h3>
1032
+ oneBefore.removeChild(last);
1033
+ last.childNodes.forEach((child) => {
1034
+ oneBefore.appendChild(child);
1035
+ });
1036
+ }
1037
+ }
1038
+ else {
1039
+ // If it's final element just skip.
1040
+ }
1041
+ }
1042
+ // response.childNodes.forEach((node) => {
1043
+ // if (node instanceof HTMLElement) {
1044
+ // node.parentNode = null;
1045
+ // }
1046
+ // });
1047
+ return root;
1048
+ }