@tkeron/html-parser 1.3.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tkeron/html-parser",
3
- "version": "1.3.0",
3
+ "version": "1.4.0",
4
4
  "description": "A fast and lightweight HTML parser for Bun",
5
5
  "main": "index.js",
6
6
  "module": "index.ts",
@@ -0,0 +1,48 @@
1
+ export const mergeAdjacentTextNodes = (
2
+ parent: any,
3
+ insertIndex: number,
4
+ ): void => {
5
+ if (!parent.childNodes || parent.childNodes.length < 2) {
6
+ return;
7
+ }
8
+
9
+ const node = parent.childNodes[insertIndex];
10
+ if (!node || node.nodeType !== 3) {
11
+ return;
12
+ }
13
+
14
+ if (insertIndex > 0) {
15
+ const prevNode = parent.childNodes[insertIndex - 1];
16
+ if (prevNode && prevNode.nodeType === 3) {
17
+ prevNode.textContent =
18
+ (prevNode.textContent || "") + (node.textContent || "");
19
+ prevNode.nodeValue = prevNode.textContent;
20
+ parent.childNodes.splice(insertIndex, 1);
21
+ return;
22
+ }
23
+ }
24
+
25
+ if (insertIndex < parent.childNodes.length - 1) {
26
+ const nextNode = parent.childNodes[insertIndex + 1];
27
+ if (nextNode && nextNode.nodeType === 3) {
28
+ node.textContent =
29
+ (node.textContent || "") + (nextNode.textContent || "");
30
+ node.nodeValue = node.textContent;
31
+ parent.childNodes.splice(insertIndex + 1, 1);
32
+ }
33
+ }
34
+ };
35
+
36
+ export const insertNodeBeforeTable = (
37
+ parent: any,
38
+ tableElement: any,
39
+ node: any,
40
+ ): number => {
41
+ const idx = parent.childNodes.indexOf(tableElement);
42
+ if (idx !== -1) {
43
+ node.parentNode = parent;
44
+ parent.childNodes.splice(idx, 0, node);
45
+ return idx;
46
+ }
47
+ return -1;
48
+ };
@@ -0,0 +1,65 @@
1
+ import { createElement, appendChild } from "../dom-simulator/index.js";
2
+
3
+ export const CELL_ELEMENTS = new Set(["td", "th"]);
4
+
5
+ export const TABLE_SECTION_ELEMENTS = new Set(["tbody", "thead", "tfoot"]);
6
+
7
+ export const shouldCreateImplicitTableStructure = (
8
+ parentTagName: string,
9
+ childTagName: string,
10
+ ): boolean => {
11
+ const parent = parentTagName.toLowerCase();
12
+ const child = childTagName.toLowerCase();
13
+
14
+ if (CELL_ELEMENTS.has(child)) {
15
+ return parent === "table" || TABLE_SECTION_ELEMENTS.has(parent);
16
+ }
17
+
18
+ if (child === "tr") {
19
+ return parent === "table";
20
+ }
21
+
22
+ return false;
23
+ };
24
+
25
+ export const createImplicitTableStructure = (
26
+ stack: any[],
27
+ parentTagName: string,
28
+ childTagName: string,
29
+ ): any => {
30
+ const parent = parentTagName.toLowerCase();
31
+ const child = childTagName.toLowerCase();
32
+ const currentParent = stack[stack.length - 1];
33
+
34
+ if (CELL_ELEMENTS.has(child)) {
35
+ if (parent === "table") {
36
+ const tbody = createElement("tbody", {});
37
+ appendChild(currentParent, tbody);
38
+ stack.push(tbody);
39
+
40
+ const tr = createElement("tr", {});
41
+ appendChild(tbody, tr);
42
+ stack.push(tr);
43
+
44
+ return tr;
45
+ }
46
+
47
+ if (TABLE_SECTION_ELEMENTS.has(parent)) {
48
+ const tr = createElement("tr", {});
49
+ appendChild(currentParent, tr);
50
+ stack.push(tr);
51
+
52
+ return tr;
53
+ }
54
+ }
55
+
56
+ if (child === "tr" && parent === "table") {
57
+ const tbody = createElement("tbody", {});
58
+ appendChild(currentParent, tbody);
59
+ stack.push(tbody);
60
+
61
+ return tbody;
62
+ }
63
+
64
+ return currentParent;
65
+ };
@@ -29,6 +29,12 @@ import {
29
29
  cloneFormattingElement,
30
30
  reparentChildren,
31
31
  } from "./adoption-agency-helpers.js";
32
+ import {
33
+ shouldCreateImplicitTableStructure,
34
+ createImplicitTableStructure,
35
+ CELL_ELEMENTS,
36
+ } from "./implicit-table-structure.js";
37
+ import { mergeAdjacentTextNodes } from "./foster-parenting-helpers.js";
32
38
 
33
39
  export const parse = (tokens: Token[]): any => {
34
40
  const state = createParserState(tokens);
@@ -329,9 +335,30 @@ const parseTokenInInBodyMode = (state: ParserState, token: Token): void => {
329
335
 
330
336
  handleAutoClosing(state, tagName);
331
337
 
332
- reconstructActiveFormattingElements(state);
338
+ const inTableContext = isInTableContext(state);
339
+ const isTableStructureElement =
340
+ CELL_ELEMENTS.has(tagName) ||
341
+ tagName === "tr" ||
342
+ tagName === "tbody" ||
343
+ tagName === "thead" ||
344
+ tagName === "tfoot";
345
+ const currentStackParent = getCurrentParent(state);
346
+ const currentStackParentTag =
347
+ currentStackParent.tagName?.toLowerCase() || "";
348
+ const parentIsTableContext = TABLE_CONTEXT_ELEMENTS.has(
349
+ currentStackParentTag,
350
+ );
333
351
 
334
- const currentParent = getCurrentParent(state);
352
+ if (inTableContext && isTableStructureElement) {
353
+ const tableParent = findTableContextParent(state);
354
+ if (tableParent) {
355
+ popStackUntilTableContext(state);
356
+ }
357
+ } else if (!parentIsTableContext) {
358
+ reconstructActiveFormattingElements(state);
359
+ }
360
+
361
+ let currentParent = getCurrentParent(state);
335
362
 
336
363
  let namespaceURI: string | undefined;
337
364
  if (tagName === "svg") {
@@ -346,8 +373,8 @@ const parseTokenInInBodyMode = (state: ParserState, token: Token): void => {
346
373
  namespaceURI,
347
374
  );
348
375
 
349
- const inTableContext = isInTableContext(state);
350
- const parentTagName = currentParent.tagName || "";
376
+ let parentTagName = currentParent.tagName || "";
377
+
351
378
  const isValidForParent = isValidChildForTableParent(parentTagName, tagName);
352
379
  const isHiddenInput =
353
380
  tagName === "input" &&
@@ -355,25 +382,37 @@ const parseTokenInInBodyMode = (state: ParserState, token: Token): void => {
355
382
  token.attributes.type &&
356
383
  token.attributes.type.toLowerCase() === "hidden";
357
384
  const isFormInTable = tagName === "form" && inTableContext;
385
+
386
+ const needsImplicitStructure =
387
+ inTableContext &&
388
+ shouldCreateImplicitTableStructure(parentTagName, tagName);
389
+
358
390
  const needsFosterParenting =
359
391
  inTableContext &&
360
392
  TABLE_CONTEXT_ELEMENTS.has(parentTagName.toLowerCase()) &&
361
393
  !isValidForParent &&
362
394
  !isHiddenInput &&
363
- !isFormInTable;
395
+ !isFormInTable &&
396
+ !needsImplicitStructure;
364
397
 
365
- if (needsFosterParenting) {
398
+ if (needsImplicitStructure) {
399
+ createImplicitTableStructure(state.stack, parentTagName, tagName);
400
+ appendChild(getCurrentParent(state), element);
401
+ } else if (needsFosterParenting) {
366
402
  insertWithFosterParenting(state, element);
367
403
  } else {
368
404
  appendChild(currentParent, element);
369
405
  }
370
406
 
407
+ const wasFosterParented = needsFosterParenting;
408
+ const isFormattingElement = FORMATTING_ELEMENTS.has(tagName);
409
+
371
410
  if (!token.isSelfClosing && !VOID_ELEMENTS.has(tagName)) {
372
- if (!isFormInTable) {
411
+ if (!isFormInTable && !(wasFosterParented && isFormattingElement)) {
373
412
  state.stack.push(element);
374
413
  }
375
414
 
376
- if (FORMATTING_ELEMENTS.has(tagName)) {
415
+ if (isFormattingElement) {
377
416
  state.activeFormattingElements.push(element);
378
417
  }
379
418
  }
@@ -541,8 +580,6 @@ const parseText = (state: ParserState, token: Token): void => {
541
580
  return;
542
581
  }
543
582
 
544
- reconstructActiveFormattingElements(state);
545
-
546
583
  const textNode = createTextNode(content);
547
584
 
548
585
  const inTableContext = isInTableContext(state);
@@ -552,9 +589,10 @@ const parseText = (state: ParserState, token: Token): void => {
552
589
  currentParent.tagName &&
553
590
  TABLE_CONTEXT_ELEMENTS.has(currentParent.tagName.toLowerCase())
554
591
  ) {
555
- insertWithFosterParenting(state, textNode);
592
+ insertWithFosterParentingAndReconstruct(state, textNode);
556
593
  } else {
557
- appendChild(currentParent, textNode);
594
+ reconstructActiveFormattingElements(state);
595
+ appendChild(getCurrentParent(state), textNode);
558
596
  }
559
597
  };
560
598
 
@@ -710,6 +748,31 @@ const isInTableContext = (state: ParserState): boolean => {
710
748
  return false;
711
749
  };
712
750
 
751
+ const findTableContextParent = (state: ParserState): any | null => {
752
+ for (let i = state.stack.length - 1; i >= 0; i--) {
753
+ const el = state.stack[i];
754
+ if (el.tagName && TABLE_CONTEXT_ELEMENTS.has(el.tagName.toLowerCase())) {
755
+ return el;
756
+ }
757
+ }
758
+ return null;
759
+ };
760
+
761
+ const popStackUntilTableContext = (state: ParserState): void => {
762
+ while (state.stack.length > 1) {
763
+ const el = getCurrentElement(state);
764
+ if (
765
+ el &&
766
+ el.tagName &&
767
+ TABLE_CONTEXT_ELEMENTS.has(el.tagName.toLowerCase())
768
+ ) {
769
+ break;
770
+ }
771
+ state.stack.pop();
772
+ }
773
+ state.activeFormattingElements.push(null);
774
+ };
775
+
713
776
  const isValidChildForTableParent = (
714
777
  parentTagName: string,
715
778
  childTagName: string,
@@ -760,13 +823,102 @@ const insertWithFosterParenting = (state: ParserState, node: any): void => {
760
823
  if (idx !== -1) {
761
824
  node.parentNode = target.parent;
762
825
  target.parent.childNodes.splice(idx, 0, node);
826
+ if (node.nodeType === 3) {
827
+ mergeAdjacentTextNodes(target.parent, idx);
828
+ }
763
829
  return;
764
830
  }
765
831
  }
766
832
  appendChild(target.parent, node);
833
+ if (node.nodeType === 3) {
834
+ const insertedIdx = target.parent.childNodes.indexOf(node);
835
+ if (insertedIdx !== -1) {
836
+ mergeAdjacentTextNodes(target.parent, insertedIdx);
837
+ }
838
+ }
767
839
  return;
768
840
  }
769
841
  }
770
842
 
771
843
  appendChild(currentParent, node);
772
844
  };
845
+
846
+ const insertWithFosterParentingAndReconstruct = (
847
+ state: ParserState,
848
+ node: any,
849
+ ): void => {
850
+ const target = findFosterParentTarget(state);
851
+ if (!target) {
852
+ appendChild(getCurrentParent(state), node);
853
+ return;
854
+ }
855
+
856
+ const activeElements = getActiveFormattingElementsBeforeMarker(state);
857
+
858
+ if (activeElements.length === 0) {
859
+ if (target.before) {
860
+ const idx = target.parent.childNodes.indexOf(target.before);
861
+ if (idx !== -1) {
862
+ node.parentNode = target.parent;
863
+ target.parent.childNodes.splice(idx, 0, node);
864
+ if (node.nodeType === 3) {
865
+ mergeAdjacentTextNodes(target.parent, idx);
866
+ }
867
+ return;
868
+ }
869
+ }
870
+ appendChild(target.parent, node);
871
+ if (node.nodeType === 3) {
872
+ const insertedIdx = target.parent.childNodes.indexOf(node);
873
+ if (insertedIdx !== -1) {
874
+ mergeAdjacentTextNodes(target.parent, insertedIdx);
875
+ }
876
+ }
877
+ return;
878
+ }
879
+
880
+ const hasMarker = state.activeFormattingElements.includes(null);
881
+ const lastFormatEl = activeElements[activeElements.length - 1];
882
+
883
+ if (
884
+ !hasMarker &&
885
+ lastFormatEl.parentNode === target.parent &&
886
+ target.parent.childNodes.indexOf(lastFormatEl) <
887
+ target.parent.childNodes.indexOf(target.before)
888
+ ) {
889
+ appendChild(lastFormatEl, node);
890
+ return;
891
+ }
892
+
893
+ let currentNode = node;
894
+ for (let i = activeElements.length - 1; i >= 0; i--) {
895
+ const formatEl = activeElements[i];
896
+ const clone = cloneFormattingElement(formatEl);
897
+ appendChild(clone, currentNode);
898
+ currentNode = clone;
899
+ }
900
+
901
+ if (target.before) {
902
+ const idx = target.parent.childNodes.indexOf(target.before);
903
+ if (idx !== -1) {
904
+ currentNode.parentNode = target.parent;
905
+ target.parent.childNodes.splice(idx, 0, currentNode);
906
+ return;
907
+ }
908
+ }
909
+ appendChild(target.parent, currentNode);
910
+ };
911
+
912
+ const getActiveFormattingElementsBeforeMarker = (state: ParserState): any[] => {
913
+ const result: any[] = [];
914
+ for (let i = 0; i < state.activeFormattingElements.length; i++) {
915
+ const el = state.activeFormattingElements[i];
916
+ if (el === null) {
917
+ continue;
918
+ }
919
+ if (!isInStack(state.stack, el)) {
920
+ result.push(el);
921
+ }
922
+ }
923
+ return result;
924
+ };
@@ -0,0 +1,127 @@
1
+ import { expect, it, describe } from "bun:test";
2
+ import { parseHTML } from "../index";
3
+ import { serializeToHtml5lib } from "./helpers/tree-adapter";
4
+
5
+ describe("Foster Parenting", () => {
6
+ describe("Text foster parenting", () => {
7
+ it("should foster parent text before table and merge adjacent text nodes", () => {
8
+ const doc = parseHTML("<table>A<td>B</td>C</table>");
9
+ const serialized = serializeToHtml5lib(doc, {
10
+ skipImplicitDoctype: true,
11
+ });
12
+ expect(serialized).toBe(`| <html>
13
+ | <head>
14
+ | <body>
15
+ | "AC"
16
+ | <table>
17
+ | <tbody>
18
+ | <tr>
19
+ | <td>
20
+ | "B"
21
+ `);
22
+ });
23
+
24
+ it("should foster parent text with whitespace correctly", () => {
25
+ const doc = parseHTML("<table> X </table>");
26
+ const serialized = serializeToHtml5lib(doc, {
27
+ skipImplicitDoctype: true,
28
+ });
29
+ expect(serialized).toBe(`| <html>
30
+ | <head>
31
+ | <body>
32
+ | " X "
33
+ | <table>
34
+ `);
35
+ });
36
+ });
37
+
38
+ describe("Element foster parenting", () => {
39
+ it("should foster parent <a> before table with AAA reconstruction", () => {
40
+ const doc = parseHTML("<table><a>1<td>2</td>3</table>");
41
+ const serialized = serializeToHtml5lib(doc, {
42
+ skipImplicitDoctype: true,
43
+ });
44
+ expect(serialized).toBe(`| <html>
45
+ | <head>
46
+ | <body>
47
+ | <a>
48
+ | "1"
49
+ | <a>
50
+ | "3"
51
+ | <table>
52
+ | <tbody>
53
+ | <tr>
54
+ | <td>
55
+ | "2"
56
+ `);
57
+ });
58
+
59
+ it("should foster parent elements with AAA for formatting in <p>", () => {
60
+ const doc = parseHTML("<table><a>1<p>2</a>3</p>");
61
+ const serialized = serializeToHtml5lib(doc, {
62
+ skipImplicitDoctype: true,
63
+ });
64
+ expect(serialized).toBe(`| <html>
65
+ | <head>
66
+ | <body>
67
+ | <a>
68
+ | "1"
69
+ | <p>
70
+ | <a>
71
+ | "2"
72
+ | "3"
73
+ | <table>
74
+ `);
75
+ });
76
+ });
77
+
78
+ describe("Implicit table structure", () => {
79
+ it("should create implicit tbody and tr for td in table", () => {
80
+ const doc = parseHTML("<table><td>X</td></table>");
81
+ const serialized = serializeToHtml5lib(doc, {
82
+ skipImplicitDoctype: true,
83
+ });
84
+ expect(serialized).toBe(`| <html>
85
+ | <head>
86
+ | <body>
87
+ | <table>
88
+ | <tbody>
89
+ | <tr>
90
+ | <td>
91
+ | "X"
92
+ `);
93
+ });
94
+
95
+ it("should create implicit tr for td in tbody", () => {
96
+ const doc = parseHTML("<table><tbody><td>X</td></tbody></table>");
97
+ const serialized = serializeToHtml5lib(doc, {
98
+ skipImplicitDoctype: true,
99
+ });
100
+ expect(serialized).toBe(`| <html>
101
+ | <head>
102
+ | <body>
103
+ | <table>
104
+ | <tbody>
105
+ | <tr>
106
+ | <td>
107
+ | "X"
108
+ `);
109
+ });
110
+
111
+ it("should not create implicit structure when tr is present", () => {
112
+ const doc = parseHTML("<table><tr><td>X</td></tr></table>");
113
+ const serialized = serializeToHtml5lib(doc, {
114
+ skipImplicitDoctype: true,
115
+ });
116
+ expect(serialized).toBe(`| <html>
117
+ | <head>
118
+ | <body>
119
+ | <table>
120
+ | <tbody>
121
+ | <tr>
122
+ | <td>
123
+ | "X"
124
+ `);
125
+ });
126
+ });
127
+ });
@@ -31,7 +31,7 @@ describe("Tree Construction Adoption01 Tests", () => {
31
31
  }
32
32
  }
33
33
 
34
- const passingTests = [1, 2, 3, 4, 7, 8, 9, 16];
34
+ const passingTests = [1, 2, 3, 4, 6, 7, 8, 9, 11, 12, 16];
35
35
  const testFn = passingTests.includes(index + 1) ? it : it.skip;
36
36
 
37
37
  testFn(`Adoption test ${index + 1}`, () => {