@tkeron/html-parser 0.1.7 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. package/README.md +1 -7
  2. package/bun.lock +5 -0
  3. package/index.ts +4 -0
  4. package/package.json +7 -1
  5. package/src/css-selector.ts +1 -1
  6. package/src/dom-simulator.ts +41 -17
  7. package/src/encoding.ts +39 -0
  8. package/src/index.ts +9 -0
  9. package/src/parser.ts +509 -143
  10. package/src/serializer.ts +450 -0
  11. package/src/tokenizer.ts +190 -118
  12. package/tests/advanced.test.ts +121 -108
  13. package/tests/custom-elements-head.test.ts +105 -0
  14. package/tests/dom-extended.test.ts +12 -12
  15. package/tests/dom-manipulation.test.ts +9 -10
  16. package/tests/dom.test.ts +32 -27
  17. package/tests/helpers/tokenizer-adapter.test.ts +70 -0
  18. package/tests/helpers/tokenizer-adapter.ts +65 -0
  19. package/tests/helpers/tree-adapter.test.ts +39 -0
  20. package/tests/helpers/tree-adapter.ts +60 -0
  21. package/tests/html5lib-data/tokenizer/namedEntities.test +42422 -0
  22. package/tests/html5lib-data/tokenizer/pendingSpecChanges.test +9 -0
  23. package/tests/html5lib-data/tree-construction/adoption01.dat +354 -0
  24. package/tests/html5lib-data/tree-construction/adoption02.dat +39 -0
  25. package/tests/html5lib-data/tree-construction/domjs-unsafe.dat +0 -0
  26. package/tests/html5lib-data/tree-construction/entities02.dat +309 -0
  27. package/tests/html5lib-data/tree-construction/html5test-com.dat +301 -0
  28. package/tests/html5lib-data/tree-construction/math.dat +104 -0
  29. package/tests/html5lib-data/tree-construction/namespace-sensitivity.dat +22 -0
  30. package/tests/html5lib-data/tree-construction/noscript01.dat +237 -0
  31. package/tests/html5lib-data/tree-construction/ruby.dat +302 -0
  32. package/tests/html5lib-data/tree-construction/scriptdata01.dat +372 -0
  33. package/tests/html5lib-data/tree-construction/svg.dat +104 -0
  34. package/tests/html5lib-data/tree-construction/template.dat +1673 -0
  35. package/tests/html5lib-data/tree-construction/tests10.dat +853 -0
  36. package/tests/html5lib-data/tree-construction/tests11.dat +523 -0
  37. package/tests/html5lib-data/tree-construction/tests20.dat +842 -0
  38. package/tests/html5lib-data/tree-construction/tests21.dat +306 -0
  39. package/tests/html5lib-data/tree-construction/tests23.dat +168 -0
  40. package/tests/html5lib-data/tree-construction/tests24.dat +79 -0
  41. package/tests/html5lib-data/tree-construction/tests5.dat +210 -0
  42. package/tests/html5lib-data/tree-construction/tests6.dat +663 -0
  43. package/tests/html5lib-data/tree-construction/tests_innerHTML_1.dat +844 -0
  44. package/tests/parser.test.ts +173 -193
  45. package/tests/serializer-core.test.ts +16 -0
  46. package/tests/serializer-data/core.test +125 -0
  47. package/tests/serializer-data/injectmeta.test +66 -0
  48. package/tests/serializer-data/optionaltags.test +965 -0
  49. package/tests/serializer-data/options.test +60 -0
  50. package/tests/serializer-data/whitespace.test +51 -0
  51. package/tests/serializer-injectmeta.test.ts +16 -0
  52. package/tests/serializer-optionaltags.test.ts +16 -0
  53. package/tests/serializer-options.test.ts +16 -0
  54. package/tests/serializer-whitespace.test.ts +16 -0
  55. package/tests/tokenizer-namedEntities.test.ts +20 -0
  56. package/tests/tokenizer-pendingSpecChanges.test.ts +20 -0
  57. package/tests/tokenizer.test.ts +25 -32
  58. package/tests/tree-construction-adoption01.test.ts +37 -0
  59. package/tests/tree-construction-adoption02.test.ts +34 -0
  60. package/tests/tree-construction-domjs-unsafe.test.ts +24 -0
  61. package/tests/tree-construction-entities02.test.ts +33 -0
  62. package/tests/tree-construction-html5test-com.test.ts +32 -0
  63. package/tests/tree-construction-math.test.ts +18 -0
  64. package/tests/tree-construction-namespace-sensitivity.test.ts +18 -0
  65. package/tests/tree-construction-noscript01.test.ts +18 -0
  66. package/tests/tree-construction-ruby.test.ts +21 -0
  67. package/tests/tree-construction-scriptdata01.test.ts +21 -0
  68. package/tests/tree-construction-svg.test.ts +21 -0
  69. package/tests/tree-construction-template.test.ts +21 -0
  70. package/tests/tree-construction-tests10.test.ts +21 -0
  71. package/tests/tree-construction-tests11.test.ts +21 -0
  72. package/tests/tree-construction-tests20.test.ts +18 -0
  73. package/tests/tree-construction-tests21.test.ts +18 -0
  74. package/tests/tree-construction-tests23.test.ts +18 -0
  75. package/tests/tree-construction-tests24.test.ts +18 -0
  76. package/tests/tree-construction-tests5.test.ts +21 -0
  77. package/tests/tree-construction-tests6.test.ts +21 -0
  78. package/tests/tree-construction-tests_innerHTML_1.test.ts +21 -0
  79. package/tests/custom-elements.test.ts +0 -745
  80. package/tests/official/README.md +0 -87
  81. package/tests/official/acid/acid-tests.test.ts +0 -309
  82. package/tests/official/final-output/final-output.test.ts +0 -361
  83. package/tests/official/html5lib/tokenizer-utils.ts +0 -192
  84. package/tests/official/html5lib/tokenizer.test.ts +0 -171
  85. package/tests/official/html5lib/tree-construction-utils.ts +0 -194
  86. package/tests/official/html5lib/tree-construction.test.ts +0 -250
  87. package/tests/official/validator/validator-tests.test.ts +0 -237
  88. package/tests/official/validator-nu/validator-nu.test.ts +0 -335
  89. package/tests/official/whatwg/whatwg-tests.test.ts +0 -205
  90. package/tests/official/wpt/wpt-tests.test.ts +0 -409
package/src/tokenizer.ts CHANGED
@@ -24,69 +24,132 @@ export interface Token {
24
24
  isClosing?: boolean;
25
25
  }
26
26
 
27
- const HTML_ENTITIES: Record<string, string> = {
28
- '&amp;': '&',
29
- '&lt;': '<',
30
- '&gt;': '>',
31
- '&quot;': '"',
32
- '&apos;': "'",
33
- '&nbsp;': '\u00A0',
34
- '&copy;': '©',
35
- '&reg;': '®',
36
- '&trade;': '™',
37
- '&hellip;': '…',
38
- '&mdash;': '—',
39
- '&ndash;': '–',
40
- '&lsquo;': '\u2018',
41
- '&rsquo;': '\u2019',
42
- '&ldquo;': '\u201C',
43
- '&rdquo;': '\u201D',
44
- '&not;': '¬'
45
- };
27
+ import { allNamedEntities } from 'all-named-html-entities';
28
+
29
+ const HTML_ENTITIES: Record<string, string> = allNamedEntities;
46
30
 
47
31
  function decodeEntities(text: string): string {
48
- let result = text.replace(/\u0000/g, '\uFFFD');
49
-
50
- return result.replace(/&(?:#x([0-9a-fA-F]+);?|#([0-9]+);?|([a-zA-Z][a-zA-Z0-9]*);?)/g, (match, hex, decimal, named) => {
51
- if (hex) {
52
- return String.fromCharCode(parseInt(hex, 16));
53
- }
54
- if (decimal) {
55
- return String.fromCharCode(parseInt(decimal, 10));
56
- }
57
- if (named) {
58
- if (HTML_ENTITIES[`&${named};`]) {
59
- return HTML_ENTITIES[`&${named};`];
60
- }
61
-
62
- if (!match.endsWith(';')) {
63
- for (let i = named.length; i > 0; i--) {
64
- const prefix = named.substring(0, i);
65
- if (HTML_ENTITIES[`&${prefix};`]) {
66
- const remainder = named.substring(i);
67
- return HTML_ENTITIES[`&${prefix};`] + remainder;
32
+ let result = '';
33
+ let i = 0;
34
+ while (i < text.length) {
35
+ if (text[i] === '&') {
36
+ let match = '';
37
+ let j = i + 1;
38
+ if (text[j] === '#') {
39
+ j++;
40
+ if (text[j] === 'x' || text[j] === 'X') {
41
+ j++;
42
+ while (j < text.length && /[0-9a-fA-F]/.test(text[j])) {
43
+ j++;
44
+ }
45
+ } else {
46
+ while (j < text.length && /[0-9]/.test(text[j])) {
47
+ j++;
48
+ }
49
+ }
50
+ if (text[j] === ';') {
51
+ j++;
52
+ }
53
+ match = text.substring(i, j);
54
+ const entity = match;
55
+ if (entity.startsWith('&#x') && entity.endsWith(';')) {
56
+ const hex = entity.slice(3, -1);
57
+ result += String.fromCharCode(parseInt(hex, 16));
58
+ i = j;
59
+ continue;
60
+ } else if (entity.startsWith('&#') && entity.endsWith(';')) {
61
+ const decimal = entity.slice(2, -1);
62
+ result += String.fromCharCode(parseInt(decimal, 10));
63
+ i = j;
64
+ continue;
65
+ }
66
+ } else {
67
+ while (j < text.length && /[a-zA-Z0-9]/.test(text[j])) {
68
+ j++;
69
+ }
70
+ const hasSemi = text[j] === ';';
71
+ if (hasSemi) {
72
+ j++;
73
+ }
74
+ match = text.substring(i, j);
75
+ const named = match.slice(1, hasSemi ? -1 : undefined);
76
+ if (HTML_ENTITIES[named]) {
77
+ if (hasSemi || (j < text.length && !/[a-zA-Z0-9]/.test(text[j]))) {
78
+ result += HTML_ENTITIES[named];
79
+ i = j;
80
+ continue;
68
81
  }
69
82
  }
70
83
  }
71
-
72
- return match;
84
+ result += text[i];
85
+ i++;
86
+ } else {
87
+ result += text[i];
88
+ i++;
73
89
  }
74
- return match;
75
- });
90
+ }
91
+ return result.replace(/\u0000/g, '\uFFFD');
76
92
  }
77
93
 
78
94
  function parseAttributes(attributeString: string): Record<string, string> {
79
95
  const attributes: Record<string, string> = {};
96
+ let i = 0;
80
97
 
81
- const attrRegex = /([a-zA-Z][a-zA-Z0-9\-_:]*)\s*(?:=\s*(?:"([^"]*)"|'([^']*)'|([^\s>]+)))?/g;
82
- let match;
83
-
84
- while ((match = attrRegex.exec(attributeString)) !== null) {
85
- const [, name, doubleQuoted, singleQuoted, unquoted] = match;
86
- if (name) {
87
- const value = doubleQuoted ?? singleQuoted ?? unquoted ?? '';
88
- attributes[name.toLowerCase()] = decodeEntities(value);
98
+ while (i < attributeString.length) {
99
+ while (i < attributeString.length && /\s/.test(attributeString[i])) {
100
+ i++;
101
+ }
102
+ if (i >= attributeString.length || attributeString[i] === '/' || attributeString[i] === '>') {
103
+ break;
104
+ }
105
+
106
+ let name = '';
107
+ while (i < attributeString.length && !/[\s=\/>]/.test(attributeString[i])) {
108
+ name += attributeString[i];
109
+ i++;
110
+ }
111
+
112
+ if (!name) {
113
+ i++;
114
+ continue;
115
+ }
116
+
117
+ while (i < attributeString.length && /\s/.test(attributeString[i])) {
118
+ i++;
119
+ }
120
+
121
+ let value = '';
122
+ if (i < attributeString.length && attributeString[i] === '=') {
123
+ i++;
124
+ while (i < attributeString.length && /\s/.test(attributeString[i])) {
125
+ i++;
126
+ }
127
+
128
+ if (i < attributeString.length) {
129
+ if (attributeString[i] === '"') {
130
+ i++;
131
+ while (i < attributeString.length && attributeString[i] !== '"') {
132
+ value += attributeString[i];
133
+ i++;
134
+ }
135
+ i++;
136
+ } else if (attributeString[i] === "'") {
137
+ i++;
138
+ while (i < attributeString.length && attributeString[i] !== "'") {
139
+ value += attributeString[i];
140
+ i++;
141
+ }
142
+ i++;
143
+ } else {
144
+ while (i < attributeString.length && !/[\s>]/.test(attributeString[i])) {
145
+ value += attributeString[i];
146
+ i++;
147
+ }
148
+ }
149
+ }
89
150
  }
151
+
152
+ attributes[name.toLowerCase()] = decodeEntities(value);
90
153
  }
91
154
 
92
155
  return attributes;
@@ -101,79 +164,72 @@ function calculatePosition(text: string, offset: number): Position {
101
164
  };
102
165
  }
103
166
 
167
+ const RAW_TEXT_ELEMENTS = new Set(['script', 'style', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript']);
168
+ const RCDATA_ELEMENTS = new Set(['textarea', 'title']);
169
+
104
170
  export function tokenize(html: string): Token[] {
105
171
  const tokens: Token[] = [];
106
- let position = 0;
107
-
108
- const specialCases = [
109
- {
110
- pattern: /<!DOCTYPE\s+[^>]*>/gi,
111
- type: TokenType.DOCTYPE,
112
- getValue: (match: string) => {
113
- const doctypeMatch = match.match(/<!DOCTYPE\s+([^\s>]+)/i);
114
- return doctypeMatch && doctypeMatch[1] ? doctypeMatch[1].toLowerCase() : match;
115
- }
116
- },
117
- {
118
- pattern: /<!--([\s\S]*?)(?:-->|$)/g,
119
- type: TokenType.COMMENT,
120
- getValue: (match: string) => match.slice(4, match.endsWith('-->') ? -3 : match.length)
121
- },
122
- {
123
- pattern: /<!\[CDATA\[([\s\S]*?)\]\]>/g,
124
- type: TokenType.CDATA,
125
- getValue: (match: string) => match.slice(9, -3)
126
- },
127
- {
128
- pattern: /<\?([^?]*(?:\?(?!>)[^?]*)*)\?>/g,
129
- type: TokenType.PROCESSING_INSTRUCTION,
130
- getValue: (match: string) => match.slice(0, -2)
131
- }
132
- ];
133
-
134
- const processedRanges: Array<[number, number]> = [];
135
-
136
- for (const { pattern, type, getValue } of specialCases) {
137
- const regex = new RegExp(pattern);
138
- let match;
139
-
140
- while ((match = regex.exec(html)) !== null) {
141
- const start = match.index;
142
- const end = start + match[0].length;
143
-
144
- tokens.push({
145
- type,
146
- value: getValue(match[0]),
147
- position: calculatePosition(html, start)
148
- });
149
-
150
- processedRanges.push([start, end]);
151
- }
152
- }
153
-
154
- processedRanges.sort((a, b) => a[0] - b[0]);
155
-
156
172
  let currentPos = 0;
157
173
 
158
174
  while (currentPos < html.length) {
159
- const inProcessedRange = processedRanges.some(([start, end]) =>
160
- currentPos >= start && currentPos < end
161
- );
162
-
163
- if (inProcessedRange) {
164
- const range = processedRanges.find(([start, end]) =>
165
- currentPos >= start && currentPos < end
166
- );
167
- if (range) {
168
- currentPos = range[1];
169
- }
170
- continue;
171
- }
172
-
173
175
  const char = html[currentPos];
174
176
 
175
177
  if (char === '<') {
176
- const tagMatch = html.slice(currentPos).match(/^<\/?([a-zA-Z][^\s/>]*)([^>]*)>/);
178
+ const remaining = html.slice(currentPos);
179
+
180
+ const doctypeMatch = remaining.match(/^<!DOCTYPE\s+[^>]*>/i);
181
+ if (doctypeMatch) {
182
+ const match = doctypeMatch[0];
183
+ const nameMatch = match.match(/<!DOCTYPE\s+([^\s>]+)/i);
184
+ tokens.push({
185
+ type: TokenType.DOCTYPE,
186
+ value: nameMatch && nameMatch[1] ? nameMatch[1].toLowerCase() : match,
187
+ position: calculatePosition(html, currentPos)
188
+ });
189
+ currentPos += match.length;
190
+ continue;
191
+ }
192
+
193
+ const commentMatch = remaining.match(/^<!--([\s\S]*?)(?:-->|$)/);
194
+ if (commentMatch) {
195
+ const match = commentMatch[0];
196
+ tokens.push({
197
+ type: TokenType.COMMENT,
198
+ value: match.slice(4, match.endsWith('-->') ? -3 : match.length),
199
+ position: calculatePosition(html, currentPos)
200
+ });
201
+ currentPos += match.length;
202
+ continue;
203
+ }
204
+
205
+ const cdataMatch = remaining.match(/^<!\[CDATA\[([\s\S]*?)\]\]>/);
206
+ if (cdataMatch) {
207
+ const content = cdataMatch[1];
208
+ tokens.push({
209
+ type: TokenType.COMMENT,
210
+ value: '[CDATA[' + content + ']]',
211
+ position: calculatePosition(html, currentPos)
212
+ });
213
+ currentPos += cdataMatch[0].length;
214
+ continue;
215
+ }
216
+
217
+ const piMatch = remaining.match(/^<\?([^>]*)/);
218
+ if (piMatch) {
219
+ let consumed = piMatch[0].length;
220
+ if (remaining[consumed] === '>') {
221
+ consumed++;
222
+ }
223
+ tokens.push({
224
+ type: TokenType.COMMENT,
225
+ value: '?' + piMatch[1],
226
+ position: calculatePosition(html, currentPos)
227
+ });
228
+ currentPos += consumed;
229
+ continue;
230
+ }
231
+
232
+ const tagMatch = remaining.match(/^<\/?([a-zA-Z][^\s/>]*)([^>]*)>/);
177
233
 
178
234
  if (tagMatch) {
179
235
  const fullTag = tagMatch[0];
@@ -206,6 +262,24 @@ export function tokenize(html: string): Token[] {
206
262
  });
207
263
 
208
264
  currentPos += fullTag.length;
265
+
266
+ if (!isClosing && !isSelfClosing && (RAW_TEXT_ELEMENTS.has(tagName) || RCDATA_ELEMENTS.has(tagName))) {
267
+ const closeTagPattern = new RegExp(`</${tagName}\\s*>`, 'i');
268
+ const restOfHtml = html.slice(currentPos);
269
+ const closeMatch = restOfHtml.match(closeTagPattern);
270
+
271
+ if (closeMatch && closeMatch.index !== undefined) {
272
+ const rawContent = restOfHtml.slice(0, closeMatch.index);
273
+ if (rawContent) {
274
+ tokens.push({
275
+ type: TokenType.TEXT,
276
+ value: RCDATA_ELEMENTS.has(tagName) ? decodeEntities(rawContent) : rawContent,
277
+ position: calculatePosition(html, currentPos)
278
+ });
279
+ }
280
+ currentPos += rawContent.length;
281
+ }
282
+ }
209
283
  } else {
210
284
  const textStart = currentPos;
211
285
  currentPos++;
@@ -241,8 +315,6 @@ export function tokenize(html: string): Token[] {
241
315
  }
242
316
  }
243
317
 
244
- tokens.sort((a, b) => a.position.offset - b.position.offset);
245
-
246
318
  tokens.push({
247
319
  type: TokenType.EOF,
248
320
  value: '',