@tkeron/html-parser 1.1.2 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/.github/workflows/npm_deploy.yml +14 -4
  2. package/README.md +6 -6
  3. package/bun.lock +6 -8
  4. package/check-versions.ts +147 -0
  5. package/index.ts +4 -8
  6. package/package.json +5 -6
  7. package/src/dom-simulator/append-child.ts +130 -0
  8. package/src/dom-simulator/append.ts +18 -0
  9. package/src/dom-simulator/attributes.ts +23 -0
  10. package/src/dom-simulator/clone-node.ts +51 -0
  11. package/src/dom-simulator/convert-ast-node-to-dom.ts +37 -0
  12. package/src/dom-simulator/create-cdata.ts +18 -0
  13. package/src/dom-simulator/create-comment.ts +23 -0
  14. package/src/dom-simulator/create-doctype.ts +24 -0
  15. package/src/dom-simulator/create-document.ts +81 -0
  16. package/src/dom-simulator/create-element.ts +195 -0
  17. package/src/dom-simulator/create-processing-instruction.ts +19 -0
  18. package/src/dom-simulator/create-temp-parent.ts +9 -0
  19. package/src/dom-simulator/create-text-node.ts +23 -0
  20. package/src/dom-simulator/escape-text-content.ts +6 -0
  21. package/src/dom-simulator/find-special-elements.ts +14 -0
  22. package/src/dom-simulator/get-text-content.ts +18 -0
  23. package/src/dom-simulator/index.ts +36 -0
  24. package/src/dom-simulator/inner-outer-html.ts +182 -0
  25. package/src/dom-simulator/insert-after.ts +20 -0
  26. package/src/dom-simulator/insert-before.ts +108 -0
  27. package/src/dom-simulator/matches.ts +26 -0
  28. package/src/dom-simulator/node-types.ts +26 -0
  29. package/src/dom-simulator/prepend.ts +24 -0
  30. package/src/dom-simulator/remove-child.ts +68 -0
  31. package/src/dom-simulator/remove.ts +7 -0
  32. package/src/dom-simulator/replace-child.ts +152 -0
  33. package/src/dom-simulator/set-text-content.ts +33 -0
  34. package/src/dom-simulator/update-element-content.ts +56 -0
  35. package/src/dom-simulator.ts +12 -1126
  36. package/src/encoding/constants.ts +8 -0
  37. package/src/encoding/detect-encoding.ts +21 -0
  38. package/src/encoding/index.ts +1 -0
  39. package/src/encoding/normalize-encoding.ts +6 -0
  40. package/src/html-entities.ts +2127 -0
  41. package/src/index.ts +5 -5
  42. package/src/parser/adoption-agency-helpers.ts +145 -0
  43. package/src/parser/constants.ts +137 -0
  44. package/src/parser/dom-to-ast.ts +79 -0
  45. package/src/parser/index.ts +9 -0
  46. package/src/parser/parse.ts +772 -0
  47. package/src/parser/types.ts +56 -0
  48. package/src/selectors/find-elements-descendant.ts +47 -0
  49. package/src/selectors/index.ts +2 -0
  50. package/src/selectors/matches-selector.ts +12 -0
  51. package/src/selectors/matches-token.ts +27 -0
  52. package/src/selectors/parse-selector.ts +48 -0
  53. package/src/selectors/query-selector-all.ts +43 -0
  54. package/src/selectors/query-selector.ts +6 -0
  55. package/src/selectors/types.ts +10 -0
  56. package/src/serializer/attributes.ts +74 -0
  57. package/src/serializer/escape.ts +13 -0
  58. package/src/serializer/index.ts +1 -0
  59. package/src/serializer/serialize-tokens.ts +511 -0
  60. package/src/tokenizer/calculate-position.ts +10 -0
  61. package/src/tokenizer/constants.ts +11 -0
  62. package/src/tokenizer/decode-entities.ts +64 -0
  63. package/src/tokenizer/index.ts +2 -0
  64. package/src/tokenizer/parse-attributes.ts +74 -0
  65. package/src/tokenizer/tokenize.ts +165 -0
  66. package/src/tokenizer/types.ts +25 -0
  67. package/tests/adoption-agency-helpers.test.ts +304 -0
  68. package/tests/advanced.test.ts +242 -221
  69. package/tests/cloneNode.test.ts +19 -66
  70. package/tests/custom-elements-head.test.ts +54 -55
  71. package/tests/dom-extended.test.ts +77 -64
  72. package/tests/dom-manipulation.test.ts +51 -24
  73. package/tests/dom.test.ts +15 -13
  74. package/tests/encoding/detect-encoding.test.ts +33 -0
  75. package/tests/google-dom.test.ts +2 -2
  76. package/tests/helpers/tokenizer-adapter.test.ts +29 -43
  77. package/tests/helpers/tokenizer-adapter.ts +36 -33
  78. package/tests/helpers/tree-adapter.test.ts +20 -20
  79. package/tests/helpers/tree-adapter.ts +34 -24
  80. package/tests/html-entities-text.test.ts +6 -2
  81. package/tests/innerhtml-void-elements.test.ts +52 -36
  82. package/tests/outerHTML-replacement.test.ts +37 -65
  83. package/tests/parser/dom-to-ast.test.ts +109 -0
  84. package/tests/parser/parse.test.ts +139 -0
  85. package/tests/parser.test.ts +281 -217
  86. package/tests/selectors/query-selector-all.test.ts +39 -0
  87. package/tests/selectors/query-selector.test.ts +42 -0
  88. package/tests/serializer/attributes.test.ts +132 -0
  89. package/tests/serializer/escape.test.ts +51 -0
  90. package/tests/serializer/serialize-tokens.test.ts +80 -0
  91. package/tests/serializer-core.test.ts +6 -6
  92. package/tests/serializer-injectmeta.test.ts +6 -6
  93. package/tests/serializer-optionaltags.test.ts +9 -6
  94. package/tests/serializer-options.test.ts +6 -6
  95. package/tests/serializer-whitespace.test.ts +6 -6
  96. package/tests/tokenizer/calculate-position.test.ts +34 -0
  97. package/tests/tokenizer/decode-entities.test.ts +31 -0
  98. package/tests/tokenizer/parse-attributes.test.ts +44 -0
  99. package/tests/tokenizer/tokenize.test.ts +757 -0
  100. package/tests/tokenizer-namedEntities.test.ts +10 -7
  101. package/tests/tokenizer-pendingSpecChanges.test.ts +10 -7
  102. package/tests/tokenizer.test.ts +268 -256
  103. package/tests/tree-construction-adoption01.test.ts +25 -16
  104. package/tests/tree-construction-adoption02.test.ts +30 -19
  105. package/tests/tree-construction-domjs-unsafe.test.ts +6 -4
  106. package/tests/tree-construction-entities02.test.ts +18 -16
  107. package/tests/tree-construction-html5test-com.test.ts +16 -10
  108. package/tests/tree-construction-math.test.ts +11 -9
  109. package/tests/tree-construction-namespace-sensitivity.test.ts +11 -9
  110. package/tests/tree-construction-noscript01.test.ts +11 -9
  111. package/tests/tree-construction-ruby.test.ts +6 -4
  112. package/tests/tree-construction-scriptdata01.test.ts +6 -4
  113. package/tests/tree-construction-svg.test.ts +6 -4
  114. package/tests/tree-construction-template.test.ts +6 -4
  115. package/tests/tree-construction-tests10.test.ts +6 -4
  116. package/tests/tree-construction-tests11.test.ts +6 -4
  117. package/tests/tree-construction-tests20.test.ts +7 -4
  118. package/tests/tree-construction-tests21.test.ts +7 -4
  119. package/tests/tree-construction-tests23.test.ts +7 -4
  120. package/tests/tree-construction-tests24.test.ts +7 -4
  121. package/tests/tree-construction-tests5.test.ts +6 -5
  122. package/tests/tree-construction-tests6.test.ts +6 -5
  123. package/tests/tree-construction-tests_innerHTML_1.test.ts +6 -5
  124. package/tests/void-elements.test.ts +85 -40
  125. package/tsconfig.json +1 -1
  126. package/src/css-selector.ts +0 -185
  127. package/src/encoding.ts +0 -39
  128. package/src/parser.ts +0 -682
  129. package/src/serializer.ts +0 -450
  130. package/src/tokenizer.ts +0 -325
  131. package/tests/selectors.test.ts +0 -128
@@ -0,0 +1,757 @@
1
+ import { expect, it, describe } from "bun:test";
2
+ import { tokenize, TokenType } from "../../src/tokenizer/index.js";
3
+
4
+ describe("HTML Tokenizer", () => {
5
+ describe("Basic Tags", () => {
6
+ it("should tokenize simple opening tag", () => {
7
+ const tokens = tokenize("<div>");
8
+
9
+ expect(tokens).toHaveLength(2);
10
+ expect(tokens[0]!).toEqual({
11
+ type: TokenType.TAG_OPEN,
12
+ value: "div",
13
+ position: expect.any(Object),
14
+ attributes: {},
15
+ isSelfClosing: false,
16
+ });
17
+ expect(tokens[1]!.type).toBe(TokenType.EOF);
18
+ });
19
+
20
+ it("should tokenize simple closing tag", () => {
21
+ const tokens = tokenize("</div>");
22
+
23
+ expect(tokens).toHaveLength(2);
24
+ expect(tokens[0]!).toEqual({
25
+ type: TokenType.TAG_CLOSE,
26
+ value: "div",
27
+ position: expect.any(Object),
28
+ isClosing: true,
29
+ });
30
+ });
31
+
32
+ it("should tokenize self-closing tag", () => {
33
+ const tokens = tokenize("<img/>");
34
+
35
+ expect(tokens).toHaveLength(2);
36
+ expect(tokens[0]!).toEqual({
37
+ type: TokenType.TAG_OPEN,
38
+ value: "img",
39
+ position: expect.any(Object),
40
+ attributes: {},
41
+ isSelfClosing: true,
42
+ });
43
+ });
44
+
45
+ it("should handle case insensitive tag names", () => {
46
+ const tokens = tokenize("<DIV></DIV>");
47
+
48
+ expect(tokens[0]!.value).toBe("div");
49
+ expect(tokens[1]!.value).toBe("div");
50
+ });
51
+ });
52
+
53
+ describe("Attributes", () => {
54
+ it("should parse attributes with double quotes", () => {
55
+ const tokens = tokenize('<div class="container" id="main">');
56
+
57
+ expect(tokens[0]?.attributes).toEqual({
58
+ class: "container",
59
+ id: "main",
60
+ });
61
+ });
62
+
63
+ it("should parse attributes with single quotes", () => {
64
+ const tokens = tokenize(`<div class='container' id='main'>`);
65
+
66
+ expect(tokens[0]?.attributes).toEqual({
67
+ class: "container",
68
+ id: "main",
69
+ });
70
+ });
71
+
72
+ it("should parse unquoted attributes", () => {
73
+ const tokens = tokenize("<div class=container id=main>");
74
+
75
+ expect(tokens[0]?.attributes).toEqual({
76
+ class: "container",
77
+ id: "main",
78
+ });
79
+ });
80
+
81
+ it("should parse boolean attributes", () => {
82
+ const tokens = tokenize("<input disabled checked>");
83
+
84
+ expect(tokens[0]?.attributes).toEqual({
85
+ disabled: "",
86
+ checked: "",
87
+ });
88
+ });
89
+
90
+ it("should handle mixed attribute types", () => {
91
+ const tokens = tokenize('<input type="text" disabled value=test>');
92
+
93
+ expect(tokens[0]?.attributes).toEqual({
94
+ type: "text",
95
+ disabled: "",
96
+ value: "test",
97
+ });
98
+ });
99
+
100
+ it("should handle attributes with special characters", () => {
101
+ const tokens = tokenize('<div data-test="value" aria-label="test">');
102
+
103
+ expect(tokens[0]?.attributes).toEqual({
104
+ "data-test": "value",
105
+ "aria-label": "test",
106
+ });
107
+ });
108
+ });
109
+
110
+ describe("Text Content", () => {
111
+ it("should tokenize plain text", () => {
112
+ const tokens = tokenize("Hello World");
113
+
114
+ expect(tokens).toHaveLength(2);
115
+ expect(tokens[0]).toEqual({
116
+ type: TokenType.TEXT,
117
+ value: "Hello World",
118
+ position: expect.any(Object),
119
+ });
120
+ });
121
+
122
+ it("should handle text with whitespace", () => {
123
+ const tokens = tokenize(" Hello World ");
124
+
125
+ expect(tokens[0]?.value).toBe(" Hello World ");
126
+ });
127
+
128
+ it("should handle multiline text", () => {
129
+ const tokens = tokenize("Line 1\nLine 2\nLine 3");
130
+
131
+ expect(tokens[0]?.value).toBe("Line 1\nLine 2\nLine 3");
132
+ });
133
+ });
134
+
135
+ describe("HTML Entities", () => {
136
+ it("should parse named entities", () => {
137
+ const tokens = tokenize("&amp; &lt; &gt; &quot; &nbsp;");
138
+
139
+ expect(tokens[0]?.value).toBe('& < > " \u00A0');
140
+ });
141
+
142
+ it("should parse numeric entities", () => {
143
+ const tokens = tokenize("&#65; &#66; &#67;");
144
+
145
+ expect(tokens[0]?.value).toBe("A B C");
146
+ });
147
+
148
+ it("should parse hexadecimal entities", () => {
149
+ const tokens = tokenize("&#x41; &#x42; &#x43;");
150
+
151
+ expect(tokens[0]?.value).toBe("A B C");
152
+ });
153
+
154
+ it("should handle entities in attributes", () => {
155
+ const tokens = tokenize('<div title="&quot;Hello&quot;">');
156
+
157
+ expect(tokens[0]?.attributes!.title).toBe('"Hello"');
158
+ });
159
+
160
+ it("should handle unknown entities", () => {
161
+ const tokens = tokenize("&unknown;");
162
+
163
+ expect(tokens[0]?.value).toBe("&unknown;");
164
+ });
165
+ });
166
+
167
+ describe("Comments", () => {
168
+ it("should parse HTML comments", () => {
169
+ const tokens = tokenize("<!-- This is a comment -->");
170
+
171
+ expect(tokens[0]).toEqual({
172
+ type: TokenType.COMMENT,
173
+ value: " This is a comment ",
174
+ position: expect.any(Object),
175
+ });
176
+ });
177
+
178
+ it("should handle multiline comments", () => {
179
+ const tokens = tokenize(
180
+ `<!-- \n Multi line\n comment\n -->`,
181
+ );
182
+
183
+ expect(tokens[0]?.type).toBe(TokenType.COMMENT);
184
+ expect(tokens[0]?.value).toContain("Multi line");
185
+ });
186
+
187
+ it("should handle empty comments", () => {
188
+ const tokens = tokenize("<!---->");
189
+
190
+ expect(tokens[0]).toEqual({
191
+ type: TokenType.COMMENT,
192
+ value: "",
193
+ position: expect.any(Object),
194
+ });
195
+ });
196
+ });
197
+
198
+ describe("CDATA Sections (HTML5: treated as bogus comments)", () => {
199
+ it("should parse CDATA sections as bogus comments in HTML5", () => {
200
+ const tokens = tokenize("<![CDATA[Some data]]>");
201
+
202
+ expect(tokens[0]).toEqual({
203
+ type: TokenType.COMMENT,
204
+ value: "[CDATA[Some data]]",
205
+ position: expect.any(Object),
206
+ });
207
+ });
208
+
209
+ it("should handle CDATA with special characters as bogus comment", () => {
210
+ const tokens = tokenize('<![CDATA[<script>alert("test");</script>]]>');
211
+
212
+ expect(tokens[0]?.value).toBe('[CDATA[<script>alert("test");</script>]]');
213
+ });
214
+ });
215
+
216
+ describe("DOCTYPE Declaration", () => {
217
+ it("should parse DOCTYPE declaration", () => {
218
+ const tokens = tokenize("<!DOCTYPE html>");
219
+
220
+ expect(tokens[0]).toEqual({
221
+ type: TokenType.DOCTYPE,
222
+ value: "html",
223
+ position: expect.any(Object),
224
+ });
225
+ });
226
+
227
+ it("should parse complex DOCTYPE", () => {
228
+ const tokens = tokenize(
229
+ '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">',
230
+ );
231
+
232
+ expect(tokens[0]?.type).toBe(TokenType.DOCTYPE);
233
+ expect(tokens[0]?.value).toBe("html");
234
+ });
235
+ });
236
+
237
+ describe("Processing Instructions (HTML5: treated as bogus comments)", () => {
238
+ it("should parse XML processing instruction as bogus comment", () => {
239
+ const tokens = tokenize('<?xml version="1.0" encoding="UTF-8"?>');
240
+
241
+ expect(tokens[0]).toEqual({
242
+ type: TokenType.COMMENT,
243
+ value: '?xml version="1.0" encoding="UTF-8"?',
244
+ position: expect.any(Object),
245
+ });
246
+ });
247
+
248
+ it("should parse PHP-style processing instruction as bogus comment", () => {
249
+ const tokens = tokenize('<?php echo "Hello"; ?>');
250
+
251
+ expect(tokens[0]?.type).toBe(TokenType.COMMENT);
252
+ expect(tokens[0]?.value).toBe('?php echo "Hello"; ?');
253
+ });
254
+ });
255
+
256
+ describe("Complex HTML Documents", () => {
257
+ it("should tokenize complete HTML document", () => {
258
+ const html = `<!DOCTYPE html>
259
+ <html lang="en">
260
+ <head>
261
+ <title>Test</title>
262
+ </head>
263
+ <body>
264
+ <h1>Hello World</h1>
265
+ <p>This is a test.</p>
266
+ </body>
267
+ </html>`;
268
+
269
+ const tokens = tokenize(html);
270
+
271
+ expect(tokens.length).toBeGreaterThan(10);
272
+ expect(tokens[0]?.type).toBe(TokenType.DOCTYPE);
273
+ expect(tokens[tokens?.length - 1]?.type).toBe(TokenType.EOF);
274
+
275
+ const htmlTag = tokens.find(
276
+ (t) => t.type === TokenType.TAG_OPEN && t.value === "html",
277
+ );
278
+ expect(htmlTag).toBeDefined();
279
+ expect(htmlTag!.attributes!.lang).toBe("en");
280
+ });
281
+
282
+ it("should handle mixed content", () => {
283
+ const html = `<div>
284
+ Text before <!-- comment -->
285
+ <span>nested</span>
286
+ Text after &amp; entity
287
+ </div>`;
288
+
289
+ const tokens = tokenize(html);
290
+
291
+ expect(tokens.some((t) => t.type === TokenType.TAG_OPEN)).toBe(true);
292
+ expect(tokens.some((t) => t.type === TokenType.TEXT)).toBe(true);
293
+ expect(tokens.some((t) => t.type === TokenType.COMMENT)).toBe(true);
294
+ });
295
+ });
296
+
297
+ describe("Edge Cases", () => {
298
+ it("should handle empty input", () => {
299
+ const tokens = tokenize("");
300
+
301
+ expect(tokens).toHaveLength(1);
302
+ expect(tokens[0]?.type).toBe(TokenType.EOF);
303
+ });
304
+
305
+ it("should handle whitespace only", () => {
306
+ const tokens = tokenize(" \n\t ");
307
+
308
+ expect(tokens).toHaveLength(2);
309
+ expect(tokens[0]?.type).toBe(TokenType.TEXT);
310
+ expect(tokens[0]?.value).toBe(" \n\t ");
311
+ });
312
+
313
+ it("should handle malformed tags", () => {
314
+ const tokens = tokenize('<div class="test>');
315
+
316
+ expect(tokens[0]?.type).toBe(TokenType.TAG_OPEN);
317
+ expect(tokens[0]?.value).toBe("div");
318
+ });
319
+
320
+ it("should handle unclosed comments", () => {
321
+ const tokens = tokenize("<!-- unclosed comment");
322
+
323
+ expect(tokens[0]?.type).toBe(TokenType.COMMENT);
324
+ expect(tokens[0]?.value).toBe(" unclosed comment");
325
+ });
326
+ });
327
+
328
+ describe("Advanced Edge Cases", () => {
329
+ it("should handle attributes with no spaces", () => {
330
+ const tokens = tokenize('<div class="test"id="main"data-value="123">');
331
+ expect(tokens.length).toBeGreaterThan(0);
332
+ const tag = tokens[0]!;
333
+
334
+ expect(tag.attributes).toEqual({
335
+ class: "test",
336
+ id: "main",
337
+ "data-value": "123",
338
+ });
339
+ });
340
+
341
+ it("should handle attributes with excessive spaces", () => {
342
+ const tokens = tokenize('<div class = "test" id = "main" >');
343
+ expect(tokens.length).toBeGreaterThan(0);
344
+ const tag = tokens[0]!;
345
+
346
+ expect(tag.attributes).toEqual({
347
+ class: "test",
348
+ id: "main",
349
+ });
350
+ });
351
+
352
+ it("should handle mixed quote styles in same tag", () => {
353
+ const tokens = tokenize(
354
+ `<div class='single' id="double" data-test='mix "quoted" content'>`,
355
+ );
356
+ expect(tokens.length).toBeGreaterThan(0);
357
+ const tag = tokens[0]!;
358
+
359
+ expect(tag.attributes!.class).toBe("single");
360
+ expect(tag.attributes!.id).toBe("double");
361
+ expect(tag.attributes!["data-test"]).toBe('mix "quoted" content');
362
+ });
363
+
364
+ it("should handle malformed quotes gracefully", () => {
365
+ const tokens = tokenize('<div class="unclosed id="test">');
366
+ expect(tokens.length).toBeGreaterThan(0);
367
+ const tag = tokens[0]!;
368
+
369
+ expect(tag.type).toBe(TokenType.TAG_OPEN);
370
+ expect(tag.value).toBe("div");
371
+ expect(tag.attributes).toBeDefined();
372
+ });
373
+
374
+ it("should handle empty tag names", () => {
375
+ const tokens = tokenize("<>content</>");
376
+
377
+ expect(tokens.length).toBeGreaterThan(0);
378
+ });
379
+
380
+ it("should handle tags with numbers and special characters", () => {
381
+ const tokens = tokenize('<h1 class="heading-1" data-level="1">');
382
+ expect(tokens.length).toBeGreaterThan(0);
383
+ const tag = tokens[0]!;
384
+
385
+ expect(tag.value).toBe("h1");
386
+ expect(tag.attributes).toEqual({
387
+ class: "heading-1",
388
+ "data-level": "1",
389
+ });
390
+ });
391
+
392
+ it("should handle extremely long attribute values", () => {
393
+ const longValue = "a".repeat(10000);
394
+ const tokens = tokenize(`<div data-long="${longValue}">`);
395
+ expect(tokens.length).toBeGreaterThan(0);
396
+ const tag = tokens[0]!;
397
+
398
+ expect(tag.attributes!["data-long"]).toBe(longValue);
399
+ });
400
+
401
+ it("should handle unicode characters in attributes", () => {
402
+ const tokens = tokenize(
403
+ '<div title="测试" data-emoji="🚀" class="café">',
404
+ );
405
+ expect(tokens.length).toBeGreaterThan(0);
406
+ const tag = tokens[0]!;
407
+
408
+ expect(tag.attributes).toEqual({
409
+ title: "测试",
410
+ "data-emoji": "🚀",
411
+ class: "café",
412
+ });
413
+ });
414
+
415
+ it("should handle nested quotes in attributes", () => {
416
+ const tokens = tokenize(
417
+ `<div onclick="alert('Hello')" title='She said "hi"'>`,
418
+ );
419
+ expect(tokens.length).toBeGreaterThan(0);
420
+ const tag = tokens[0]!;
421
+
422
+ expect(tag.attributes!.onclick).toBe(`alert('Hello')`);
423
+ expect(tag.attributes!.title).toBe('She said "hi"');
424
+ });
425
+
426
+ it("should handle attributes without values", () => {
427
+ const tokens = tokenize(
428
+ '<input type="checkbox" checked disabled readonly>',
429
+ );
430
+ expect(tokens.length).toBeGreaterThan(0);
431
+ const tag = tokens[0]!;
432
+
433
+ expect(tag.attributes).toEqual({
434
+ type: "checkbox",
435
+ checked: "",
436
+ disabled: "",
437
+ readonly: "",
438
+ });
439
+ });
440
+
441
+ it("should handle CDATA as bogus comment with complex content", () => {
442
+ const complexContent = `
443
+ function it() {
444
+ return "<div>HTML inside JS</div>";
445
+ }
446
+ /* Comment with </script> */
447
+ var x = "String with <tags>";
448
+ `;
449
+ const tokens = tokenize(`<![CDATA[${complexContent}]]>`);
450
+ const cdataToken = tokens[0]!;
451
+
452
+ expect(cdataToken.type).toBe(TokenType.COMMENT);
453
+ expect(cdataToken.value).toBe("[CDATA[" + complexContent + "]]");
454
+ });
455
+
456
+ it("should handle processing instructions as bogus comments", () => {
457
+ const tests = [
458
+ { input: '<?xml version="1.0" encoding="UTF-8"?>', expected: "xml" },
459
+ {
460
+ input: '<?xml-stylesheet type="text/xsl" href="style.xsl"?>',
461
+ expected: "xml",
462
+ },
463
+ { input: '<?php echo "Hello World"; ?>', expected: "php" },
464
+ { input: '<?python print("Hello") ?>', expected: "python" },
465
+ ];
466
+
467
+ tests.forEach((test) => {
468
+ const tokens = tokenize(test.input);
469
+ const piToken = tokens[0]!;
470
+
471
+ expect(piToken.type).toBe(TokenType.COMMENT);
472
+ expect(piToken.value.toLowerCase()).toContain(test.expected);
473
+ });
474
+ });
475
+
476
+ it("should handle comments with special content", () => {
477
+ const specialComments = [
478
+ "<!-- TODO: Fix this -->",
479
+ '<!-- <script>alert("xss")</script> -->',
480
+ "<!-- Multi\nline\ncomment -->",
481
+ "<!-- Comment with -- inside -->",
482
+ "<!--[if IE]><![endif]-->",
483
+ ];
484
+
485
+ specialComments.forEach((comment) => {
486
+ const tokens = tokenize(comment);
487
+ const commentToken = tokens[0]!;
488
+
489
+ expect(commentToken.type).toBe(TokenType.COMMENT);
490
+ });
491
+ });
492
+
493
+ it("should handle mixed content with all token types (HTML5 mode)", () => {
494
+ const html = `
495
+ <!DOCTYPE html>
496
+ <!-- Main document -->
497
+ <html lang="en">
498
+ <head>
499
+ <title>Test &amp; Demo</title>
500
+ </head>
501
+ <body>
502
+ <h1>Hello World</h1>
503
+ <p>Text with <strong>bold</strong> content.</p>
504
+ <!-- End of body -->
505
+ </body>
506
+ </html>
507
+ <!-- End of document -->
508
+ `;
509
+
510
+ const tokens = tokenize(html);
511
+
512
+ const tokenCounts = {
513
+ [TokenType.DOCTYPE]: 0,
514
+ [TokenType.COMMENT]: 0,
515
+ [TokenType.TAG_OPEN]: 0,
516
+ [TokenType.TAG_CLOSE]: 0,
517
+ [TokenType.TEXT]: 0,
518
+ [TokenType.EOF]: 0,
519
+ };
520
+
521
+ tokens.forEach((token) => {
522
+ if (token.type in tokenCounts) {
523
+ tokenCounts[token.type]++;
524
+ }
525
+ });
526
+
527
+ expect(tokenCounts[TokenType.DOCTYPE]).toBeGreaterThan(0);
528
+ expect(tokenCounts[TokenType.COMMENT]).toBeGreaterThan(0);
529
+ expect(tokenCounts[TokenType.TAG_OPEN]).toBeGreaterThan(0);
530
+ expect(tokenCounts[TokenType.TAG_CLOSE]).toBeGreaterThan(0);
531
+ expect(tokenCounts[TokenType.TEXT]).toBeGreaterThan(0);
532
+ expect(tokenCounts[TokenType.EOF]).toBe(1);
533
+ });
534
+ });
535
+
536
+ describe("Performance and Stress Tests", () => {
537
+ it("should handle very large documents", () => {
538
+ let html = "<div>";
539
+ for (let i = 0; i < 1000; i++) {
540
+ html += `<p id="para-${i}" class="paragraph">Paragraph ${i} content</p>`;
541
+ }
542
+ html += "</div>";
543
+
544
+ const startTime = Date.now();
545
+ const tokens = tokenize(html);
546
+ const endTime = Date.now();
547
+
548
+ expect(tokens.length).toBeGreaterThan(2000);
549
+ expect(endTime - startTime).toBeLessThan(1000);
550
+ });
551
+
552
+ it("should handle deeply nested structures", () => {
553
+ let html = "";
554
+ const depth = 100;
555
+
556
+ for (let i = 0; i < depth; i++) {
557
+ html += `<div level="${i}">`;
558
+ }
559
+ html += "Content";
560
+ for (let i = 0; i < depth; i++) {
561
+ html += "</div>";
562
+ }
563
+
564
+ const tokens = tokenize(html);
565
+
566
+ expect(tokens.length).toBe(depth * 2 + 2);
567
+ });
568
+
569
+ it("should handle many attributes per element", () => {
570
+ let html = "<div";
571
+ for (let i = 0; i < 100; i++) {
572
+ html += ` attr-${i}="value-${i}"`;
573
+ }
574
+ html += ">";
575
+
576
+ const tokens = tokenize(html);
577
+ const divTag = tokens[0]!;
578
+
579
+ expect(Object.keys(divTag.attributes!).length).toBe(100);
580
+ expect(divTag.attributes!["attr-50"]).toBe("value-50");
581
+ });
582
+ });
583
+
584
+ describe("Real-world Scenarios", () => {
585
+ it("should handle SVG elements", () => {
586
+ const svg = `
587
+ <svg width="100" height="100" xmlns="http://www.w3.org/2000/svg">
588
+ <circle cx="50" cy="50" r="40" stroke="black" stroke-width="3" fill="red"/>
589
+ <text x="50" y="50" text-anchor="middle">SVG</text>
590
+ </svg>
591
+ `;
592
+
593
+ const tokens = tokenize(svg);
594
+
595
+ const svgTag = tokens.find((token) => token.value === "svg")!;
596
+ expect(svgTag.attributes!.xmlns).toBe("http://www.w3.org/2000/svg");
597
+
598
+ const circleTag = tokens.find((token) => token.value === "circle")!;
599
+ expect(circleTag.isSelfClosing).toBe(true);
600
+ expect(circleTag.attributes!.fill).toBe("red");
601
+ });
602
+
603
+ it("should handle script and style tags", () => {
604
+ const html = `
605
+ <script type="text/javascript">
606
+ function hello() {
607
+ alert("Hello <world>");
608
+ }
609
+ </script>
610
+ <style type="text/css">
611
+ .class { color: red; }
612
+ /* Comment with <tags> */
613
+ </style>
614
+ `;
615
+
616
+ const tokens = tokenize(html);
617
+
618
+ const scriptTag = tokens.find((token) => token.value === "script")!;
619
+ const styleTag = tokens.find((token) => token.value === "style")!;
620
+
621
+ expect(scriptTag.attributes!.type).toBe("text/javascript");
622
+ expect(styleTag.attributes!.type).toBe("text/css");
623
+ });
624
+
625
+ it("should handle form elements with complex attributes", () => {
626
+ const html = `
627
+ <form method="POST" action="/submit" enctype="multipart/form-data">
628
+ <input type="email" name="email" required pattern="[a-z0-9._%+-]+@[a-z0-9.-]+\\.[a-z]{2,}$" title="Please enter a valid email">
629
+ <select name="country" size="1" multiple>
630
+ <option value="us" selected>United States</option>
631
+ <option value="ca">Canada</option>
632
+ </select>
633
+ </form>
634
+ `;
635
+
636
+ const tokens = tokenize(html);
637
+
638
+ const inputTag = tokens.find((token) => token.value === "input")!;
639
+ expect(inputTag.attributes!.pattern).toContain("@");
640
+ expect(inputTag.attributes!.required).toBe("");
641
+
642
+ const selectTag = tokens.find((token) => token.value === "select")!;
643
+ expect(selectTag.attributes!.multiple).toBe("");
644
+ });
645
+ });
646
+
647
+ describe("Error Recovery", () => {
648
+ it("should handle incomplete tags gracefully", () => {
649
+ const malformedHTML = '<div class="test><p>Content</p>';
650
+ const tokens = tokenize(malformedHTML);
651
+
652
+ expect(tokens.length).toBeGreaterThan(0);
653
+ expect(tokens[tokens.length - 1]!.type).toBe(TokenType.EOF);
654
+ });
655
+
656
+ it("should handle unmatched quotes in attributes", () => {
657
+ const html = '<div class="test id=\'main">Content</div>';
658
+ const tokens = tokenize(html);
659
+
660
+ const divTag = tokens.find((token) => token.value === "div")!;
661
+ expect(divTag).toBeDefined();
662
+ });
663
+
664
+ it("should continue parsing after errors", () => {
665
+ const html = "<div><p>Valid paragraph</p><span>Valid span</span>";
666
+ const tokens = tokenize(html);
667
+
668
+ const hasValidElements =
669
+ tokens.some((token) => token.value === "p") ||
670
+ tokens.some((token) => token.value === "span");
671
+ expect(hasValidElements).toBe(true);
672
+ });
673
+
674
+ it("should handle empty angle brackets <>", () => {
675
+ const html = "<>text<div>content</div>";
676
+ const tokens = tokenize(html);
677
+
678
+ // Should skip the invalid <> and continue parsing
679
+ expect(tokens[tokens.length - 1]!.type).toBe(TokenType.EOF);
680
+ const divToken = tokens.find((t) => t.value === "div");
681
+ expect(divToken).toBeDefined();
682
+ });
683
+
684
+ it("should handle angle bracket with only space < >", () => {
685
+ const html = "< >text<p>paragraph</p>";
686
+ const tokens = tokenize(html);
687
+
688
+ expect(tokens[tokens.length - 1]!.type).toBe(TokenType.EOF);
689
+ const pToken = tokens.find((t) => t.value === "p");
690
+ expect(pToken).toBeDefined();
691
+ });
692
+
693
+ it("should handle tag with no valid name", () => {
694
+ const html = "<123>text</123><div>ok</div>";
695
+ const tokens = tokenize(html);
696
+
697
+ // Tags starting with numbers are invalid, should be treated as text
698
+ expect(tokens[tokens.length - 1]!.type).toBe(TokenType.EOF);
699
+ const divToken = tokens.find((t) => t.value === "div");
700
+ expect(divToken).toBeDefined();
701
+ });
702
+ });
703
+
704
+ describe("Entity Edge Cases", () => {
705
+ it("should handle entity without semicolon with valid prefix", () => {
706
+ // &nbsp followed by other text (no semicolon) should decode &nbsp
707
+ const tokens = tokenize("<div>&nbsptext</div>");
708
+
709
+ const textToken = tokens.find((t) => t.type === TokenType.TEXT);
710
+ expect(textToken).toBeDefined();
711
+ // Should decode &nbsp (non-breaking space) and keep "text"
712
+ expect(textToken!.value).toContain("text");
713
+ });
714
+
715
+ it("should handle entity without semicolon - lt prefix", () => {
716
+ const tokens = tokenize("<div>&ltvalue</div>");
717
+
718
+ const textToken = tokens.find((t) => t.type === TokenType.TEXT);
719
+ expect(textToken).toBeDefined();
720
+ expect(textToken!.value).toBe("&ltvalue");
721
+ });
722
+
723
+ it("should handle entity without semicolon - gt prefix", () => {
724
+ const tokens = tokenize("<div>&gtvalue</div>");
725
+
726
+ const textToken = tokens.find((t) => t.type === TokenType.TEXT);
727
+ expect(textToken).toBeDefined();
728
+ expect(textToken!.value).toBe("&gtvalue");
729
+ });
730
+
731
+ it("should handle entity without semicolon - amp prefix", () => {
732
+ const tokens = tokenize("<div>&ampvalue</div>");
733
+
734
+ const textToken = tokens.find((t) => t.type === TokenType.TEXT);
735
+ expect(textToken).toBeDefined();
736
+ expect(textToken!.value).toBe("&ampvalue");
737
+ });
738
+
739
+ it("should handle unknown entity gracefully", () => {
740
+ const tokens = tokenize("<div>&unknownentity;</div>");
741
+
742
+ const textToken = tokens.find((t) => t.type === TokenType.TEXT);
743
+ expect(textToken).toBeDefined();
744
+ // Unknown entity should be kept as-is
745
+ expect(textToken!.value).toBe("&unknownentity;");
746
+ });
747
+
748
+ it("should handle partial entity name with no matching prefix", () => {
749
+ const tokens = tokenize("<div>&xyz</div>");
750
+
751
+ const textToken = tokens.find((t) => t.type === TokenType.TEXT);
752
+ expect(textToken).toBeDefined();
753
+ // No valid entity prefix, keep as-is
754
+ expect(textToken!.value).toBe("&xyz");
755
+ });
756
+ });
757
+ });