@tkeron/html-parser 1.1.2 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/npm_deploy.yml +14 -4
- package/README.md +6 -6
- package/bun.lock +6 -8
- package/check-versions.ts +147 -0
- package/index.ts +4 -8
- package/package.json +5 -6
- package/src/dom-simulator/append-child.ts +130 -0
- package/src/dom-simulator/append.ts +18 -0
- package/src/dom-simulator/attributes.ts +23 -0
- package/src/dom-simulator/clone-node.ts +51 -0
- package/src/dom-simulator/convert-ast-node-to-dom.ts +37 -0
- package/src/dom-simulator/create-cdata.ts +18 -0
- package/src/dom-simulator/create-comment.ts +23 -0
- package/src/dom-simulator/create-doctype.ts +24 -0
- package/src/dom-simulator/create-document.ts +81 -0
- package/src/dom-simulator/create-element.ts +195 -0
- package/src/dom-simulator/create-processing-instruction.ts +19 -0
- package/src/dom-simulator/create-temp-parent.ts +9 -0
- package/src/dom-simulator/create-text-node.ts +23 -0
- package/src/dom-simulator/escape-text-content.ts +6 -0
- package/src/dom-simulator/find-special-elements.ts +14 -0
- package/src/dom-simulator/get-text-content.ts +18 -0
- package/src/dom-simulator/index.ts +36 -0
- package/src/dom-simulator/inner-outer-html.ts +182 -0
- package/src/dom-simulator/insert-after.ts +20 -0
- package/src/dom-simulator/insert-before.ts +108 -0
- package/src/dom-simulator/matches.ts +26 -0
- package/src/dom-simulator/node-types.ts +26 -0
- package/src/dom-simulator/prepend.ts +24 -0
- package/src/dom-simulator/remove-child.ts +68 -0
- package/src/dom-simulator/remove.ts +7 -0
- package/src/dom-simulator/replace-child.ts +152 -0
- package/src/dom-simulator/set-text-content.ts +33 -0
- package/src/dom-simulator/update-element-content.ts +56 -0
- package/src/dom-simulator.ts +12 -1126
- package/src/encoding/constants.ts +8 -0
- package/src/encoding/detect-encoding.ts +21 -0
- package/src/encoding/index.ts +1 -0
- package/src/encoding/normalize-encoding.ts +6 -0
- package/src/html-entities.ts +2127 -0
- package/src/index.ts +5 -5
- package/src/parser/adoption-agency-helpers.ts +145 -0
- package/src/parser/constants.ts +137 -0
- package/src/parser/dom-to-ast.ts +79 -0
- package/src/parser/index.ts +9 -0
- package/src/parser/parse.ts +772 -0
- package/src/parser/types.ts +56 -0
- package/src/selectors/find-elements-descendant.ts +47 -0
- package/src/selectors/index.ts +2 -0
- package/src/selectors/matches-selector.ts +12 -0
- package/src/selectors/matches-token.ts +27 -0
- package/src/selectors/parse-selector.ts +48 -0
- package/src/selectors/query-selector-all.ts +43 -0
- package/src/selectors/query-selector.ts +6 -0
- package/src/selectors/types.ts +10 -0
- package/src/serializer/attributes.ts +74 -0
- package/src/serializer/escape.ts +13 -0
- package/src/serializer/index.ts +1 -0
- package/src/serializer/serialize-tokens.ts +511 -0
- package/src/tokenizer/calculate-position.ts +10 -0
- package/src/tokenizer/constants.ts +11 -0
- package/src/tokenizer/decode-entities.ts +64 -0
- package/src/tokenizer/index.ts +2 -0
- package/src/tokenizer/parse-attributes.ts +74 -0
- package/src/tokenizer/tokenize.ts +165 -0
- package/src/tokenizer/types.ts +25 -0
- package/tests/adoption-agency-helpers.test.ts +304 -0
- package/tests/advanced.test.ts +242 -221
- package/tests/cloneNode.test.ts +19 -66
- package/tests/custom-elements-head.test.ts +54 -55
- package/tests/dom-extended.test.ts +77 -64
- package/tests/dom-manipulation.test.ts +51 -24
- package/tests/dom.test.ts +15 -13
- package/tests/encoding/detect-encoding.test.ts +33 -0
- package/tests/google-dom.test.ts +2 -2
- package/tests/helpers/tokenizer-adapter.test.ts +29 -43
- package/tests/helpers/tokenizer-adapter.ts +36 -33
- package/tests/helpers/tree-adapter.test.ts +20 -20
- package/tests/helpers/tree-adapter.ts +34 -24
- package/tests/html-entities-text.test.ts +6 -2
- package/tests/innerhtml-void-elements.test.ts +52 -36
- package/tests/outerHTML-replacement.test.ts +37 -65
- package/tests/parser/dom-to-ast.test.ts +109 -0
- package/tests/parser/parse.test.ts +139 -0
- package/tests/parser.test.ts +281 -217
- package/tests/selectors/query-selector-all.test.ts +39 -0
- package/tests/selectors/query-selector.test.ts +42 -0
- package/tests/serializer/attributes.test.ts +132 -0
- package/tests/serializer/escape.test.ts +51 -0
- package/tests/serializer/serialize-tokens.test.ts +80 -0
- package/tests/serializer-core.test.ts +6 -6
- package/tests/serializer-injectmeta.test.ts +6 -6
- package/tests/serializer-optionaltags.test.ts +9 -6
- package/tests/serializer-options.test.ts +6 -6
- package/tests/serializer-whitespace.test.ts +6 -6
- package/tests/tokenizer/calculate-position.test.ts +34 -0
- package/tests/tokenizer/decode-entities.test.ts +31 -0
- package/tests/tokenizer/parse-attributes.test.ts +44 -0
- package/tests/tokenizer/tokenize.test.ts +757 -0
- package/tests/tokenizer-namedEntities.test.ts +10 -7
- package/tests/tokenizer-pendingSpecChanges.test.ts +10 -7
- package/tests/tokenizer.test.ts +268 -256
- package/tests/tree-construction-adoption01.test.ts +25 -16
- package/tests/tree-construction-adoption02.test.ts +30 -19
- package/tests/tree-construction-domjs-unsafe.test.ts +6 -4
- package/tests/tree-construction-entities02.test.ts +18 -16
- package/tests/tree-construction-html5test-com.test.ts +16 -10
- package/tests/tree-construction-math.test.ts +11 -9
- package/tests/tree-construction-namespace-sensitivity.test.ts +11 -9
- package/tests/tree-construction-noscript01.test.ts +11 -9
- package/tests/tree-construction-ruby.test.ts +6 -4
- package/tests/tree-construction-scriptdata01.test.ts +6 -4
- package/tests/tree-construction-svg.test.ts +6 -4
- package/tests/tree-construction-template.test.ts +6 -4
- package/tests/tree-construction-tests10.test.ts +6 -4
- package/tests/tree-construction-tests11.test.ts +6 -4
- package/tests/tree-construction-tests20.test.ts +7 -4
- package/tests/tree-construction-tests21.test.ts +7 -4
- package/tests/tree-construction-tests23.test.ts +7 -4
- package/tests/tree-construction-tests24.test.ts +7 -4
- package/tests/tree-construction-tests5.test.ts +6 -5
- package/tests/tree-construction-tests6.test.ts +6 -5
- package/tests/tree-construction-tests_innerHTML_1.test.ts +6 -5
- package/tests/void-elements.test.ts +85 -40
- package/tsconfig.json +1 -1
- package/src/css-selector.ts +0 -185
- package/src/encoding.ts +0 -39
- package/src/parser.ts +0 -682
- package/src/serializer.ts +0 -450
- package/src/tokenizer.ts +0 -325
- package/tests/selectors.test.ts +0 -128
package/tests/tokenizer.test.ts
CHANGED
|
@@ -1,252 +1,251 @@
|
|
|
1
|
-
import { expect, it, describe } from
|
|
2
|
-
import {
|
|
3
|
-
tokenize,
|
|
4
|
-
TokenType,
|
|
5
|
-
type Token
|
|
6
|
-
} from '../src/tokenizer';
|
|
1
|
+
import { expect, it, describe } from "bun:test";
|
|
2
|
+
import { tokenize, TokenType } from "../src/tokenizer/index.js";
|
|
7
3
|
|
|
8
|
-
describe(
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
const tokens = tokenize('<div>');
|
|
4
|
+
describe("HTML Tokenizer", () => {
|
|
5
|
+
describe("Basic Tags", () => {
|
|
6
|
+
it("should tokenize simple opening tag", () => {
|
|
7
|
+
const tokens = tokenize("<div>");
|
|
13
8
|
|
|
14
9
|
expect(tokens).toHaveLength(2);
|
|
15
10
|
expect(tokens[0]!).toEqual({
|
|
16
11
|
type: TokenType.TAG_OPEN,
|
|
17
|
-
value:
|
|
12
|
+
value: "div",
|
|
18
13
|
position: expect.any(Object),
|
|
19
14
|
attributes: {},
|
|
20
|
-
isSelfClosing: false
|
|
15
|
+
isSelfClosing: false,
|
|
21
16
|
});
|
|
22
17
|
expect(tokens[1]!.type).toBe(TokenType.EOF);
|
|
23
18
|
});
|
|
24
19
|
|
|
25
|
-
it(
|
|
26
|
-
const tokens = tokenize(
|
|
20
|
+
it("should tokenize simple closing tag", () => {
|
|
21
|
+
const tokens = tokenize("</div>");
|
|
27
22
|
|
|
28
23
|
expect(tokens).toHaveLength(2);
|
|
29
24
|
expect(tokens[0]!).toEqual({
|
|
30
25
|
type: TokenType.TAG_CLOSE,
|
|
31
|
-
value:
|
|
26
|
+
value: "div",
|
|
32
27
|
position: expect.any(Object),
|
|
33
|
-
isClosing: true
|
|
28
|
+
isClosing: true,
|
|
34
29
|
});
|
|
35
30
|
});
|
|
36
31
|
|
|
37
|
-
it(
|
|
38
|
-
const tokens = tokenize(
|
|
32
|
+
it("should tokenize self-closing tag", () => {
|
|
33
|
+
const tokens = tokenize("<img/>");
|
|
39
34
|
|
|
40
35
|
expect(tokens).toHaveLength(2);
|
|
41
36
|
expect(tokens[0]!).toEqual({
|
|
42
37
|
type: TokenType.TAG_OPEN,
|
|
43
|
-
value:
|
|
38
|
+
value: "img",
|
|
44
39
|
position: expect.any(Object),
|
|
45
40
|
attributes: {},
|
|
46
|
-
isSelfClosing: true
|
|
41
|
+
isSelfClosing: true,
|
|
47
42
|
});
|
|
48
43
|
});
|
|
49
44
|
|
|
50
|
-
it(
|
|
51
|
-
const tokens = tokenize(
|
|
45
|
+
it("should handle case insensitive tag names", () => {
|
|
46
|
+
const tokens = tokenize("<DIV></DIV>");
|
|
52
47
|
|
|
53
|
-
expect(tokens[0]!.value).toBe(
|
|
54
|
-
expect(tokens[1]!.value).toBe(
|
|
48
|
+
expect(tokens[0]!.value).toBe("div");
|
|
49
|
+
expect(tokens[1]!.value).toBe("div");
|
|
55
50
|
});
|
|
56
51
|
});
|
|
57
52
|
|
|
58
|
-
describe(
|
|
59
|
-
it(
|
|
53
|
+
describe("Attributes", () => {
|
|
54
|
+
it("should parse attributes with double quotes", () => {
|
|
60
55
|
const tokens = tokenize('<div class="container" id="main">');
|
|
61
56
|
|
|
62
57
|
expect(tokens[0]?.attributes).toEqual({
|
|
63
|
-
class:
|
|
64
|
-
id:
|
|
58
|
+
class: "container",
|
|
59
|
+
id: "main",
|
|
65
60
|
});
|
|
66
61
|
});
|
|
67
62
|
|
|
68
|
-
it(
|
|
63
|
+
it("should parse attributes with single quotes", () => {
|
|
69
64
|
const tokens = tokenize(`<div class='container' id='main'>`);
|
|
70
65
|
|
|
71
66
|
expect(tokens[0]?.attributes).toEqual({
|
|
72
|
-
class:
|
|
73
|
-
id:
|
|
67
|
+
class: "container",
|
|
68
|
+
id: "main",
|
|
74
69
|
});
|
|
75
70
|
});
|
|
76
71
|
|
|
77
|
-
it(
|
|
78
|
-
const tokens = tokenize(
|
|
72
|
+
it("should parse unquoted attributes", () => {
|
|
73
|
+
const tokens = tokenize("<div class=container id=main>");
|
|
79
74
|
|
|
80
75
|
expect(tokens[0]?.attributes).toEqual({
|
|
81
|
-
class:
|
|
82
|
-
id:
|
|
76
|
+
class: "container",
|
|
77
|
+
id: "main",
|
|
83
78
|
});
|
|
84
79
|
});
|
|
85
80
|
|
|
86
|
-
it(
|
|
87
|
-
const tokens = tokenize(
|
|
81
|
+
it("should parse boolean attributes", () => {
|
|
82
|
+
const tokens = tokenize("<input disabled checked>");
|
|
88
83
|
|
|
89
84
|
expect(tokens[0]?.attributes).toEqual({
|
|
90
|
-
disabled:
|
|
91
|
-
checked:
|
|
85
|
+
disabled: "",
|
|
86
|
+
checked: "",
|
|
92
87
|
});
|
|
93
88
|
});
|
|
94
89
|
|
|
95
|
-
it(
|
|
90
|
+
it("should handle mixed attribute types", () => {
|
|
96
91
|
const tokens = tokenize('<input type="text" disabled value=test>');
|
|
97
92
|
|
|
98
93
|
expect(tokens[0]?.attributes).toEqual({
|
|
99
|
-
type:
|
|
100
|
-
disabled:
|
|
101
|
-
value:
|
|
94
|
+
type: "text",
|
|
95
|
+
disabled: "",
|
|
96
|
+
value: "test",
|
|
102
97
|
});
|
|
103
98
|
});
|
|
104
99
|
|
|
105
|
-
it(
|
|
100
|
+
it("should handle attributes with special characters", () => {
|
|
106
101
|
const tokens = tokenize('<div data-test="value" aria-label="test">');
|
|
107
102
|
|
|
108
103
|
expect(tokens[0]?.attributes).toEqual({
|
|
109
|
-
|
|
110
|
-
|
|
104
|
+
"data-test": "value",
|
|
105
|
+
"aria-label": "test",
|
|
111
106
|
});
|
|
112
107
|
});
|
|
113
108
|
});
|
|
114
109
|
|
|
115
|
-
describe(
|
|
116
|
-
it(
|
|
117
|
-
const tokens = tokenize(
|
|
110
|
+
describe("Text Content", () => {
|
|
111
|
+
it("should tokenize plain text", () => {
|
|
112
|
+
const tokens = tokenize("Hello World");
|
|
118
113
|
|
|
119
114
|
expect(tokens).toHaveLength(2);
|
|
120
115
|
expect(tokens[0]).toEqual({
|
|
121
116
|
type: TokenType.TEXT,
|
|
122
|
-
value:
|
|
123
|
-
position: expect.any(Object)
|
|
117
|
+
value: "Hello World",
|
|
118
|
+
position: expect.any(Object),
|
|
124
119
|
});
|
|
125
120
|
});
|
|
126
121
|
|
|
127
|
-
it(
|
|
128
|
-
const tokens = tokenize(
|
|
122
|
+
it("should handle text with whitespace", () => {
|
|
123
|
+
const tokens = tokenize(" Hello World ");
|
|
129
124
|
|
|
130
|
-
expect(tokens[0]?.value).toBe(
|
|
125
|
+
expect(tokens[0]?.value).toBe(" Hello World ");
|
|
131
126
|
});
|
|
132
127
|
|
|
133
|
-
it(
|
|
134
|
-
const tokens = tokenize(
|
|
128
|
+
it("should handle multiline text", () => {
|
|
129
|
+
const tokens = tokenize("Line 1\nLine 2\nLine 3");
|
|
135
130
|
|
|
136
|
-
expect(tokens[0]?.value).toBe(
|
|
131
|
+
expect(tokens[0]?.value).toBe("Line 1\nLine 2\nLine 3");
|
|
137
132
|
});
|
|
138
133
|
});
|
|
139
134
|
|
|
140
|
-
describe(
|
|
141
|
-
it(
|
|
142
|
-
const tokens = tokenize(
|
|
135
|
+
describe("HTML Entities", () => {
|
|
136
|
+
it("should parse named entities", () => {
|
|
137
|
+
const tokens = tokenize("& < > " ");
|
|
143
138
|
|
|
144
139
|
expect(tokens[0]?.value).toBe('& < > " \u00A0');
|
|
145
140
|
});
|
|
146
141
|
|
|
147
|
-
it(
|
|
148
|
-
const tokens = tokenize(
|
|
142
|
+
it("should parse numeric entities", () => {
|
|
143
|
+
const tokens = tokenize("A B C");
|
|
149
144
|
|
|
150
|
-
expect(tokens[0]?.value).toBe(
|
|
145
|
+
expect(tokens[0]?.value).toBe("A B C");
|
|
151
146
|
});
|
|
152
147
|
|
|
153
|
-
it(
|
|
154
|
-
const tokens = tokenize(
|
|
148
|
+
it("should parse hexadecimal entities", () => {
|
|
149
|
+
const tokens = tokenize("A B C");
|
|
155
150
|
|
|
156
|
-
expect(tokens[0]?.value).toBe(
|
|
151
|
+
expect(tokens[0]?.value).toBe("A B C");
|
|
157
152
|
});
|
|
158
153
|
|
|
159
|
-
it(
|
|
154
|
+
it("should handle entities in attributes", () => {
|
|
160
155
|
const tokens = tokenize('<div title=""Hello"">');
|
|
161
156
|
|
|
162
157
|
expect(tokens[0]?.attributes!.title).toBe('"Hello"');
|
|
163
158
|
});
|
|
164
159
|
|
|
165
|
-
it(
|
|
166
|
-
const tokens = tokenize(
|
|
160
|
+
it("should handle unknown entities", () => {
|
|
161
|
+
const tokens = tokenize("&unknown;");
|
|
167
162
|
|
|
168
|
-
expect(tokens[0]?.value).toBe(
|
|
163
|
+
expect(tokens[0]?.value).toBe("&unknown;");
|
|
169
164
|
});
|
|
170
165
|
});
|
|
171
166
|
|
|
172
|
-
describe(
|
|
173
|
-
it(
|
|
174
|
-
const tokens = tokenize(
|
|
167
|
+
describe("Comments", () => {
|
|
168
|
+
it("should parse HTML comments", () => {
|
|
169
|
+
const tokens = tokenize("<!-- This is a comment -->");
|
|
175
170
|
|
|
176
171
|
expect(tokens[0]).toEqual({
|
|
177
172
|
type: TokenType.COMMENT,
|
|
178
|
-
value:
|
|
179
|
-
position: expect.any(Object)
|
|
173
|
+
value: " This is a comment ",
|
|
174
|
+
position: expect.any(Object),
|
|
180
175
|
});
|
|
181
176
|
});
|
|
182
177
|
|
|
183
|
-
it(
|
|
184
|
-
const tokens = tokenize(
|
|
178
|
+
it("should handle multiline comments", () => {
|
|
179
|
+
const tokens = tokenize(
|
|
180
|
+
`<!-- \n Multi line\n comment\n -->`,
|
|
181
|
+
);
|
|
185
182
|
|
|
186
183
|
expect(tokens[0]?.type).toBe(TokenType.COMMENT);
|
|
187
|
-
expect(tokens[0]?.value).toContain(
|
|
184
|
+
expect(tokens[0]?.value).toContain("Multi line");
|
|
188
185
|
});
|
|
189
186
|
|
|
190
|
-
it(
|
|
191
|
-
const tokens = tokenize(
|
|
187
|
+
it("should handle empty comments", () => {
|
|
188
|
+
const tokens = tokenize("<!---->");
|
|
192
189
|
|
|
193
190
|
expect(tokens[0]).toEqual({
|
|
194
191
|
type: TokenType.COMMENT,
|
|
195
|
-
value:
|
|
196
|
-
position: expect.any(Object)
|
|
192
|
+
value: "",
|
|
193
|
+
position: expect.any(Object),
|
|
197
194
|
});
|
|
198
195
|
});
|
|
199
196
|
});
|
|
200
197
|
|
|
201
|
-
describe(
|
|
202
|
-
it(
|
|
203
|
-
const tokens = tokenize(
|
|
198
|
+
describe("CDATA Sections (HTML5: treated as bogus comments)", () => {
|
|
199
|
+
it("should parse CDATA sections as bogus comments in HTML5", () => {
|
|
200
|
+
const tokens = tokenize("<![CDATA[Some data]]>");
|
|
204
201
|
|
|
205
202
|
expect(tokens[0]).toEqual({
|
|
206
203
|
type: TokenType.COMMENT,
|
|
207
|
-
value:
|
|
208
|
-
position: expect.any(Object)
|
|
204
|
+
value: "[CDATA[Some data]]",
|
|
205
|
+
position: expect.any(Object),
|
|
209
206
|
});
|
|
210
207
|
});
|
|
211
208
|
|
|
212
|
-
it(
|
|
209
|
+
it("should handle CDATA with special characters as bogus comment", () => {
|
|
213
210
|
const tokens = tokenize('<![CDATA[<script>alert("test");</script>]]>');
|
|
214
211
|
|
|
215
212
|
expect(tokens[0]?.value).toBe('[CDATA[<script>alert("test");</script>]]');
|
|
216
213
|
});
|
|
217
214
|
});
|
|
218
215
|
|
|
219
|
-
describe(
|
|
220
|
-
it(
|
|
221
|
-
const tokens = tokenize(
|
|
216
|
+
describe("DOCTYPE Declaration", () => {
|
|
217
|
+
it("should parse DOCTYPE declaration", () => {
|
|
218
|
+
const tokens = tokenize("<!DOCTYPE html>");
|
|
222
219
|
|
|
223
220
|
expect(tokens[0]).toEqual({
|
|
224
221
|
type: TokenType.DOCTYPE,
|
|
225
|
-
value:
|
|
226
|
-
position: expect.any(Object)
|
|
222
|
+
value: "html",
|
|
223
|
+
position: expect.any(Object),
|
|
227
224
|
});
|
|
228
225
|
});
|
|
229
226
|
|
|
230
|
-
it(
|
|
231
|
-
const tokens = tokenize(
|
|
227
|
+
it("should parse complex DOCTYPE", () => {
|
|
228
|
+
const tokens = tokenize(
|
|
229
|
+
'<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">',
|
|
230
|
+
);
|
|
232
231
|
|
|
233
232
|
expect(tokens[0]?.type).toBe(TokenType.DOCTYPE);
|
|
234
|
-
expect(tokens[0]?.value).toBe(
|
|
233
|
+
expect(tokens[0]?.value).toBe("html");
|
|
235
234
|
});
|
|
236
235
|
});
|
|
237
236
|
|
|
238
|
-
describe(
|
|
239
|
-
it(
|
|
237
|
+
describe("Processing Instructions (HTML5: treated as bogus comments)", () => {
|
|
238
|
+
it("should parse XML processing instruction as bogus comment", () => {
|
|
240
239
|
const tokens = tokenize('<?xml version="1.0" encoding="UTF-8"?>');
|
|
241
240
|
|
|
242
241
|
expect(tokens[0]).toEqual({
|
|
243
242
|
type: TokenType.COMMENT,
|
|
244
243
|
value: '?xml version="1.0" encoding="UTF-8"?',
|
|
245
|
-
position: expect.any(Object)
|
|
244
|
+
position: expect.any(Object),
|
|
246
245
|
});
|
|
247
246
|
});
|
|
248
247
|
|
|
249
|
-
it(
|
|
248
|
+
it("should parse PHP-style processing instruction as bogus comment", () => {
|
|
250
249
|
const tokens = tokenize('<?php echo "Hello"; ?>');
|
|
251
250
|
|
|
252
251
|
expect(tokens[0]?.type).toBe(TokenType.COMMENT);
|
|
@@ -254,8 +253,8 @@ describe('HTML Tokenizer', () => {
|
|
|
254
253
|
});
|
|
255
254
|
});
|
|
256
255
|
|
|
257
|
-
describe(
|
|
258
|
-
it(
|
|
256
|
+
describe("Complex HTML Documents", () => {
|
|
257
|
+
it("should tokenize complete HTML document", () => {
|
|
259
258
|
const html = `<!DOCTYPE html>
|
|
260
259
|
<html lang="en">
|
|
261
260
|
<head>
|
|
@@ -273,12 +272,14 @@ describe('HTML Tokenizer', () => {
|
|
|
273
272
|
expect(tokens[0]?.type).toBe(TokenType.DOCTYPE);
|
|
274
273
|
expect(tokens[tokens?.length - 1]?.type).toBe(TokenType.EOF);
|
|
275
274
|
|
|
276
|
-
const htmlTag = tokens.find(
|
|
275
|
+
const htmlTag = tokens.find(
|
|
276
|
+
(t) => t.type === TokenType.TAG_OPEN && t.value === "html",
|
|
277
|
+
);
|
|
277
278
|
expect(htmlTag).toBeDefined();
|
|
278
|
-
expect(htmlTag!.attributes!.lang).toBe(
|
|
279
|
+
expect(htmlTag!.attributes!.lang).toBe("en");
|
|
279
280
|
});
|
|
280
281
|
|
|
281
|
-
it(
|
|
282
|
+
it("should handle mixed content", () => {
|
|
282
283
|
const html = `<div>
|
|
283
284
|
Text before <!-- comment -->
|
|
284
285
|
<span>nested</span>
|
|
@@ -287,128 +288,134 @@ describe('HTML Tokenizer', () => {
|
|
|
287
288
|
|
|
288
289
|
const tokens = tokenize(html);
|
|
289
290
|
|
|
290
|
-
expect(tokens.some(t => t.type === TokenType.TAG_OPEN)).toBe(true);
|
|
291
|
-
expect(tokens.some(t => t.type === TokenType.TEXT)).toBe(true);
|
|
292
|
-
expect(tokens.some(t => t.type === TokenType.COMMENT)).toBe(true);
|
|
291
|
+
expect(tokens.some((t) => t.type === TokenType.TAG_OPEN)).toBe(true);
|
|
292
|
+
expect(tokens.some((t) => t.type === TokenType.TEXT)).toBe(true);
|
|
293
|
+
expect(tokens.some((t) => t.type === TokenType.COMMENT)).toBe(true);
|
|
293
294
|
});
|
|
294
295
|
});
|
|
295
296
|
|
|
296
|
-
describe(
|
|
297
|
-
it(
|
|
298
|
-
const tokens = tokenize(
|
|
297
|
+
describe("Edge Cases", () => {
|
|
298
|
+
it("should handle empty input", () => {
|
|
299
|
+
const tokens = tokenize("");
|
|
299
300
|
|
|
300
301
|
expect(tokens).toHaveLength(1);
|
|
301
302
|
expect(tokens[0]?.type).toBe(TokenType.EOF);
|
|
302
303
|
});
|
|
303
304
|
|
|
304
|
-
it(
|
|
305
|
-
const tokens = tokenize(
|
|
305
|
+
it("should handle whitespace only", () => {
|
|
306
|
+
const tokens = tokenize(" \n\t ");
|
|
306
307
|
|
|
307
308
|
expect(tokens).toHaveLength(2);
|
|
308
309
|
expect(tokens[0]?.type).toBe(TokenType.TEXT);
|
|
309
|
-
expect(tokens[0]?.value).toBe(
|
|
310
|
+
expect(tokens[0]?.value).toBe(" \n\t ");
|
|
310
311
|
});
|
|
311
312
|
|
|
312
|
-
it(
|
|
313
|
+
it("should handle malformed tags", () => {
|
|
313
314
|
const tokens = tokenize('<div class="test>');
|
|
314
315
|
|
|
315
316
|
expect(tokens[0]?.type).toBe(TokenType.TAG_OPEN);
|
|
316
|
-
expect(tokens[0]?.value).toBe(
|
|
317
|
+
expect(tokens[0]?.value).toBe("div");
|
|
317
318
|
});
|
|
318
319
|
|
|
319
|
-
it(
|
|
320
|
-
const tokens = tokenize(
|
|
320
|
+
it("should handle unclosed comments", () => {
|
|
321
|
+
const tokens = tokenize("<!-- unclosed comment");
|
|
321
322
|
|
|
322
323
|
expect(tokens[0]?.type).toBe(TokenType.COMMENT);
|
|
323
|
-
expect(tokens[0]?.value).toBe(
|
|
324
|
+
expect(tokens[0]?.value).toBe(" unclosed comment");
|
|
324
325
|
});
|
|
325
326
|
});
|
|
326
327
|
|
|
327
|
-
describe(
|
|
328
|
-
it(
|
|
328
|
+
describe("Advanced Edge Cases", () => {
|
|
329
|
+
it("should handle attributes with no spaces", () => {
|
|
329
330
|
const tokens = tokenize('<div class="test"id="main"data-value="123">');
|
|
330
331
|
expect(tokens.length).toBeGreaterThan(0);
|
|
331
332
|
const tag = tokens[0]!;
|
|
332
333
|
|
|
333
334
|
expect(tag.attributes).toEqual({
|
|
334
|
-
class:
|
|
335
|
-
id:
|
|
336
|
-
|
|
335
|
+
class: "test",
|
|
336
|
+
id: "main",
|
|
337
|
+
"data-value": "123",
|
|
337
338
|
});
|
|
338
339
|
});
|
|
339
340
|
|
|
340
|
-
it(
|
|
341
|
+
it("should handle attributes with excessive spaces", () => {
|
|
341
342
|
const tokens = tokenize('<div class = "test" id = "main" >');
|
|
342
343
|
expect(tokens.length).toBeGreaterThan(0);
|
|
343
344
|
const tag = tokens[0]!;
|
|
344
345
|
|
|
345
346
|
expect(tag.attributes).toEqual({
|
|
346
|
-
class:
|
|
347
|
-
id:
|
|
347
|
+
class: "test",
|
|
348
|
+
id: "main",
|
|
348
349
|
});
|
|
349
350
|
});
|
|
350
351
|
|
|
351
|
-
it(
|
|
352
|
-
const tokens = tokenize(
|
|
352
|
+
it("should handle mixed quote styles in same tag", () => {
|
|
353
|
+
const tokens = tokenize(
|
|
354
|
+
`<div class='single' id="double" data-test='mix "quoted" content'>`,
|
|
355
|
+
);
|
|
353
356
|
expect(tokens.length).toBeGreaterThan(0);
|
|
354
357
|
const tag = tokens[0]!;
|
|
355
358
|
|
|
356
|
-
expect(tag.attributes!.class).toBe(
|
|
357
|
-
expect(tag.attributes!.id).toBe(
|
|
358
|
-
expect(tag.attributes![
|
|
359
|
+
expect(tag.attributes!.class).toBe("single");
|
|
360
|
+
expect(tag.attributes!.id).toBe("double");
|
|
361
|
+
expect(tag.attributes!["data-test"]).toBe('mix "quoted" content');
|
|
359
362
|
});
|
|
360
363
|
|
|
361
|
-
it(
|
|
364
|
+
it("should handle malformed quotes gracefully", () => {
|
|
362
365
|
const tokens = tokenize('<div class="unclosed id="test">');
|
|
363
366
|
expect(tokens.length).toBeGreaterThan(0);
|
|
364
367
|
const tag = tokens[0]!;
|
|
365
368
|
|
|
366
369
|
expect(tag.type).toBe(TokenType.TAG_OPEN);
|
|
367
|
-
expect(tag.value).toBe(
|
|
370
|
+
expect(tag.value).toBe("div");
|
|
368
371
|
expect(tag.attributes).toBeDefined();
|
|
369
372
|
});
|
|
370
373
|
|
|
371
|
-
it(
|
|
372
|
-
const tokens = tokenize(
|
|
374
|
+
it("should handle empty tag names", () => {
|
|
375
|
+
const tokens = tokenize("<>content</>");
|
|
373
376
|
|
|
374
377
|
expect(tokens.length).toBeGreaterThan(0);
|
|
375
378
|
});
|
|
376
379
|
|
|
377
|
-
it(
|
|
380
|
+
it("should handle tags with numbers and special characters", () => {
|
|
378
381
|
const tokens = tokenize('<h1 class="heading-1" data-level="1">');
|
|
379
382
|
expect(tokens.length).toBeGreaterThan(0);
|
|
380
383
|
const tag = tokens[0]!;
|
|
381
384
|
|
|
382
|
-
expect(tag.value).toBe(
|
|
385
|
+
expect(tag.value).toBe("h1");
|
|
383
386
|
expect(tag.attributes).toEqual({
|
|
384
|
-
class:
|
|
385
|
-
|
|
387
|
+
class: "heading-1",
|
|
388
|
+
"data-level": "1",
|
|
386
389
|
});
|
|
387
390
|
});
|
|
388
391
|
|
|
389
|
-
it(
|
|
390
|
-
const longValue =
|
|
392
|
+
it("should handle extremely long attribute values", () => {
|
|
393
|
+
const longValue = "a".repeat(10000);
|
|
391
394
|
const tokens = tokenize(`<div data-long="${longValue}">`);
|
|
392
395
|
expect(tokens.length).toBeGreaterThan(0);
|
|
393
396
|
const tag = tokens[0]!;
|
|
394
397
|
|
|
395
|
-
expect(tag.attributes![
|
|
398
|
+
expect(tag.attributes!["data-long"]).toBe(longValue);
|
|
396
399
|
});
|
|
397
400
|
|
|
398
|
-
it(
|
|
399
|
-
const tokens = tokenize(
|
|
401
|
+
it("should handle unicode characters in attributes", () => {
|
|
402
|
+
const tokens = tokenize(
|
|
403
|
+
'<div title="测试" data-emoji="🚀" class="café">',
|
|
404
|
+
);
|
|
400
405
|
expect(tokens.length).toBeGreaterThan(0);
|
|
401
406
|
const tag = tokens[0]!;
|
|
402
407
|
|
|
403
408
|
expect(tag.attributes).toEqual({
|
|
404
|
-
title:
|
|
405
|
-
|
|
406
|
-
class:
|
|
409
|
+
title: "测试",
|
|
410
|
+
"data-emoji": "🚀",
|
|
411
|
+
class: "café",
|
|
407
412
|
});
|
|
408
413
|
});
|
|
409
414
|
|
|
410
|
-
it(
|
|
411
|
-
const tokens = tokenize(
|
|
415
|
+
it("should handle nested quotes in attributes", () => {
|
|
416
|
+
const tokens = tokenize(
|
|
417
|
+
`<div onclick="alert('Hello')" title='She said "hi"'>`,
|
|
418
|
+
);
|
|
412
419
|
expect(tokens.length).toBeGreaterThan(0);
|
|
413
420
|
const tag = tokens[0]!;
|
|
414
421
|
|
|
@@ -416,20 +423,22 @@ describe('HTML Tokenizer', () => {
|
|
|
416
423
|
expect(tag.attributes!.title).toBe('She said "hi"');
|
|
417
424
|
});
|
|
418
425
|
|
|
419
|
-
it(
|
|
420
|
-
const tokens = tokenize(
|
|
426
|
+
it("should handle attributes without values", () => {
|
|
427
|
+
const tokens = tokenize(
|
|
428
|
+
'<input type="checkbox" checked disabled readonly>',
|
|
429
|
+
);
|
|
421
430
|
expect(tokens.length).toBeGreaterThan(0);
|
|
422
431
|
const tag = tokens[0]!;
|
|
423
432
|
|
|
424
433
|
expect(tag.attributes).toEqual({
|
|
425
|
-
type:
|
|
426
|
-
checked:
|
|
427
|
-
disabled:
|
|
428
|
-
readonly:
|
|
434
|
+
type: "checkbox",
|
|
435
|
+
checked: "",
|
|
436
|
+
disabled: "",
|
|
437
|
+
readonly: "",
|
|
429
438
|
});
|
|
430
439
|
});
|
|
431
440
|
|
|
432
|
-
it(
|
|
441
|
+
it("should handle CDATA as bogus comment with complex content", () => {
|
|
433
442
|
const complexContent = `
|
|
434
443
|
function it() {
|
|
435
444
|
return "<div>HTML inside JS</div>";
|
|
@@ -441,18 +450,21 @@ describe('HTML Tokenizer', () => {
|
|
|
441
450
|
const cdataToken = tokens[0]!;
|
|
442
451
|
|
|
443
452
|
expect(cdataToken.type).toBe(TokenType.COMMENT);
|
|
444
|
-
expect(cdataToken.value).toBe(
|
|
453
|
+
expect(cdataToken.value).toBe("[CDATA[" + complexContent + "]]");
|
|
445
454
|
});
|
|
446
455
|
|
|
447
|
-
it(
|
|
456
|
+
it("should handle processing instructions as bogus comments", () => {
|
|
448
457
|
const tests = [
|
|
449
|
-
{ input: '<?xml version="1.0" encoding="UTF-8"?>', expected:
|
|
450
|
-
{
|
|
451
|
-
|
|
452
|
-
|
|
458
|
+
{ input: '<?xml version="1.0" encoding="UTF-8"?>', expected: "xml" },
|
|
459
|
+
{
|
|
460
|
+
input: '<?xml-stylesheet type="text/xsl" href="style.xsl"?>',
|
|
461
|
+
expected: "xml",
|
|
462
|
+
},
|
|
463
|
+
{ input: '<?php echo "Hello World"; ?>', expected: "php" },
|
|
464
|
+
{ input: '<?python print("Hello") ?>', expected: "python" },
|
|
453
465
|
];
|
|
454
466
|
|
|
455
|
-
tests.forEach(test => {
|
|
467
|
+
tests.forEach((test) => {
|
|
456
468
|
const tokens = tokenize(test.input);
|
|
457
469
|
const piToken = tokens[0]!;
|
|
458
470
|
|
|
@@ -461,16 +473,16 @@ describe('HTML Tokenizer', () => {
|
|
|
461
473
|
});
|
|
462
474
|
});
|
|
463
475
|
|
|
464
|
-
it(
|
|
476
|
+
it("should handle comments with special content", () => {
|
|
465
477
|
const specialComments = [
|
|
466
|
-
|
|
478
|
+
"<!-- TODO: Fix this -->",
|
|
467
479
|
'<!-- <script>alert("xss")</script> -->',
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
480
|
+
"<!-- Multi\nline\ncomment -->",
|
|
481
|
+
"<!-- Comment with -- inside -->",
|
|
482
|
+
"<!--[if IE]><![endif]-->",
|
|
471
483
|
];
|
|
472
484
|
|
|
473
|
-
specialComments.forEach(comment => {
|
|
485
|
+
specialComments.forEach((comment) => {
|
|
474
486
|
const tokens = tokenize(comment);
|
|
475
487
|
const commentToken = tokens[0]!;
|
|
476
488
|
|
|
@@ -478,7 +490,7 @@ describe('HTML Tokenizer', () => {
|
|
|
478
490
|
});
|
|
479
491
|
});
|
|
480
492
|
|
|
481
|
-
it(
|
|
493
|
+
it("should handle mixed content with all token types (HTML5 mode)", () => {
|
|
482
494
|
const html = `
|
|
483
495
|
<!DOCTYPE html>
|
|
484
496
|
<!-- Main document -->
|
|
@@ -503,10 +515,10 @@ describe('HTML Tokenizer', () => {
|
|
|
503
515
|
[TokenType.TAG_OPEN]: 0,
|
|
504
516
|
[TokenType.TAG_CLOSE]: 0,
|
|
505
517
|
[TokenType.TEXT]: 0,
|
|
506
|
-
[TokenType.EOF]: 0
|
|
518
|
+
[TokenType.EOF]: 0,
|
|
507
519
|
};
|
|
508
520
|
|
|
509
|
-
tokens.forEach(token => {
|
|
521
|
+
tokens.forEach((token) => {
|
|
510
522
|
if (token.type in tokenCounts) {
|
|
511
523
|
tokenCounts[token.type]++;
|
|
512
524
|
}
|
|
@@ -519,16 +531,15 @@ describe('HTML Tokenizer', () => {
|
|
|
519
531
|
expect(tokenCounts[TokenType.TEXT]).toBeGreaterThan(0);
|
|
520
532
|
expect(tokenCounts[TokenType.EOF]).toBe(1);
|
|
521
533
|
});
|
|
522
|
-
})
|
|
523
|
-
|
|
524
|
-
describe('Performance and Stress Tests', () => {
|
|
525
|
-
it('should handle very large documents', () => {
|
|
534
|
+
});
|
|
526
535
|
|
|
527
|
-
|
|
536
|
+
describe("Performance and Stress Tests", () => {
|
|
537
|
+
it("should handle very large documents", () => {
|
|
538
|
+
let html = "<div>";
|
|
528
539
|
for (let i = 0; i < 1000; i++) {
|
|
529
540
|
html += `<p id="para-${i}" class="paragraph">Paragraph ${i} content</p>`;
|
|
530
541
|
}
|
|
531
|
-
html +=
|
|
542
|
+
html += "</div>";
|
|
532
543
|
|
|
533
544
|
const startTime = Date.now();
|
|
534
545
|
const tokens = tokenize(html);
|
|
@@ -538,16 +549,16 @@ describe('HTML Tokenizer', () => {
|
|
|
538
549
|
expect(endTime - startTime).toBeLessThan(1000);
|
|
539
550
|
});
|
|
540
551
|
|
|
541
|
-
it(
|
|
542
|
-
let html =
|
|
552
|
+
it("should handle deeply nested structures", () => {
|
|
553
|
+
let html = "";
|
|
543
554
|
const depth = 100;
|
|
544
555
|
|
|
545
556
|
for (let i = 0; i < depth; i++) {
|
|
546
557
|
html += `<div level="${i}">`;
|
|
547
558
|
}
|
|
548
|
-
html +=
|
|
559
|
+
html += "Content";
|
|
549
560
|
for (let i = 0; i < depth; i++) {
|
|
550
|
-
html +=
|
|
561
|
+
html += "</div>";
|
|
551
562
|
}
|
|
552
563
|
|
|
553
564
|
const tokens = tokenize(html);
|
|
@@ -555,23 +566,23 @@ describe('HTML Tokenizer', () => {
|
|
|
555
566
|
expect(tokens.length).toBe(depth * 2 + 2);
|
|
556
567
|
});
|
|
557
568
|
|
|
558
|
-
it(
|
|
559
|
-
let html =
|
|
569
|
+
it("should handle many attributes per element", () => {
|
|
570
|
+
let html = "<div";
|
|
560
571
|
for (let i = 0; i < 100; i++) {
|
|
561
572
|
html += ` attr-${i}="value-${i}"`;
|
|
562
573
|
}
|
|
563
|
-
html +=
|
|
574
|
+
html += ">";
|
|
564
575
|
|
|
565
576
|
const tokens = tokenize(html);
|
|
566
577
|
const divTag = tokens[0]!;
|
|
567
578
|
|
|
568
579
|
expect(Object.keys(divTag.attributes!).length).toBe(100);
|
|
569
|
-
expect(divTag.attributes![
|
|
580
|
+
expect(divTag.attributes!["attr-50"]).toBe("value-50");
|
|
570
581
|
});
|
|
571
|
-
})
|
|
582
|
+
});
|
|
572
583
|
|
|
573
|
-
describe(
|
|
574
|
-
it(
|
|
584
|
+
describe("Real-world Scenarios", () => {
|
|
585
|
+
it("should handle SVG elements", () => {
|
|
575
586
|
const svg = `
|
|
576
587
|
<svg width="100" height="100" xmlns="http://www.w3.org/2000/svg">
|
|
577
588
|
<circle cx="50" cy="50" r="40" stroke="black" stroke-width="3" fill="red"/>
|
|
@@ -581,15 +592,15 @@ describe('HTML Tokenizer', () => {
|
|
|
581
592
|
|
|
582
593
|
const tokens = tokenize(svg);
|
|
583
594
|
|
|
584
|
-
const svgTag = tokens.find(token => token.value ===
|
|
585
|
-
expect(svgTag.attributes!.xmlns).toBe(
|
|
595
|
+
const svgTag = tokens.find((token) => token.value === "svg")!;
|
|
596
|
+
expect(svgTag.attributes!.xmlns).toBe("http://www.w3.org/2000/svg");
|
|
586
597
|
|
|
587
|
-
const circleTag = tokens.find(token => token.value ===
|
|
598
|
+
const circleTag = tokens.find((token) => token.value === "circle")!;
|
|
588
599
|
expect(circleTag.isSelfClosing).toBe(true);
|
|
589
|
-
expect(circleTag.attributes!.fill).toBe(
|
|
600
|
+
expect(circleTag.attributes!.fill).toBe("red");
|
|
590
601
|
});
|
|
591
602
|
|
|
592
|
-
it(
|
|
603
|
+
it("should handle script and style tags", () => {
|
|
593
604
|
const html = `
|
|
594
605
|
<script type="text/javascript">
|
|
595
606
|
function hello() {
|
|
@@ -604,14 +615,14 @@ describe('HTML Tokenizer', () => {
|
|
|
604
615
|
|
|
605
616
|
const tokens = tokenize(html);
|
|
606
617
|
|
|
607
|
-
const scriptTag = tokens.find(token => token.value ===
|
|
608
|
-
const styleTag = tokens.find(token => token.value ===
|
|
618
|
+
const scriptTag = tokens.find((token) => token.value === "script")!;
|
|
619
|
+
const styleTag = tokens.find((token) => token.value === "style")!;
|
|
609
620
|
|
|
610
|
-
expect(scriptTag.attributes!.type).toBe(
|
|
611
|
-
expect(styleTag.attributes!.type).toBe(
|
|
621
|
+
expect(scriptTag.attributes!.type).toBe("text/javascript");
|
|
622
|
+
expect(styleTag.attributes!.type).toBe("text/css");
|
|
612
623
|
});
|
|
613
624
|
|
|
614
|
-
it(
|
|
625
|
+
it("should handle form elements with complex attributes", () => {
|
|
615
626
|
const html = `
|
|
616
627
|
<form method="POST" action="/submit" enctype="multipart/form-data">
|
|
617
628
|
<input type="email" name="email" required pattern="[a-z0-9._%+-]+@[a-z0-9.-]+\\.[a-z]{2,}$" title="Please enter a valid email">
|
|
@@ -624,17 +635,17 @@ describe('HTML Tokenizer', () => {
|
|
|
624
635
|
|
|
625
636
|
const tokens = tokenize(html);
|
|
626
637
|
|
|
627
|
-
const inputTag = tokens.find(token => token.value ===
|
|
628
|
-
expect(inputTag.attributes!.pattern).toContain(
|
|
629
|
-
expect(inputTag.attributes!.required).toBe(
|
|
638
|
+
const inputTag = tokens.find((token) => token.value === "input")!;
|
|
639
|
+
expect(inputTag.attributes!.pattern).toContain("@");
|
|
640
|
+
expect(inputTag.attributes!.required).toBe("");
|
|
630
641
|
|
|
631
|
-
const selectTag = tokens.find(token => token.value ===
|
|
632
|
-
expect(selectTag.attributes!.multiple).toBe(
|
|
642
|
+
const selectTag = tokens.find((token) => token.value === "select")!;
|
|
643
|
+
expect(selectTag.attributes!.multiple).toBe("");
|
|
633
644
|
});
|
|
634
|
-
})
|
|
645
|
+
});
|
|
635
646
|
|
|
636
|
-
describe(
|
|
637
|
-
it(
|
|
647
|
+
describe("Error Recovery", () => {
|
|
648
|
+
it("should handle incomplete tags gracefully", () => {
|
|
638
649
|
const malformedHTML = '<div class="test><p>Content</p>';
|
|
639
650
|
const tokens = tokenize(malformedHTML);
|
|
640
651
|
|
|
@@ -642,104 +653,105 @@ describe('HTML Tokenizer', () => {
|
|
|
642
653
|
expect(tokens[tokens.length - 1]!.type).toBe(TokenType.EOF);
|
|
643
654
|
});
|
|
644
655
|
|
|
645
|
-
it(
|
|
656
|
+
it("should handle unmatched quotes in attributes", () => {
|
|
646
657
|
const html = '<div class="test id=\'main">Content</div>';
|
|
647
658
|
const tokens = tokenize(html);
|
|
648
659
|
|
|
649
|
-
const divTag = tokens.find(token => token.value ===
|
|
660
|
+
const divTag = tokens.find((token) => token.value === "div")!;
|
|
650
661
|
expect(divTag).toBeDefined();
|
|
651
662
|
});
|
|
652
663
|
|
|
653
|
-
it(
|
|
654
|
-
const html =
|
|
664
|
+
it("should continue parsing after errors", () => {
|
|
665
|
+
const html = "<div><p>Valid paragraph</p><span>Valid span</span>";
|
|
655
666
|
const tokens = tokenize(html);
|
|
656
667
|
|
|
657
|
-
const hasValidElements =
|
|
658
|
-
tokens.some(token => token.value ===
|
|
668
|
+
const hasValidElements =
|
|
669
|
+
tokens.some((token) => token.value === "p") ||
|
|
670
|
+
tokens.some((token) => token.value === "span");
|
|
659
671
|
expect(hasValidElements).toBe(true);
|
|
660
672
|
});
|
|
661
673
|
|
|
662
|
-
it(
|
|
663
|
-
const html =
|
|
674
|
+
it("should handle empty angle brackets <>", () => {
|
|
675
|
+
const html = "<>text<div>content</div>";
|
|
664
676
|
const tokens = tokenize(html);
|
|
665
677
|
|
|
666
678
|
// Should skip the invalid <> and continue parsing
|
|
667
679
|
expect(tokens[tokens.length - 1]!.type).toBe(TokenType.EOF);
|
|
668
|
-
const divToken = tokens.find(t => t.value ===
|
|
680
|
+
const divToken = tokens.find((t) => t.value === "div");
|
|
669
681
|
expect(divToken).toBeDefined();
|
|
670
682
|
});
|
|
671
683
|
|
|
672
|
-
it(
|
|
673
|
-
const html =
|
|
684
|
+
it("should handle angle bracket with only space < >", () => {
|
|
685
|
+
const html = "< >text<p>paragraph</p>";
|
|
674
686
|
const tokens = tokenize(html);
|
|
675
687
|
|
|
676
688
|
expect(tokens[tokens.length - 1]!.type).toBe(TokenType.EOF);
|
|
677
|
-
const pToken = tokens.find(t => t.value ===
|
|
689
|
+
const pToken = tokens.find((t) => t.value === "p");
|
|
678
690
|
expect(pToken).toBeDefined();
|
|
679
691
|
});
|
|
680
692
|
|
|
681
|
-
it(
|
|
682
|
-
const html =
|
|
693
|
+
it("should handle tag with no valid name", () => {
|
|
694
|
+
const html = "<123>text</123><div>ok</div>";
|
|
683
695
|
const tokens = tokenize(html);
|
|
684
696
|
|
|
685
697
|
// Tags starting with numbers are invalid, should be treated as text
|
|
686
698
|
expect(tokens[tokens.length - 1]!.type).toBe(TokenType.EOF);
|
|
687
|
-
const divToken = tokens.find(t => t.value ===
|
|
699
|
+
const divToken = tokens.find((t) => t.value === "div");
|
|
688
700
|
expect(divToken).toBeDefined();
|
|
689
701
|
});
|
|
690
702
|
});
|
|
691
703
|
|
|
692
|
-
describe(
|
|
693
|
-
it(
|
|
704
|
+
describe("Entity Edge Cases", () => {
|
|
705
|
+
it("should handle entity without semicolon with valid prefix", () => {
|
|
694
706
|
//   followed by other text (no semicolon) should decode  
|
|
695
|
-
const tokens = tokenize(
|
|
696
|
-
|
|
697
|
-
const textToken = tokens.find(t => t.type === TokenType.TEXT);
|
|
707
|
+
const tokens = tokenize("<div> text</div>");
|
|
708
|
+
|
|
709
|
+
const textToken = tokens.find((t) => t.type === TokenType.TEXT);
|
|
698
710
|
expect(textToken).toBeDefined();
|
|
699
711
|
// Should decode   (non-breaking space) and keep "text"
|
|
700
|
-
expect(textToken!.value).toContain(
|
|
712
|
+
expect(textToken!.value).toContain("text");
|
|
701
713
|
});
|
|
702
714
|
|
|
703
|
-
it(
|
|
704
|
-
const tokens = tokenize(
|
|
705
|
-
|
|
706
|
-
const textToken = tokens.find(t => t.type === TokenType.TEXT);
|
|
715
|
+
it("should handle entity without semicolon - lt prefix", () => {
|
|
716
|
+
const tokens = tokenize("<div><value</div>");
|
|
717
|
+
|
|
718
|
+
const textToken = tokens.find((t) => t.type === TokenType.TEXT);
|
|
707
719
|
expect(textToken).toBeDefined();
|
|
708
|
-
expect(textToken!.value).toBe(
|
|
720
|
+
expect(textToken!.value).toBe("<value");
|
|
709
721
|
});
|
|
710
722
|
|
|
711
|
-
it(
|
|
712
|
-
const tokens = tokenize(
|
|
713
|
-
|
|
714
|
-
const textToken = tokens.find(t => t.type === TokenType.TEXT);
|
|
723
|
+
it("should handle entity without semicolon - gt prefix", () => {
|
|
724
|
+
const tokens = tokenize("<div>>value</div>");
|
|
725
|
+
|
|
726
|
+
const textToken = tokens.find((t) => t.type === TokenType.TEXT);
|
|
715
727
|
expect(textToken).toBeDefined();
|
|
716
|
-
expect(textToken!.value).toBe(
|
|
728
|
+
expect(textToken!.value).toBe(">value");
|
|
717
729
|
});
|
|
718
730
|
|
|
719
|
-
it(
|
|
720
|
-
const tokens = tokenize(
|
|
721
|
-
|
|
722
|
-
const textToken = tokens.find(t => t.type === TokenType.TEXT);
|
|
731
|
+
it("should handle entity without semicolon - amp prefix", () => {
|
|
732
|
+
const tokens = tokenize("<div>&value</div>");
|
|
733
|
+
|
|
734
|
+
const textToken = tokens.find((t) => t.type === TokenType.TEXT);
|
|
723
735
|
expect(textToken).toBeDefined();
|
|
724
|
-
expect(textToken!.value).toBe(
|
|
736
|
+
expect(textToken!.value).toBe("&value");
|
|
725
737
|
});
|
|
726
738
|
|
|
727
|
-
it(
|
|
728
|
-
const tokens = tokenize(
|
|
729
|
-
|
|
730
|
-
const textToken = tokens.find(t => t.type === TokenType.TEXT);
|
|
739
|
+
it("should handle unknown entity gracefully", () => {
|
|
740
|
+
const tokens = tokenize("<div>&unknownentity;</div>");
|
|
741
|
+
|
|
742
|
+
const textToken = tokens.find((t) => t.type === TokenType.TEXT);
|
|
731
743
|
expect(textToken).toBeDefined();
|
|
732
744
|
// Unknown entity should be kept as-is
|
|
733
|
-
expect(textToken!.value).toBe(
|
|
745
|
+
expect(textToken!.value).toBe("&unknownentity;");
|
|
734
746
|
});
|
|
735
747
|
|
|
736
|
-
it(
|
|
737
|
-
const tokens = tokenize(
|
|
738
|
-
|
|
739
|
-
const textToken = tokens.find(t => t.type === TokenType.TEXT);
|
|
748
|
+
it("should handle partial entity name with no matching prefix", () => {
|
|
749
|
+
const tokens = tokenize("<div>&xyz</div>");
|
|
750
|
+
|
|
751
|
+
const textToken = tokens.find((t) => t.type === TokenType.TEXT);
|
|
740
752
|
expect(textToken).toBeDefined();
|
|
741
753
|
// No valid entity prefix, keep as-is
|
|
742
|
-
expect(textToken!.value).toBe(
|
|
754
|
+
expect(textToken!.value).toBe("&xyz");
|
|
743
755
|
});
|
|
744
|
-
})
|
|
756
|
+
});
|
|
745
757
|
});
|