@tkeron/html-parser 0.1.7 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -7
- package/bun.lock +5 -0
- package/index.ts +4 -0
- package/package.json +7 -1
- package/src/css-selector.ts +1 -1
- package/src/dom-simulator.ts +41 -17
- package/src/encoding.ts +39 -0
- package/src/index.ts +9 -0
- package/src/parser.ts +509 -143
- package/src/serializer.ts +450 -0
- package/src/tokenizer.ts +190 -118
- package/tests/advanced.test.ts +121 -108
- package/tests/custom-elements-head.test.ts +105 -0
- package/tests/dom-extended.test.ts +12 -12
- package/tests/dom-manipulation.test.ts +9 -10
- package/tests/dom.test.ts +32 -27
- package/tests/helpers/tokenizer-adapter.test.ts +70 -0
- package/tests/helpers/tokenizer-adapter.ts +65 -0
- package/tests/helpers/tree-adapter.test.ts +39 -0
- package/tests/helpers/tree-adapter.ts +60 -0
- package/tests/html5lib-data/tokenizer/namedEntities.test +42422 -0
- package/tests/html5lib-data/tokenizer/pendingSpecChanges.test +9 -0
- package/tests/html5lib-data/tree-construction/adoption01.dat +354 -0
- package/tests/html5lib-data/tree-construction/adoption02.dat +39 -0
- package/tests/html5lib-data/tree-construction/domjs-unsafe.dat +0 -0
- package/tests/html5lib-data/tree-construction/entities02.dat +309 -0
- package/tests/html5lib-data/tree-construction/html5test-com.dat +301 -0
- package/tests/html5lib-data/tree-construction/math.dat +104 -0
- package/tests/html5lib-data/tree-construction/namespace-sensitivity.dat +22 -0
- package/tests/html5lib-data/tree-construction/noscript01.dat +237 -0
- package/tests/html5lib-data/tree-construction/ruby.dat +302 -0
- package/tests/html5lib-data/tree-construction/scriptdata01.dat +372 -0
- package/tests/html5lib-data/tree-construction/svg.dat +104 -0
- package/tests/html5lib-data/tree-construction/template.dat +1673 -0
- package/tests/html5lib-data/tree-construction/tests10.dat +853 -0
- package/tests/html5lib-data/tree-construction/tests11.dat +523 -0
- package/tests/html5lib-data/tree-construction/tests20.dat +842 -0
- package/tests/html5lib-data/tree-construction/tests21.dat +306 -0
- package/tests/html5lib-data/tree-construction/tests23.dat +168 -0
- package/tests/html5lib-data/tree-construction/tests24.dat +79 -0
- package/tests/html5lib-data/tree-construction/tests5.dat +210 -0
- package/tests/html5lib-data/tree-construction/tests6.dat +663 -0
- package/tests/html5lib-data/tree-construction/tests_innerHTML_1.dat +844 -0
- package/tests/parser.test.ts +173 -193
- package/tests/serializer-core.test.ts +16 -0
- package/tests/serializer-data/core.test +125 -0
- package/tests/serializer-data/injectmeta.test +66 -0
- package/tests/serializer-data/optionaltags.test +965 -0
- package/tests/serializer-data/options.test +60 -0
- package/tests/serializer-data/whitespace.test +51 -0
- package/tests/serializer-injectmeta.test.ts +16 -0
- package/tests/serializer-optionaltags.test.ts +16 -0
- package/tests/serializer-options.test.ts +16 -0
- package/tests/serializer-whitespace.test.ts +16 -0
- package/tests/tokenizer-namedEntities.test.ts +20 -0
- package/tests/tokenizer-pendingSpecChanges.test.ts +20 -0
- package/tests/tokenizer.test.ts +25 -32
- package/tests/tree-construction-adoption01.test.ts +37 -0
- package/tests/tree-construction-adoption02.test.ts +34 -0
- package/tests/tree-construction-domjs-unsafe.test.ts +24 -0
- package/tests/tree-construction-entities02.test.ts +33 -0
- package/tests/tree-construction-html5test-com.test.ts +32 -0
- package/tests/tree-construction-math.test.ts +18 -0
- package/tests/tree-construction-namespace-sensitivity.test.ts +18 -0
- package/tests/tree-construction-noscript01.test.ts +18 -0
- package/tests/tree-construction-ruby.test.ts +21 -0
- package/tests/tree-construction-scriptdata01.test.ts +21 -0
- package/tests/tree-construction-svg.test.ts +21 -0
- package/tests/tree-construction-template.test.ts +21 -0
- package/tests/tree-construction-tests10.test.ts +21 -0
- package/tests/tree-construction-tests11.test.ts +21 -0
- package/tests/tree-construction-tests20.test.ts +18 -0
- package/tests/tree-construction-tests21.test.ts +18 -0
- package/tests/tree-construction-tests23.test.ts +18 -0
- package/tests/tree-construction-tests24.test.ts +18 -0
- package/tests/tree-construction-tests5.test.ts +21 -0
- package/tests/tree-construction-tests6.test.ts +21 -0
- package/tests/tree-construction-tests_innerHTML_1.test.ts +21 -0
- package/tests/custom-elements.test.ts +0 -745
- package/tests/official/README.md +0 -87
- package/tests/official/acid/acid-tests.test.ts +0 -309
- package/tests/official/final-output/final-output.test.ts +0 -361
- package/tests/official/html5lib/tokenizer-utils.ts +0 -192
- package/tests/official/html5lib/tokenizer.test.ts +0 -171
- package/tests/official/html5lib/tree-construction-utils.ts +0 -194
- package/tests/official/html5lib/tree-construction.test.ts +0 -250
- package/tests/official/validator/validator-tests.test.ts +0 -237
- package/tests/official/validator-nu/validator-nu.test.ts +0 -335
- package/tests/official/whatwg/whatwg-tests.test.ts +0 -205
- package/tests/official/wpt/wpt-tests.test.ts +0 -409
package/src/tokenizer.ts
CHANGED
|
@@ -24,69 +24,132 @@ export interface Token {
|
|
|
24
24
|
isClosing?: boolean;
|
|
25
25
|
}
|
|
26
26
|
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
'>': '>',
|
|
31
|
-
'"': '"',
|
|
32
|
-
''': "'",
|
|
33
|
-
' ': '\u00A0',
|
|
34
|
-
'©': '©',
|
|
35
|
-
'®': '®',
|
|
36
|
-
'™': '™',
|
|
37
|
-
'…': '…',
|
|
38
|
-
'—': '—',
|
|
39
|
-
'–': '–',
|
|
40
|
-
'‘': '\u2018',
|
|
41
|
-
'’': '\u2019',
|
|
42
|
-
'“': '\u201C',
|
|
43
|
-
'”': '\u201D',
|
|
44
|
-
'¬': '¬'
|
|
45
|
-
};
|
|
27
|
+
import { allNamedEntities } from 'all-named-html-entities';
|
|
28
|
+
|
|
29
|
+
const HTML_ENTITIES: Record<string, string> = allNamedEntities;
|
|
46
30
|
|
|
47
31
|
function decodeEntities(text: string): string {
|
|
48
|
-
let result =
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
if (
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
32
|
+
let result = '';
|
|
33
|
+
let i = 0;
|
|
34
|
+
while (i < text.length) {
|
|
35
|
+
if (text[i] === '&') {
|
|
36
|
+
let match = '';
|
|
37
|
+
let j = i + 1;
|
|
38
|
+
if (text[j] === '#') {
|
|
39
|
+
j++;
|
|
40
|
+
if (text[j] === 'x' || text[j] === 'X') {
|
|
41
|
+
j++;
|
|
42
|
+
while (j < text.length && /[0-9a-fA-F]/.test(text[j])) {
|
|
43
|
+
j++;
|
|
44
|
+
}
|
|
45
|
+
} else {
|
|
46
|
+
while (j < text.length && /[0-9]/.test(text[j])) {
|
|
47
|
+
j++;
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
if (text[j] === ';') {
|
|
51
|
+
j++;
|
|
52
|
+
}
|
|
53
|
+
match = text.substring(i, j);
|
|
54
|
+
const entity = match;
|
|
55
|
+
if (entity.startsWith('&#x') && entity.endsWith(';')) {
|
|
56
|
+
const hex = entity.slice(3, -1);
|
|
57
|
+
result += String.fromCharCode(parseInt(hex, 16));
|
|
58
|
+
i = j;
|
|
59
|
+
continue;
|
|
60
|
+
} else if (entity.startsWith('&#') && entity.endsWith(';')) {
|
|
61
|
+
const decimal = entity.slice(2, -1);
|
|
62
|
+
result += String.fromCharCode(parseInt(decimal, 10));
|
|
63
|
+
i = j;
|
|
64
|
+
continue;
|
|
65
|
+
}
|
|
66
|
+
} else {
|
|
67
|
+
while (j < text.length && /[a-zA-Z0-9]/.test(text[j])) {
|
|
68
|
+
j++;
|
|
69
|
+
}
|
|
70
|
+
const hasSemi = text[j] === ';';
|
|
71
|
+
if (hasSemi) {
|
|
72
|
+
j++;
|
|
73
|
+
}
|
|
74
|
+
match = text.substring(i, j);
|
|
75
|
+
const named = match.slice(1, hasSemi ? -1 : undefined);
|
|
76
|
+
if (HTML_ENTITIES[named]) {
|
|
77
|
+
if (hasSemi || (j < text.length && !/[a-zA-Z0-9]/.test(text[j]))) {
|
|
78
|
+
result += HTML_ENTITIES[named];
|
|
79
|
+
i = j;
|
|
80
|
+
continue;
|
|
68
81
|
}
|
|
69
82
|
}
|
|
70
83
|
}
|
|
71
|
-
|
|
72
|
-
|
|
84
|
+
result += text[i];
|
|
85
|
+
i++;
|
|
86
|
+
} else {
|
|
87
|
+
result += text[i];
|
|
88
|
+
i++;
|
|
73
89
|
}
|
|
74
|
-
|
|
75
|
-
|
|
90
|
+
}
|
|
91
|
+
return result.replace(/\u0000/g, '\uFFFD');
|
|
76
92
|
}
|
|
77
93
|
|
|
78
94
|
function parseAttributes(attributeString: string): Record<string, string> {
|
|
79
95
|
const attributes: Record<string, string> = {};
|
|
96
|
+
let i = 0;
|
|
80
97
|
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
98
|
+
while (i < attributeString.length) {
|
|
99
|
+
while (i < attributeString.length && /\s/.test(attributeString[i])) {
|
|
100
|
+
i++;
|
|
101
|
+
}
|
|
102
|
+
if (i >= attributeString.length || attributeString[i] === '/' || attributeString[i] === '>') {
|
|
103
|
+
break;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
let name = '';
|
|
107
|
+
while (i < attributeString.length && !/[\s=\/>]/.test(attributeString[i])) {
|
|
108
|
+
name += attributeString[i];
|
|
109
|
+
i++;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
if (!name) {
|
|
113
|
+
i++;
|
|
114
|
+
continue;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
while (i < attributeString.length && /\s/.test(attributeString[i])) {
|
|
118
|
+
i++;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
let value = '';
|
|
122
|
+
if (i < attributeString.length && attributeString[i] === '=') {
|
|
123
|
+
i++;
|
|
124
|
+
while (i < attributeString.length && /\s/.test(attributeString[i])) {
|
|
125
|
+
i++;
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
if (i < attributeString.length) {
|
|
129
|
+
if (attributeString[i] === '"') {
|
|
130
|
+
i++;
|
|
131
|
+
while (i < attributeString.length && attributeString[i] !== '"') {
|
|
132
|
+
value += attributeString[i];
|
|
133
|
+
i++;
|
|
134
|
+
}
|
|
135
|
+
i++;
|
|
136
|
+
} else if (attributeString[i] === "'") {
|
|
137
|
+
i++;
|
|
138
|
+
while (i < attributeString.length && attributeString[i] !== "'") {
|
|
139
|
+
value += attributeString[i];
|
|
140
|
+
i++;
|
|
141
|
+
}
|
|
142
|
+
i++;
|
|
143
|
+
} else {
|
|
144
|
+
while (i < attributeString.length && !/[\s>]/.test(attributeString[i])) {
|
|
145
|
+
value += attributeString[i];
|
|
146
|
+
i++;
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
}
|
|
89
150
|
}
|
|
151
|
+
|
|
152
|
+
attributes[name.toLowerCase()] = decodeEntities(value);
|
|
90
153
|
}
|
|
91
154
|
|
|
92
155
|
return attributes;
|
|
@@ -101,79 +164,72 @@ function calculatePosition(text: string, offset: number): Position {
|
|
|
101
164
|
};
|
|
102
165
|
}
|
|
103
166
|
|
|
167
|
+
const RAW_TEXT_ELEMENTS = new Set(['script', 'style', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript']);
|
|
168
|
+
const RCDATA_ELEMENTS = new Set(['textarea', 'title']);
|
|
169
|
+
|
|
104
170
|
export function tokenize(html: string): Token[] {
|
|
105
171
|
const tokens: Token[] = [];
|
|
106
|
-
let position = 0;
|
|
107
|
-
|
|
108
|
-
const specialCases = [
|
|
109
|
-
{
|
|
110
|
-
pattern: /<!DOCTYPE\s+[^>]*>/gi,
|
|
111
|
-
type: TokenType.DOCTYPE,
|
|
112
|
-
getValue: (match: string) => {
|
|
113
|
-
const doctypeMatch = match.match(/<!DOCTYPE\s+([^\s>]+)/i);
|
|
114
|
-
return doctypeMatch && doctypeMatch[1] ? doctypeMatch[1].toLowerCase() : match;
|
|
115
|
-
}
|
|
116
|
-
},
|
|
117
|
-
{
|
|
118
|
-
pattern: /<!--([\s\S]*?)(?:-->|$)/g,
|
|
119
|
-
type: TokenType.COMMENT,
|
|
120
|
-
getValue: (match: string) => match.slice(4, match.endsWith('-->') ? -3 : match.length)
|
|
121
|
-
},
|
|
122
|
-
{
|
|
123
|
-
pattern: /<!\[CDATA\[([\s\S]*?)\]\]>/g,
|
|
124
|
-
type: TokenType.CDATA,
|
|
125
|
-
getValue: (match: string) => match.slice(9, -3)
|
|
126
|
-
},
|
|
127
|
-
{
|
|
128
|
-
pattern: /<\?([^?]*(?:\?(?!>)[^?]*)*)\?>/g,
|
|
129
|
-
type: TokenType.PROCESSING_INSTRUCTION,
|
|
130
|
-
getValue: (match: string) => match.slice(0, -2)
|
|
131
|
-
}
|
|
132
|
-
];
|
|
133
|
-
|
|
134
|
-
const processedRanges: Array<[number, number]> = [];
|
|
135
|
-
|
|
136
|
-
for (const { pattern, type, getValue } of specialCases) {
|
|
137
|
-
const regex = new RegExp(pattern);
|
|
138
|
-
let match;
|
|
139
|
-
|
|
140
|
-
while ((match = regex.exec(html)) !== null) {
|
|
141
|
-
const start = match.index;
|
|
142
|
-
const end = start + match[0].length;
|
|
143
|
-
|
|
144
|
-
tokens.push({
|
|
145
|
-
type,
|
|
146
|
-
value: getValue(match[0]),
|
|
147
|
-
position: calculatePosition(html, start)
|
|
148
|
-
});
|
|
149
|
-
|
|
150
|
-
processedRanges.push([start, end]);
|
|
151
|
-
}
|
|
152
|
-
}
|
|
153
|
-
|
|
154
|
-
processedRanges.sort((a, b) => a[0] - b[0]);
|
|
155
|
-
|
|
156
172
|
let currentPos = 0;
|
|
157
173
|
|
|
158
174
|
while (currentPos < html.length) {
|
|
159
|
-
const inProcessedRange = processedRanges.some(([start, end]) =>
|
|
160
|
-
currentPos >= start && currentPos < end
|
|
161
|
-
);
|
|
162
|
-
|
|
163
|
-
if (inProcessedRange) {
|
|
164
|
-
const range = processedRanges.find(([start, end]) =>
|
|
165
|
-
currentPos >= start && currentPos < end
|
|
166
|
-
);
|
|
167
|
-
if (range) {
|
|
168
|
-
currentPos = range[1];
|
|
169
|
-
}
|
|
170
|
-
continue;
|
|
171
|
-
}
|
|
172
|
-
|
|
173
175
|
const char = html[currentPos];
|
|
174
176
|
|
|
175
177
|
if (char === '<') {
|
|
176
|
-
const
|
|
178
|
+
const remaining = html.slice(currentPos);
|
|
179
|
+
|
|
180
|
+
const doctypeMatch = remaining.match(/^<!DOCTYPE\s+[^>]*>/i);
|
|
181
|
+
if (doctypeMatch) {
|
|
182
|
+
const match = doctypeMatch[0];
|
|
183
|
+
const nameMatch = match.match(/<!DOCTYPE\s+([^\s>]+)/i);
|
|
184
|
+
tokens.push({
|
|
185
|
+
type: TokenType.DOCTYPE,
|
|
186
|
+
value: nameMatch && nameMatch[1] ? nameMatch[1].toLowerCase() : match,
|
|
187
|
+
position: calculatePosition(html, currentPos)
|
|
188
|
+
});
|
|
189
|
+
currentPos += match.length;
|
|
190
|
+
continue;
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
const commentMatch = remaining.match(/^<!--([\s\S]*?)(?:-->|$)/);
|
|
194
|
+
if (commentMatch) {
|
|
195
|
+
const match = commentMatch[0];
|
|
196
|
+
tokens.push({
|
|
197
|
+
type: TokenType.COMMENT,
|
|
198
|
+
value: match.slice(4, match.endsWith('-->') ? -3 : match.length),
|
|
199
|
+
position: calculatePosition(html, currentPos)
|
|
200
|
+
});
|
|
201
|
+
currentPos += match.length;
|
|
202
|
+
continue;
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
const cdataMatch = remaining.match(/^<!\[CDATA\[([\s\S]*?)\]\]>/);
|
|
206
|
+
if (cdataMatch) {
|
|
207
|
+
const content = cdataMatch[1];
|
|
208
|
+
tokens.push({
|
|
209
|
+
type: TokenType.COMMENT,
|
|
210
|
+
value: '[CDATA[' + content + ']]',
|
|
211
|
+
position: calculatePosition(html, currentPos)
|
|
212
|
+
});
|
|
213
|
+
currentPos += cdataMatch[0].length;
|
|
214
|
+
continue;
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
const piMatch = remaining.match(/^<\?([^>]*)/);
|
|
218
|
+
if (piMatch) {
|
|
219
|
+
let consumed = piMatch[0].length;
|
|
220
|
+
if (remaining[consumed] === '>') {
|
|
221
|
+
consumed++;
|
|
222
|
+
}
|
|
223
|
+
tokens.push({
|
|
224
|
+
type: TokenType.COMMENT,
|
|
225
|
+
value: '?' + piMatch[1],
|
|
226
|
+
position: calculatePosition(html, currentPos)
|
|
227
|
+
});
|
|
228
|
+
currentPos += consumed;
|
|
229
|
+
continue;
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
const tagMatch = remaining.match(/^<\/?([a-zA-Z][^\s/>]*)([^>]*)>/);
|
|
177
233
|
|
|
178
234
|
if (tagMatch) {
|
|
179
235
|
const fullTag = tagMatch[0];
|
|
@@ -206,6 +262,24 @@ export function tokenize(html: string): Token[] {
|
|
|
206
262
|
});
|
|
207
263
|
|
|
208
264
|
currentPos += fullTag.length;
|
|
265
|
+
|
|
266
|
+
if (!isClosing && !isSelfClosing && (RAW_TEXT_ELEMENTS.has(tagName) || RCDATA_ELEMENTS.has(tagName))) {
|
|
267
|
+
const closeTagPattern = new RegExp(`</${tagName}\\s*>`, 'i');
|
|
268
|
+
const restOfHtml = html.slice(currentPos);
|
|
269
|
+
const closeMatch = restOfHtml.match(closeTagPattern);
|
|
270
|
+
|
|
271
|
+
if (closeMatch && closeMatch.index !== undefined) {
|
|
272
|
+
const rawContent = restOfHtml.slice(0, closeMatch.index);
|
|
273
|
+
if (rawContent) {
|
|
274
|
+
tokens.push({
|
|
275
|
+
type: TokenType.TEXT,
|
|
276
|
+
value: RCDATA_ELEMENTS.has(tagName) ? decodeEntities(rawContent) : rawContent,
|
|
277
|
+
position: calculatePosition(html, currentPos)
|
|
278
|
+
});
|
|
279
|
+
}
|
|
280
|
+
currentPos += rawContent.length;
|
|
281
|
+
}
|
|
282
|
+
}
|
|
209
283
|
} else {
|
|
210
284
|
const textStart = currentPos;
|
|
211
285
|
currentPos++;
|
|
@@ -241,8 +315,6 @@ export function tokenize(html: string): Token[] {
|
|
|
241
315
|
}
|
|
242
316
|
}
|
|
243
317
|
|
|
244
|
-
tokens.sort((a, b) => a.position.offset - b.position.offset);
|
|
245
|
-
|
|
246
318
|
tokens.push({
|
|
247
319
|
type: TokenType.EOF,
|
|
248
320
|
value: '',
|