@tkeron/html-parser 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/parser.ts ADDED
@@ -0,0 +1,355 @@
1
+ import type { Token } from './tokenizer.js';
2
+ import { TokenType } from './tokenizer.js';
3
+
4
+ export interface ASTNode {
5
+ type: ASTNodeType;
6
+ tagName?: string;
7
+ attributes?: Record<string, string>;
8
+ children?: ASTNode[];
9
+ content?: string;
10
+ parent?: ASTNode;
11
+ isSelfClosing?: boolean;
12
+ position?: {
13
+ start: number;
14
+ end: number;
15
+ line: number;
16
+ column: number;
17
+ };
18
+ }
19
+
20
+ export enum ASTNodeType {
21
+ DOCUMENT = 'DOCUMENT',
22
+ ELEMENT = 'ELEMENT',
23
+ TEXT = 'TEXT',
24
+ COMMENT = 'COMMENT',
25
+ CDATA = 'CDATA',
26
+ DOCTYPE = 'DOCTYPE',
27
+ PROCESSING_INSTRUCTION = 'PROCESSING_INSTRUCTION'
28
+ }
29
+
30
+ export interface ParserState {
31
+ tokens: Token[];
32
+ position: number;
33
+ length: number;
34
+ stack: ASTNode[];
35
+ root: ASTNode;
36
+ errors: ParseError[];
37
+ }
38
+
39
+ export interface ParseError {
40
+ message: string;
41
+ position: number;
42
+ line: number;
43
+ column: number;
44
+ severity: 'error' | 'warning';
45
+ }
46
+
47
+ const VOID_ELEMENTS = new Set([
48
+ 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input',
49
+ 'link', 'meta', 'param', 'source', 'track', 'wbr'
50
+ ]);
51
+
52
+ const RAW_TEXT_ELEMENTS = new Set([
53
+ 'script', 'style', 'textarea', 'title'
54
+ ]);
55
+
56
+ const AUTO_CLOSE_RULES: Record<string, string[]> = {
57
+ 'li': ['li'],
58
+ 'dt': ['dt', 'dd'],
59
+ 'dd': ['dt', 'dd'],
60
+ 'p': ['p'],
61
+ 'rt': ['rt', 'rp'],
62
+ 'rp': ['rt', 'rp'],
63
+ 'optgroup': ['optgroup'],
64
+ 'option': ['option'],
65
+ 'thead': ['tbody', 'tfoot'],
66
+ 'tbody': ['thead', 'tbody', 'tfoot'],
67
+ 'tfoot': ['thead', 'tbody'],
68
+ 'tr': ['tr'],
69
+ 'td': ['td', 'th'],
70
+ 'th': ['td', 'th']
71
+ };
72
+
73
+ export function parse(tokens: Token[]): ASTNode {
74
+ const state = createParserState(tokens);
75
+
76
+ while (state.position < state.length) {
77
+ const token = getCurrentToken(state);
78
+
79
+ if (!token || token.type === TokenType.EOF) {
80
+ break;
81
+ }
82
+
83
+ parseToken(state, token);
84
+ advance(state);
85
+ }
86
+
87
+ while (state.stack.length > 1) {
88
+ const unclosedElement = state.stack.pop()!;
89
+ const currentToken = getCurrentToken(state);
90
+ addError(state, `Unclosed tag: ${unclosedElement.tagName}`, currentToken?.position?.start || 0);
91
+ }
92
+
93
+ return state.root;
94
+ }
95
+
96
+ function createParserState(tokens: Token[]): ParserState {
97
+ const root: ASTNode = {
98
+ type: ASTNodeType.DOCUMENT,
99
+ children: [],
100
+ tagName: '#document'
101
+ };
102
+
103
+ return {
104
+ tokens,
105
+ position: 0,
106
+ length: tokens.length,
107
+ stack: [root],
108
+ root,
109
+ errors: []
110
+ };
111
+ }
112
+
113
+ function parseToken(state: ParserState, token: Token): void {
114
+ switch (token.type) {
115
+ case TokenType.TAG_OPEN:
116
+ parseOpenTag(state, token);
117
+ break;
118
+ case TokenType.TAG_CLOSE:
119
+ parseCloseTag(state, token);
120
+ break;
121
+ case TokenType.TEXT:
122
+ parseText(state, token);
123
+ break;
124
+ case TokenType.COMMENT:
125
+ parseComment(state, token);
126
+ break;
127
+ case TokenType.CDATA:
128
+ parseCDATA(state, token);
129
+ break;
130
+ case TokenType.DOCTYPE:
131
+ parseDoctype(state, token);
132
+ break;
133
+ case TokenType.PROCESSING_INSTRUCTION:
134
+ parseProcessingInstruction(state, token);
135
+ break;
136
+ }
137
+ }
138
+
139
+ function parseOpenTag(state: ParserState, token: Token): void {
140
+ const tagName = token.value.toLowerCase();
141
+
142
+ handleAutoClosing(state, tagName);
143
+
144
+ const currentParent = getCurrentParent(state);
145
+
146
+ const element: ASTNode = {
147
+ type: ASTNodeType.ELEMENT,
148
+ tagName,
149
+ attributes: token.attributes || {},
150
+ children: [],
151
+ parent: currentParent,
152
+ isSelfClosing: token.isSelfClosing || VOID_ELEMENTS.has(tagName),
153
+ position: token.position
154
+ };
155
+
156
+ if (currentParent.children) {
157
+ currentParent.children.push(element);
158
+ }
159
+
160
+ if (!element.isSelfClosing) {
161
+ state.stack.push(element);
162
+ }
163
+ }
164
+
165
+ function parseCloseTag(state: ParserState, token: Token): void {
166
+ const tagName = token.value.toLowerCase();
167
+
168
+ let found = false;
169
+ for (let i = state.stack.length - 1; i >= 0; i--) {
170
+ const element = state.stack[i]!;
171
+ if (element.tagName === tagName) {
172
+ while (state.stack.length > i + 1) {
173
+ const unclosedElement = state.stack.pop()!;
174
+ addError(state, `Unclosed tag: ${unclosedElement.tagName}`, token.position?.start || 0);
175
+ }
176
+ state.stack.pop();
177
+ found = true;
178
+ break;
179
+ }
180
+ }
181
+
182
+ if (!found) {
183
+ addError(state, `Unexpected closing tag: ${tagName}`, token.position?.start || 0);
184
+ }
185
+ }
186
+
187
+ function parseText(state: ParserState, token: Token): void {
188
+ const content = token.value;
189
+ const currentParent = getCurrentParent(state);
190
+
191
+ if (content.trim() === '' && shouldSkipWhitespace(currentParent)) {
192
+ return;
193
+ }
194
+
195
+ const textNode: ASTNode = {
196
+ type: ASTNodeType.TEXT,
197
+ content,
198
+ parent: currentParent,
199
+ position: token.position
200
+ };
201
+
202
+ if (currentParent.children) {
203
+ currentParent.children.push(textNode);
204
+ }
205
+ }
206
+
207
+ function parseComment(state: ParserState, token: Token): void {
208
+ const currentParent = getCurrentParent(state);
209
+
210
+ const commentNode: ASTNode = {
211
+ type: ASTNodeType.COMMENT,
212
+ content: token.value,
213
+ parent: currentParent,
214
+ position: token.position
215
+ };
216
+
217
+ if (currentParent.children) {
218
+ currentParent.children.push(commentNode);
219
+ }
220
+ }
221
+
222
+ function parseCDATA(state: ParserState, token: Token): void {
223
+ const currentParent = getCurrentParent(state);
224
+
225
+ const cdataNode: ASTNode = {
226
+ type: ASTNodeType.CDATA,
227
+ content: token.value,
228
+ parent: currentParent,
229
+ position: token.position
230
+ };
231
+
232
+ if (currentParent.children) {
233
+ currentParent.children.push(cdataNode);
234
+ }
235
+ }
236
+
237
+ function parseDoctype(state: ParserState, token: Token): void {
238
+ const currentParent = getCurrentParent(state);
239
+
240
+ const doctypeNode: ASTNode = {
241
+ type: ASTNodeType.DOCTYPE,
242
+ content: token.value,
243
+ parent: currentParent,
244
+ position: token.position
245
+ };
246
+
247
+ if (currentParent.children) {
248
+ currentParent.children.push(doctypeNode);
249
+ }
250
+ }
251
+
252
+ function parseProcessingInstruction(state: ParserState, token: Token): void {
253
+ const currentParent = getCurrentParent(state);
254
+
255
+ const piNode: ASTNode = {
256
+ type: ASTNodeType.PROCESSING_INSTRUCTION,
257
+ content: token.value,
258
+ parent: currentParent,
259
+ position: token.position
260
+ };
261
+
262
+ if (currentParent.children) {
263
+ currentParent.children.push(piNode);
264
+ }
265
+ }
266
+
267
+ function handleAutoClosing(state: ParserState, tagName: string): void {
268
+ const autoCloseList = AUTO_CLOSE_RULES[tagName];
269
+ if (!autoCloseList) return;
270
+
271
+ const currentElement = getCurrentElement(state);
272
+ if (currentElement && currentElement.tagName && autoCloseList.includes(currentElement.tagName)) {
273
+ state.stack.pop();
274
+ }
275
+ }
276
+
277
+ function getCurrentParent(state: ParserState): ASTNode {
278
+ return state.stack[state.stack.length - 1]!;
279
+ }
280
+
281
+ function getCurrentElement(state: ParserState): ASTNode | null {
282
+ for (let i = state.stack.length - 1; i >= 0; i--) {
283
+ const element = state.stack[i]!;
284
+ if (element.type === ASTNodeType.ELEMENT) {
285
+ return element;
286
+ }
287
+ }
288
+ return null;
289
+ }
290
+
291
+ function getCurrentToken(state: ParserState): Token | null {
292
+ return state.tokens[state.position] || null;
293
+ }
294
+
295
+ function advance(state: ParserState): void {
296
+ state.position++;
297
+ }
298
+
299
+ function addError(state: ParserState, message: string, position: number): void {
300
+ state.errors.push({
301
+ message,
302
+ position,
303
+ line: 0,
304
+ column: 0,
305
+ severity: 'error'
306
+ });
307
+ }
308
+
309
+ function shouldSkipWhitespace(parent: ASTNode): boolean {
310
+ const skipWhitespaceIn = new Set([
311
+ 'html', 'head', 'body', 'table', 'tbody', 'thead', 'tfoot', 'tr',
312
+ 'ul', 'ol', 'dl', 'select', 'optgroup'
313
+ ]);
314
+
315
+ return parent.tagName ? skipWhitespaceIn.has(parent.tagName) : false;
316
+ }
317
+
318
+ export function traverseAST(node: ASTNode, callback: (node: ASTNode) => void): void {
319
+ callback(node);
320
+
321
+ if (node.children) {
322
+ for (const child of node.children) {
323
+ traverseAST(child, callback);
324
+ }
325
+ }
326
+ }
327
+
328
+ export function findNodesByTagName(root: ASTNode, tagName: string): ASTNode[] {
329
+ const results: ASTNode[] = [];
330
+
331
+ traverseAST(root, (node) => {
332
+ if (node.type === ASTNodeType.ELEMENT && node.tagName === tagName.toLowerCase()) {
333
+ results.push(node);
334
+ }
335
+ });
336
+
337
+ return results;
338
+ }
339
+
340
+ export function findNodesByAttribute(root: ASTNode, attrName: string, attrValue?: string): ASTNode[] {
341
+ const results: ASTNode[] = [];
342
+
343
+ traverseAST(root, (node) => {
344
+ if (node.type === ASTNodeType.ELEMENT && node.attributes) {
345
+ const hasAttr = attrName in node.attributes;
346
+ const valueMatches = attrValue === undefined || node.attributes[attrName] === attrValue;
347
+
348
+ if (hasAttr && valueMatches) {
349
+ results.push(node);
350
+ }
351
+ }
352
+ });
353
+
354
+ return results;
355
+ }