@tkeron/html-parser 1.0.0 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tkeron/html-parser",
3
- "version": "1.0.0",
3
+ "version": "1.1.1",
4
4
  "description": "A fast and lightweight HTML parser for Bun",
5
5
  "main": "index.js",
6
6
  "module": "index.ts",
@@ -28,7 +28,8 @@ export const enum NodeType {
28
28
 
29
29
  export function createElement(
30
30
  tagName: string,
31
- attributes: Record<string, string> = {}
31
+ attributes: Record<string, string> = {},
32
+ namespaceURI?: string
32
33
  ): any {
33
34
  const innerHTML = "";
34
35
  const tagNameLower = tagName.toLowerCase();
@@ -46,6 +47,7 @@ export function createElement(
46
47
  nodeName: tagName.toUpperCase(),
47
48
  nodeValue: null,
48
49
  tagName: tagName.toUpperCase(),
50
+ namespaceURI: namespaceURI || null,
49
51
  attributes: { ...attributes },
50
52
  childNodes: [],
51
53
  children: [],
@@ -941,11 +943,12 @@ export function setInnerHTML(element: any, html: string): void {
941
943
  element.lastElementChild = null;
942
944
 
943
945
  if (html.trim()) {
944
- const tokens = tokenize(html);
946
+ const wrappedHtml = '<div>' + html + '</div>';
947
+ const tokens = tokenize(wrappedHtml);
945
948
  const doc = parse(tokens);
946
- const body = doc.body;
947
- if (body && body.childNodes) {
948
- const nodesToMove = [...body.childNodes];
949
+ const div = doc.querySelector('div');
950
+ if (div && div.childNodes) {
951
+ const nodesToMove = [...div.childNodes];
949
952
  for (const child of nodesToMove) {
950
953
  child.parentNode = null;
951
954
  appendChild(element, child);
package/src/parser.ts CHANGED
@@ -138,20 +138,35 @@ export function parse(tokens: Token[]): any {
138
138
  appendChild(html, body);
139
139
 
140
140
  const doctypes: any[] = [];
141
+ const commentsBeforeHtml: any[] = [];
142
+ const bodyContent: any[] = [];
141
143
  const children = [...state.root.childNodes];
144
+
145
+ let foundElement = false;
142
146
  for (const child of children) {
143
147
  if (child.nodeType === 10) {
144
148
  doctypes.push(child);
149
+ } else if (child.nodeType === 8 && !foundElement) {
150
+ commentsBeforeHtml.push(child);
145
151
  } else {
146
- appendChild(body, child);
152
+ if (child.nodeType === 1) foundElement = true;
153
+ bodyContent.push(child);
147
154
  }
148
155
  }
149
156
 
157
+ for (const content of bodyContent) {
158
+ appendChild(body, content);
159
+ }
160
+
150
161
  state.root.childNodes = [];
151
162
  for (const doctype of doctypes) {
152
163
  doctype.parentNode = null;
153
164
  appendChild(state.root, doctype);
154
165
  }
166
+ for (const comment of commentsBeforeHtml) {
167
+ comment.parentNode = null;
168
+ appendChild(state.root, comment);
169
+ }
155
170
  appendChild(state.root, html);
156
171
  state.root.documentElement = html;
157
172
  state.root.head = head;
@@ -364,6 +379,10 @@ function parseTokenInInHeadMode(state: ParserState, token: Token): void {
364
379
  } else if (tagName === 'meta' || tagName === 'link' || tagName === 'base') {
365
380
  parseOpenTag(state, token);
366
381
  } else if (tagName === 'head') {
382
+ // Ignore duplicate <head> tags
383
+ } else if (tagName.includes('-')) {
384
+ // Custom elements (tags with hyphens) are valid in <head>
385
+ parseOpenTag(state, token);
367
386
  } else {
368
387
  state.stack.pop();
369
388
  state.insertionMode = InsertionMode.AfterHead;
@@ -378,6 +397,9 @@ function parseTokenInInHeadMode(state: ParserState, token: Token): void {
378
397
  if (currentTagName === tagName) {
379
398
  state.stack.pop();
380
399
  }
400
+ } else if (tagName.includes('-') && currentTagName === tagName) {
401
+ // Handle closing tags for custom elements in <head>
402
+ state.stack.pop();
381
403
  }
382
404
  } else if (token.type === TokenType.COMMENT) {
383
405
  parseComment(state, token);
@@ -410,6 +432,9 @@ function parseTokenInAfterHeadMode(state: ParserState, token: Token): void {
410
432
  }
411
433
  }
412
434
 
435
+ const SVG_NAMESPACE = 'http://www.w3.org/2000/svg';
436
+ const MATHML_NAMESPACE = 'http://www.w3.org/1998/Math/MathML';
437
+
413
438
  function parseTokenInInBodyMode(state: ParserState, token: Token): void {
414
439
  if (token.type === TokenType.TAG_OPEN) {
415
440
  const tagName = token.value.toLowerCase();
@@ -418,7 +443,14 @@ function parseTokenInInBodyMode(state: ParserState, token: Token): void {
418
443
 
419
444
  const currentParent = getCurrentParent(state);
420
445
 
421
- const element = createElement(tagName, token.attributes || {});
446
+ let namespaceURI: string | undefined;
447
+ if (tagName === 'svg') {
448
+ namespaceURI = SVG_NAMESPACE;
449
+ } else if (tagName === 'math') {
450
+ namespaceURI = MATHML_NAMESPACE;
451
+ }
452
+
453
+ const element = createElement(tagName, token.attributes || {}, namespaceURI);
422
454
 
423
455
  appendChild(currentParent, element);
424
456
 
package/src/tokenizer.ts CHANGED
@@ -93,16 +93,63 @@ function decodeEntities(text: string): string {
93
93
 
94
94
  function parseAttributes(attributeString: string): Record<string, string> {
95
95
  const attributes: Record<string, string> = {};
96
+ let i = 0;
96
97
 
97
- const attrRegex = /([a-zA-Z][a-zA-Z0-9\-_:]*)\s*(?:=\s*(?:"([^"]*)"|'([^']*)'|([^\s>]+)))?/g;
98
- let match;
99
-
100
- while ((match = attrRegex.exec(attributeString)) !== null) {
101
- const [, name, doubleQuoted, singleQuoted, unquoted] = match;
102
- if (name) {
103
- const value = doubleQuoted ?? singleQuoted ?? unquoted ?? '';
104
- attributes[name.toLowerCase()] = decodeEntities(value);
98
+ while (i < attributeString.length) {
99
+ while (i < attributeString.length && /\s/.test(attributeString[i])) {
100
+ i++;
105
101
  }
102
+ if (i >= attributeString.length || attributeString[i] === '/' || attributeString[i] === '>') {
103
+ break;
104
+ }
105
+
106
+ let name = '';
107
+ while (i < attributeString.length && !/[\s=\/>]/.test(attributeString[i])) {
108
+ name += attributeString[i];
109
+ i++;
110
+ }
111
+
112
+ if (!name) {
113
+ i++;
114
+ continue;
115
+ }
116
+
117
+ while (i < attributeString.length && /\s/.test(attributeString[i])) {
118
+ i++;
119
+ }
120
+
121
+ let value = '';
122
+ if (i < attributeString.length && attributeString[i] === '=') {
123
+ i++;
124
+ while (i < attributeString.length && /\s/.test(attributeString[i])) {
125
+ i++;
126
+ }
127
+
128
+ if (i < attributeString.length) {
129
+ if (attributeString[i] === '"') {
130
+ i++;
131
+ while (i < attributeString.length && attributeString[i] !== '"') {
132
+ value += attributeString[i];
133
+ i++;
134
+ }
135
+ i++;
136
+ } else if (attributeString[i] === "'") {
137
+ i++;
138
+ while (i < attributeString.length && attributeString[i] !== "'") {
139
+ value += attributeString[i];
140
+ i++;
141
+ }
142
+ i++;
143
+ } else {
144
+ while (i < attributeString.length && !/[\s>]/.test(attributeString[i])) {
145
+ value += attributeString[i];
146
+ i++;
147
+ }
148
+ }
149
+ }
150
+ }
151
+
152
+ attributes[name.toLowerCase()] = decodeEntities(value);
106
153
  }
107
154
 
108
155
  return attributes;
@@ -117,79 +164,72 @@ function calculatePosition(text: string, offset: number): Position {
117
164
  };
118
165
  }
119
166
 
167
+ const RAW_TEXT_ELEMENTS = new Set(['script', 'style', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript']);
168
+ const RCDATA_ELEMENTS = new Set(['textarea', 'title']);
169
+
120
170
  export function tokenize(html: string): Token[] {
121
171
  const tokens: Token[] = [];
122
- let position = 0;
123
-
124
- const specialCases = [
125
- {
126
- pattern: /<!DOCTYPE\s+[^>]*>/gi,
127
- type: TokenType.DOCTYPE,
128
- getValue: (match: string) => {
129
- const doctypeMatch = match.match(/<!DOCTYPE\s+([^\s>]+)/i);
130
- return doctypeMatch && doctypeMatch[1] ? doctypeMatch[1].toLowerCase() : match;
131
- }
132
- },
133
- {
134
- pattern: /<!--([\s\S]*?)(?:-->|$)/g,
135
- type: TokenType.COMMENT,
136
- getValue: (match: string) => match.slice(4, match.endsWith('-->') ? -3 : match.length)
137
- },
138
- {
139
- pattern: /<!\[CDATA\[([\s\S]*?)\]\]>/g,
140
- type: TokenType.CDATA,
141
- getValue: (match: string) => match.slice(9, -3)
142
- },
143
- {
144
- pattern: /<\?([^?]*(?:\?(?!>)[^?]*)*)\?>/g,
145
- type: TokenType.PROCESSING_INSTRUCTION,
146
- getValue: (match: string) => match.slice(0, -2)
147
- }
148
- ];
149
-
150
- const processedRanges: Array<[number, number]> = [];
151
-
152
- for (const { pattern, type, getValue } of specialCases) {
153
- const regex = new RegExp(pattern);
154
- let match;
155
-
156
- while ((match = regex.exec(html)) !== null) {
157
- const start = match.index;
158
- const end = start + match[0].length;
159
-
160
- tokens.push({
161
- type,
162
- value: getValue(match[0]),
163
- position: calculatePosition(html, start)
164
- });
165
-
166
- processedRanges.push([start, end]);
167
- }
168
- }
169
-
170
- processedRanges.sort((a, b) => a[0] - b[0]);
171
-
172
172
  let currentPos = 0;
173
173
 
174
174
  while (currentPos < html.length) {
175
- const inProcessedRange = processedRanges.some(([start, end]) =>
176
- currentPos >= start && currentPos < end
177
- );
178
-
179
- if (inProcessedRange) {
180
- const range = processedRanges.find(([start, end]) =>
181
- currentPos >= start && currentPos < end
182
- );
183
- if (range) {
184
- currentPos = range[1];
185
- }
186
- continue;
187
- }
188
-
189
175
  const char = html[currentPos];
190
176
 
191
177
  if (char === '<') {
192
- const tagMatch = html.slice(currentPos).match(/^<\/?([a-zA-Z][^\s/>]*)([^>]*)>/);
178
+ const remaining = html.slice(currentPos);
179
+
180
+ const doctypeMatch = remaining.match(/^<!DOCTYPE\s+[^>]*>/i);
181
+ if (doctypeMatch) {
182
+ const match = doctypeMatch[0];
183
+ const nameMatch = match.match(/<!DOCTYPE\s+([^\s>]+)/i);
184
+ tokens.push({
185
+ type: TokenType.DOCTYPE,
186
+ value: nameMatch && nameMatch[1] ? nameMatch[1].toLowerCase() : match,
187
+ position: calculatePosition(html, currentPos)
188
+ });
189
+ currentPos += match.length;
190
+ continue;
191
+ }
192
+
193
+ const commentMatch = remaining.match(/^<!--([\s\S]*?)(?:-->|$)/);
194
+ if (commentMatch) {
195
+ const match = commentMatch[0];
196
+ tokens.push({
197
+ type: TokenType.COMMENT,
198
+ value: match.slice(4, match.endsWith('-->') ? -3 : match.length),
199
+ position: calculatePosition(html, currentPos)
200
+ });
201
+ currentPos += match.length;
202
+ continue;
203
+ }
204
+
205
+ const cdataMatch = remaining.match(/^<!\[CDATA\[([\s\S]*?)\]\]>/);
206
+ if (cdataMatch) {
207
+ const content = cdataMatch[1];
208
+ tokens.push({
209
+ type: TokenType.COMMENT,
210
+ value: '[CDATA[' + content + ']]',
211
+ position: calculatePosition(html, currentPos)
212
+ });
213
+ currentPos += cdataMatch[0].length;
214
+ continue;
215
+ }
216
+
217
+ const piMatch = remaining.match(/^<\?([^>]*)/);
218
+ if (piMatch) {
219
+ let consumed = piMatch[0].length;
220
+ if (remaining[consumed] === '>') {
221
+ consumed++;
222
+ }
223
+ tokens.push({
224
+ type: TokenType.COMMENT,
225
+ value: '?' + piMatch[1],
226
+ position: calculatePosition(html, currentPos)
227
+ });
228
+ currentPos += consumed;
229
+ continue;
230
+ }
231
+
232
+ const tagMatch = remaining.match(/^<\/?([a-zA-Z][^\s/>]*)([^>]*)>/);
193
233
 
194
234
  if (tagMatch) {
195
235
  const fullTag = tagMatch[0];
@@ -222,6 +262,24 @@ export function tokenize(html: string): Token[] {
222
262
  });
223
263
 
224
264
  currentPos += fullTag.length;
265
+
266
+ if (!isClosing && !isSelfClosing && (RAW_TEXT_ELEMENTS.has(tagName) || RCDATA_ELEMENTS.has(tagName))) {
267
+ const closeTagPattern = new RegExp(`</${tagName}\\s*>`, 'i');
268
+ const restOfHtml = html.slice(currentPos);
269
+ const closeMatch = restOfHtml.match(closeTagPattern);
270
+
271
+ if (closeMatch && closeMatch.index !== undefined) {
272
+ const rawContent = restOfHtml.slice(0, closeMatch.index);
273
+ if (rawContent) {
274
+ tokens.push({
275
+ type: TokenType.TEXT,
276
+ value: RCDATA_ELEMENTS.has(tagName) ? decodeEntities(rawContent) : rawContent,
277
+ position: calculatePosition(html, currentPos)
278
+ });
279
+ }
280
+ currentPos += rawContent.length;
281
+ }
282
+ }
225
283
  } else {
226
284
  const textStart = currentPos;
227
285
  currentPos++;
@@ -257,8 +315,6 @@ export function tokenize(html: string): Token[] {
257
315
  }
258
316
  }
259
317
 
260
- tokens.sort((a, b) => a.position.offset - b.position.offset);
261
-
262
318
  tokens.push({
263
319
  type: TokenType.EOF,
264
320
  value: '',
@@ -60,7 +60,7 @@ describe('HTML Parser & Tokenizer - Advanced Tests', () => {
60
60
  });
61
61
  });
62
62
 
63
- it('should handle complex CDATA content', () => {
63
+ it('should handle complex CDATA content as bogus comment', () => {
64
64
  const complexContent = `
65
65
  function test() {
66
66
  return "<div>HTML inside JS</div>";
@@ -71,8 +71,8 @@ describe('HTML Parser & Tokenizer - Advanced Tests', () => {
71
71
  expect(tokens.length).toBeGreaterThan(0);
72
72
  const cdataToken = tokens[0]!;
73
73
 
74
- expect(cdataToken.type).toBe(TokenType.CDATA);
75
- expect(cdataToken.value).toBe(complexContent);
74
+ expect(cdataToken.type).toBe(TokenType.COMMENT);
75
+ expect(cdataToken.value).toBe('[CDATA[' + complexContent + ']]');
76
76
  });
77
77
 
78
78
  it('should handle performance with large documents', () => {
@@ -0,0 +1,105 @@
1
+ import { describe, it, expect } from 'bun:test';
2
+ import { parseHTML } from '../index';
3
+
4
+ describe('Custom Elements in <head>', () => {
5
+
6
+ it('should keep <meta-tags> custom element in head', () => {
7
+ const doc = parseHTML(
8
+ '<!DOCTYPE html><html><head><meta-tags></meta-tags></head><body></body></html>'
9
+ );
10
+
11
+ const metaTags = doc.head?.querySelector('meta-tags');
12
+ expect(metaTags).toBeTruthy();
13
+ expect(metaTags?.parentElement?.tagName).toBe('HEAD');
14
+ });
15
+
16
+ it('should keep <social-meta> custom element in head', () => {
17
+ const doc = parseHTML(
18
+ '<!DOCTYPE html><html><head><social-meta></social-meta></head><body></body></html>'
19
+ );
20
+
21
+ const socialMeta = doc.head?.querySelector('social-meta');
22
+ expect(socialMeta).toBeTruthy();
23
+ expect(socialMeta?.parentElement?.tagName).toBe('HEAD');
24
+ });
25
+
26
+ it('should keep any <custom-element> with hyphen in head', () => {
27
+ const doc = parseHTML(
28
+ '<!DOCTYPE html><html><head><my-component></my-component></head><body></body></html>'
29
+ );
30
+
31
+ const myComponent = doc.head?.querySelector('my-component');
32
+ expect(myComponent).toBeTruthy();
33
+ expect(myComponent?.parentElement?.tagName).toBe('HEAD');
34
+ });
35
+
36
+ it('should still eject non-custom elements like <div> to body', () => {
37
+ const doc = parseHTML(
38
+ '<!DOCTYPE html><html><head><div>test</div></head><body></body></html>'
39
+ );
40
+
41
+ const divInHead = doc.head?.querySelector('div');
42
+ const divInBody = doc.body?.querySelector('div');
43
+ expect(divInHead).toBeFalsy();
44
+ expect(divInBody).toBeTruthy();
45
+ });
46
+
47
+ it('should handle nested custom elements in head', () => {
48
+ const doc = parseHTML(
49
+ '<!DOCTYPE html><html><head><my-wrapper><inner-comp></inner-comp></my-wrapper></head><body></body></html>'
50
+ );
51
+
52
+ const myWrapper = doc.head?.querySelector('my-wrapper');
53
+ expect(myWrapper).toBeTruthy();
54
+ expect(myWrapper?.parentElement?.tagName).toBe('HEAD');
55
+
56
+ const innerComp = myWrapper?.querySelector('inner-comp');
57
+ expect(innerComp).toBeTruthy();
58
+ });
59
+
60
+ it('should keep custom elements with attributes in head', () => {
61
+ const doc = parseHTML(
62
+ '<!DOCTYPE html><html><head><seo-meta property="og:title" content="Test"></seo-meta></head><body></body></html>'
63
+ );
64
+
65
+ const seoMeta = doc.head?.querySelector('seo-meta');
66
+ expect(seoMeta).toBeTruthy();
67
+ expect(seoMeta?.getAttribute('property')).toBe('og:title');
68
+ expect(seoMeta?.getAttribute('content')).toBe('Test');
69
+ expect(seoMeta?.parentElement?.tagName).toBe('HEAD');
70
+ });
71
+
72
+ it('should keep self-closing custom elements in head', () => {
73
+ const doc = parseHTML(
74
+ '<!DOCTYPE html><html><head><custom-void /></head><body></body></html>'
75
+ );
76
+
77
+ const customVoid = doc.head?.querySelector('custom-void');
78
+ expect(customVoid).toBeTruthy();
79
+ expect(customVoid?.parentElement?.tagName).toBe('HEAD');
80
+ });
81
+
82
+ it('should handle custom elements mixed with standard head elements', () => {
83
+ const doc = parseHTML(
84
+ '<!DOCTYPE html><html><head><title>Test</title><meta-tags></meta-tags><link rel="stylesheet" href="style.css"></head><body></body></html>'
85
+ );
86
+
87
+ const title = doc.head?.querySelector('title');
88
+ const metaTags = doc.head?.querySelector('meta-tags');
89
+ const link = doc.head?.querySelector('link');
90
+
91
+ expect(title).toBeTruthy();
92
+ expect(metaTags).toBeTruthy();
93
+ expect(link).toBeTruthy();
94
+ });
95
+
96
+ it('should handle custom element containing text in head', () => {
97
+ const doc = parseHTML(
98
+ '<!DOCTYPE html><html><head><inline-script>console.log("test")</inline-script></head><body></body></html>'
99
+ );
100
+
101
+ const inlineScript = doc.head?.querySelector('inline-script');
102
+ expect(inlineScript).toBeTruthy();
103
+ expect(inlineScript?.parentElement?.tagName).toBe('HEAD');
104
+ });
105
+ });