@dooboostore/dom-parser 1.0.1 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/dist/cjs/DomParser.js +33 -12
  2. package/dist/cjs/DomParser.js.map +2 -2
  3. package/dist/cjs/node/DocumentBase.js +4 -0
  4. package/dist/cjs/node/DocumentBase.js.map +2 -2
  5. package/dist/cjs/node/elements/Element.js.map +1 -1
  6. package/dist/cjs/node/elements/ElementBase.js +12 -2
  7. package/dist/cjs/node/elements/ElementBase.js.map +2 -2
  8. package/dist/cjs/node/elements/HTMLElement.js.map +1 -1
  9. package/dist/cjs/node/elements/HTMLElementBase.js +154 -2
  10. package/dist/cjs/node/elements/HTMLElementBase.js.map +2 -2
  11. package/dist/cjs/window/WindowBase.js +128 -7
  12. package/dist/cjs/window/WindowBase.js.map +2 -2
  13. package/dist/esm/DomParser.js +33 -12
  14. package/dist/esm/DomParser.js.map +2 -2
  15. package/dist/esm/node/DocumentBase.js +4 -0
  16. package/dist/esm/node/DocumentBase.js.map +2 -2
  17. package/dist/esm/node/elements/ElementBase.js +12 -2
  18. package/dist/esm/node/elements/ElementBase.js.map +2 -2
  19. package/dist/esm/node/elements/HTMLElementBase.js +154 -2
  20. package/dist/esm/node/elements/HTMLElementBase.js.map +2 -2
  21. package/dist/esm/window/WindowBase.js +128 -7
  22. package/dist/esm/window/WindowBase.js.map +2 -2
  23. package/dist/esm-bundle/dooboostore-dom-parser.esm.js +504 -195
  24. package/dist/esm-bundle/dooboostore-dom-parser.esm.js.map +3 -3
  25. package/dist/types/DomParser.d.ts +4 -0
  26. package/dist/types/DomParser.d.ts.map +1 -1
  27. package/dist/types/node/DocumentBase.d.ts +2 -0
  28. package/dist/types/node/DocumentBase.d.ts.map +1 -1
  29. package/dist/types/node/elements/Element.d.ts +1 -0
  30. package/dist/types/node/elements/Element.d.ts.map +1 -1
  31. package/dist/types/node/elements/ElementBase.d.ts +2 -2
  32. package/dist/types/node/elements/ElementBase.d.ts.map +1 -1
  33. package/dist/types/node/elements/HTMLElement.d.ts +32 -1
  34. package/dist/types/node/elements/HTMLElement.d.ts.map +1 -1
  35. package/dist/types/node/elements/HTMLElementBase.d.ts +11 -2
  36. package/dist/types/node/elements/HTMLElementBase.d.ts.map +1 -1
  37. package/dist/types/window/WindowBase.d.ts +12 -2
  38. package/dist/types/window/WindowBase.d.ts.map +1 -1
  39. package/dist/umd-bundle/dooboostore-dom-parser.umd.js +504 -195
  40. package/dist/umd-bundle/dooboostore-dom-parser.umd.js.map +3 -3
  41. package/package.json +1 -1
  42. package/src/DomParser.ts +457 -436
  43. package/src/node/DocumentBase.ts +7 -2
  44. package/src/node/elements/Element.ts +24 -23
  45. package/src/node/elements/ElementBase.ts +50 -41
  46. package/src/node/elements/HTMLElement.ts +36 -1
  47. package/src/node/elements/HTMLElementBase.ts +191 -5
  48. package/src/window/WindowBase.ts +1128 -919
package/src/DomParser.ts CHANGED
@@ -1,491 +1,512 @@
1
- import { DocumentBase } from './node/DocumentBase';
2
- import { TextBase } from './node/TextBase';
3
- import { Comment } from './node/Comment';
4
- import { WindowBase } from './window/WindowBase';
1
+ import {DocumentBase} from './node/DocumentBase';
2
+ import {TextBase} from './node/TextBase';
3
+ import {Comment} from './node/Comment';
4
+ import {WindowBase} from './window/WindowBase';
5
5
 
6
6
  export interface DomParserOptions {
7
- href?: string;
7
+ href?: string;
8
8
  }
9
9
 
10
10
  export class DomParser {
11
- private _window: Window;
12
- private _document: Document;
13
-
14
- constructor(html: string, option?: DomParserOptions) {
15
- // Create a new document instance
16
- const document = new DocumentBase();
17
-
18
- // Create WindowBase instance with the document
19
- const windowBase = new WindowBase(document, option?.href);
20
-
21
- this._window = windowBase as unknown as Window;
22
- this._document = document as any;
23
-
24
- // Parse the provided HTML string
25
- this.parseHTML(html);
26
-
27
- // Set up document references after parsing
28
- this.setupDocumentReferences();
29
-
30
- // Simulate document loading process
31
- if (document && (document as any).simulateLoading) {
32
- (document as any).simulateLoading();
33
- }
11
+ private _window: Window | null;
12
+ private _document: Document | null;
13
+
14
+ constructor(html: string, option?: DomParserOptions) {
15
+ // Create WindowBase instance with the document
16
+ const windowBase = new WindowBase({initialUrl: option?.href});
17
+ this._window = windowBase as unknown as Window;
18
+ this._document = windowBase.document as any;
19
+
20
+ // Parse the provided HTML string
21
+ this.parseHTML(html);
22
+
23
+ // Set up document references after parsing
24
+ this.setupDocumentReferences();
25
+
26
+ // Simulate document loading process
27
+ if (this._document && (this._document as any).simulateLoading) {
28
+ (this._document as any).simulateLoading();
34
29
  }
30
+ }
35
31
 
36
- get window(): Window {
37
- return this._window;
32
+ get window(): Window {
33
+ if (!this._window) {
34
+ throw new Error('DomParser has been destroyed');
38
35
  }
36
+ return this._window;
37
+ }
39
38
 
40
- get document(): Document {
41
- return this._document;
39
+ get document(): Document {
40
+ if (!this._document) {
41
+ throw new Error('DomParser has been destroyed');
42
42
  }
43
- /**
44
- * Load new HTML content and replace the current document
45
- */
46
- loadHTML(html: string): void {
47
- // Clear current document content
48
- this.clearDocument();
49
-
50
- // Parse new HTML
51
- this.parseHTML(html);
52
-
53
- // Set up document references after parsing
54
- this.setupDocumentReferences();
43
+ return this._document;
44
+ }
45
+
46
+ /**
47
+ * Destroy the DomParser instance and free memory
48
+ */
49
+ destroy(): void {
50
+ if (this._window) {
51
+ this._window.close();
52
+ this._window = null;
55
53
  }
56
-
57
- private clearDocument(): void {
58
- // Clear document body and head content while preserving structure
59
- if (this.document.head) {
60
- while (this.document.head.firstChild) {
61
- this.document.head.removeChild(this.document.head.firstChild);
62
- }
63
- }
64
-
65
- if (this.document.body) {
66
- while (this.document.body.firstChild) {
67
- this.document.body.removeChild(this.document.body.firstChild);
68
- }
69
- }
54
+
55
+ this._document = null;
56
+ }
57
+
58
+ /**
59
+ * Load new HTML content and replace the current document
60
+ */
61
+ loadHTML(html: string): void {
62
+ if (!this._document) {
63
+ throw new Error('DomParser has been destroyed');
70
64
  }
71
-
72
- parseHTML(html: string): void {
73
- // Simple HTML parsing implementation
74
- if (!html.trim()) {
75
- return;
76
- }
77
-
78
- // Basic HTML parsing - this is a simplified version
79
- // In a real implementation, you'd use a proper HTML parser
80
- this.parseHTMLString(html, this.document);
65
+
66
+ // Clear current document content
67
+ this.clearDocument();
68
+
69
+ // Parse new HTML
70
+ this.parseHTML(html);
71
+
72
+ // Set up document references after parsing
73
+ this.setupDocumentReferences();
74
+ }
75
+
76
+ private clearDocument(): void {
77
+ if (!this._document) return;
78
+
79
+ // Clear document body and head content while preserving structure
80
+ if (this._document.head) {
81
+ while (this._document.head.firstChild) {
82
+ this._document.head.removeChild(this._document.head.firstChild);
83
+ }
81
84
  }
82
85
 
83
- private parseHTMLString(html: string, parent: any): void {
84
- // Remove DOCTYPE if present
85
- html = html.replace(/<!DOCTYPE[^>]*>/i, '').trim();
86
-
87
- if (!html) return;
86
+ if (this._document.body) {
87
+ while (this._document.body.firstChild) {
88
+ this._document.body.removeChild(this._document.body.firstChild);
89
+ }
90
+ }
91
+ }
88
92
 
89
- // Handle template tags specially
90
- const templateRegex = /<template([^>]*)>(.*?)<\/template>/gs;
91
- html = html.replace(templateRegex, (match, attributes, content) => {
92
- const element = this.document.createElement('template');
93
+ parseHTML(html: string): void {
94
+ if (!this._document) {
95
+ throw new Error('DomParser has been destroyed');
96
+ }
97
+ // Simple HTML parsing implementation
98
+ if (!html.trim()) {
99
+ return;
100
+ }
93
101
 
94
- // Parse attributes
95
- if (attributes.trim()) {
96
- this.parseAttributes(attributes, element);
97
- }
102
+ // Basic HTML parsing - this is a simplified version
103
+ // In a real implementation, you'd use a proper HTML parser
104
+ this.parseHTMLString(html, this.document);
105
+ }
106
+
107
+ private parseHTMLString(html: string, parent: any): void {
108
+ // Remove DOCTYPE if present
109
+ html = html.replace(/<!DOCTYPE[^>]*>/i, '').trim();
110
+
111
+ if (!html) return;
112
+
113
+ // Handle template tags specially
114
+ const templateRegex = /<template([^>]*)>(.*?)<\/template>/gs;
115
+ html = html.replace(templateRegex, (match, attributes, content) => {
116
+ const element = this.document.createElement('template');
117
+
118
+ // Parse attributes
119
+ if (attributes.trim()) {
120
+ this.parseAttributes(attributes, element);
121
+ }
122
+
123
+ // Parse content directly into the template's content fragment
124
+ if (content.trim()) {
125
+ // Use internal method to avoid appendChild side effects
126
+ this.parseHTMLString(content.trim(), element.content);
127
+ }
128
+
129
+ parent.appendChild(element);
130
+ return ''; // Remove from HTML string
131
+ });
132
+
133
+ // Improved HTML parsing with proper nesting support
134
+ this.parseHTMLRecursive(html, parent);
135
+ }
136
+
137
+ private parseHTMLRecursive(html: string, parent: any): void {
138
+ let position = 0;
139
+
140
+ while (position < html.length) {
141
+ // Find next tag or comment
142
+ const tagStart = html.indexOf('<', position);
143
+
144
+ if (tagStart === -1) {
145
+ // No more tags, add remaining text if any
146
+ const remainingText = html.substring(position).trim();
147
+ if (remainingText) {
148
+ const textNode = new TextBase(remainingText, this.document);
149
+ parent.appendChild(textNode);
150
+ }
151
+ break;
152
+ }
153
+
154
+ // Add text before tag if any
155
+ if (tagStart > position) {
156
+ const textContent = html.substring(position, tagStart).trim();
157
+ if (textContent) {
158
+ const textNode = new TextBase(textContent, this.document);
159
+ parent.appendChild(textNode);
160
+ }
161
+ }
162
+
163
+ // Check if this is a comment
164
+ if (html.substring(tagStart, tagStart + 4) === '<!--') {
165
+ const commentEnd = html.indexOf('-->', tagStart + 4);
166
+ if (commentEnd !== -1) {
167
+ const commentContent = html.substring(tagStart + 4, commentEnd);
168
+ const commentNode = new Comment(commentContent, this.document);
169
+ parent.appendChild(commentNode);
170
+ position = commentEnd + 3;
171
+ continue;
172
+ } else {
173
+ // Malformed comment, treat as text
174
+ const textNode = new TextBase(html.substring(tagStart, tagStart + 4), this.document);
175
+ parent.appendChild(textNode);
176
+ position = tagStart + 4;
177
+ continue;
178
+ }
179
+ }
180
+
181
+ // Find tag end
182
+ const tagEnd = html.indexOf('>', tagStart);
183
+ if (tagEnd === -1) break;
184
+
185
+ const tagContent = html.substring(tagStart + 1, tagEnd);
186
+
187
+ // Check if it's a closing tag
188
+ if (tagContent.startsWith('/')) {
189
+ // This is a closing tag, we should return to parent
190
+ position = tagEnd + 1;
191
+ break;
192
+ }
193
+
194
+ // Check if it's a self-closing tag
195
+ const isSelfClosing = tagContent.endsWith('/') || this.isSelfClosingTag(tagContent.split(/\s+/)[0]);
196
+
197
+ // Parse tag name and attributes
198
+ const spaceIndex = tagContent.indexOf(' ');
199
+ const tagName = spaceIndex === -1 ? tagContent.replace('/', '') : tagContent.substring(0, spaceIndex);
200
+ let attributes = spaceIndex === -1 ? '' : tagContent.substring(spaceIndex + 1);
201
+
202
+ // Only remove trailing slash for self-closing tags (not from attribute values)
203
+ if (attributes.endsWith('/')) {
204
+ attributes = attributes.slice(0, -1).trim();
205
+ }
206
+
207
+
208
+ // Create element
209
+ const element = this.document.createElement(tagName.toLowerCase());
210
+
211
+
212
+ // Parse attributes
213
+ if (attributes.trim()) {
214
+ this.parseAttributes(attributes, element);
215
+ }
216
+
217
+ parent.appendChild(element);
218
+
219
+ if (isSelfClosing) {
220
+ position = tagEnd + 1;
221
+ } else {
222
+ // Find matching closing tag and parse content
223
+ const closingTag = `</${tagName}>`;
224
+ const closingTagIndex = this.findMatchingClosingTag(html, tagEnd + 1, tagName);
225
+
226
+ if (closingTagIndex !== -1) {
227
+ const innerContent = html.substring(tagEnd + 1, closingTagIndex);
228
+ if (innerContent.trim()) {
229
+ this.parseHTMLRecursive(innerContent, element);
230
+ }
231
+ position = closingTagIndex + closingTag.length;
232
+ } else {
233
+ // No matching closing tag found, treat as self-closing
234
+ position = tagEnd + 1;
235
+ }
236
+ }
237
+ }
238
+ }
98
239
 
99
- // Parse content directly into the template's content fragment
100
- if (content.trim()) {
101
- // Use internal method to avoid appendChild side effects
102
- this.parseHTMLString(content.trim(), element.content);
103
- }
240
+ private findMatchingClosingTag(html: string, startPos: number, tagName: string): number {
241
+ const openTag = `<${tagName}`;
242
+ const closeTag = `</${tagName}>`;
243
+ let depth = 1;
244
+ let pos = startPos;
104
245
 
105
- parent.appendChild(element);
106
- return ''; // Remove from HTML string
107
- });
246
+ while (pos < html.length && depth > 0) {
247
+ const nextOpen = html.indexOf(openTag, pos);
248
+ const nextClose = html.indexOf(closeTag, pos);
108
249
 
109
- // Improved HTML parsing with proper nesting support
110
- this.parseHTMLRecursive(html, parent);
250
+ if (nextClose === -1) {
251
+ // No more closing tags
252
+ return -1;
253
+ }
254
+
255
+ if (nextOpen !== -1 && nextOpen < nextClose) {
256
+ // Found another opening tag before the closing tag
257
+ depth++;
258
+ pos = nextOpen + openTag.length;
259
+ } else {
260
+ // Found a closing tag
261
+ depth--;
262
+ if (depth === 0) {
263
+ return nextClose;
264
+ }
265
+ pos = nextClose + closeTag.length;
266
+ }
111
267
  }
112
268
 
113
- private parseHTMLRecursive(html: string, parent: any): void {
114
- let position = 0;
115
-
116
- while (position < html.length) {
117
- // Find next tag or comment
118
- const tagStart = html.indexOf('<', position);
119
-
120
- if (tagStart === -1) {
121
- // No more tags, add remaining text if any
122
- const remainingText = html.substring(position).trim();
123
- if (remainingText) {
124
- const textNode = new TextBase(remainingText, this.document);
125
- parent.appendChild(textNode);
126
- }
127
- break;
128
- }
269
+ return -1;
270
+ }
129
271
 
130
- // Add text before tag if any
131
- if (tagStart > position) {
132
- const textContent = html.substring(position, tagStart).trim();
133
- if (textContent) {
134
- const textNode = new TextBase(textContent, this.document);
135
- parent.appendChild(textNode);
136
- }
137
- }
272
+ private parseAttributes(attributeString: string, element: any): void {
273
+ // Improved attribute parsing that handles complex JavaScript expressions
274
+ let position = 0;
275
+ const length = attributeString.length;
138
276
 
139
- // Check if this is a comment
140
- if (html.substring(tagStart, tagStart + 4) === '<!--') {
141
- const commentEnd = html.indexOf('-->', tagStart + 4);
142
- if (commentEnd !== -1) {
143
- const commentContent = html.substring(tagStart + 4, commentEnd);
144
- const commentNode = new Comment(commentContent, this.document);
145
- parent.appendChild(commentNode);
146
- position = commentEnd + 3;
147
- continue;
148
- } else {
149
- // Malformed comment, treat as text
150
- const textNode = new TextBase(html.substring(tagStart, tagStart + 4), this.document);
151
- parent.appendChild(textNode);
152
- position = tagStart + 4;
153
- continue;
154
- }
155
- }
277
+ while (position < length) {
278
+ // Skip whitespace
279
+ while (position < length && /\s/.test(attributeString[position])) {
280
+ position++;
281
+ }
156
282
 
157
- // Find tag end
158
- const tagEnd = html.indexOf('>', tagStart);
159
- if (tagEnd === -1) break;
283
+ if (position >= length) break;
160
284
 
161
- const tagContent = html.substring(tagStart + 1, tagEnd);
285
+ // Find attribute name
286
+ const nameStart = position;
287
+ while (position < length && /[\w:-]/.test(attributeString[position])) {
288
+ position++;
289
+ }
162
290
 
163
- // Check if it's a closing tag
164
- if (tagContent.startsWith('/')) {
165
- // This is a closing tag, we should return to parent
166
- position = tagEnd + 1;
167
- break;
168
- }
291
+ if (position === nameStart) {
292
+ // Invalid character, skip it
293
+ position++;
294
+ continue;
295
+ }
169
296
 
170
- // Check if it's a self-closing tag
171
- const isSelfClosing = tagContent.endsWith('/') || this.isSelfClosingTag(tagContent.split(/\s+/)[0]);
172
-
173
- // Parse tag name and attributes
174
- const spaceIndex = tagContent.indexOf(' ');
175
- const tagName = spaceIndex === -1 ? tagContent.replace('/', '') : tagContent.substring(0, spaceIndex);
176
- let attributes = spaceIndex === -1 ? '' : tagContent.substring(spaceIndex + 1);
177
-
178
- // Only remove trailing slash for self-closing tags (not from attribute values)
179
- if (attributes.endsWith('/')) {
180
- attributes = attributes.slice(0, -1).trim();
181
- }
297
+ const name = attributeString.substring(nameStart, position);
182
298
 
299
+ // Skip whitespace
300
+ while (position < length && /\s/.test(attributeString[position])) {
301
+ position++;
302
+ }
183
303
 
304
+ let value = '';
184
305
 
185
- // Create element
186
- const element = this.document.createElement(tagName.toLowerCase());
306
+ // Check if there's an equals sign
307
+ if (position < length && attributeString[position] === '=') {
308
+ position++; // Skip '='
187
309
 
310
+ // Skip whitespace
311
+ while (position < length && /\s/.test(attributeString[position])) {
312
+ position++;
313
+ }
188
314
 
315
+ if (position < length) {
316
+ const quote = attributeString[position];
189
317
 
190
- // Parse attributes
191
- if (attributes.trim()) {
192
- this.parseAttributes(attributes, element);
193
- }
318
+ if (quote === '"' || quote === "'") {
319
+ // Quoted value - find matching closing quote
320
+ position++; // Skip opening quote
321
+ const valueStart = position;
194
322
 
195
- parent.appendChild(element);
196
-
197
- if (isSelfClosing) {
198
- position = tagEnd + 1;
199
- } else {
200
- // Find matching closing tag and parse content
201
- const closingTag = `</${tagName}>`;
202
- const closingTagIndex = this.findMatchingClosingTag(html, tagEnd + 1, tagName);
203
-
204
- if (closingTagIndex !== -1) {
205
- const innerContent = html.substring(tagEnd + 1, closingTagIndex);
206
- if (innerContent.trim()) {
207
- this.parseHTMLRecursive(innerContent, element);
208
- }
209
- position = closingTagIndex + closingTag.length;
210
- } else {
211
- // No matching closing tag found, treat as self-closing
212
- position = tagEnd + 1;
213
- }
323
+ while (position < length && attributeString[position] !== quote) {
324
+ position++;
214
325
  }
215
- }
216
- }
217
326
 
218
- private findMatchingClosingTag(html: string, startPos: number, tagName: string): number {
219
- const openTag = `<${tagName}`;
220
- const closeTag = `</${tagName}>`;
221
- let depth = 1;
222
- let pos = startPos;
327
+ value = attributeString.substring(valueStart, position);
223
328
 
224
- while (pos < html.length && depth > 0) {
225
- const nextOpen = html.indexOf(openTag, pos);
226
- const nextClose = html.indexOf(closeTag, pos);
227
-
228
- if (nextClose === -1) {
229
- // No more closing tags
230
- return -1;
329
+ if (position < length && attributeString[position] === quote) {
330
+ position++; // Skip closing quote
231
331
  }
232
-
233
- if (nextOpen !== -1 && nextOpen < nextClose) {
234
- // Found another opening tag before the closing tag
235
- depth++;
236
- pos = nextOpen + openTag.length;
237
- } else {
238
- // Found a closing tag
239
- depth--;
240
- if (depth === 0) {
241
- return nextClose;
242
- }
243
- pos = nextClose + closeTag.length;
332
+ } else {
333
+ // Unquoted value - read until whitespace or end
334
+ const valueStart = position;
335
+ while (position < length && !/\s/.test(attributeString[position])) {
336
+ position++;
244
337
  }
338
+ value = attributeString.substring(valueStart, position);
339
+ }
245
340
  }
341
+ }
246
342
 
247
- return -1;
248
- }
249
-
250
- private parseAttributes(attributeString: string, element: any): void {
251
- // Improved attribute parsing that handles complex JavaScript expressions
252
- let position = 0;
253
- const length = attributeString.length;
254
-
255
- while (position < length) {
256
- // Skip whitespace
257
- while (position < length && /\s/.test(attributeString[position])) {
258
- position++;
259
- }
260
-
261
- if (position >= length) break;
343
+ // Decode HTML entities in attribute values
344
+ value = this.decodeHTMLEntities(value);
262
345
 
263
- // Find attribute name
264
- const nameStart = position;
265
- while (position < length && /[\w:-]/.test(attributeString[position])) {
266
- position++;
267
- }
346
+ element.setAttribute(name, value);
347
+ }
348
+ }
349
+
350
+ /**
351
+ * Decode HTML entities in a string
352
+ */
353
+ private decodeHTMLEntities(str: string): string {
354
+ const entityMap: { [key: string]: string } = {
355
+ '&amp;': '&',
356
+ '&lt;': '<',
357
+ '&gt;': '>',
358
+ '&quot;': '"',
359
+ '&#39;': "'",
360
+ '&#34;': '"',
361
+ '&apos;': "'",
362
+ '&copy;': '©',
363
+ '&reg;': '®',
364
+ '&trade;': '™',
365
+ '&nbsp;': ' ',
366
+ '&hellip;': '…',
367
+ '&mdash;': '—',
368
+ '&ndash;': '–',
369
+ '&lsquo;': '\u2018',
370
+ '&rsquo;': '\u2019',
371
+ '&ldquo;': '"',
372
+ '&rdquo;': '"'
373
+ };
374
+
375
+ return str.replace(/&[a-zA-Z0-9#]+;/g, (entity) => {
376
+ // Handle named entities
377
+ if (entityMap[entity]) {
378
+ return entityMap[entity];
379
+ }
380
+
381
+ // Handle numeric entities like &#39; &#34;
382
+ if (entity.startsWith('&#') && entity.endsWith(';')) {
383
+ const numStr = entity.slice(2, -1);
384
+ const num = parseInt(numStr, 10);
385
+ if (!isNaN(num)) {
386
+ return String.fromCharCode(num);
387
+ }
388
+ }
389
+
390
+ // Handle hex entities like &#x27;
391
+ if (entity.startsWith('&#x') && entity.endsWith(';')) {
392
+ const hexStr = entity.slice(3, -1);
393
+ const num = parseInt(hexStr, 16);
394
+ if (!isNaN(num)) {
395
+ return String.fromCharCode(num);
396
+ }
397
+ }
398
+
399
+ // Return original if not recognized
400
+ return entity;
401
+ });
402
+ }
403
+
404
+ private isSelfClosingTag(tagName: string): boolean {
405
+ const selfClosingTags = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'link', 'meta', 'param', 'source', 'track', 'wbr'];
406
+ return selfClosingTags.includes(tagName.toLowerCase());
407
+ }
408
+
409
+ /**
410
+ * Set up document references after HTML parsing
411
+ */
412
+ private setupDocumentReferences(): void {
413
+ // Find HTML, HEAD, and BODY elements
414
+ const allHtmlElements = this.document.querySelectorAll('html');
415
+ const allHeadElements = this.document.querySelectorAll('head');
416
+ const allBodyElements = this.document.querySelectorAll('body');
417
+
418
+ // Choose the HTML element with content, then attributes, then first one
419
+ let htmlElement = null;
420
+ // First priority: HTML with attributes (lang, data-theme 등)
421
+ for (let i = 0; i < allHtmlElements.length; i++) {
422
+ const html = allHtmlElements[i];
423
+ if (html.attributes.length > 0) {
424
+ htmlElement = html;
425
+ break;
426
+ }
427
+ }
428
+ // Second priority: HTML with child nodes (content)
429
+ if (!htmlElement) {
430
+ for (let i = 0; i < allHtmlElements.length; i++) {
431
+ const html = allHtmlElements[i];
432
+ if (html.childNodes.length > 0) {
433
+ htmlElement = html;
434
+ break;
435
+ }
436
+ }
437
+ }
438
+ // Last resort: first HTML
439
+ if (!htmlElement && allHtmlElements.length > 0) {
440
+ htmlElement = allHtmlElements[0];
441
+ }
268
442
 
269
- if (position === nameStart) {
270
- // Invalid character, skip it
271
- position++;
272
- continue;
273
- }
443
+ // Choose the HEAD element with content, then attributes, then first one
444
+ let headElement = null;
445
+ // First priority: HEAD with child nodes (content)
446
+ for (let i = 0; i < allHeadElements.length; i++) {
447
+ const head = allHeadElements[i];
448
+ if (head.childNodes.length > 0) {
449
+ headElement = head;
450
+ break;
451
+ }
452
+ }
453
+ // Second priority: HEAD with attributes
454
+ if (!headElement) {
455
+ for (let i = 0; i < allHeadElements.length; i++) {
456
+ const head = allHeadElements[i];
457
+ if (head.attributes.length > 0) {
458
+ headElement = head;
459
+ break;
460
+ }
461
+ }
462
+ }
463
+ // Last resort: first HEAD
464
+ if (!headElement && allHeadElements.length > 0) {
465
+ headElement = allHeadElements[0];
466
+ }
274
467
 
275
- const name = attributeString.substring(nameStart, position);
468
+ // Choose the BODY element with content, then attributes, then first one
469
+ let bodyElement = null;
276
470
 
277
- // Skip whitespace
278
- while (position < length && /\s/.test(attributeString[position])) {
279
- position++;
280
- }
281
471
 
282
- let value = '';
283
-
284
- // Check if there's an equals sign
285
- if (position < length && attributeString[position] === '=') {
286
- position++; // Skip '='
287
-
288
- // Skip whitespace
289
- while (position < length && /\s/.test(attributeString[position])) {
290
- position++;
291
- }
292
-
293
- if (position < length) {
294
- const quote = attributeString[position];
295
-
296
- if (quote === '"' || quote === "'") {
297
- // Quoted value - find matching closing quote
298
- position++; // Skip opening quote
299
- const valueStart = position;
300
-
301
- while (position < length && attributeString[position] !== quote) {
302
- position++;
303
- }
304
-
305
- value = attributeString.substring(valueStart, position);
306
-
307
- if (position < length && attributeString[position] === quote) {
308
- position++; // Skip closing quote
309
- }
310
- } else {
311
- // Unquoted value - read until whitespace or end
312
- const valueStart = position;
313
- while (position < length && !/\s/.test(attributeString[position])) {
314
- position++;
315
- }
316
- value = attributeString.substring(valueStart, position);
317
- }
318
- }
319
- }
472
+ // First priority: BODY with child nodes (content)
473
+ for (let i = 0; i < allBodyElements.length; i++) {
474
+ const body = allBodyElements[i];
475
+ if (body.childNodes.length > 0) {
476
+ bodyElement = body;
477
+ break;
478
+ }
479
+ }
320
480
 
321
- // Decode HTML entities in attribute values
322
- value = this.decodeHTMLEntities(value);
323
-
324
- element.setAttribute(name, value);
481
+ // Second priority: BODY with attributes
482
+ if (!bodyElement) {
483
+ for (let i = 0; i < allBodyElements.length; i++) {
484
+ const body = allBodyElements[i];
485
+ if (body.attributes.length > 0) {
486
+ bodyElement = body;
487
+ break;
325
488
  }
489
+ }
326
490
  }
327
491
 
328
- /**
329
- * Decode HTML entities in a string
330
- */
331
- private decodeHTMLEntities(str: string): string {
332
- const entityMap: { [key: string]: string } = {
333
- '&amp;': '&',
334
- '&lt;': '<',
335
- '&gt;': '>',
336
- '&quot;': '"',
337
- '&#39;': "'",
338
- '&#34;': '"',
339
- '&apos;': "'",
340
- '&copy;': '©',
341
- '&reg;': '®',
342
- '&trade;': '™',
343
- '&nbsp;': ' ',
344
- '&hellip;': '…',
345
- '&mdash;': '—',
346
- '&ndash;': '–',
347
- '&lsquo;': '\u2018',
348
- '&rsquo;': '\u2019',
349
- '&ldquo;': '"',
350
- '&rdquo;': '"'
351
- };
352
-
353
- return str.replace(/&[a-zA-Z0-9#]+;/g, (entity) => {
354
- // Handle named entities
355
- if (entityMap[entity]) {
356
- return entityMap[entity];
357
- }
358
-
359
- // Handle numeric entities like &#39; &#34;
360
- if (entity.startsWith('&#') && entity.endsWith(';')) {
361
- const numStr = entity.slice(2, -1);
362
- const num = parseInt(numStr, 10);
363
- if (!isNaN(num)) {
364
- return String.fromCharCode(num);
365
- }
366
- }
367
-
368
- // Handle hex entities like &#x27;
369
- if (entity.startsWith('&#x') && entity.endsWith(';')) {
370
- const hexStr = entity.slice(3, -1);
371
- const num = parseInt(hexStr, 16);
372
- if (!isNaN(num)) {
373
- return String.fromCharCode(num);
374
- }
375
- }
376
-
377
- // Return original if not recognized
378
- return entity;
379
- });
492
+ // Last resort: first BODY
493
+ if (!bodyElement && allBodyElements.length > 0) {
494
+ bodyElement = allBodyElements[0];
380
495
  }
381
496
 
382
- private isSelfClosingTag(tagName: string): boolean {
383
- const selfClosingTags = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'link', 'meta', 'param', 'source', 'track', 'wbr'];
384
- return selfClosingTags.includes(tagName.toLowerCase());
385
- }
386
497
 
387
- /**
388
- * Set up document references after HTML parsing
389
- */
390
- private setupDocumentReferences(): void {
391
- // Find HTML, HEAD, and BODY elements
392
- const allHtmlElements = this.document.querySelectorAll('html');
393
- const allHeadElements = this.document.querySelectorAll('head');
394
- const allBodyElements = this.document.querySelectorAll('body');
395
-
396
- // Choose the HTML element with content, then attributes, then first one
397
- let htmlElement = null;
398
- // First priority: HTML with attributes (lang, data-theme 등)
399
- for (let i = 0; i < allHtmlElements.length; i++) {
400
- const html = allHtmlElements[i];
401
- if (html.attributes.length > 0) {
402
- htmlElement = html;
403
- break;
404
- }
405
- }
406
- // Second priority: HTML with child nodes (content)
407
- if (!htmlElement) {
408
- for (let i = 0; i < allHtmlElements.length; i++) {
409
- const html = allHtmlElements[i];
410
- if (html.childNodes.length > 0) {
411
- htmlElement = html;
412
- break;
413
- }
414
- }
415
- }
416
- // Last resort: first HTML
417
- if (!htmlElement && allHtmlElements.length > 0) {
418
- htmlElement = allHtmlElements[0];
419
- }
420
-
421
- // Choose the HEAD element with content, then attributes, then first one
422
- let headElement = null;
423
- // First priority: HEAD with child nodes (content)
424
- for (let i = 0; i < allHeadElements.length; i++) {
425
- const head = allHeadElements[i];
426
- if (head.childNodes.length > 0) {
427
- headElement = head;
428
- break;
429
- }
430
- }
431
- // Second priority: HEAD with attributes
432
- if (!headElement) {
433
- for (let i = 0; i < allHeadElements.length; i++) {
434
- const head = allHeadElements[i];
435
- if (head.attributes.length > 0) {
436
- headElement = head;
437
- break;
438
- }
439
- }
440
- }
441
- // Last resort: first HEAD
442
- if (!headElement && allHeadElements.length > 0) {
443
- headElement = allHeadElements[0];
444
- }
445
-
446
- // Choose the BODY element with content, then attributes, then first one
447
- let bodyElement = null;
448
-
449
-
450
- // First priority: BODY with child nodes (content)
451
- for (let i = 0; i < allBodyElements.length; i++) {
452
- const body = allBodyElements[i];
453
- if (body.childNodes.length > 0) {
454
- bodyElement = body;
455
- break;
456
- }
457
- }
458
-
459
- // Second priority: BODY with attributes
460
- if (!bodyElement) {
461
- for (let i = 0; i < allBodyElements.length; i++) {
462
- const body = allBodyElements[i];
463
- if (body.attributes.length > 0) {
464
- bodyElement = body;
465
- break;
466
- }
467
- }
468
- }
469
-
470
- // Last resort: first BODY
471
- if (!bodyElement && allBodyElements.length > 0) {
472
- bodyElement = allBodyElements[0];
473
- }
474
-
475
-
476
-
477
- // For now, just use the elements as they are parsed
478
- // TODO: Implement proper DOM structure reorganization later
479
-
480
- // Set document references
481
- if (htmlElement) {
482
- (this.document as any).documentElement = htmlElement;
483
- }
484
- if (headElement) {
485
- (this.document as any).head = headElement;
486
- }
487
- if (bodyElement) {
488
- (this.document as any).body = bodyElement;
489
- }
498
+ // For now, just use the elements as they are parsed
499
+ // TODO: Implement proper DOM structure reorganization later
500
+
501
+ // Set document references
502
+ if (htmlElement) {
503
+ (this.document as any).documentElement = htmlElement;
504
+ }
505
+ if (headElement) {
506
+ (this.document as any).head = headElement;
507
+ }
508
+ if (bodyElement) {
509
+ (this.document as any).body = bodyElement;
490
510
  }
511
+ }
491
512
  }