@dooboostore/dom-parser 1.0.1 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cjs/DomParser.js +33 -12
- package/dist/cjs/DomParser.js.map +2 -2
- package/dist/cjs/node/DocumentBase.js +4 -0
- package/dist/cjs/node/DocumentBase.js.map +2 -2
- package/dist/cjs/node/elements/Element.js.map +1 -1
- package/dist/cjs/node/elements/ElementBase.js +12 -2
- package/dist/cjs/node/elements/ElementBase.js.map +2 -2
- package/dist/cjs/node/elements/HTMLElement.js.map +1 -1
- package/dist/cjs/node/elements/HTMLElementBase.js +154 -2
- package/dist/cjs/node/elements/HTMLElementBase.js.map +2 -2
- package/dist/cjs/window/WindowBase.js +128 -7
- package/dist/cjs/window/WindowBase.js.map +2 -2
- package/dist/esm/DomParser.js +33 -12
- package/dist/esm/DomParser.js.map +2 -2
- package/dist/esm/node/DocumentBase.js +4 -0
- package/dist/esm/node/DocumentBase.js.map +2 -2
- package/dist/esm/node/elements/ElementBase.js +12 -2
- package/dist/esm/node/elements/ElementBase.js.map +2 -2
- package/dist/esm/node/elements/HTMLElementBase.js +154 -2
- package/dist/esm/node/elements/HTMLElementBase.js.map +2 -2
- package/dist/esm/window/WindowBase.js +128 -7
- package/dist/esm/window/WindowBase.js.map +2 -2
- package/dist/esm-bundle/dooboostore-dom-parser.esm.js +504 -195
- package/dist/esm-bundle/dooboostore-dom-parser.esm.js.map +3 -3
- package/dist/types/DomParser.d.ts +4 -0
- package/dist/types/DomParser.d.ts.map +1 -1
- package/dist/types/node/DocumentBase.d.ts +2 -0
- package/dist/types/node/DocumentBase.d.ts.map +1 -1
- package/dist/types/node/elements/Element.d.ts +1 -0
- package/dist/types/node/elements/Element.d.ts.map +1 -1
- package/dist/types/node/elements/ElementBase.d.ts +2 -2
- package/dist/types/node/elements/ElementBase.d.ts.map +1 -1
- package/dist/types/node/elements/HTMLElement.d.ts +32 -1
- package/dist/types/node/elements/HTMLElement.d.ts.map +1 -1
- package/dist/types/node/elements/HTMLElementBase.d.ts +11 -2
- package/dist/types/node/elements/HTMLElementBase.d.ts.map +1 -1
- package/dist/types/window/WindowBase.d.ts +12 -2
- package/dist/types/window/WindowBase.d.ts.map +1 -1
- package/dist/umd-bundle/dooboostore-dom-parser.umd.js +504 -195
- package/dist/umd-bundle/dooboostore-dom-parser.umd.js.map +3 -3
- package/package.json +1 -1
- package/src/DomParser.ts +457 -436
- package/src/node/DocumentBase.ts +7 -2
- package/src/node/elements/Element.ts +24 -23
- package/src/node/elements/ElementBase.ts +50 -41
- package/src/node/elements/HTMLElement.ts +36 -1
- package/src/node/elements/HTMLElementBase.ts +191 -5
- package/src/window/WindowBase.ts +1128 -919
package/src/DomParser.ts
CHANGED
|
@@ -1,491 +1,512 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import {
|
|
3
|
-
import {
|
|
4
|
-
import {
|
|
1
|
+
import {DocumentBase} from './node/DocumentBase';
|
|
2
|
+
import {TextBase} from './node/TextBase';
|
|
3
|
+
import {Comment} from './node/Comment';
|
|
4
|
+
import {WindowBase} from './window/WindowBase';
|
|
5
5
|
|
|
6
6
|
export interface DomParserOptions {
|
|
7
|
-
|
|
7
|
+
href?: string;
|
|
8
8
|
}
|
|
9
9
|
|
|
10
10
|
export class DomParser {
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
// Simulate document loading process
|
|
31
|
-
if (document && (document as any).simulateLoading) {
|
|
32
|
-
(document as any).simulateLoading();
|
|
33
|
-
}
|
|
11
|
+
private _window: Window | null;
|
|
12
|
+
private _document: Document | null;
|
|
13
|
+
|
|
14
|
+
constructor(html: string, option?: DomParserOptions) {
|
|
15
|
+
// Create WindowBase instance with the document
|
|
16
|
+
const windowBase = new WindowBase({initialUrl: option?.href});
|
|
17
|
+
this._window = windowBase as unknown as Window;
|
|
18
|
+
this._document = windowBase.document as any;
|
|
19
|
+
|
|
20
|
+
// Parse the provided HTML string
|
|
21
|
+
this.parseHTML(html);
|
|
22
|
+
|
|
23
|
+
// Set up document references after parsing
|
|
24
|
+
this.setupDocumentReferences();
|
|
25
|
+
|
|
26
|
+
// Simulate document loading process
|
|
27
|
+
if (this._document && (this._document as any).simulateLoading) {
|
|
28
|
+
(this._document as any).simulateLoading();
|
|
34
29
|
}
|
|
30
|
+
}
|
|
35
31
|
|
|
36
|
-
|
|
37
|
-
|
|
32
|
+
get window(): Window {
|
|
33
|
+
if (!this._window) {
|
|
34
|
+
throw new Error('DomParser has been destroyed');
|
|
38
35
|
}
|
|
36
|
+
return this._window;
|
|
37
|
+
}
|
|
39
38
|
|
|
40
|
-
|
|
41
|
-
|
|
39
|
+
get document(): Document {
|
|
40
|
+
if (!this._document) {
|
|
41
|
+
throw new Error('DomParser has been destroyed');
|
|
42
42
|
}
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
// Set up document references after parsing
|
|
54
|
-
this.setupDocumentReferences();
|
|
43
|
+
return this._document;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* Destroy the DomParser instance and free memory
|
|
48
|
+
*/
|
|
49
|
+
destroy(): void {
|
|
50
|
+
if (this._window) {
|
|
51
|
+
this._window.close();
|
|
52
|
+
this._window = null;
|
|
55
53
|
}
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
while (this.document.body.firstChild) {
|
|
67
|
-
this.document.body.removeChild(this.document.body.firstChild);
|
|
68
|
-
}
|
|
69
|
-
}
|
|
54
|
+
|
|
55
|
+
this._document = null;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* Load new HTML content and replace the current document
|
|
60
|
+
*/
|
|
61
|
+
loadHTML(html: string): void {
|
|
62
|
+
if (!this._document) {
|
|
63
|
+
throw new Error('DomParser has been destroyed');
|
|
70
64
|
}
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
65
|
+
|
|
66
|
+
// Clear current document content
|
|
67
|
+
this.clearDocument();
|
|
68
|
+
|
|
69
|
+
// Parse new HTML
|
|
70
|
+
this.parseHTML(html);
|
|
71
|
+
|
|
72
|
+
// Set up document references after parsing
|
|
73
|
+
this.setupDocumentReferences();
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
private clearDocument(): void {
|
|
77
|
+
if (!this._document) return;
|
|
78
|
+
|
|
79
|
+
// Clear document body and head content while preserving structure
|
|
80
|
+
if (this._document.head) {
|
|
81
|
+
while (this._document.head.firstChild) {
|
|
82
|
+
this._document.head.removeChild(this._document.head.firstChild);
|
|
83
|
+
}
|
|
81
84
|
}
|
|
82
85
|
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
86
|
+
if (this._document.body) {
|
|
87
|
+
while (this._document.body.firstChild) {
|
|
88
|
+
this._document.body.removeChild(this._document.body.firstChild);
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
}
|
|
88
92
|
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
+
parseHTML(html: string): void {
|
|
94
|
+
if (!this._document) {
|
|
95
|
+
throw new Error('DomParser has been destroyed');
|
|
96
|
+
}
|
|
97
|
+
// Simple HTML parsing implementation
|
|
98
|
+
if (!html.trim()) {
|
|
99
|
+
return;
|
|
100
|
+
}
|
|
93
101
|
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
102
|
+
// Basic HTML parsing - this is a simplified version
|
|
103
|
+
// In a real implementation, you'd use a proper HTML parser
|
|
104
|
+
this.parseHTMLString(html, this.document);
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
private parseHTMLString(html: string, parent: any): void {
|
|
108
|
+
// Remove DOCTYPE if present
|
|
109
|
+
html = html.replace(/<!DOCTYPE[^>]*>/i, '').trim();
|
|
110
|
+
|
|
111
|
+
if (!html) return;
|
|
112
|
+
|
|
113
|
+
// Handle template tags specially
|
|
114
|
+
const templateRegex = /<template([^>]*)>(.*?)<\/template>/gs;
|
|
115
|
+
html = html.replace(templateRegex, (match, attributes, content) => {
|
|
116
|
+
const element = this.document.createElement('template');
|
|
117
|
+
|
|
118
|
+
// Parse attributes
|
|
119
|
+
if (attributes.trim()) {
|
|
120
|
+
this.parseAttributes(attributes, element);
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
// Parse content directly into the template's content fragment
|
|
124
|
+
if (content.trim()) {
|
|
125
|
+
// Use internal method to avoid appendChild side effects
|
|
126
|
+
this.parseHTMLString(content.trim(), element.content);
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
parent.appendChild(element);
|
|
130
|
+
return ''; // Remove from HTML string
|
|
131
|
+
});
|
|
132
|
+
|
|
133
|
+
// Improved HTML parsing with proper nesting support
|
|
134
|
+
this.parseHTMLRecursive(html, parent);
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
private parseHTMLRecursive(html: string, parent: any): void {
|
|
138
|
+
let position = 0;
|
|
139
|
+
|
|
140
|
+
while (position < html.length) {
|
|
141
|
+
// Find next tag or comment
|
|
142
|
+
const tagStart = html.indexOf('<', position);
|
|
143
|
+
|
|
144
|
+
if (tagStart === -1) {
|
|
145
|
+
// No more tags, add remaining text if any
|
|
146
|
+
const remainingText = html.substring(position).trim();
|
|
147
|
+
if (remainingText) {
|
|
148
|
+
const textNode = new TextBase(remainingText, this.document);
|
|
149
|
+
parent.appendChild(textNode);
|
|
150
|
+
}
|
|
151
|
+
break;
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
// Add text before tag if any
|
|
155
|
+
if (tagStart > position) {
|
|
156
|
+
const textContent = html.substring(position, tagStart).trim();
|
|
157
|
+
if (textContent) {
|
|
158
|
+
const textNode = new TextBase(textContent, this.document);
|
|
159
|
+
parent.appendChild(textNode);
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
// Check if this is a comment
|
|
164
|
+
if (html.substring(tagStart, tagStart + 4) === '<!--') {
|
|
165
|
+
const commentEnd = html.indexOf('-->', tagStart + 4);
|
|
166
|
+
if (commentEnd !== -1) {
|
|
167
|
+
const commentContent = html.substring(tagStart + 4, commentEnd);
|
|
168
|
+
const commentNode = new Comment(commentContent, this.document);
|
|
169
|
+
parent.appendChild(commentNode);
|
|
170
|
+
position = commentEnd + 3;
|
|
171
|
+
continue;
|
|
172
|
+
} else {
|
|
173
|
+
// Malformed comment, treat as text
|
|
174
|
+
const textNode = new TextBase(html.substring(tagStart, tagStart + 4), this.document);
|
|
175
|
+
parent.appendChild(textNode);
|
|
176
|
+
position = tagStart + 4;
|
|
177
|
+
continue;
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
// Find tag end
|
|
182
|
+
const tagEnd = html.indexOf('>', tagStart);
|
|
183
|
+
if (tagEnd === -1) break;
|
|
184
|
+
|
|
185
|
+
const tagContent = html.substring(tagStart + 1, tagEnd);
|
|
186
|
+
|
|
187
|
+
// Check if it's a closing tag
|
|
188
|
+
if (tagContent.startsWith('/')) {
|
|
189
|
+
// This is a closing tag, we should return to parent
|
|
190
|
+
position = tagEnd + 1;
|
|
191
|
+
break;
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
// Check if it's a self-closing tag
|
|
195
|
+
const isSelfClosing = tagContent.endsWith('/') || this.isSelfClosingTag(tagContent.split(/\s+/)[0]);
|
|
196
|
+
|
|
197
|
+
// Parse tag name and attributes
|
|
198
|
+
const spaceIndex = tagContent.indexOf(' ');
|
|
199
|
+
const tagName = spaceIndex === -1 ? tagContent.replace('/', '') : tagContent.substring(0, spaceIndex);
|
|
200
|
+
let attributes = spaceIndex === -1 ? '' : tagContent.substring(spaceIndex + 1);
|
|
201
|
+
|
|
202
|
+
// Only remove trailing slash for self-closing tags (not from attribute values)
|
|
203
|
+
if (attributes.endsWith('/')) {
|
|
204
|
+
attributes = attributes.slice(0, -1).trim();
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
// Create element
|
|
209
|
+
const element = this.document.createElement(tagName.toLowerCase());
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
// Parse attributes
|
|
213
|
+
if (attributes.trim()) {
|
|
214
|
+
this.parseAttributes(attributes, element);
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
parent.appendChild(element);
|
|
218
|
+
|
|
219
|
+
if (isSelfClosing) {
|
|
220
|
+
position = tagEnd + 1;
|
|
221
|
+
} else {
|
|
222
|
+
// Find matching closing tag and parse content
|
|
223
|
+
const closingTag = `</${tagName}>`;
|
|
224
|
+
const closingTagIndex = this.findMatchingClosingTag(html, tagEnd + 1, tagName);
|
|
225
|
+
|
|
226
|
+
if (closingTagIndex !== -1) {
|
|
227
|
+
const innerContent = html.substring(tagEnd + 1, closingTagIndex);
|
|
228
|
+
if (innerContent.trim()) {
|
|
229
|
+
this.parseHTMLRecursive(innerContent, element);
|
|
230
|
+
}
|
|
231
|
+
position = closingTagIndex + closingTag.length;
|
|
232
|
+
} else {
|
|
233
|
+
// No matching closing tag found, treat as self-closing
|
|
234
|
+
position = tagEnd + 1;
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
}
|
|
98
239
|
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
240
|
+
private findMatchingClosingTag(html: string, startPos: number, tagName: string): number {
|
|
241
|
+
const openTag = `<${tagName}`;
|
|
242
|
+
const closeTag = `</${tagName}>`;
|
|
243
|
+
let depth = 1;
|
|
244
|
+
let pos = startPos;
|
|
104
245
|
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
246
|
+
while (pos < html.length && depth > 0) {
|
|
247
|
+
const nextOpen = html.indexOf(openTag, pos);
|
|
248
|
+
const nextClose = html.indexOf(closeTag, pos);
|
|
108
249
|
|
|
109
|
-
|
|
110
|
-
|
|
250
|
+
if (nextClose === -1) {
|
|
251
|
+
// No more closing tags
|
|
252
|
+
return -1;
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
if (nextOpen !== -1 && nextOpen < nextClose) {
|
|
256
|
+
// Found another opening tag before the closing tag
|
|
257
|
+
depth++;
|
|
258
|
+
pos = nextOpen + openTag.length;
|
|
259
|
+
} else {
|
|
260
|
+
// Found a closing tag
|
|
261
|
+
depth--;
|
|
262
|
+
if (depth === 0) {
|
|
263
|
+
return nextClose;
|
|
264
|
+
}
|
|
265
|
+
pos = nextClose + closeTag.length;
|
|
266
|
+
}
|
|
111
267
|
}
|
|
112
268
|
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
while (position < html.length) {
|
|
117
|
-
// Find next tag or comment
|
|
118
|
-
const tagStart = html.indexOf('<', position);
|
|
119
|
-
|
|
120
|
-
if (tagStart === -1) {
|
|
121
|
-
// No more tags, add remaining text if any
|
|
122
|
-
const remainingText = html.substring(position).trim();
|
|
123
|
-
if (remainingText) {
|
|
124
|
-
const textNode = new TextBase(remainingText, this.document);
|
|
125
|
-
parent.appendChild(textNode);
|
|
126
|
-
}
|
|
127
|
-
break;
|
|
128
|
-
}
|
|
269
|
+
return -1;
|
|
270
|
+
}
|
|
129
271
|
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
const textNode = new TextBase(textContent, this.document);
|
|
135
|
-
parent.appendChild(textNode);
|
|
136
|
-
}
|
|
137
|
-
}
|
|
272
|
+
private parseAttributes(attributeString: string, element: any): void {
|
|
273
|
+
// Improved attribute parsing that handles complex JavaScript expressions
|
|
274
|
+
let position = 0;
|
|
275
|
+
const length = attributeString.length;
|
|
138
276
|
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
const commentNode = new Comment(commentContent, this.document);
|
|
145
|
-
parent.appendChild(commentNode);
|
|
146
|
-
position = commentEnd + 3;
|
|
147
|
-
continue;
|
|
148
|
-
} else {
|
|
149
|
-
// Malformed comment, treat as text
|
|
150
|
-
const textNode = new TextBase(html.substring(tagStart, tagStart + 4), this.document);
|
|
151
|
-
parent.appendChild(textNode);
|
|
152
|
-
position = tagStart + 4;
|
|
153
|
-
continue;
|
|
154
|
-
}
|
|
155
|
-
}
|
|
277
|
+
while (position < length) {
|
|
278
|
+
// Skip whitespace
|
|
279
|
+
while (position < length && /\s/.test(attributeString[position])) {
|
|
280
|
+
position++;
|
|
281
|
+
}
|
|
156
282
|
|
|
157
|
-
|
|
158
|
-
const tagEnd = html.indexOf('>', tagStart);
|
|
159
|
-
if (tagEnd === -1) break;
|
|
283
|
+
if (position >= length) break;
|
|
160
284
|
|
|
161
|
-
|
|
285
|
+
// Find attribute name
|
|
286
|
+
const nameStart = position;
|
|
287
|
+
while (position < length && /[\w:-]/.test(attributeString[position])) {
|
|
288
|
+
position++;
|
|
289
|
+
}
|
|
162
290
|
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
}
|
|
291
|
+
if (position === nameStart) {
|
|
292
|
+
// Invalid character, skip it
|
|
293
|
+
position++;
|
|
294
|
+
continue;
|
|
295
|
+
}
|
|
169
296
|
|
|
170
|
-
|
|
171
|
-
const isSelfClosing = tagContent.endsWith('/') || this.isSelfClosingTag(tagContent.split(/\s+/)[0]);
|
|
172
|
-
|
|
173
|
-
// Parse tag name and attributes
|
|
174
|
-
const spaceIndex = tagContent.indexOf(' ');
|
|
175
|
-
const tagName = spaceIndex === -1 ? tagContent.replace('/', '') : tagContent.substring(0, spaceIndex);
|
|
176
|
-
let attributes = spaceIndex === -1 ? '' : tagContent.substring(spaceIndex + 1);
|
|
177
|
-
|
|
178
|
-
// Only remove trailing slash for self-closing tags (not from attribute values)
|
|
179
|
-
if (attributes.endsWith('/')) {
|
|
180
|
-
attributes = attributes.slice(0, -1).trim();
|
|
181
|
-
}
|
|
297
|
+
const name = attributeString.substring(nameStart, position);
|
|
182
298
|
|
|
299
|
+
// Skip whitespace
|
|
300
|
+
while (position < length && /\s/.test(attributeString[position])) {
|
|
301
|
+
position++;
|
|
302
|
+
}
|
|
183
303
|
|
|
304
|
+
let value = '';
|
|
184
305
|
|
|
185
|
-
|
|
186
|
-
|
|
306
|
+
// Check if there's an equals sign
|
|
307
|
+
if (position < length && attributeString[position] === '=') {
|
|
308
|
+
position++; // Skip '='
|
|
187
309
|
|
|
310
|
+
// Skip whitespace
|
|
311
|
+
while (position < length && /\s/.test(attributeString[position])) {
|
|
312
|
+
position++;
|
|
313
|
+
}
|
|
188
314
|
|
|
315
|
+
if (position < length) {
|
|
316
|
+
const quote = attributeString[position];
|
|
189
317
|
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
318
|
+
if (quote === '"' || quote === "'") {
|
|
319
|
+
// Quoted value - find matching closing quote
|
|
320
|
+
position++; // Skip opening quote
|
|
321
|
+
const valueStart = position;
|
|
194
322
|
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
if (isSelfClosing) {
|
|
198
|
-
position = tagEnd + 1;
|
|
199
|
-
} else {
|
|
200
|
-
// Find matching closing tag and parse content
|
|
201
|
-
const closingTag = `</${tagName}>`;
|
|
202
|
-
const closingTagIndex = this.findMatchingClosingTag(html, tagEnd + 1, tagName);
|
|
203
|
-
|
|
204
|
-
if (closingTagIndex !== -1) {
|
|
205
|
-
const innerContent = html.substring(tagEnd + 1, closingTagIndex);
|
|
206
|
-
if (innerContent.trim()) {
|
|
207
|
-
this.parseHTMLRecursive(innerContent, element);
|
|
208
|
-
}
|
|
209
|
-
position = closingTagIndex + closingTag.length;
|
|
210
|
-
} else {
|
|
211
|
-
// No matching closing tag found, treat as self-closing
|
|
212
|
-
position = tagEnd + 1;
|
|
213
|
-
}
|
|
323
|
+
while (position < length && attributeString[position] !== quote) {
|
|
324
|
+
position++;
|
|
214
325
|
}
|
|
215
|
-
}
|
|
216
|
-
}
|
|
217
326
|
|
|
218
|
-
|
|
219
|
-
const openTag = `<${tagName}`;
|
|
220
|
-
const closeTag = `</${tagName}>`;
|
|
221
|
-
let depth = 1;
|
|
222
|
-
let pos = startPos;
|
|
327
|
+
value = attributeString.substring(valueStart, position);
|
|
223
328
|
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
const nextClose = html.indexOf(closeTag, pos);
|
|
227
|
-
|
|
228
|
-
if (nextClose === -1) {
|
|
229
|
-
// No more closing tags
|
|
230
|
-
return -1;
|
|
329
|
+
if (position < length && attributeString[position] === quote) {
|
|
330
|
+
position++; // Skip closing quote
|
|
231
331
|
}
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
} else {
|
|
238
|
-
// Found a closing tag
|
|
239
|
-
depth--;
|
|
240
|
-
if (depth === 0) {
|
|
241
|
-
return nextClose;
|
|
242
|
-
}
|
|
243
|
-
pos = nextClose + closeTag.length;
|
|
332
|
+
} else {
|
|
333
|
+
// Unquoted value - read until whitespace or end
|
|
334
|
+
const valueStart = position;
|
|
335
|
+
while (position < length && !/\s/.test(attributeString[position])) {
|
|
336
|
+
position++;
|
|
244
337
|
}
|
|
338
|
+
value = attributeString.substring(valueStart, position);
|
|
339
|
+
}
|
|
245
340
|
}
|
|
341
|
+
}
|
|
246
342
|
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
private parseAttributes(attributeString: string, element: any): void {
|
|
251
|
-
// Improved attribute parsing that handles complex JavaScript expressions
|
|
252
|
-
let position = 0;
|
|
253
|
-
const length = attributeString.length;
|
|
254
|
-
|
|
255
|
-
while (position < length) {
|
|
256
|
-
// Skip whitespace
|
|
257
|
-
while (position < length && /\s/.test(attributeString[position])) {
|
|
258
|
-
position++;
|
|
259
|
-
}
|
|
260
|
-
|
|
261
|
-
if (position >= length) break;
|
|
343
|
+
// Decode HTML entities in attribute values
|
|
344
|
+
value = this.decodeHTMLEntities(value);
|
|
262
345
|
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
346
|
+
element.setAttribute(name, value);
|
|
347
|
+
}
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
/**
|
|
351
|
+
* Decode HTML entities in a string
|
|
352
|
+
*/
|
|
353
|
+
private decodeHTMLEntities(str: string): string {
|
|
354
|
+
const entityMap: { [key: string]: string } = {
|
|
355
|
+
'&': '&',
|
|
356
|
+
'<': '<',
|
|
357
|
+
'>': '>',
|
|
358
|
+
'"': '"',
|
|
359
|
+
''': "'",
|
|
360
|
+
'"': '"',
|
|
361
|
+
''': "'",
|
|
362
|
+
'©': '©',
|
|
363
|
+
'®': '®',
|
|
364
|
+
'™': '™',
|
|
365
|
+
' ': ' ',
|
|
366
|
+
'…': '…',
|
|
367
|
+
'—': '—',
|
|
368
|
+
'–': '–',
|
|
369
|
+
'‘': '\u2018',
|
|
370
|
+
'’': '\u2019',
|
|
371
|
+
'“': '"',
|
|
372
|
+
'”': '"'
|
|
373
|
+
};
|
|
374
|
+
|
|
375
|
+
return str.replace(/&[a-zA-Z0-9#]+;/g, (entity) => {
|
|
376
|
+
// Handle named entities
|
|
377
|
+
if (entityMap[entity]) {
|
|
378
|
+
return entityMap[entity];
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
// Handle numeric entities like ' "
|
|
382
|
+
if (entity.startsWith('&#') && entity.endsWith(';')) {
|
|
383
|
+
const numStr = entity.slice(2, -1);
|
|
384
|
+
const num = parseInt(numStr, 10);
|
|
385
|
+
if (!isNaN(num)) {
|
|
386
|
+
return String.fromCharCode(num);
|
|
387
|
+
}
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
// Handle hex entities like '
|
|
391
|
+
if (entity.startsWith('&#x') && entity.endsWith(';')) {
|
|
392
|
+
const hexStr = entity.slice(3, -1);
|
|
393
|
+
const num = parseInt(hexStr, 16);
|
|
394
|
+
if (!isNaN(num)) {
|
|
395
|
+
return String.fromCharCode(num);
|
|
396
|
+
}
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
// Return original if not recognized
|
|
400
|
+
return entity;
|
|
401
|
+
});
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
private isSelfClosingTag(tagName: string): boolean {
|
|
405
|
+
const selfClosingTags = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'link', 'meta', 'param', 'source', 'track', 'wbr'];
|
|
406
|
+
return selfClosingTags.includes(tagName.toLowerCase());
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
/**
|
|
410
|
+
* Set up document references after HTML parsing
|
|
411
|
+
*/
|
|
412
|
+
private setupDocumentReferences(): void {
|
|
413
|
+
// Find HTML, HEAD, and BODY elements
|
|
414
|
+
const allHtmlElements = this.document.querySelectorAll('html');
|
|
415
|
+
const allHeadElements = this.document.querySelectorAll('head');
|
|
416
|
+
const allBodyElements = this.document.querySelectorAll('body');
|
|
417
|
+
|
|
418
|
+
// Choose the HTML element with content, then attributes, then first one
|
|
419
|
+
let htmlElement = null;
|
|
420
|
+
// First priority: HTML with attributes (lang, data-theme 등)
|
|
421
|
+
for (let i = 0; i < allHtmlElements.length; i++) {
|
|
422
|
+
const html = allHtmlElements[i];
|
|
423
|
+
if (html.attributes.length > 0) {
|
|
424
|
+
htmlElement = html;
|
|
425
|
+
break;
|
|
426
|
+
}
|
|
427
|
+
}
|
|
428
|
+
// Second priority: HTML with child nodes (content)
|
|
429
|
+
if (!htmlElement) {
|
|
430
|
+
for (let i = 0; i < allHtmlElements.length; i++) {
|
|
431
|
+
const html = allHtmlElements[i];
|
|
432
|
+
if (html.childNodes.length > 0) {
|
|
433
|
+
htmlElement = html;
|
|
434
|
+
break;
|
|
435
|
+
}
|
|
436
|
+
}
|
|
437
|
+
}
|
|
438
|
+
// Last resort: first HTML
|
|
439
|
+
if (!htmlElement && allHtmlElements.length > 0) {
|
|
440
|
+
htmlElement = allHtmlElements[0];
|
|
441
|
+
}
|
|
268
442
|
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
443
|
+
// Choose the HEAD element with content, then attributes, then first one
|
|
444
|
+
let headElement = null;
|
|
445
|
+
// First priority: HEAD with child nodes (content)
|
|
446
|
+
for (let i = 0; i < allHeadElements.length; i++) {
|
|
447
|
+
const head = allHeadElements[i];
|
|
448
|
+
if (head.childNodes.length > 0) {
|
|
449
|
+
headElement = head;
|
|
450
|
+
break;
|
|
451
|
+
}
|
|
452
|
+
}
|
|
453
|
+
// Second priority: HEAD with attributes
|
|
454
|
+
if (!headElement) {
|
|
455
|
+
for (let i = 0; i < allHeadElements.length; i++) {
|
|
456
|
+
const head = allHeadElements[i];
|
|
457
|
+
if (head.attributes.length > 0) {
|
|
458
|
+
headElement = head;
|
|
459
|
+
break;
|
|
460
|
+
}
|
|
461
|
+
}
|
|
462
|
+
}
|
|
463
|
+
// Last resort: first HEAD
|
|
464
|
+
if (!headElement && allHeadElements.length > 0) {
|
|
465
|
+
headElement = allHeadElements[0];
|
|
466
|
+
}
|
|
274
467
|
|
|
275
|
-
|
|
468
|
+
// Choose the BODY element with content, then attributes, then first one
|
|
469
|
+
let bodyElement = null;
|
|
276
470
|
|
|
277
|
-
// Skip whitespace
|
|
278
|
-
while (position < length && /\s/.test(attributeString[position])) {
|
|
279
|
-
position++;
|
|
280
|
-
}
|
|
281
471
|
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
position++;
|
|
291
|
-
}
|
|
292
|
-
|
|
293
|
-
if (position < length) {
|
|
294
|
-
const quote = attributeString[position];
|
|
295
|
-
|
|
296
|
-
if (quote === '"' || quote === "'") {
|
|
297
|
-
// Quoted value - find matching closing quote
|
|
298
|
-
position++; // Skip opening quote
|
|
299
|
-
const valueStart = position;
|
|
300
|
-
|
|
301
|
-
while (position < length && attributeString[position] !== quote) {
|
|
302
|
-
position++;
|
|
303
|
-
}
|
|
304
|
-
|
|
305
|
-
value = attributeString.substring(valueStart, position);
|
|
306
|
-
|
|
307
|
-
if (position < length && attributeString[position] === quote) {
|
|
308
|
-
position++; // Skip closing quote
|
|
309
|
-
}
|
|
310
|
-
} else {
|
|
311
|
-
// Unquoted value - read until whitespace or end
|
|
312
|
-
const valueStart = position;
|
|
313
|
-
while (position < length && !/\s/.test(attributeString[position])) {
|
|
314
|
-
position++;
|
|
315
|
-
}
|
|
316
|
-
value = attributeString.substring(valueStart, position);
|
|
317
|
-
}
|
|
318
|
-
}
|
|
319
|
-
}
|
|
472
|
+
// First priority: BODY with child nodes (content)
|
|
473
|
+
for (let i = 0; i < allBodyElements.length; i++) {
|
|
474
|
+
const body = allBodyElements[i];
|
|
475
|
+
if (body.childNodes.length > 0) {
|
|
476
|
+
bodyElement = body;
|
|
477
|
+
break;
|
|
478
|
+
}
|
|
479
|
+
}
|
|
320
480
|
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
481
|
+
// Second priority: BODY with attributes
|
|
482
|
+
if (!bodyElement) {
|
|
483
|
+
for (let i = 0; i < allBodyElements.length; i++) {
|
|
484
|
+
const body = allBodyElements[i];
|
|
485
|
+
if (body.attributes.length > 0) {
|
|
486
|
+
bodyElement = body;
|
|
487
|
+
break;
|
|
325
488
|
}
|
|
489
|
+
}
|
|
326
490
|
}
|
|
327
491
|
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
private decodeHTMLEntities(str: string): string {
|
|
332
|
-
const entityMap: { [key: string]: string } = {
|
|
333
|
-
'&': '&',
|
|
334
|
-
'<': '<',
|
|
335
|
-
'>': '>',
|
|
336
|
-
'"': '"',
|
|
337
|
-
''': "'",
|
|
338
|
-
'"': '"',
|
|
339
|
-
''': "'",
|
|
340
|
-
'©': '©',
|
|
341
|
-
'®': '®',
|
|
342
|
-
'™': '™',
|
|
343
|
-
' ': ' ',
|
|
344
|
-
'…': '…',
|
|
345
|
-
'—': '—',
|
|
346
|
-
'–': '–',
|
|
347
|
-
'‘': '\u2018',
|
|
348
|
-
'’': '\u2019',
|
|
349
|
-
'“': '"',
|
|
350
|
-
'”': '"'
|
|
351
|
-
};
|
|
352
|
-
|
|
353
|
-
return str.replace(/&[a-zA-Z0-9#]+;/g, (entity) => {
|
|
354
|
-
// Handle named entities
|
|
355
|
-
if (entityMap[entity]) {
|
|
356
|
-
return entityMap[entity];
|
|
357
|
-
}
|
|
358
|
-
|
|
359
|
-
// Handle numeric entities like ' "
|
|
360
|
-
if (entity.startsWith('&#') && entity.endsWith(';')) {
|
|
361
|
-
const numStr = entity.slice(2, -1);
|
|
362
|
-
const num = parseInt(numStr, 10);
|
|
363
|
-
if (!isNaN(num)) {
|
|
364
|
-
return String.fromCharCode(num);
|
|
365
|
-
}
|
|
366
|
-
}
|
|
367
|
-
|
|
368
|
-
// Handle hex entities like '
|
|
369
|
-
if (entity.startsWith('&#x') && entity.endsWith(';')) {
|
|
370
|
-
const hexStr = entity.slice(3, -1);
|
|
371
|
-
const num = parseInt(hexStr, 16);
|
|
372
|
-
if (!isNaN(num)) {
|
|
373
|
-
return String.fromCharCode(num);
|
|
374
|
-
}
|
|
375
|
-
}
|
|
376
|
-
|
|
377
|
-
// Return original if not recognized
|
|
378
|
-
return entity;
|
|
379
|
-
});
|
|
492
|
+
// Last resort: first BODY
|
|
493
|
+
if (!bodyElement && allBodyElements.length > 0) {
|
|
494
|
+
bodyElement = allBodyElements[0];
|
|
380
495
|
}
|
|
381
496
|
|
|
382
|
-
private isSelfClosingTag(tagName: string): boolean {
|
|
383
|
-
const selfClosingTags = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'link', 'meta', 'param', 'source', 'track', 'wbr'];
|
|
384
|
-
return selfClosingTags.includes(tagName.toLowerCase());
|
|
385
|
-
}
|
|
386
497
|
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
for (let i = 0; i < allHtmlElements.length; i++) {
|
|
400
|
-
const html = allHtmlElements[i];
|
|
401
|
-
if (html.attributes.length > 0) {
|
|
402
|
-
htmlElement = html;
|
|
403
|
-
break;
|
|
404
|
-
}
|
|
405
|
-
}
|
|
406
|
-
// Second priority: HTML with child nodes (content)
|
|
407
|
-
if (!htmlElement) {
|
|
408
|
-
for (let i = 0; i < allHtmlElements.length; i++) {
|
|
409
|
-
const html = allHtmlElements[i];
|
|
410
|
-
if (html.childNodes.length > 0) {
|
|
411
|
-
htmlElement = html;
|
|
412
|
-
break;
|
|
413
|
-
}
|
|
414
|
-
}
|
|
415
|
-
}
|
|
416
|
-
// Last resort: first HTML
|
|
417
|
-
if (!htmlElement && allHtmlElements.length > 0) {
|
|
418
|
-
htmlElement = allHtmlElements[0];
|
|
419
|
-
}
|
|
420
|
-
|
|
421
|
-
// Choose the HEAD element with content, then attributes, then first one
|
|
422
|
-
let headElement = null;
|
|
423
|
-
// First priority: HEAD with child nodes (content)
|
|
424
|
-
for (let i = 0; i < allHeadElements.length; i++) {
|
|
425
|
-
const head = allHeadElements[i];
|
|
426
|
-
if (head.childNodes.length > 0) {
|
|
427
|
-
headElement = head;
|
|
428
|
-
break;
|
|
429
|
-
}
|
|
430
|
-
}
|
|
431
|
-
// Second priority: HEAD with attributes
|
|
432
|
-
if (!headElement) {
|
|
433
|
-
for (let i = 0; i < allHeadElements.length; i++) {
|
|
434
|
-
const head = allHeadElements[i];
|
|
435
|
-
if (head.attributes.length > 0) {
|
|
436
|
-
headElement = head;
|
|
437
|
-
break;
|
|
438
|
-
}
|
|
439
|
-
}
|
|
440
|
-
}
|
|
441
|
-
// Last resort: first HEAD
|
|
442
|
-
if (!headElement && allHeadElements.length > 0) {
|
|
443
|
-
headElement = allHeadElements[0];
|
|
444
|
-
}
|
|
445
|
-
|
|
446
|
-
// Choose the BODY element with content, then attributes, then first one
|
|
447
|
-
let bodyElement = null;
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
// First priority: BODY with child nodes (content)
|
|
451
|
-
for (let i = 0; i < allBodyElements.length; i++) {
|
|
452
|
-
const body = allBodyElements[i];
|
|
453
|
-
if (body.childNodes.length > 0) {
|
|
454
|
-
bodyElement = body;
|
|
455
|
-
break;
|
|
456
|
-
}
|
|
457
|
-
}
|
|
458
|
-
|
|
459
|
-
// Second priority: BODY with attributes
|
|
460
|
-
if (!bodyElement) {
|
|
461
|
-
for (let i = 0; i < allBodyElements.length; i++) {
|
|
462
|
-
const body = allBodyElements[i];
|
|
463
|
-
if (body.attributes.length > 0) {
|
|
464
|
-
bodyElement = body;
|
|
465
|
-
break;
|
|
466
|
-
}
|
|
467
|
-
}
|
|
468
|
-
}
|
|
469
|
-
|
|
470
|
-
// Last resort: first BODY
|
|
471
|
-
if (!bodyElement && allBodyElements.length > 0) {
|
|
472
|
-
bodyElement = allBodyElements[0];
|
|
473
|
-
}
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
// For now, just use the elements as they are parsed
|
|
478
|
-
// TODO: Implement proper DOM structure reorganization later
|
|
479
|
-
|
|
480
|
-
// Set document references
|
|
481
|
-
if (htmlElement) {
|
|
482
|
-
(this.document as any).documentElement = htmlElement;
|
|
483
|
-
}
|
|
484
|
-
if (headElement) {
|
|
485
|
-
(this.document as any).head = headElement;
|
|
486
|
-
}
|
|
487
|
-
if (bodyElement) {
|
|
488
|
-
(this.document as any).body = bodyElement;
|
|
489
|
-
}
|
|
498
|
+
// For now, just use the elements as they are parsed
|
|
499
|
+
// TODO: Implement proper DOM structure reorganization later
|
|
500
|
+
|
|
501
|
+
// Set document references
|
|
502
|
+
if (htmlElement) {
|
|
503
|
+
(this.document as any).documentElement = htmlElement;
|
|
504
|
+
}
|
|
505
|
+
if (headElement) {
|
|
506
|
+
(this.document as any).head = headElement;
|
|
507
|
+
}
|
|
508
|
+
if (bodyElement) {
|
|
509
|
+
(this.document as any).body = bodyElement;
|
|
490
510
|
}
|
|
511
|
+
}
|
|
491
512
|
}
|