als-document 1.0.5-alpha → 1.0.6-alpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/document.js +1 -1
- package/index.js +1 -1
- package/index.mjs +1 -1
- package/package.json +1 -1
- package/src/parse/parser.js +3 -1
- package/tests/index.html +2 -2
- package/tests/parse-real.js +3 -2
package/document.js
CHANGED
|
@@ -29,7 +29,7 @@ class SingleNode extends Node{
|
|
|
29
29
|
constructor(tagName,attributes={},parent=null)
|
|
30
30
|
function parseAttributes(str){
|
|
31
31
|
const attrs={};
|
|
32
32
|
let key="";
|
|
33
33
|
let value="";
|
|
34
34
|
let isKey=true;
|
|
35
35
|
let quoteChar=null;
|
|
36
36
|
for (let i=0; i< str.length; i++){
|
|
37
37
|
const char=str[i];
|
|
38
38
|
if (isKey && (char==='=' || char===' ')){
|
|
39
39
|
if (char==='=') isKey=false;
|
|
40
40
|
else if (key.trim()){
|
|
41
41
|
attrs[key.trim()]=true;
|
|
42
42
|
key="";
|
|
43
43
|
}
|
|
44
44
|
continue;
|
|
45
45
|
}
|
|
46
46
|
if (!quoteChar && (char==='"' || char==="'")){
|
|
47
47
|
quoteChar=char;
|
|
48
48
|
continue;
|
|
49
49
|
} else if (quoteChar && char===quoteChar){
|
|
50
50
|
quoteChar=null;
|
|
51
51
|
attrs[key.trim()]=value.trim();
|
|
52
52
|
key=""; value=""; isKey=true;
|
|
53
53
|
continue;
|
|
54
54
|
}
|
|
55
55
|
if (isKey) key+=char;
|
|
56
56
|
else value+=char;
|
|
57
57
|
}
|
|
58
58
|
if (key.trim() &&!value) attrs[key.trim()]=true;
|
|
59
59
|
return attrs;
|
|
60
60
|
}
|
|
61
61
|
const VOID_TAGS=new Set(["area","base","br","col","command","embed","hr","img","input","keygen","link","meta","param","source","track","wbr","!doctype",'?xml']);
|
|
62
|
-
function parseHTML(html){
|
|
63
62
|
const root=new Node("ROOT");
|
|
64
63
|
const stack=[root];
|
|
65
64
|
let currentText="",i=0;
|
|
66
65
|
let max=0
|
|
67
66
|
function parseScript(){
|
|
68
67
|
if (!html.startsWith("<script",i)) return false;
|
|
69
68
|
const openTagEnd=html.indexOf(">",i);
|
|
70
69
|
if (openTagEnd===-1) return false;
|
|
71
70
|
const attributesString=html.substring(i+7,openTagEnd).trim();
|
|
72
71
|
const attributes=parseAttributes(attributesString);
|
|
73
72
|
let closeTagStart=html.indexOf("</script>",openTagEnd);
|
|
74
73
|
if (closeTagStart===-1) return false;
|
|
75
74
|
const content=html.substring(openTagEnd+1,closeTagStart);
|
|
76
75
|
const scriptNode=new Node('script',attributes,stack[stack.length-1]);
|
|
77
76
|
if(content.length>0) scriptNode.childNodes.push(content);
|
|
78
77
|
i=closeTagStart+9;
|
|
79
78
|
return true;
|
|
80
79
|
}
|
|
81
80
|
function parseSpecial(startStr,endStr,n1,n2,tag){
|
|
82
81
|
if (!html.startsWith(startStr,i)) return false
|
|
83
82
|
const end=html.indexOf(endStr,i+n1);
|
|
84
83
|
const strNode=new Node(tag,{},stack[stack.length-1]);
|
|
85
84
|
strNode.childNodes.push(html.substring(i+n1,end));
|
|
86
85
|
i=end+n2;
|
|
87
86
|
return true
|
|
88
87
|
}
|
|
89
88
|
while (i< html.length){
|
|
90
89
|
if(i>=max) max=i;
|
|
91
90
|
else break;
|
|
92
91
|
if (parseScript()) continue
|
|
93
92
|
if (parseSpecial("<!--","-->",4,3,'#comment')) continue
|
|
94
93
|
if (parseSpecial("<style","</style>",7,8,'style')) continue
|
|
95
94
|
if (html.startsWith("<![CDATA[",i)){
|
|
96
95
|
const end=html.indexOf("]]>",i+9);
|
|
97
96
|
if (end===-1) break;
|
|
98
97
|
const content=html.substring(i+9,end);
|
|
99
98
|
const cdataNode=new SingleNode("#cdata-section",{},stack[stack.length-1]);
|
|
100
99
|
cdataNode.nodeValue=content;
|
|
101
100
|
i=end+3;
|
|
102
101
|
continue;
|
|
103
102
|
}
|
|
104
103
|
if (html.startsWith("<",i)){
|
|
105
104
|
if (currentText && stack[stack.length-1]){
|
|
106
105
|
stack[stack.length-1].childNodes.push(new TextNode(currentText));
|
|
107
106
|
currentText="";
|
|
108
107
|
}
|
|
109
108
|
let tagEnd=i+1;
|
|
110
109
|
let insideQuotes=false;
|
|
111
110
|
let quoteChar=null;
|
|
112
111
|
while (tagEnd< html.length){
|
|
113
112
|
const char=html[tagEnd];
|
|
114
113
|
if (!insideQuotes && (char==='"' || char==="'")){
|
|
115
114
|
insideQuotes=true;
|
|
116
115
|
quoteChar=char;
|
|
117
116
|
} else if (insideQuotes && char===quoteChar){
|
|
118
117
|
insideQuotes=false;
|
|
119
118
|
quoteChar=null;
|
|
120
119
|
}
|
|
121
120
|
if (!insideQuotes && char==='>') break;
|
|
122
121
|
tagEnd++;
|
|
123
122
|
}
|
|
124
123
|
const tagContent=html.substring(i+1,tagEnd);
|
|
125
124
|
if (tagContent.startsWith("/")) stack.pop();
|
|
126
125
|
else{
|
|
127
126
|
let isSelfClosing=tagContent.endsWith('/');
|
|
128
127
|
const tagNameEnd=tagContent.search(/\s|>|\//);
|
|
129
128
|
const tagName=tagContent.substring(0,tagNameEnd>0 ? tagNameEnd : tagEnd-i-1);
|
|
130
129
|
const attributesString=tagContent.substring(tagName.length,isSelfClosing ? tagContent.length-1 : tagContent.length).trim();
|
|
131
130
|
const attributes=parseAttributes(attributesString);
|
|
132
131
|
if (VOID_TAGS.has(tagName.toLowerCase()) || isSelfClosing) new SingleNode(tagName,attributes,stack[stack.length-1])
|
|
133
132
|
else stack.push(new Node(tagName,attributes,stack[stack.length-1]));
|
|
134
133
|
}
|
|
135
134
|
i=tagEnd+1;
|
|
136
135
|
} else{
|
|
137
136
|
currentText+=html[i];
|
|
138
137
|
i++;
|
|
139
138
|
}
|
|
140
139
|
}
|
|
141
140
|
if (currentText.trim() && stack[stack.length-1]) stack[stack.length-1].childNodes.push(new TextNode(currentText));
|
|
142
141
|
return root;
|
|
142
|
+
function parseHTML(html){
|
|
143
143
|
const root=new Node("ROOT");
|
|
144
144
|
const stack=[root];
|
|
145
145
|
let currentText="",i=0;
|
|
146
146
|
let max=0
|
|
147
147
|
function parseScript(){
|
|
148
148
|
if (!html.startsWith("<script",i)) return false;
|
|
149
149
|
const openTagEnd=html.indexOf(">",i);
|
|
150
150
|
if (openTagEnd===-1) return false;
|
|
151
151
|
const attributesString=html.substring(i+7,openTagEnd).trim();
|
|
152
152
|
const attributes=parseAttributes(attributesString);
|
|
153
153
|
let closeTagStart=html.indexOf("</script>",openTagEnd);
|
|
154
154
|
if (closeTagStart===-1) return false;
|
|
155
155
|
const content=html.substring(openTagEnd+1,closeTagStart);
|
|
156
156
|
const scriptNode=new Node('script',attributes,stack[stack.length-1]);
|
|
157
157
|
if(content.length>0) scriptNode.childNodes.push(content);
|
|
158
158
|
i=closeTagStart+9;
|
|
159
159
|
return true;
|
|
160
160
|
}
|
|
161
161
|
function parseSpecial(startStr,endStr,n1,n2,tag){
|
|
162
162
|
if (!html.startsWith(startStr,i)) return false
|
|
163
163
|
const end=html.indexOf(endStr,i+n1);
|
|
164
164
|
const strNode=new Node(tag,{},stack[stack.length-1]);
|
|
165
165
|
strNode.childNodes.push(html.substring(i+n1,end));
|
|
166
166
|
i=end+n2;
|
|
167
167
|
return true
|
|
168
168
|
}
|
|
169
169
|
while (i< html.length){
|
|
170
170
|
if(i>=max) max=i;
|
|
171
171
|
else break;
|
|
172
172
|
if (parseScript()) continue
|
|
173
173
|
if (parseSpecial("<!--","-->",4,3,'#comment')) continue
|
|
174
174
|
if (parseSpecial("<style","</style>",7,8,'style')) continue
|
|
175
175
|
if (html.startsWith("<![CDATA[",i)){
|
|
176
176
|
const end=html.indexOf("]]>",i+9);
|
|
177
177
|
if (end===-1) break;
|
|
178
178
|
const content=html.substring(i+9,end);
|
|
179
179
|
const cdataNode=new SingleNode("#cdata-section",{},stack[stack.length-1]);
|
|
180
180
|
cdataNode.nodeValue=content;
|
|
181
181
|
i=end+3;
|
|
182
182
|
continue;
|
|
183
183
|
}
|
|
184
184
|
if (html.startsWith("<",i)){
|
|
185
185
|
if (currentText && stack[stack.length-1]){
|
|
186
186
|
const textNode=new TextNode(currentText)
|
|
187
187
|
stack[stack.length-1].childNodes.push(textNode);
|
|
188
188
|
textNode.parent=stack[stack.length-1]
|
|
189
189
|
currentText="";
|
|
190
190
|
}
|
|
191
191
|
let tagEnd=i+1;
|
|
192
192
|
let insideQuotes=false;
|
|
193
193
|
let quoteChar=null;
|
|
194
194
|
while (tagEnd< html.length){
|
|
195
195
|
const char=html[tagEnd];
|
|
196
196
|
if (!insideQuotes && (char==='"' || char==="'")){
|
|
197
197
|
insideQuotes=true;
|
|
198
198
|
quoteChar=char;
|
|
199
199
|
} else if (insideQuotes && char===quoteChar){
|
|
200
200
|
insideQuotes=false;
|
|
201
201
|
quoteChar=null;
|
|
202
202
|
}
|
|
203
203
|
if (!insideQuotes && char==='>') break;
|
|
204
204
|
tagEnd++;
|
|
205
205
|
}
|
|
206
206
|
const tagContent=html.substring(i+1,tagEnd);
|
|
207
207
|
if (tagContent.startsWith("/")) stack.pop();
|
|
208
208
|
else{
|
|
209
209
|
let isSelfClosing=tagContent.endsWith('/');
|
|
210
210
|
const tagNameEnd=tagContent.search(/\s|>|\//);
|
|
211
211
|
const tagName=tagContent.substring(0,tagNameEnd>0 ? tagNameEnd : tagEnd-i-1);
|
|
212
212
|
const attributesString=tagContent.substring(tagName.length,isSelfClosing ? tagContent.length-1 : tagContent.length).trim();
|
|
213
213
|
const attributes=parseAttributes(attributesString);
|
|
214
214
|
if (VOID_TAGS.has(tagName.toLowerCase()) || isSelfClosing) new SingleNode(tagName,attributes,stack[stack.length-1])
|
|
215
215
|
else stack.push(new Node(tagName,attributes,stack[stack.length-1]));
|
|
216
216
|
}
|
|
217
217
|
i=tagEnd+1;
|
|
218
218
|
} else{
|
|
219
219
|
currentText+=html[i];
|
|
220
220
|
i++;
|
|
221
221
|
}
|
|
222
222
|
}
|
|
223
223
|
if (currentText.trim() && stack[stack.length-1]) stack[stack.length-1].childNodes.push(new TextNode(currentText));
|
|
224
224
|
return root;
|
|
225
225
|
}
|
|
226
226
|
return { parseHTML, Node, Query, TextNode, SingleNode }
|
|
227
227
|
})()
|
package/index.js
CHANGED
|
@@ -28,6 +28,6 @@ class SingleNode extends Node{
|
|
|
28
28
|
constructor(tagName,attributes={},parent=null)
|
|
29
29
|
function parseAttributes(str){
|
|
30
30
|
const attrs={};
|
|
31
31
|
let key="";
|
|
32
32
|
let value="";
|
|
33
33
|
let isKey=true;
|
|
34
34
|
let quoteChar=null;
|
|
35
35
|
for (let i=0; i< str.length; i++){
|
|
36
36
|
const char=str[i];
|
|
37
37
|
if (isKey && (char==='=' || char===' ')){
|
|
38
38
|
if (char==='=') isKey=false;
|
|
39
39
|
else if (key.trim()){
|
|
40
40
|
attrs[key.trim()]=true;
|
|
41
41
|
key="";
|
|
42
42
|
}
|
|
43
43
|
continue;
|
|
44
44
|
}
|
|
45
45
|
if (!quoteChar && (char==='"' || char==="'")){
|
|
46
46
|
quoteChar=char;
|
|
47
47
|
continue;
|
|
48
48
|
} else if (quoteChar && char===quoteChar){
|
|
49
49
|
quoteChar=null;
|
|
50
50
|
attrs[key.trim()]=value.trim();
|
|
51
51
|
key=""; value=""; isKey=true;
|
|
52
52
|
continue;
|
|
53
53
|
}
|
|
54
54
|
if (isKey) key+=char;
|
|
55
55
|
else value+=char;
|
|
56
56
|
}
|
|
57
57
|
if (key.trim() &&!value) attrs[key.trim()]=true;
|
|
58
58
|
return attrs;
|
|
59
59
|
}
|
|
60
60
|
const VOID_TAGS=new Set(["area","base","br","col","command","embed","hr","img","input","keygen","link","meta","param","source","track","wbr","!doctype",'?xml']);
|
|
61
|
-
function parseHTML(html){
|
|
62
61
|
const root=new Node("ROOT");
|
|
63
62
|
const stack=[root];
|
|
64
63
|
let currentText="",i=0;
|
|
65
64
|
let max=0
|
|
66
65
|
function parseScript(){
|
|
67
66
|
if (!html.startsWith("<script",i)) return false;
|
|
68
67
|
const openTagEnd=html.indexOf(">",i);
|
|
69
68
|
if (openTagEnd===-1) return false;
|
|
70
69
|
const attributesString=html.substring(i+7,openTagEnd).trim();
|
|
71
70
|
const attributes=parseAttributes(attributesString);
|
|
72
71
|
let closeTagStart=html.indexOf("</script>",openTagEnd);
|
|
73
72
|
if (closeTagStart===-1) return false;
|
|
74
73
|
const content=html.substring(openTagEnd+1,closeTagStart);
|
|
75
74
|
const scriptNode=new Node('script',attributes,stack[stack.length-1]);
|
|
76
75
|
if(content.length>0) scriptNode.childNodes.push(content);
|
|
77
76
|
i=closeTagStart+9;
|
|
78
77
|
return true;
|
|
79
78
|
}
|
|
80
79
|
function parseSpecial(startStr,endStr,n1,n2,tag){
|
|
81
80
|
if (!html.startsWith(startStr,i)) return false
|
|
82
81
|
const end=html.indexOf(endStr,i+n1);
|
|
83
82
|
const strNode=new Node(tag,{},stack[stack.length-1]);
|
|
84
83
|
strNode.childNodes.push(html.substring(i+n1,end));
|
|
85
84
|
i=end+n2;
|
|
86
85
|
return true
|
|
87
86
|
}
|
|
88
87
|
while (i< html.length){
|
|
89
88
|
if(i>=max) max=i;
|
|
90
89
|
else break;
|
|
91
90
|
if (parseScript()) continue
|
|
92
91
|
if (parseSpecial("<!--","-->",4,3,'#comment')) continue
|
|
93
92
|
if (parseSpecial("<style","</style>",7,8,'style')) continue
|
|
94
93
|
if (html.startsWith("<![CDATA[",i)){
|
|
95
94
|
const end=html.indexOf("]]>",i+9);
|
|
96
95
|
if (end===-1) break;
|
|
97
96
|
const content=html.substring(i+9,end);
|
|
98
97
|
const cdataNode=new SingleNode("#cdata-section",{},stack[stack.length-1]);
|
|
99
98
|
cdataNode.nodeValue=content;
|
|
100
99
|
i=end+3;
|
|
101
100
|
continue;
|
|
102
101
|
}
|
|
103
102
|
if (html.startsWith("<",i)){
|
|
104
103
|
if (currentText && stack[stack.length-1]){
|
|
105
104
|
stack[stack.length-1].childNodes.push(new TextNode(currentText));
|
|
106
105
|
currentText="";
|
|
107
106
|
}
|
|
108
107
|
let tagEnd=i+1;
|
|
109
108
|
let insideQuotes=false;
|
|
110
109
|
let quoteChar=null;
|
|
111
110
|
while (tagEnd< html.length){
|
|
112
111
|
const char=html[tagEnd];
|
|
113
112
|
if (!insideQuotes && (char==='"' || char==="'")){
|
|
114
113
|
insideQuotes=true;
|
|
115
114
|
quoteChar=char;
|
|
116
115
|
} else if (insideQuotes && char===quoteChar){
|
|
117
116
|
insideQuotes=false;
|
|
118
117
|
quoteChar=null;
|
|
119
118
|
}
|
|
120
119
|
if (!insideQuotes && char==='>') break;
|
|
121
120
|
tagEnd++;
|
|
122
121
|
}
|
|
123
122
|
const tagContent=html.substring(i+1,tagEnd);
|
|
124
123
|
if (tagContent.startsWith("/")) stack.pop();
|
|
125
124
|
else{
|
|
126
125
|
let isSelfClosing=tagContent.endsWith('/');
|
|
127
126
|
const tagNameEnd=tagContent.search(/\s|>|\//);
|
|
128
127
|
const tagName=tagContent.substring(0,tagNameEnd>0 ? tagNameEnd : tagEnd-i-1);
|
|
129
128
|
const attributesString=tagContent.substring(tagName.length,isSelfClosing ? tagContent.length-1 : tagContent.length).trim();
|
|
130
129
|
const attributes=parseAttributes(attributesString);
|
|
131
130
|
if (VOID_TAGS.has(tagName.toLowerCase()) || isSelfClosing) new SingleNode(tagName,attributes,stack[stack.length-1])
|
|
132
131
|
else stack.push(new Node(tagName,attributes,stack[stack.length-1]));
|
|
133
132
|
}
|
|
134
133
|
i=tagEnd+1;
|
|
135
134
|
} else{
|
|
136
135
|
currentText+=html[i];
|
|
137
136
|
i++;
|
|
138
137
|
}
|
|
139
138
|
}
|
|
140
139
|
if (currentText.trim() && stack[stack.length-1]) stack[stack.length-1].childNodes.push(new TextNode(currentText));
|
|
141
140
|
return root;
|
|
141
|
+
function parseHTML(html){
|
|
142
142
|
const root=new Node("ROOT");
|
|
143
143
|
const stack=[root];
|
|
144
144
|
let currentText="",i=0;
|
|
145
145
|
let max=0
|
|
146
146
|
function parseScript(){
|
|
147
147
|
if (!html.startsWith("<script",i)) return false;
|
|
148
148
|
const openTagEnd=html.indexOf(">",i);
|
|
149
149
|
if (openTagEnd===-1) return false;
|
|
150
150
|
const attributesString=html.substring(i+7,openTagEnd).trim();
|
|
151
151
|
const attributes=parseAttributes(attributesString);
|
|
152
152
|
let closeTagStart=html.indexOf("</script>",openTagEnd);
|
|
153
153
|
if (closeTagStart===-1) return false;
|
|
154
154
|
const content=html.substring(openTagEnd+1,closeTagStart);
|
|
155
155
|
const scriptNode=new Node('script',attributes,stack[stack.length-1]);
|
|
156
156
|
if(content.length>0) scriptNode.childNodes.push(content);
|
|
157
157
|
i=closeTagStart+9;
|
|
158
158
|
return true;
|
|
159
159
|
}
|
|
160
160
|
function parseSpecial(startStr,endStr,n1,n2,tag){
|
|
161
161
|
if (!html.startsWith(startStr,i)) return false
|
|
162
162
|
const end=html.indexOf(endStr,i+n1);
|
|
163
163
|
const strNode=new Node(tag,{},stack[stack.length-1]);
|
|
164
164
|
strNode.childNodes.push(html.substring(i+n1,end));
|
|
165
165
|
i=end+n2;
|
|
166
166
|
return true
|
|
167
167
|
}
|
|
168
168
|
while (i< html.length){
|
|
169
169
|
if(i>=max) max=i;
|
|
170
170
|
else break;
|
|
171
171
|
if (parseScript()) continue
|
|
172
172
|
if (parseSpecial("<!--","-->",4,3,'#comment')) continue
|
|
173
173
|
if (parseSpecial("<style","</style>",7,8,'style')) continue
|
|
174
174
|
if (html.startsWith("<![CDATA[",i)){
|
|
175
175
|
const end=html.indexOf("]]>",i+9);
|
|
176
176
|
if (end===-1) break;
|
|
177
177
|
const content=html.substring(i+9,end);
|
|
178
178
|
const cdataNode=new SingleNode("#cdata-section",{},stack[stack.length-1]);
|
|
179
179
|
cdataNode.nodeValue=content;
|
|
180
180
|
i=end+3;
|
|
181
181
|
continue;
|
|
182
182
|
}
|
|
183
183
|
if (html.startsWith("<",i)){
|
|
184
184
|
if (currentText && stack[stack.length-1]){
|
|
185
185
|
const textNode=new TextNode(currentText)
|
|
186
186
|
stack[stack.length-1].childNodes.push(textNode);
|
|
187
187
|
textNode.parent=stack[stack.length-1]
|
|
188
188
|
currentText="";
|
|
189
189
|
}
|
|
190
190
|
let tagEnd=i+1;
|
|
191
191
|
let insideQuotes=false;
|
|
192
192
|
let quoteChar=null;
|
|
193
193
|
while (tagEnd< html.length){
|
|
194
194
|
const char=html[tagEnd];
|
|
195
195
|
if (!insideQuotes && (char==='"' || char==="'")){
|
|
196
196
|
insideQuotes=true;
|
|
197
197
|
quoteChar=char;
|
|
198
198
|
} else if (insideQuotes && char===quoteChar){
|
|
199
199
|
insideQuotes=false;
|
|
200
200
|
quoteChar=null;
|
|
201
201
|
}
|
|
202
202
|
if (!insideQuotes && char==='>') break;
|
|
203
203
|
tagEnd++;
|
|
204
204
|
}
|
|
205
205
|
const tagContent=html.substring(i+1,tagEnd);
|
|
206
206
|
if (tagContent.startsWith("/")) stack.pop();
|
|
207
207
|
else{
|
|
208
208
|
let isSelfClosing=tagContent.endsWith('/');
|
|
209
209
|
const tagNameEnd=tagContent.search(/\s|>|\//);
|
|
210
210
|
const tagName=tagContent.substring(0,tagNameEnd>0 ? tagNameEnd : tagEnd-i-1);
|
|
211
211
|
const attributesString=tagContent.substring(tagName.length,isSelfClosing ? tagContent.length-1 : tagContent.length).trim();
|
|
212
212
|
const attributes=parseAttributes(attributesString);
|
|
213
213
|
if (VOID_TAGS.has(tagName.toLowerCase()) || isSelfClosing) new SingleNode(tagName,attributes,stack[stack.length-1])
|
|
214
214
|
else stack.push(new Node(tagName,attributes,stack[stack.length-1]));
|
|
215
215
|
}
|
|
216
216
|
i=tagEnd+1;
|
|
217
217
|
} else{
|
|
218
218
|
currentText+=html[i];
|
|
219
219
|
i++;
|
|
220
220
|
}
|
|
221
221
|
}
|
|
222
222
|
if (currentText.trim() && stack[stack.length-1]) stack[stack.length-1].childNodes.push(new TextNode(currentText));
|
|
223
223
|
return root;
|
|
224
224
|
}
|
|
225
225
|
module.exports = { parseHTML, Node, Query, TextNode, SingleNode }
|
package/index.mjs
CHANGED
|
@@ -28,6 +28,6 @@ class SingleNode extends Node{
|
|
|
28
28
|
constructor(tagName,attributes={},parent=null)
|
|
29
29
|
function parseAttributes(str){
|
|
30
30
|
const attrs={};
|
|
31
31
|
let key="";
|
|
32
32
|
let value="";
|
|
33
33
|
let isKey=true;
|
|
34
34
|
let quoteChar=null;
|
|
35
35
|
for (let i=0; i< str.length; i++){
|
|
36
36
|
const char=str[i];
|
|
37
37
|
if (isKey && (char==='=' || char===' ')){
|
|
38
38
|
if (char==='=') isKey=false;
|
|
39
39
|
else if (key.trim()){
|
|
40
40
|
attrs[key.trim()]=true;
|
|
41
41
|
key="";
|
|
42
42
|
}
|
|
43
43
|
continue;
|
|
44
44
|
}
|
|
45
45
|
if (!quoteChar && (char==='"' || char==="'")){
|
|
46
46
|
quoteChar=char;
|
|
47
47
|
continue;
|
|
48
48
|
} else if (quoteChar && char===quoteChar){
|
|
49
49
|
quoteChar=null;
|
|
50
50
|
attrs[key.trim()]=value.trim();
|
|
51
51
|
key=""; value=""; isKey=true;
|
|
52
52
|
continue;
|
|
53
53
|
}
|
|
54
54
|
if (isKey) key+=char;
|
|
55
55
|
else value+=char;
|
|
56
56
|
}
|
|
57
57
|
if (key.trim() &&!value) attrs[key.trim()]=true;
|
|
58
58
|
return attrs;
|
|
59
59
|
}
|
|
60
60
|
const VOID_TAGS=new Set(["area","base","br","col","command","embed","hr","img","input","keygen","link","meta","param","source","track","wbr","!doctype",'?xml']);
|
|
61
|
-
function parseHTML(html){
|
|
62
61
|
const root=new Node("ROOT");
|
|
63
62
|
const stack=[root];
|
|
64
63
|
let currentText="",i=0;
|
|
65
64
|
let max=0
|
|
66
65
|
function parseScript(){
|
|
67
66
|
if (!html.startsWith("<script",i)) return false;
|
|
68
67
|
const openTagEnd=html.indexOf(">",i);
|
|
69
68
|
if (openTagEnd===-1) return false;
|
|
70
69
|
const attributesString=html.substring(i+7,openTagEnd).trim();
|
|
71
70
|
const attributes=parseAttributes(attributesString);
|
|
72
71
|
let closeTagStart=html.indexOf("</script>",openTagEnd);
|
|
73
72
|
if (closeTagStart===-1) return false;
|
|
74
73
|
const content=html.substring(openTagEnd+1,closeTagStart);
|
|
75
74
|
const scriptNode=new Node('script',attributes,stack[stack.length-1]);
|
|
76
75
|
if(content.length>0) scriptNode.childNodes.push(content);
|
|
77
76
|
i=closeTagStart+9;
|
|
78
77
|
return true;
|
|
79
78
|
}
|
|
80
79
|
function parseSpecial(startStr,endStr,n1,n2,tag){
|
|
81
80
|
if (!html.startsWith(startStr,i)) return false
|
|
82
81
|
const end=html.indexOf(endStr,i+n1);
|
|
83
82
|
const strNode=new Node(tag,{},stack[stack.length-1]);
|
|
84
83
|
strNode.childNodes.push(html.substring(i+n1,end));
|
|
85
84
|
i=end+n2;
|
|
86
85
|
return true
|
|
87
86
|
}
|
|
88
87
|
while (i< html.length){
|
|
89
88
|
if(i>=max) max=i;
|
|
90
89
|
else break;
|
|
91
90
|
if (parseScript()) continue
|
|
92
91
|
if (parseSpecial("<!--","-->",4,3,'#comment')) continue
|
|
93
92
|
if (parseSpecial("<style","</style>",7,8,'style')) continue
|
|
94
93
|
if (html.startsWith("<![CDATA[",i)){
|
|
95
94
|
const end=html.indexOf("]]>",i+9);
|
|
96
95
|
if (end===-1) break;
|
|
97
96
|
const content=html.substring(i+9,end);
|
|
98
97
|
const cdataNode=new SingleNode("#cdata-section",{},stack[stack.length-1]);
|
|
99
98
|
cdataNode.nodeValue=content;
|
|
100
99
|
i=end+3;
|
|
101
100
|
continue;
|
|
102
101
|
}
|
|
103
102
|
if (html.startsWith("<",i)){
|
|
104
103
|
if (currentText && stack[stack.length-1]){
|
|
105
104
|
stack[stack.length-1].childNodes.push(new TextNode(currentText));
|
|
106
105
|
currentText="";
|
|
107
106
|
}
|
|
108
107
|
let tagEnd=i+1;
|
|
109
108
|
let insideQuotes=false;
|
|
110
109
|
let quoteChar=null;
|
|
111
110
|
while (tagEnd< html.length){
|
|
112
111
|
const char=html[tagEnd];
|
|
113
112
|
if (!insideQuotes && (char==='"' || char==="'")){
|
|
114
113
|
insideQuotes=true;
|
|
115
114
|
quoteChar=char;
|
|
116
115
|
} else if (insideQuotes && char===quoteChar){
|
|
117
116
|
insideQuotes=false;
|
|
118
117
|
quoteChar=null;
|
|
119
118
|
}
|
|
120
119
|
if (!insideQuotes && char==='>') break;
|
|
121
120
|
tagEnd++;
|
|
122
121
|
}
|
|
123
122
|
const tagContent=html.substring(i+1,tagEnd);
|
|
124
123
|
if (tagContent.startsWith("/")) stack.pop();
|
|
125
124
|
else{
|
|
126
125
|
let isSelfClosing=tagContent.endsWith('/');
|
|
127
126
|
const tagNameEnd=tagContent.search(/\s|>|\//);
|
|
128
127
|
const tagName=tagContent.substring(0,tagNameEnd>0 ? tagNameEnd : tagEnd-i-1);
|
|
129
128
|
const attributesString=tagContent.substring(tagName.length,isSelfClosing ? tagContent.length-1 : tagContent.length).trim();
|
|
130
129
|
const attributes=parseAttributes(attributesString);
|
|
131
130
|
if (VOID_TAGS.has(tagName.toLowerCase()) || isSelfClosing) new SingleNode(tagName,attributes,stack[stack.length-1])
|
|
132
131
|
else stack.push(new Node(tagName,attributes,stack[stack.length-1]));
|
|
133
132
|
}
|
|
134
133
|
i=tagEnd+1;
|
|
135
134
|
} else{
|
|
136
135
|
currentText+=html[i];
|
|
137
136
|
i++;
|
|
138
137
|
}
|
|
139
138
|
}
|
|
140
139
|
if (currentText.trim() && stack[stack.length-1]) stack[stack.length-1].childNodes.push(new TextNode(currentText));
|
|
141
140
|
return root;
|
|
141
|
+
function parseHTML(html){
|
|
142
142
|
const root=new Node("ROOT");
|
|
143
143
|
const stack=[root];
|
|
144
144
|
let currentText="",i=0;
|
|
145
145
|
let max=0
|
|
146
146
|
function parseScript(){
|
|
147
147
|
if (!html.startsWith("<script",i)) return false;
|
|
148
148
|
const openTagEnd=html.indexOf(">",i);
|
|
149
149
|
if (openTagEnd===-1) return false;
|
|
150
150
|
const attributesString=html.substring(i+7,openTagEnd).trim();
|
|
151
151
|
const attributes=parseAttributes(attributesString);
|
|
152
152
|
let closeTagStart=html.indexOf("</script>",openTagEnd);
|
|
153
153
|
if (closeTagStart===-1) return false;
|
|
154
154
|
const content=html.substring(openTagEnd+1,closeTagStart);
|
|
155
155
|
const scriptNode=new Node('script',attributes,stack[stack.length-1]);
|
|
156
156
|
if(content.length>0) scriptNode.childNodes.push(content);
|
|
157
157
|
i=closeTagStart+9;
|
|
158
158
|
return true;
|
|
159
159
|
}
|
|
160
160
|
function parseSpecial(startStr,endStr,n1,n2,tag){
|
|
161
161
|
if (!html.startsWith(startStr,i)) return false
|
|
162
162
|
const end=html.indexOf(endStr,i+n1);
|
|
163
163
|
const strNode=new Node(tag,{},stack[stack.length-1]);
|
|
164
164
|
strNode.childNodes.push(html.substring(i+n1,end));
|
|
165
165
|
i=end+n2;
|
|
166
166
|
return true
|
|
167
167
|
}
|
|
168
168
|
while (i< html.length){
|
|
169
169
|
if(i>=max) max=i;
|
|
170
170
|
else break;
|
|
171
171
|
if (parseScript()) continue
|
|
172
172
|
if (parseSpecial("<!--","-->",4,3,'#comment')) continue
|
|
173
173
|
if (parseSpecial("<style","</style>",7,8,'style')) continue
|
|
174
174
|
if (html.startsWith("<![CDATA[",i)){
|
|
175
175
|
const end=html.indexOf("]]>",i+9);
|
|
176
176
|
if (end===-1) break;
|
|
177
177
|
const content=html.substring(i+9,end);
|
|
178
178
|
const cdataNode=new SingleNode("#cdata-section",{},stack[stack.length-1]);
|
|
179
179
|
cdataNode.nodeValue=content;
|
|
180
180
|
i=end+3;
|
|
181
181
|
continue;
|
|
182
182
|
}
|
|
183
183
|
if (html.startsWith("<",i)){
|
|
184
184
|
if (currentText && stack[stack.length-1]){
|
|
185
185
|
const textNode=new TextNode(currentText)
|
|
186
186
|
stack[stack.length-1].childNodes.push(textNode);
|
|
187
187
|
textNode.parent=stack[stack.length-1]
|
|
188
188
|
currentText="";
|
|
189
189
|
}
|
|
190
190
|
let tagEnd=i+1;
|
|
191
191
|
let insideQuotes=false;
|
|
192
192
|
let quoteChar=null;
|
|
193
193
|
while (tagEnd< html.length){
|
|
194
194
|
const char=html[tagEnd];
|
|
195
195
|
if (!insideQuotes && (char==='"' || char==="'")){
|
|
196
196
|
insideQuotes=true;
|
|
197
197
|
quoteChar=char;
|
|
198
198
|
} else if (insideQuotes && char===quoteChar){
|
|
199
199
|
insideQuotes=false;
|
|
200
200
|
quoteChar=null;
|
|
201
201
|
}
|
|
202
202
|
if (!insideQuotes && char==='>') break;
|
|
203
203
|
tagEnd++;
|
|
204
204
|
}
|
|
205
205
|
const tagContent=html.substring(i+1,tagEnd);
|
|
206
206
|
if (tagContent.startsWith("/")) stack.pop();
|
|
207
207
|
else{
|
|
208
208
|
let isSelfClosing=tagContent.endsWith('/');
|
|
209
209
|
const tagNameEnd=tagContent.search(/\s|>|\//);
|
|
210
210
|
const tagName=tagContent.substring(0,tagNameEnd>0 ? tagNameEnd : tagEnd-i-1);
|
|
211
211
|
const attributesString=tagContent.substring(tagName.length,isSelfClosing ? tagContent.length-1 : tagContent.length).trim();
|
|
212
212
|
const attributes=parseAttributes(attributesString);
|
|
213
213
|
if (VOID_TAGS.has(tagName.toLowerCase()) || isSelfClosing) new SingleNode(tagName,attributes,stack[stack.length-1])
|
|
214
214
|
else stack.push(new Node(tagName,attributes,stack[stack.length-1]));
|
|
215
215
|
}
|
|
216
216
|
i=tagEnd+1;
|
|
217
217
|
} else{
|
|
218
218
|
currentText+=html[i];
|
|
219
219
|
i++;
|
|
220
220
|
}
|
|
221
221
|
}
|
|
222
222
|
if (currentText.trim() && stack[stack.length-1]) stack[stack.length-1].childNodes.push(new TextNode(currentText));
|
|
223
223
|
return root;
|
|
224
224
|
}
|
|
225
225
|
export default { parseHTML, Node, Query, TextNode, SingleNode }
|
package/package.json
CHANGED
package/src/parse/parser.js
CHANGED
|
@@ -52,7 +52,9 @@ function parseHTML(html) {
|
|
|
52
52
|
|
|
53
53
|
if (html.startsWith("<", i)) {
|
|
54
54
|
if (currentText && stack[stack.length - 1]) {
|
|
55
|
-
|
|
55
|
+
const textNode = new TextNode(currentText)
|
|
56
|
+
stack[stack.length - 1].childNodes.push(textNode);
|
|
57
|
+
textNode.parent = stack[stack.length - 1]
|
|
56
58
|
currentText = "";
|
|
57
59
|
}
|
|
58
60
|
|
package/tests/index.html
CHANGED
|
@@ -6,8 +6,8 @@
|
|
|
6
6
|
<title>Document</title>
|
|
7
7
|
<script src="/node_modules/als-simple-test/test.js"></script>
|
|
8
8
|
<script src="../document.js"></script>
|
|
9
|
-
<script src="./data/html1.js"></script>
|
|
10
|
-
|
|
9
|
+
<!-- <script src="./data/html1.js"></script> -->
|
|
10
|
+
<script src="./data/html2.js"></script>
|
|
11
11
|
<script src="./data/svg.js"></script>
|
|
12
12
|
<script>
|
|
13
13
|
const { parseHTML, Node, Query, TextNode, SingleNode } = alsDocument
|
package/tests/parse-real.js
CHANGED
|
@@ -32,8 +32,9 @@ describe('Real data html1', async () => {
|
|
|
32
32
|
it('Text nodes check', () => {
|
|
33
33
|
const realParagraph = iframe.querySelector('p');
|
|
34
34
|
const parsedParagraph = parsedHTML.querySelector('p');
|
|
35
|
-
const real = realParagraph.textContent.trim().replace(/\n
|
|
36
|
-
const parsed = parsedParagraph.textContent.trim()
|
|
35
|
+
const real = realParagraph.textContent.trim().replace(/\n|\s/gm,'')
|
|
36
|
+
const parsed = parsedParagraph.textContent.trim().replace(/\n|\s/gm,'')
|
|
37
|
+
console.log({parsed,real})
|
|
37
38
|
assert(real === parsed, 'Text contents are the same');
|
|
38
39
|
});
|
|
39
40
|
|