@rgrove/parse-xml 3.0.0 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/README.md +72 -97
- package/dist/browser.js +774 -0
- package/dist/browser.js.map +7 -0
- package/dist/global.min.js +10 -0
- package/dist/global.min.js.map +7 -0
- package/dist/index.d.ts +24 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +50 -0
- package/dist/index.js.map +1 -0
- package/dist/lib/Parser.d.ts +218 -0
- package/dist/lib/Parser.d.ts.map +1 -0
- package/dist/lib/Parser.js +638 -0
- package/dist/lib/Parser.js.map +1 -0
- package/dist/lib/StringScanner.d.ts +97 -0
- package/dist/lib/StringScanner.d.ts.map +1 -0
- package/dist/lib/StringScanner.js +210 -0
- package/dist/lib/StringScanner.js.map +1 -0
- package/dist/lib/XmlCdata.d.ts +8 -0
- package/dist/lib/XmlCdata.d.ts.map +1 -0
- package/dist/lib/XmlCdata.js +15 -0
- package/dist/lib/XmlCdata.js.map +1 -0
- package/dist/lib/XmlComment.d.ts +16 -0
- package/dist/lib/XmlComment.d.ts.map +1 -0
- package/dist/lib/XmlComment.js +23 -0
- package/dist/lib/XmlComment.js.map +1 -0
- package/dist/lib/XmlDocument.d.ts +29 -0
- package/dist/lib/XmlDocument.d.ts.map +1 -0
- package/dist/lib/XmlDocument.js +47 -0
- package/dist/lib/XmlDocument.js.map +1 -0
- package/dist/lib/XmlElement.d.ts +40 -0
- package/dist/lib/XmlElement.d.ts.map +1 -0
- package/dist/lib/XmlElement.js +51 -0
- package/dist/lib/XmlElement.js.map +1 -0
- package/dist/lib/XmlNode.d.ts +74 -0
- package/dist/lib/XmlNode.d.ts.map +1 -0
- package/dist/lib/XmlNode.js +96 -0
- package/dist/lib/XmlNode.js.map +1 -0
- package/dist/lib/XmlProcessingInstruction.d.ts +22 -0
- package/dist/lib/XmlProcessingInstruction.d.ts.map +1 -0
- package/dist/lib/XmlProcessingInstruction.js +25 -0
- package/dist/lib/XmlProcessingInstruction.js.map +1 -0
- package/dist/lib/XmlText.d.ts +16 -0
- package/dist/lib/XmlText.d.ts.map +1 -0
- package/dist/lib/XmlText.js +23 -0
- package/dist/lib/XmlText.js.map +1 -0
- package/dist/lib/syntax.d.ts +69 -0
- package/dist/lib/syntax.d.ts.map +1 -0
- package/dist/lib/syntax.js +133 -0
- package/dist/lib/syntax.js.map +1 -0
- package/dist/lib/types.d.ts +5 -0
- package/dist/lib/types.d.ts.map +1 -0
- package/dist/lib/types.js +3 -0
- package/dist/lib/types.js.map +1 -0
- package/package.json +30 -22
- package/src/index.ts +30 -0
- package/src/lib/Parser.ts +819 -0
- package/src/lib/StringScanner.ts +254 -0
- package/src/lib/XmlCdata.ts +11 -0
- package/src/lib/XmlComment.ts +26 -0
- package/src/lib/XmlDocument.ts +57 -0
- package/src/lib/XmlElement.ts +81 -0
- package/src/lib/XmlNode.ts +107 -0
- package/src/lib/XmlProcessingInstruction.ts +35 -0
- package/src/lib/XmlText.ts +26 -0
- package/src/lib/syntax.ts +136 -0
- package/src/lib/types.ts +2 -0
- package/CHANGELOG.md +0 -162
- package/dist/types/index.d.ts +0 -68
- package/dist/types/index.d.ts.map +0 -1
- package/dist/types/lib/Parser.d.ts +0 -234
- package/dist/types/lib/Parser.d.ts.map +0 -1
- package/dist/types/lib/StringScanner.d.ts +0 -139
- package/dist/types/lib/StringScanner.d.ts.map +0 -1
- package/dist/types/lib/XmlCdata.d.ts +0 -11
- package/dist/types/lib/XmlCdata.d.ts.map +0 -1
- package/dist/types/lib/XmlComment.d.ts +0 -21
- package/dist/types/lib/XmlComment.d.ts.map +0 -1
- package/dist/types/lib/XmlDocument.d.ts +0 -42
- package/dist/types/lib/XmlDocument.d.ts.map +0 -1
- package/dist/types/lib/XmlElement.d.ts +0 -62
- package/dist/types/lib/XmlElement.d.ts.map +0 -1
- package/dist/types/lib/XmlNode.d.ts +0 -78
- package/dist/types/lib/XmlNode.d.ts.map +0 -1
- package/dist/types/lib/XmlProcessingInstruction.d.ts +0 -30
- package/dist/types/lib/XmlProcessingInstruction.d.ts.map +0 -1
- package/dist/types/lib/XmlText.d.ts +0 -21
- package/dist/types/lib/XmlText.d.ts.map +0 -1
- package/dist/types/lib/syntax.d.ts +0 -59
- package/dist/types/lib/syntax.d.ts.map +0 -1
- package/dist/umd/parse-xml.min.js +0 -2
- package/dist/umd/parse-xml.min.js.map +0 -1
- package/src/index.js +0 -67
- package/src/lib/Parser.js +0 -812
- package/src/lib/StringScanner.js +0 -312
- package/src/lib/XmlCdata.js +0 -17
- package/src/lib/XmlComment.js +0 -37
- package/src/lib/XmlDocument.js +0 -69
- package/src/lib/XmlElement.js +0 -101
- package/src/lib/XmlNode.js +0 -152
- package/src/lib/XmlProcessingInstruction.js +0 -48
- package/src/lib/XmlText.js +0 -37
- package/src/lib/syntax.js +0 -153
|
@@ -0,0 +1,819 @@
|
|
|
1
|
+
import { StringScanner } from './StringScanner.js';
|
|
2
|
+
import * as syntax from './syntax.js';
|
|
3
|
+
import { XmlCdata } from './XmlCdata.js';
|
|
4
|
+
import { XmlComment } from './XmlComment.js';
|
|
5
|
+
import { XmlDocument } from './XmlDocument.js';
|
|
6
|
+
import { XmlElement } from './XmlElement.js';
|
|
7
|
+
import { XmlProcessingInstruction } from './XmlProcessingInstruction.js';
|
|
8
|
+
import { XmlText } from './XmlText.js';
|
|
9
|
+
|
|
10
|
+
import type { XmlNode } from './XmlNode.js';
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
const emptyString = '';
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* Parses an XML string into an `XmlDocument`.
|
|
17
|
+
*
|
|
18
|
+
* @private
|
|
19
|
+
*/
|
|
20
|
+
export class Parser {
|
|
21
|
+
readonly document: XmlDocument;
|
|
22
|
+
|
|
23
|
+
private currentNode: XmlDocument | XmlElement;
|
|
24
|
+
private readonly options: ParserOptions;
|
|
25
|
+
private readonly scanner: StringScanner;
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* @param xml XML string to parse.
|
|
29
|
+
* @param options Parser options.
|
|
30
|
+
*/
|
|
31
|
+
constructor(xml: string, options: ParserOptions = {}) {
|
|
32
|
+
this.document = new XmlDocument();
|
|
33
|
+
this.currentNode = this.document;
|
|
34
|
+
this.options = options;
|
|
35
|
+
this.scanner = new StringScanner(normalizeXmlString(xml));
|
|
36
|
+
|
|
37
|
+
this.consumeProlog();
|
|
38
|
+
|
|
39
|
+
if (!this.consumeElement()) {
|
|
40
|
+
throw this.error('Root element is missing or invalid');
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
while (this.consumeMisc()) {} // eslint-disable-line no-empty
|
|
44
|
+
|
|
45
|
+
if (!this.scanner.isEnd) {
|
|
46
|
+
throw this.error('Extra content at the end of the document');
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* Adds the given `XmlNode` as a child of `this.currentNode`.
|
|
52
|
+
*/
|
|
53
|
+
addNode(node: XmlNode) {
|
|
54
|
+
node.parent = this.currentNode;
|
|
55
|
+
|
|
56
|
+
// @ts-expect-error: XmlDocument has a more limited set of possible children
|
|
57
|
+
// than XmlElement so TypeScript is unhappy, but we always do the right
|
|
58
|
+
// thing.
|
|
59
|
+
this.currentNode.children.push(node);
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
/**
|
|
63
|
+
* Adds the given _text_ to the document, either by appending it to a
|
|
64
|
+
* preceding `XmlText` node (if possible) or by creating a new `XmlText` node.
|
|
65
|
+
*/
|
|
66
|
+
addText(text: string) {
|
|
67
|
+
let { children } = this.currentNode;
|
|
68
|
+
let { length } = children;
|
|
69
|
+
|
|
70
|
+
if (length > 0) {
|
|
71
|
+
let prevNode = children[length - 1];
|
|
72
|
+
|
|
73
|
+
if (prevNode instanceof XmlText) {
|
|
74
|
+
// The previous node is a text node, so we can append to it and avoid
|
|
75
|
+
// creating another node.
|
|
76
|
+
prevNode.text += text;
|
|
77
|
+
return;
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
this.addNode(new XmlText(text));
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
/**
|
|
85
|
+
* Consumes element attributes.
|
|
86
|
+
*
|
|
87
|
+
* @see https://www.w3.org/TR/2008/REC-xml-20081126/#sec-starttags
|
|
88
|
+
*/
|
|
89
|
+
consumeAttributes(): Record<string, string> {
|
|
90
|
+
let attributes = Object.create(null);
|
|
91
|
+
|
|
92
|
+
while (this.consumeWhitespace()) {
|
|
93
|
+
let attrName = this.consumeName();
|
|
94
|
+
|
|
95
|
+
if (!attrName) {
|
|
96
|
+
break;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
let attrValue = this.consumeEqual() && this.consumeAttributeValue();
|
|
100
|
+
|
|
101
|
+
if (attrValue === false) {
|
|
102
|
+
throw this.error('Attribute value expected');
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
if (attrName in attributes) {
|
|
106
|
+
throw this.error(`Duplicate attribute: ${attrName}`);
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
if (attrName === 'xml:space'
|
|
110
|
+
&& attrValue !== 'default'
|
|
111
|
+
&& attrValue !== 'preserve') {
|
|
112
|
+
|
|
113
|
+
throw this.error('Value of the `xml:space` attribute must be "default" or "preserve"');
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
attributes[attrName] = attrValue;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
if (this.options.sortAttributes) {
|
|
120
|
+
let attrNames = Object.keys(attributes).sort();
|
|
121
|
+
let sortedAttributes = Object.create(null);
|
|
122
|
+
|
|
123
|
+
for (let i = 0; i < attrNames.length; ++i) {
|
|
124
|
+
let attrName = attrNames[i] as string;
|
|
125
|
+
sortedAttributes[attrName] = attributes[attrName];
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
attributes = sortedAttributes;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
return attributes;
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
/**
|
|
135
|
+
* Consumes an `AttValue` (attribute value) if possible.
|
|
136
|
+
*
|
|
137
|
+
* @returns
|
|
138
|
+
* Contents of the `AttValue` minus quotes, or `false` if nothing was
|
|
139
|
+
* consumed. An empty string indicates that an `AttValue` was consumed but
|
|
140
|
+
* was empty.
|
|
141
|
+
*
|
|
142
|
+
* @see https://www.w3.org/TR/2008/REC-xml-20081126/#NT-AttValue
|
|
143
|
+
*/
|
|
144
|
+
consumeAttributeValue(): string | false {
|
|
145
|
+
let { scanner } = this;
|
|
146
|
+
let quote = scanner.peek();
|
|
147
|
+
|
|
148
|
+
if (quote !== '"' && quote !== "'") {
|
|
149
|
+
return false;
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
scanner.advance();
|
|
153
|
+
|
|
154
|
+
let chars;
|
|
155
|
+
let isClosed = false;
|
|
156
|
+
let value = emptyString;
|
|
157
|
+
let regex = quote === '"'
|
|
158
|
+
? syntax.attValueCharDoubleQuote
|
|
159
|
+
: syntax.attValueCharSingleQuote;
|
|
160
|
+
|
|
161
|
+
matchLoop: while (!scanner.isEnd) {
|
|
162
|
+
chars = scanner.consumeMatch(regex);
|
|
163
|
+
|
|
164
|
+
if (chars) {
|
|
165
|
+
this.validateChars(chars);
|
|
166
|
+
value += chars.replace(syntax.attValueNormalizedWhitespace, ' ');
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
switch (scanner.peek()) {
|
|
170
|
+
case quote:
|
|
171
|
+
isClosed = true;
|
|
172
|
+
break matchLoop;
|
|
173
|
+
|
|
174
|
+
case '&':
|
|
175
|
+
value += this.consumeReference();
|
|
176
|
+
continue;
|
|
177
|
+
|
|
178
|
+
case '<':
|
|
179
|
+
throw this.error('Unescaped `<` is not allowed in an attribute value');
|
|
180
|
+
|
|
181
|
+
case emptyString:
|
|
182
|
+
break matchLoop;
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
if (!isClosed) {
|
|
187
|
+
throw this.error('Unclosed attribute');
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
scanner.advance();
|
|
191
|
+
return value;
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
/**
|
|
195
|
+
* Consumes a CDATA section if possible.
|
|
196
|
+
*
|
|
197
|
+
* @returns Whether a CDATA section was consumed.
|
|
198
|
+
* @see https://www.w3.org/TR/2008/REC-xml-20081126/#sec-cdata-sect
|
|
199
|
+
*/
|
|
200
|
+
consumeCdataSection(): boolean {
|
|
201
|
+
let { scanner } = this;
|
|
202
|
+
|
|
203
|
+
if (!scanner.consumeStringFast('<![CDATA[')) {
|
|
204
|
+
return false;
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
let text = scanner.consumeUntilString(']]>');
|
|
208
|
+
this.validateChars(text);
|
|
209
|
+
|
|
210
|
+
if (!scanner.consumeStringFast(']]>')) {
|
|
211
|
+
throw this.error('Unclosed CDATA section');
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
if (this.options.preserveCdata) {
|
|
215
|
+
this.addNode(new XmlCdata(text));
|
|
216
|
+
} else {
|
|
217
|
+
this.addText(text);
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
return true;
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
/**
|
|
224
|
+
* Consumes character data if possible.
|
|
225
|
+
*
|
|
226
|
+
* @returns Whether character data was consumed.
|
|
227
|
+
* @see https://www.w3.org/TR/2008/REC-xml-20081126/#dt-chardata
|
|
228
|
+
*/
|
|
229
|
+
consumeCharData(): boolean {
|
|
230
|
+
let { scanner } = this;
|
|
231
|
+
let charData = scanner.consumeUntilMatch(syntax.endCharData);
|
|
232
|
+
|
|
233
|
+
if (!charData) {
|
|
234
|
+
return false;
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
this.validateChars(charData);
|
|
238
|
+
|
|
239
|
+
if (scanner.peek(3) === ']]>') {
|
|
240
|
+
throw this.error('Element content may not contain the CDATA section close delimiter `]]>`');
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
this.addText(charData);
|
|
244
|
+
return true;
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
/**
|
|
248
|
+
* Consumes a comment if possible.
|
|
249
|
+
*
|
|
250
|
+
* @returns Whether a comment was consumed.
|
|
251
|
+
* @see https://www.w3.org/TR/2008/REC-xml-20081126/#NT-Comment
|
|
252
|
+
*/
|
|
253
|
+
consumeComment(): boolean {
|
|
254
|
+
let { scanner } = this;
|
|
255
|
+
|
|
256
|
+
if (!scanner.consumeStringFast('<!--')) {
|
|
257
|
+
return false;
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
let content = scanner.consumeUntilString('--');
|
|
261
|
+
this.validateChars(content);
|
|
262
|
+
|
|
263
|
+
if (!scanner.consumeStringFast('-->')) {
|
|
264
|
+
if (scanner.peek(2) === '--') {
|
|
265
|
+
throw this.error("The string `--` isn't allowed inside a comment");
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
throw this.error('Unclosed comment');
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
if (this.options.preserveComments) {
|
|
272
|
+
this.addNode(new XmlComment(content.trim()));
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
return true;
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
/**
|
|
279
|
+
* Consumes a reference in a content context if possible.
|
|
280
|
+
*
|
|
281
|
+
* This differs from `consumeReference()` in that a consumed reference will be
|
|
282
|
+
* added to the document as a text node instead of returned.
|
|
283
|
+
*
|
|
284
|
+
* @returns Whether a reference was consumed.
|
|
285
|
+
* @see https://www.w3.org/TR/2008/REC-xml-20081126/#entproc
|
|
286
|
+
*/
|
|
287
|
+
consumeContentReference(): boolean {
|
|
288
|
+
let ref = this.consumeReference();
|
|
289
|
+
|
|
290
|
+
if (ref) {
|
|
291
|
+
this.addText(ref);
|
|
292
|
+
return true;
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
return false;
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
/**
|
|
299
|
+
* Consumes a doctype declaration if possible.
|
|
300
|
+
*
|
|
301
|
+
* This is a loose implementation since doctype declarations are currently
|
|
302
|
+
* discarded without further parsing.
|
|
303
|
+
*
|
|
304
|
+
* @returns Whether a doctype declaration was consumed.
|
|
305
|
+
* @see https://www.w3.org/TR/2008/REC-xml-20081126/#dtd
|
|
306
|
+
*/
|
|
307
|
+
consumeDoctypeDeclaration(): boolean {
|
|
308
|
+
let { scanner } = this;
|
|
309
|
+
|
|
310
|
+
if (!scanner.consumeStringFast('<!DOCTYPE')
|
|
311
|
+
|| !this.consumeWhitespace()) {
|
|
312
|
+
|
|
313
|
+
return false;
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
scanner.consumeMatch(/[^[>]+/y);
|
|
317
|
+
|
|
318
|
+
if (scanner.consumeMatch(/\[[\s\S]+?\][\x20\t\r\n]*>/y)) {
|
|
319
|
+
return true;
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
if (!scanner.consumeStringFast('>')) {
|
|
323
|
+
throw this.error('Unclosed doctype declaration');
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
return true;
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
/**
|
|
330
|
+
* Consumes an element if possible.
|
|
331
|
+
*
|
|
332
|
+
* @returns Whether an element was consumed.
|
|
333
|
+
* @see https://www.w3.org/TR/2008/REC-xml-20081126/#NT-element
|
|
334
|
+
*/
|
|
335
|
+
consumeElement(): boolean {
|
|
336
|
+
let { scanner } = this;
|
|
337
|
+
let mark = scanner.charIndex;
|
|
338
|
+
|
|
339
|
+
if (!scanner.consumeStringFast('<')) {
|
|
340
|
+
return false;
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
let name = this.consumeName();
|
|
344
|
+
|
|
345
|
+
if (!name) {
|
|
346
|
+
scanner.reset(mark);
|
|
347
|
+
return false;
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
let attributes = this.consumeAttributes();
|
|
351
|
+
let isEmpty = Boolean(scanner.consumeStringFast('/>'));
|
|
352
|
+
let element = new XmlElement(name, attributes);
|
|
353
|
+
|
|
354
|
+
element.parent = this.currentNode;
|
|
355
|
+
|
|
356
|
+
if (!isEmpty) {
|
|
357
|
+
if (!scanner.consumeStringFast('>')) {
|
|
358
|
+
throw this.error(`Unclosed start tag for element \`${name}\``);
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
this.currentNode = element;
|
|
362
|
+
|
|
363
|
+
do {
|
|
364
|
+
this.consumeCharData();
|
|
365
|
+
} while (
|
|
366
|
+
this.consumeElement()
|
|
367
|
+
|| this.consumeContentReference()
|
|
368
|
+
|| this.consumeCdataSection()
|
|
369
|
+
|| this.consumeProcessingInstruction()
|
|
370
|
+
|| this.consumeComment()
|
|
371
|
+
);
|
|
372
|
+
|
|
373
|
+
let endTagMark = scanner.charIndex;
|
|
374
|
+
let endTagName;
|
|
375
|
+
|
|
376
|
+
if (!scanner.consumeStringFast('</')
|
|
377
|
+
|| !(endTagName = this.consumeName())
|
|
378
|
+
|| endTagName !== name) {
|
|
379
|
+
|
|
380
|
+
scanner.reset(endTagMark);
|
|
381
|
+
throw this.error(`Missing end tag for element ${name}`);
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
this.consumeWhitespace();
|
|
385
|
+
|
|
386
|
+
if (!scanner.consumeStringFast('>')) {
|
|
387
|
+
throw this.error(`Unclosed end tag for element ${name}`);
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
this.currentNode = element.parent;
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
this.addNode(element);
|
|
394
|
+
return true;
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
/**
|
|
398
|
+
* Consumes an `Eq` production if possible.
|
|
399
|
+
*
|
|
400
|
+
* @returns Whether an `Eq` production was consumed.
|
|
401
|
+
* @see https://www.w3.org/TR/2008/REC-xml-20081126/#NT-Eq
|
|
402
|
+
*/
|
|
403
|
+
consumeEqual(): boolean {
|
|
404
|
+
this.consumeWhitespace();
|
|
405
|
+
|
|
406
|
+
if (this.scanner.consumeStringFast('=')) {
|
|
407
|
+
this.consumeWhitespace();
|
|
408
|
+
return true;
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
return false;
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
/**
|
|
415
|
+
* Consumes `Misc` content if possible.
|
|
416
|
+
*
|
|
417
|
+
* @returns Whether anything was consumed.
|
|
418
|
+
* @see https://www.w3.org/TR/2008/REC-xml-20081126/#NT-Misc
|
|
419
|
+
*/
|
|
420
|
+
consumeMisc(): boolean {
|
|
421
|
+
return this.consumeComment()
|
|
422
|
+
|| this.consumeProcessingInstruction()
|
|
423
|
+
|| this.consumeWhitespace();
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
/**
|
|
427
|
+
* Consumes one or more `Name` characters if possible.
|
|
428
|
+
*
|
|
429
|
+
* @returns `Name` characters, or an empty string if none were consumed.
|
|
430
|
+
* @see https://www.w3.org/TR/2008/REC-xml-20081126/#NT-Name
|
|
431
|
+
*/
|
|
432
|
+
consumeName(): string {
|
|
433
|
+
return syntax.isNameStartChar(this.scanner.peek())
|
|
434
|
+
? this.scanner.consumeMatchFn(syntax.isNameChar)
|
|
435
|
+
: emptyString;
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
/**
|
|
439
|
+
* Consumes a processing instruction if possible.
|
|
440
|
+
*
|
|
441
|
+
* @returns Whether a processing instruction was consumed.
|
|
442
|
+
* @see https://www.w3.org/TR/2008/REC-xml-20081126/#sec-pi
|
|
443
|
+
*/
|
|
444
|
+
consumeProcessingInstruction(): boolean {
|
|
445
|
+
let { scanner } = this;
|
|
446
|
+
let mark = scanner.charIndex;
|
|
447
|
+
|
|
448
|
+
if (!scanner.consumeStringFast('<?')) {
|
|
449
|
+
return false;
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
let name = this.consumeName();
|
|
453
|
+
|
|
454
|
+
if (name) {
|
|
455
|
+
if (name.toLowerCase() === 'xml') {
|
|
456
|
+
scanner.reset(mark);
|
|
457
|
+
throw this.error("XML declaration isn't allowed here");
|
|
458
|
+
}
|
|
459
|
+
} else {
|
|
460
|
+
throw this.error('Invalid processing instruction');
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
if (!this.consumeWhitespace()) {
|
|
464
|
+
if (scanner.consumeStringFast('?>')) {
|
|
465
|
+
this.addNode(new XmlProcessingInstruction(name));
|
|
466
|
+
return true;
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
throw this.error('Whitespace is required after a processing instruction name');
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
let content = scanner.consumeUntilString('?>');
|
|
473
|
+
this.validateChars(content);
|
|
474
|
+
|
|
475
|
+
if (!scanner.consumeStringFast('?>')) {
|
|
476
|
+
throw this.error('Unterminated processing instruction');
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
this.addNode(new XmlProcessingInstruction(name, content));
|
|
480
|
+
return true;
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
/**
|
|
484
|
+
* Consumes a prolog if possible.
|
|
485
|
+
*
|
|
486
|
+
* @returns Whether a prolog was consumed.
|
|
487
|
+
* @see https://www.w3.org/TR/2008/REC-xml-20081126/#sec-prolog-dtd
|
|
488
|
+
*/
|
|
489
|
+
consumeProlog(): boolean {
|
|
490
|
+
let { scanner } = this;
|
|
491
|
+
let mark = scanner.charIndex;
|
|
492
|
+
|
|
493
|
+
this.consumeXmlDeclaration();
|
|
494
|
+
|
|
495
|
+
while (this.consumeMisc()) {} // eslint-disable-line no-empty
|
|
496
|
+
|
|
497
|
+
if (this.consumeDoctypeDeclaration()) {
|
|
498
|
+
while (this.consumeMisc()) {} // eslint-disable-line no-empty
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
return mark < scanner.charIndex;
|
|
502
|
+
}
|
|
503
|
+
|
|
504
|
+
/**
|
|
505
|
+
* Consumes a reference if possible.
|
|
506
|
+
*
|
|
507
|
+
* This differs from `consumeContentReference()` in that a consumed reference
|
|
508
|
+
* will be returned rather than added to the document.
|
|
509
|
+
*
|
|
510
|
+
* @returns
|
|
511
|
+
* Parsed reference value, or `false` if nothing was consumed (to
|
|
512
|
+
* distinguish from a reference that resolves to an empty string).
|
|
513
|
+
*
|
|
514
|
+
* @see https://www.w3.org/TR/2008/REC-xml-20081126/#NT-Reference
|
|
515
|
+
*/
|
|
516
|
+
consumeReference(): string | false {
|
|
517
|
+
let { scanner } = this;
|
|
518
|
+
|
|
519
|
+
if (!scanner.consumeStringFast('&')) {
|
|
520
|
+
return false;
|
|
521
|
+
}
|
|
522
|
+
|
|
523
|
+
let ref = scanner.consumeMatchFn(syntax.isReferenceChar);
|
|
524
|
+
|
|
525
|
+
if (scanner.consume() !== ';') {
|
|
526
|
+
throw this.error('Unterminated reference (a reference must end with `;`)');
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
let parsedValue;
|
|
530
|
+
|
|
531
|
+
if (ref[0] === '#') {
|
|
532
|
+
// This is a character reference.
|
|
533
|
+
let codePoint = ref[1] === 'x'
|
|
534
|
+
? parseInt(ref.slice(2), 16) // Hex codepoint.
|
|
535
|
+
: parseInt(ref.slice(1), 10); // Decimal codepoint.
|
|
536
|
+
|
|
537
|
+
if (isNaN(codePoint)) {
|
|
538
|
+
throw this.error('Invalid character reference');
|
|
539
|
+
}
|
|
540
|
+
|
|
541
|
+
if (!syntax.isXmlCodePoint(codePoint)) {
|
|
542
|
+
throw this.error('Character reference resolves to an invalid character');
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
parsedValue = String.fromCodePoint(codePoint);
|
|
546
|
+
} else {
|
|
547
|
+
// This is an entity reference.
|
|
548
|
+
parsedValue = syntax.predefinedEntities[ref];
|
|
549
|
+
|
|
550
|
+
if (parsedValue === undefined) {
|
|
551
|
+
let {
|
|
552
|
+
ignoreUndefinedEntities,
|
|
553
|
+
resolveUndefinedEntity,
|
|
554
|
+
} = this.options;
|
|
555
|
+
|
|
556
|
+
let wrappedRef = `&${ref};`; // for backcompat with <= 2.x
|
|
557
|
+
|
|
558
|
+
if (resolveUndefinedEntity) {
|
|
559
|
+
let resolvedValue = resolveUndefinedEntity(wrappedRef);
|
|
560
|
+
|
|
561
|
+
if (resolvedValue !== null && resolvedValue !== undefined) {
|
|
562
|
+
let type = typeof resolvedValue;
|
|
563
|
+
|
|
564
|
+
if (type !== 'string') {
|
|
565
|
+
throw new TypeError(`\`resolveUndefinedEntity()\` must return a string, \`null\`, or \`undefined\`, but returned a value of type ${type}`);
|
|
566
|
+
}
|
|
567
|
+
|
|
568
|
+
return resolvedValue;
|
|
569
|
+
}
|
|
570
|
+
}
|
|
571
|
+
|
|
572
|
+
if (ignoreUndefinedEntities) {
|
|
573
|
+
return wrappedRef;
|
|
574
|
+
}
|
|
575
|
+
|
|
576
|
+
scanner.reset(-wrappedRef.length);
|
|
577
|
+
throw this.error(`Named entity isn't defined: ${wrappedRef}`);
|
|
578
|
+
}
|
|
579
|
+
}
|
|
580
|
+
|
|
581
|
+
return parsedValue;
|
|
582
|
+
}
|
|
583
|
+
|
|
584
|
+
/**
|
|
585
|
+
* Consumes a `SystemLiteral` if possible.
|
|
586
|
+
*
|
|
587
|
+
* A `SystemLiteral` is similar to an attribute value, but allows the
|
|
588
|
+
* characters `<` and `&` and doesn't replace references.
|
|
589
|
+
*
|
|
590
|
+
* @returns
|
|
591
|
+
* Value of the `SystemLiteral` minus quotes, or `false` if nothing was
|
|
592
|
+
* consumed. An empty string indicates that a `SystemLiteral` was consumed
|
|
593
|
+
* but was empty.
|
|
594
|
+
*
|
|
595
|
+
* @see https://www.w3.org/TR/2008/REC-xml-20081126/#NT-SystemLiteral
|
|
596
|
+
*/
|
|
597
|
+
consumeSystemLiteral(): string | false {
|
|
598
|
+
let { scanner } = this;
|
|
599
|
+
let quote = scanner.consumeStringFast('"') || scanner.consumeStringFast("'");
|
|
600
|
+
|
|
601
|
+
if (!quote) {
|
|
602
|
+
return false;
|
|
603
|
+
}
|
|
604
|
+
|
|
605
|
+
let value = scanner.consumeUntilString(quote);
|
|
606
|
+
this.validateChars(value);
|
|
607
|
+
|
|
608
|
+
if (!scanner.consumeStringFast(quote)) {
|
|
609
|
+
throw this.error('Missing end quote');
|
|
610
|
+
}
|
|
611
|
+
|
|
612
|
+
return value;
|
|
613
|
+
}
|
|
614
|
+
|
|
615
|
+
/**
|
|
616
|
+
* Consumes one or more whitespace characters if possible.
|
|
617
|
+
*
|
|
618
|
+
* @returns Whether any whitespace characters were consumed.
|
|
619
|
+
* @see https://www.w3.org/TR/2008/REC-xml-20081126/#white
|
|
620
|
+
*/
|
|
621
|
+
consumeWhitespace(): boolean {
|
|
622
|
+
return Boolean(this.scanner.consumeMatchFn(syntax.isWhitespace));
|
|
623
|
+
}
|
|
624
|
+
|
|
625
|
+
/**
|
|
626
|
+
* Consumes an XML declaration if possible.
|
|
627
|
+
*
|
|
628
|
+
* @returns Whether an XML declaration was consumed.
|
|
629
|
+
* @see https://www.w3.org/TR/2008/REC-xml-20081126/#NT-XMLDecl
|
|
630
|
+
*/
|
|
631
|
+
consumeXmlDeclaration(): boolean {
|
|
632
|
+
let { scanner } = this;
|
|
633
|
+
|
|
634
|
+
if (!scanner.consumeStringFast('<?xml')) {
|
|
635
|
+
return false;
|
|
636
|
+
}
|
|
637
|
+
|
|
638
|
+
if (!this.consumeWhitespace()) {
|
|
639
|
+
throw this.error('Invalid XML declaration');
|
|
640
|
+
}
|
|
641
|
+
|
|
642
|
+
let version = Boolean(scanner.consumeStringFast('version'))
|
|
643
|
+
&& this.consumeEqual()
|
|
644
|
+
&& this.consumeSystemLiteral();
|
|
645
|
+
|
|
646
|
+
if (version === false) {
|
|
647
|
+
throw this.error('XML version is missing or invalid');
|
|
648
|
+
} else if (!/^1\.[0-9]+$/.test(version)) {
|
|
649
|
+
throw this.error('Invalid character in version number');
|
|
650
|
+
}
|
|
651
|
+
|
|
652
|
+
if (this.consumeWhitespace()) {
|
|
653
|
+
let encoding = Boolean(scanner.consumeStringFast('encoding'))
|
|
654
|
+
&& this.consumeEqual()
|
|
655
|
+
&& this.consumeSystemLiteral();
|
|
656
|
+
|
|
657
|
+
if (encoding) {
|
|
658
|
+
this.consumeWhitespace();
|
|
659
|
+
}
|
|
660
|
+
|
|
661
|
+
let standalone = Boolean(scanner.consumeStringFast('standalone'))
|
|
662
|
+
&& this.consumeEqual()
|
|
663
|
+
&& this.consumeSystemLiteral();
|
|
664
|
+
|
|
665
|
+
if (standalone) {
|
|
666
|
+
if (standalone !== 'yes' && standalone !== 'no') {
|
|
667
|
+
throw this.error('Only "yes" and "no" are permitted as values of `standalone`');
|
|
668
|
+
}
|
|
669
|
+
|
|
670
|
+
this.consumeWhitespace();
|
|
671
|
+
}
|
|
672
|
+
}
|
|
673
|
+
|
|
674
|
+
if (!scanner.consumeStringFast('?>')) {
|
|
675
|
+
throw this.error('Invalid or unclosed XML declaration');
|
|
676
|
+
}
|
|
677
|
+
|
|
678
|
+
return true;
|
|
679
|
+
}
|
|
680
|
+
|
|
681
|
+
/**
|
|
682
|
+
* Throws an error at the current scanner position.
|
|
683
|
+
*/
|
|
684
|
+
error(message: string) {
|
|
685
|
+
let { charIndex, string: xml } = this.scanner;
|
|
686
|
+
let column = 1;
|
|
687
|
+
let excerpt = '';
|
|
688
|
+
let line = 1;
|
|
689
|
+
|
|
690
|
+
// Find the line and column where the error occurred.
|
|
691
|
+
for (let i = 0; i < charIndex; ++i) {
|
|
692
|
+
let char = xml[i];
|
|
693
|
+
|
|
694
|
+
if (char === '\n') {
|
|
695
|
+
column = 1;
|
|
696
|
+
excerpt = '';
|
|
697
|
+
line += 1;
|
|
698
|
+
} else {
|
|
699
|
+
column += 1;
|
|
700
|
+
excerpt += char;
|
|
701
|
+
}
|
|
702
|
+
}
|
|
703
|
+
|
|
704
|
+
let eol = xml.indexOf('\n', charIndex);
|
|
705
|
+
|
|
706
|
+
excerpt += eol === -1
|
|
707
|
+
? xml.slice(charIndex)
|
|
708
|
+
: xml.slice(charIndex, eol);
|
|
709
|
+
|
|
710
|
+
let excerptStart = 0;
|
|
711
|
+
|
|
712
|
+
// Keep the excerpt below 50 chars, but always keep the error position in
|
|
713
|
+
// view.
|
|
714
|
+
if (excerpt.length > 50) {
|
|
715
|
+
if (column < 40) {
|
|
716
|
+
excerpt = excerpt.slice(0, 50);
|
|
717
|
+
} else {
|
|
718
|
+
excerptStart = column - 20;
|
|
719
|
+
excerpt = excerpt.slice(excerptStart, column + 30);
|
|
720
|
+
}
|
|
721
|
+
}
|
|
722
|
+
|
|
723
|
+
let err = new Error(
|
|
724
|
+
`${message} (line ${line}, column ${column})\n`
|
|
725
|
+
+ ` ${excerpt}\n`
|
|
726
|
+
+ ' '.repeat(column - excerptStart + 1) + '^\n',
|
|
727
|
+
);
|
|
728
|
+
|
|
729
|
+
Object.assign(err, {
|
|
730
|
+
column,
|
|
731
|
+
excerpt,
|
|
732
|
+
line,
|
|
733
|
+
pos: charIndex,
|
|
734
|
+
});
|
|
735
|
+
|
|
736
|
+
return err;
|
|
737
|
+
}
|
|
738
|
+
|
|
739
|
+
/**
|
|
740
|
+
* Throws an invalid character error if any character in the given _string_
|
|
741
|
+
* isn't a valid XML character.
|
|
742
|
+
*/
|
|
743
|
+
validateChars(string: string) {
|
|
744
|
+
let { length } = string;
|
|
745
|
+
|
|
746
|
+
for (let i = 0; i < length; ++i) {
|
|
747
|
+
let cp = string.codePointAt(i) as number;
|
|
748
|
+
|
|
749
|
+
if (!syntax.isXmlCodePoint(cp)) {
|
|
750
|
+
this.scanner.reset(-([ ...string ].length - i));
|
|
751
|
+
throw this.error('Invalid character');
|
|
752
|
+
}
|
|
753
|
+
|
|
754
|
+
if (cp > 65535) {
|
|
755
|
+
i += 1;
|
|
756
|
+
}
|
|
757
|
+
}
|
|
758
|
+
}
|
|
759
|
+
}
|
|
760
|
+
|
|
761
|
+
// -- Private Functions --------------------------------------------------------
|
|
762
|
+
|
|
763
|
+
/**
|
|
764
|
+
* Normalizes the given XML string by stripping a byte order mark (if present)
|
|
765
|
+
* and replacing CRLF sequences and lone CR characters with LF characters.
|
|
766
|
+
*/
|
|
767
|
+
function normalizeXmlString(xml: string): string {
|
|
768
|
+
if (xml[0] === '\uFEFF') {
|
|
769
|
+
xml = xml.slice(1);
|
|
770
|
+
}
|
|
771
|
+
|
|
772
|
+
return xml.replace(/\r\n?/g, '\n');
|
|
773
|
+
}
|
|
774
|
+
|
|
775
|
+
// -- Types --------------------------------------------------------------------
|
|
776
|
+
export type ParserOptions = {
|
|
777
|
+
/**
|
|
778
|
+
* When `true`, an undefined named entity (like "&bogus;") will be left in the
|
|
779
|
+
* output as is instead of causing a parse error.
|
|
780
|
+
*
|
|
781
|
+
* @default false
|
|
782
|
+
*/
|
|
783
|
+
ignoreUndefinedEntities?: boolean;
|
|
784
|
+
|
|
785
|
+
/**
|
|
786
|
+
* When `true`, CDATA sections will be preserved in the document as `XmlCdata`
|
|
787
|
+
* nodes. Otherwise CDATA sections will be represented as `XmlText` nodes,
|
|
788
|
+
* which keeps the node tree simpler and easier to work with.
|
|
789
|
+
*
|
|
790
|
+
* @default false
|
|
791
|
+
*/
|
|
792
|
+
preserveCdata?: boolean;
|
|
793
|
+
|
|
794
|
+
/**
|
|
795
|
+
* When `true`, comments will be preserved in the document as `XmlComment`
|
|
796
|
+
* nodes. Otherwise comments will not be included in the node tree.
|
|
797
|
+
*
|
|
798
|
+
* @default false
|
|
799
|
+
*/
|
|
800
|
+
preserveComments?: boolean;
|
|
801
|
+
|
|
802
|
+
/**
|
|
803
|
+
* When an undefined named entity is encountered, this function will be called
|
|
804
|
+
* with the entity as its only argument. It should return a string value with
|
|
805
|
+
* which to replace the entity, or `null` or `undefined` to treat the entity
|
|
806
|
+
* as undefined (which may result in a parse error depending on the value of
|
|
807
|
+
* `ignoreUndefinedEntities`).
|
|
808
|
+
*/
|
|
809
|
+
resolveUndefinedEntity?: (entity: string) => string | null | undefined;
|
|
810
|
+
|
|
811
|
+
/**
|
|
812
|
+
* When `true`, attributes in an element's `attributes` object will be sorted
|
|
813
|
+
* in alphanumeric order by name. Otherwise they'll retain their original
|
|
814
|
+
* order as found in the XML.
|
|
815
|
+
*
|
|
816
|
+
* @default false
|
|
817
|
+
*/
|
|
818
|
+
sortAttributes?: boolean;
|
|
819
|
+
};
|