@eksml/xml 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +22 -0
- package/README.md +588 -0
- package/dist/converters/fromLossless.d.mts +14 -0
- package/dist/converters/fromLossless.d.mts.map +1 -0
- package/dist/converters/fromLossless.mjs +35 -0
- package/dist/converters/fromLossless.mjs.map +1 -0
- package/dist/converters/fromLossy.d.mts +18 -0
- package/dist/converters/fromLossy.d.mts.map +1 -0
- package/dist/converters/fromLossy.mjs +91 -0
- package/dist/converters/fromLossy.mjs.map +1 -0
- package/dist/converters/lossless.d.mts +39 -0
- package/dist/converters/lossless.d.mts.map +1 -0
- package/dist/converters/lossless.mjs +74 -0
- package/dist/converters/lossless.mjs.map +1 -0
- package/dist/converters/lossy.d.mts +42 -0
- package/dist/converters/lossy.d.mts.map +1 -0
- package/dist/converters/lossy.mjs +158 -0
- package/dist/converters/lossy.mjs.map +1 -0
- package/dist/htmlConstants-D6fsKbZ-.mjs +30 -0
- package/dist/htmlConstants-D6fsKbZ-.mjs.map +1 -0
- package/dist/parser-BfdEfWDg.d.mts +95 -0
- package/dist/parser-BfdEfWDg.d.mts.map +1 -0
- package/dist/parser-CYq309aR.mjs +479 -0
- package/dist/parser-CYq309aR.mjs.map +1 -0
- package/dist/parser.d.mts +2 -0
- package/dist/parser.mjs +2 -0
- package/dist/sax.d.mts +64 -0
- package/dist/sax.d.mts.map +1 -0
- package/dist/sax.mjs +70 -0
- package/dist/sax.mjs.map +1 -0
- package/dist/saxEngine-BDnD7ruG.mjs +750 -0
- package/dist/saxEngine-BDnD7ruG.mjs.map +1 -0
- package/dist/utilities/index.d.mts +88 -0
- package/dist/utilities/index.d.mts.map +1 -0
- package/dist/utilities/index.mjs +87 -0
- package/dist/utilities/index.mjs.map +1 -0
- package/dist/writer.d.mts +58 -0
- package/dist/writer.d.mts.map +1 -0
- package/dist/writer.mjs +357 -0
- package/dist/writer.mjs.map +1 -0
- package/dist/xmlParseStream.d.mts +138 -0
- package/dist/xmlParseStream.d.mts.map +1 -0
- package/dist/xmlParseStream.mjs +313 -0
- package/dist/xmlParseStream.mjs.map +1 -0
- package/package.json +100 -0
- package/src/converters/fromLossless.ts +80 -0
- package/src/converters/fromLossy.ts +180 -0
- package/src/converters/lossless.ts +116 -0
- package/src/converters/lossy.ts +274 -0
- package/src/parser.ts +728 -0
- package/src/sax.ts +157 -0
- package/src/saxEngine.ts +1157 -0
- package/src/utilities/escapeRegExp.ts +19 -0
- package/src/utilities/filter.ts +63 -0
- package/src/utilities/getElementById.ts +21 -0
- package/src/utilities/getElementsByClassName.ts +22 -0
- package/src/utilities/htmlConstants.ts +26 -0
- package/src/utilities/index.ts +7 -0
- package/src/utilities/isElementNode.ts +19 -0
- package/src/utilities/isTextNode.ts +19 -0
- package/src/utilities/toContentString.ts +23 -0
- package/src/writer.ts +650 -0
- package/src/xmlParseStream.ts +597 -0
package/src/parser.ts
ADDED
|
@@ -0,0 +1,728 @@
|
|
|
1
|
+
import { decodeXML, decodeHTML } from 'entities';
|
|
2
|
+
import { escapeRegExp } from '#src/utilities/escapeRegExp.ts';
|
|
3
|
+
import { filter } from '#src/utilities/filter.ts';
|
|
4
|
+
import {
|
|
5
|
+
HTML_VOID_ELEMENTS,
|
|
6
|
+
HTML_RAW_CONTENT_TAGS,
|
|
7
|
+
} from '#src/utilities/htmlConstants.ts';
|
|
8
|
+
// @generated:char-codes:begin
|
|
9
|
+
const LT = 60; // <
|
|
10
|
+
const GT = 62; // >
|
|
11
|
+
const SLASH = 47; // /
|
|
12
|
+
const BANG = 33; // !
|
|
13
|
+
const QUESTION = 63; // ?
|
|
14
|
+
const LBRACKET = 91; // [
|
|
15
|
+
const RBRACKET = 93; // ]
|
|
16
|
+
const SQUOTE = 39; // '
|
|
17
|
+
const DQUOTE = 34; // "
|
|
18
|
+
const DASH = 45; // -
|
|
19
|
+
const UNDERSCORE = 95; // _
|
|
20
|
+
const COLON = 58; // :
|
|
21
|
+
// @generated:char-codes:end
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* A parsed XML node
|
|
25
|
+
*/
|
|
26
|
+
export interface TNode {
|
|
27
|
+
tagName: string;
|
|
28
|
+
/**
|
|
29
|
+
* Element attributes, or `null` if the element has no attributes.
|
|
30
|
+
* Values can be:
|
|
31
|
+
* - string: attribute with a value (e.g., `<div id="test">` -> `{id: "test"}`)
|
|
32
|
+
* - null: attribute without a value (e.g., `<input disabled>` -> `{disabled: null}`)
|
|
33
|
+
* - empty string: attribute with empty value (e.g., `<input value="">` -> `{value: ""}`)
|
|
34
|
+
*/
|
|
35
|
+
attributes: Record<string, string | null> | null;
|
|
36
|
+
children: (TNode | string)[];
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
/**
|
|
40
|
+
* TNode with a pos property, returned when setPos option is true
|
|
41
|
+
*/
|
|
42
|
+
interface TNodeWithPos extends TNode {
|
|
43
|
+
pos: number;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* Options for parsing XML
|
|
48
|
+
*/
|
|
49
|
+
export interface ParseOptions {
|
|
50
|
+
/** Starting position in the string */
|
|
51
|
+
pos?: number;
|
|
52
|
+
/**
|
|
53
|
+
* Array of tag names that are self-closing (void elements) and don't need closing tags.
|
|
54
|
+
* In XML mode (default), this defaults to `[]` — self-closing is detected only by `/>` syntax.
|
|
55
|
+
* In HTML mode (`html: true`), this defaults to the standard HTML void elements.
|
|
56
|
+
* Can be overridden explicitly regardless of mode.
|
|
57
|
+
*/
|
|
58
|
+
selfClosingTags?: string[];
|
|
59
|
+
/**
|
|
60
|
+
* Array of tag names whose content should be treated as raw text, not parsed as XML/HTML.
|
|
61
|
+
* The parser will scan for the matching `</tagName>` close tag and emit everything between
|
|
62
|
+
* as a single text child node.
|
|
63
|
+
*
|
|
64
|
+
* In XML mode (default), this defaults to `[]`.
|
|
65
|
+
* In HTML mode (`html: true`), this defaults to `["script", "style"]`.
|
|
66
|
+
* Can be overridden explicitly regardless of mode.
|
|
67
|
+
*/
|
|
68
|
+
rawContentTags?: string[];
|
|
69
|
+
/**
|
|
70
|
+
* Enable HTML parsing mode. When `true`, sets sensible defaults for:
|
|
71
|
+
* - `selfClosingTags`: standard HTML void elements (area, base, br, col, embed, hr, img, input, link, meta, param, source, track, wbr)
|
|
72
|
+
* - `rawContentTags`: elements whose content is raw text (script, style)
|
|
73
|
+
*
|
|
74
|
+
* These defaults can be overridden by explicitly passing `selfClosingTags` or `rawContentTags`.
|
|
75
|
+
*/
|
|
76
|
+
html?: boolean;
|
|
77
|
+
/** Keep XML comments in the output */
|
|
78
|
+
keepComments?: boolean;
|
|
79
|
+
/** Trim whitespace from text nodes and discard whitespace-only text nodes */
|
|
80
|
+
trimWhitespace?: boolean;
|
|
81
|
+
/**
|
|
82
|
+
* Strict mode: throw on malformed XML instead of recovering silently.
|
|
83
|
+
* Catches unclosed comments, CDATA sections, processing instructions,
|
|
84
|
+
* close tags, and open tags that reach end-of-input without closing.
|
|
85
|
+
*/
|
|
86
|
+
strict?: boolean;
|
|
87
|
+
/**
|
|
88
|
+
* Decode XML/HTML entities in text content and attribute values.
|
|
89
|
+
* When enabled, named entities (`&`, `<`, etc.), decimal character
|
|
90
|
+
* references (`ä`), and hex character references (`ä`) are decoded.
|
|
91
|
+
*
|
|
92
|
+
* In HTML mode (`html: true`), the full set of HTML named entities is
|
|
93
|
+
* supported (e.g. ` `, `©`, `—`). In XML mode, only the
|
|
94
|
+
* five standard XML entities plus numeric references are decoded.
|
|
95
|
+
*
|
|
96
|
+
* CDATA sections are never decoded regardless of this setting.
|
|
97
|
+
*
|
|
98
|
+
* Defaults to `false` — entities are preserved as-is in the output.
|
|
99
|
+
*/
|
|
100
|
+
entities?: boolean;
|
|
101
|
+
/** Attribute name to search for (used with attrValue) */
|
|
102
|
+
attrName?: string;
|
|
103
|
+
/** Attribute value to search for (regex pattern) */
|
|
104
|
+
attrValue?: string;
|
|
105
|
+
/** Filter function to apply to nodes */
|
|
106
|
+
filter?: (node: TNode, index: number, depth: number, path: string) => boolean;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
/** Internal options extending ParseOptions — not part of the public API. */
|
|
110
|
+
interface InternalParseOptions extends ParseOptions {
|
|
111
|
+
/** If true, the returned object will have a pos property indicating where parsing stopped */
|
|
112
|
+
setPos?: boolean;
|
|
113
|
+
/** Parse a single node instead of a list of nodes */
|
|
114
|
+
parseNode?: boolean;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
// Pre-computed lookup table: 1 for characters that terminate a name token
|
|
118
|
+
// Name-ending chars: \t(9) \n(10) \r(13) space(32) /(47) =(61) >(62)
|
|
119
|
+
const NAME_END = new Uint8Array(128);
|
|
120
|
+
NAME_END[9] = 1; // \t
|
|
121
|
+
NAME_END[10] = 1; // \n
|
|
122
|
+
NAME_END[13] = 1; // \r
|
|
123
|
+
NAME_END[32] = 1; // space
|
|
124
|
+
NAME_END[47] = 1; // /
|
|
125
|
+
NAME_END[61] = 1; // =
|
|
126
|
+
NAME_END[62] = 1; // >
|
|
127
|
+
|
|
128
|
+
/**
|
|
129
|
+
* Parse XML/HTML into a DOM Object with minimal validation and fault tolerance
|
|
130
|
+
* @param S - The XML string to parse
|
|
131
|
+
* @param options - Parsing options
|
|
132
|
+
* @returns Array of parsed nodes and text content
|
|
133
|
+
*/
|
|
134
|
+
export function parse(
|
|
135
|
+
S: string,
|
|
136
|
+
options?: ParseOptions | InternalParseOptions,
|
|
137
|
+
): (TNode | string)[] {
|
|
138
|
+
const resolvedOptions = (options || {}) as InternalParseOptions;
|
|
139
|
+
|
|
140
|
+
let pos = resolvedOptions.pos || 0;
|
|
141
|
+
const keepComments = !!resolvedOptions.keepComments;
|
|
142
|
+
const trimWhitespace = !!resolvedOptions.trimWhitespace;
|
|
143
|
+
const strict = !!resolvedOptions.strict;
|
|
144
|
+
const htmlMode = !!resolvedOptions.html;
|
|
145
|
+
const decode =
|
|
146
|
+
resolvedOptions.entities === true
|
|
147
|
+
? htmlMode
|
|
148
|
+
? decodeHTML
|
|
149
|
+
: decodeXML
|
|
150
|
+
: null;
|
|
151
|
+
|
|
152
|
+
const selfClosingTagList: string[] =
|
|
153
|
+
resolvedOptions.selfClosingTags ?? (htmlMode ? HTML_VOID_ELEMENTS : []);
|
|
154
|
+
const rawContentTagList: string[] =
|
|
155
|
+
resolvedOptions.rawContentTags ?? (htmlMode ? HTML_RAW_CONTENT_TAGS : []);
|
|
156
|
+
|
|
157
|
+
// Convert to Sets for O(1) lookup when non-empty
|
|
158
|
+
const selfClosingSet: Set<string> | null =
|
|
159
|
+
selfClosingTagList.length > 0 ? new Set(selfClosingTagList) : null;
|
|
160
|
+
const rawContentSet: Set<string> | null =
|
|
161
|
+
rawContentTagList.length > 0 ? new Set(rawContentTagList) : null;
|
|
162
|
+
|
|
163
|
+
/** Build an error with line/column info for strict mode. */
|
|
164
|
+
function strictError(message: string, atPos?: number): Error {
|
|
165
|
+
const p = atPos !== undefined ? atPos : pos;
|
|
166
|
+
const before = S.substring(0, p);
|
|
167
|
+
const lines = before.split('\n');
|
|
168
|
+
const line = lines.length;
|
|
169
|
+
const column = lines[lines.length - 1]!.length + 1;
|
|
170
|
+
return new Error(`${message} at line ${line}, column ${column}`);
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
/**
|
|
174
|
+
* Strip whitespace-only text nodes from a children array when it
|
|
175
|
+
* contains only element nodes and whitespace-only text (i.e. "ignorable
|
|
176
|
+
* whitespace" per XML spec). Mixed-content elements — those with at
|
|
177
|
+
* least one non-whitespace text child — are left untouched so that
|
|
178
|
+
* whitespace formatting is preserved.
|
|
179
|
+
* Mutates the array in-place for performance.
|
|
180
|
+
*/
|
|
181
|
+
function stripIgnorableWhitespace(children: (TNode | string)[]): void {
|
|
182
|
+
let hasElement = false;
|
|
183
|
+
let hasWhitespaceOnlyText = false;
|
|
184
|
+
let hasNonWhitespaceText = false;
|
|
185
|
+
for (let i = 0; i < children.length; i++) {
|
|
186
|
+
const child = children[i]!;
|
|
187
|
+
if (typeof child !== 'string') {
|
|
188
|
+
hasElement = true;
|
|
189
|
+
} else if (child.trim().length === 0) {
|
|
190
|
+
hasWhitespaceOnlyText = true;
|
|
191
|
+
} else {
|
|
192
|
+
hasNonWhitespaceText = true;
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
// Only strip when children are exclusively elements + whitespace-only
|
|
196
|
+
// text (pure element containers). Mixed content is left intact.
|
|
197
|
+
if (hasElement && hasWhitespaceOnlyText && !hasNonWhitespaceText) {
|
|
198
|
+
// Compact in-place with a write pointer (avoids O(n) splice per removal)
|
|
199
|
+
let writeIndex = 0;
|
|
200
|
+
for (let i = 0; i < children.length; i++) {
|
|
201
|
+
const child = children[i]!;
|
|
202
|
+
if (typeof child === 'string' && child.trim().length === 0) continue;
|
|
203
|
+
children[writeIndex++] = child;
|
|
204
|
+
}
|
|
205
|
+
children.length = writeIndex;
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
function parseChildren(rootTagName: string): (TNode | string)[] {
|
|
210
|
+
// Iterative tree-building using an explicit stack to avoid
|
|
211
|
+
// stack overflow on deeply nested XML (the old recursive
|
|
212
|
+
// parseNode → parseChildren → parseNode chain blew up at ~2000 levels).
|
|
213
|
+
interface Frame {
|
|
214
|
+
tagName: string;
|
|
215
|
+
attributes: Record<string, string | null> | null;
|
|
216
|
+
children: (TNode | string)[];
|
|
217
|
+
}
|
|
218
|
+
const stack: Frame[] = [];
|
|
219
|
+
let currentTagName = rootTagName;
|
|
220
|
+
let children: (TNode | string)[] = [];
|
|
221
|
+
|
|
222
|
+
while (S[pos]) {
|
|
223
|
+
if (S.charCodeAt(pos) === LT) {
|
|
224
|
+
if (S.charCodeAt(pos + 1) === SLASH) {
|
|
225
|
+
// ---- Close tag ----
|
|
226
|
+
const closeStart = pos + 2;
|
|
227
|
+
pos = S.indexOf('>', pos);
|
|
228
|
+
|
|
229
|
+
if (pos === -1) {
|
|
230
|
+
if (strict) throw strictError('Unclosed close tag', closeStart - 2);
|
|
231
|
+
pos = S.length;
|
|
232
|
+
stripIgnorableWhitespace(children);
|
|
233
|
+
// Unwind: if we are inside a stacked frame, pop back
|
|
234
|
+
while (stack.length > 0) {
|
|
235
|
+
const frame = stack.pop()!;
|
|
236
|
+
const node: TNode = {
|
|
237
|
+
tagName: currentTagName,
|
|
238
|
+
attributes: frame.attributes,
|
|
239
|
+
children,
|
|
240
|
+
};
|
|
241
|
+
// Restore from stack
|
|
242
|
+
currentTagName = frame.tagName;
|
|
243
|
+
children = frame.children;
|
|
244
|
+
children.push(node);
|
|
245
|
+
stripIgnorableWhitespace(children);
|
|
246
|
+
}
|
|
247
|
+
return children;
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
const closeTag = S.substring(closeStart, pos).trimEnd();
|
|
251
|
+
if (closeTag !== currentTagName) {
|
|
252
|
+
throw strictError(
|
|
253
|
+
`Unexpected close tag </${closeTag}> (expected </${currentTagName}>)`,
|
|
254
|
+
);
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
if (pos + 1) pos += 1;
|
|
258
|
+
|
|
259
|
+
stripIgnorableWhitespace(children);
|
|
260
|
+
|
|
261
|
+
if (stack.length === 0) {
|
|
262
|
+
// We've closed the root tag — return
|
|
263
|
+
return children;
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
// Pop frame: finalize node and add to parent's children
|
|
267
|
+
const frame = stack.pop()!;
|
|
268
|
+
const node: TNode = {
|
|
269
|
+
tagName: currentTagName,
|
|
270
|
+
attributes: frame.attributes,
|
|
271
|
+
children,
|
|
272
|
+
};
|
|
273
|
+
currentTagName = frame.tagName;
|
|
274
|
+
children = frame.children;
|
|
275
|
+
children.push(node);
|
|
276
|
+
// Handle processing instruction children promotion
|
|
277
|
+
if (node.tagName.charCodeAt(0) === QUESTION) {
|
|
278
|
+
children.push(...node.children);
|
|
279
|
+
node.children = [];
|
|
280
|
+
}
|
|
281
|
+
continue;
|
|
282
|
+
} else if (S.charCodeAt(pos + 1) === BANG) {
|
|
283
|
+
if (S.charCodeAt(pos + 2) === DASH) {
|
|
284
|
+
// comment: use indexOf("-->") for fast scanning
|
|
285
|
+
const startCommentPos = pos;
|
|
286
|
+
pos = S.indexOf('-->', pos + 3);
|
|
287
|
+
if (pos === -1) {
|
|
288
|
+
if (strict)
|
|
289
|
+
throw strictError('Unclosed comment', startCommentPos);
|
|
290
|
+
pos = S.length;
|
|
291
|
+
if (keepComments) {
|
|
292
|
+
children.push(S.substring(startCommentPos));
|
|
293
|
+
}
|
|
294
|
+
} else {
|
|
295
|
+
pos += 2; // point to the '>'
|
|
296
|
+
if (keepComments) {
|
|
297
|
+
children.push(S.substring(startCommentPos, pos + 1));
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
} else if (
|
|
301
|
+
S.charCodeAt(pos + 2) === LBRACKET &&
|
|
302
|
+
S.charCodeAt(pos + 8) === LBRACKET &&
|
|
303
|
+
S.substring(pos + 3, pos + 8).toLowerCase() === 'cdata'
|
|
304
|
+
) {
|
|
305
|
+
// cdata
|
|
306
|
+
const cdataEndIndex = S.indexOf(']]>', pos);
|
|
307
|
+
if (cdataEndIndex === -1) {
|
|
308
|
+
if (strict) throw strictError('Unclosed CDATA section');
|
|
309
|
+
children.push(S.substring(pos + 9));
|
|
310
|
+
pos = S.length;
|
|
311
|
+
} else {
|
|
312
|
+
children.push(S.substring(pos + 9, cdataEndIndex));
|
|
313
|
+
pos = cdataEndIndex + 3;
|
|
314
|
+
}
|
|
315
|
+
continue;
|
|
316
|
+
} else {
|
|
317
|
+
// doctype / other <!...> declarations: parse as TNode
|
|
318
|
+
// Read the declaration keyword (e.g. "!DOCTYPE")
|
|
319
|
+
pos += 2; // skip '<!'
|
|
320
|
+
const keywordStart = pos - 1; // include the '!'
|
|
321
|
+
while (pos < S.length) {
|
|
322
|
+
const cc = S.charCodeAt(pos);
|
|
323
|
+
if (cc <= 32 || cc === GT || cc === LBRACKET) break;
|
|
324
|
+
pos++;
|
|
325
|
+
}
|
|
326
|
+
const declTagName = S.substring(keywordStart, pos);
|
|
327
|
+
|
|
328
|
+
// Parse space-separated tokens as null-valued attributes
|
|
329
|
+
let declAttributes: Record<string, string | null> | null = null;
|
|
330
|
+
while (pos < S.length) {
|
|
331
|
+
const cc = S.charCodeAt(pos);
|
|
332
|
+
if (cc === GT || cc === LBRACKET) break;
|
|
333
|
+
// Skip whitespace
|
|
334
|
+
if (cc <= 32) {
|
|
335
|
+
pos++;
|
|
336
|
+
continue;
|
|
337
|
+
}
|
|
338
|
+
// Quoted token — capture including quotes as the key
|
|
339
|
+
if (cc === SQUOTE || cc === DQUOTE) {
|
|
340
|
+
const closePos = S.indexOf(cc === SQUOTE ? "'" : '"', pos + 1);
|
|
341
|
+
if (closePos === -1) {
|
|
342
|
+
if (strict) throw strictError('Unclosed declaration');
|
|
343
|
+
pos = S.length;
|
|
344
|
+
break;
|
|
345
|
+
}
|
|
346
|
+
const token = S.substring(pos + 1, closePos);
|
|
347
|
+
if (declAttributes === null)
|
|
348
|
+
declAttributes = Object.create(null);
|
|
349
|
+
declAttributes![token] = null;
|
|
350
|
+
pos = closePos + 1;
|
|
351
|
+
continue;
|
|
352
|
+
}
|
|
353
|
+
// Unquoted token
|
|
354
|
+
const tokenStart = pos;
|
|
355
|
+
while (pos < S.length) {
|
|
356
|
+
const tc = S.charCodeAt(pos);
|
|
357
|
+
if (tc <= 32 || tc === GT || tc === LBRACKET) break;
|
|
358
|
+
pos++;
|
|
359
|
+
}
|
|
360
|
+
const token = S.substring(tokenStart, pos);
|
|
361
|
+
if (declAttributes === null) declAttributes = Object.create(null);
|
|
362
|
+
declAttributes![token] = null;
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
// Skip internal DTD subset ([...]) if present
|
|
366
|
+
if (pos < S.length && S.charCodeAt(pos) === LBRACKET) {
|
|
367
|
+
pos++; // skip '['
|
|
368
|
+
let insideBracketSection = true;
|
|
369
|
+
while (insideBracketSection && pos < S.length) {
|
|
370
|
+
if (S.charCodeAt(pos) === RBRACKET) {
|
|
371
|
+
insideBracketSection = false;
|
|
372
|
+
} else {
|
|
373
|
+
// Skip quoted strings inside internal DTD subset
|
|
374
|
+
const quoteCharCode = S.charCodeAt(pos);
|
|
375
|
+
if (quoteCharCode === SQUOTE || quoteCharCode === DQUOTE) {
|
|
376
|
+
pos = S.indexOf(
|
|
377
|
+
quoteCharCode === SQUOTE ? "'" : '"',
|
|
378
|
+
pos + 1,
|
|
379
|
+
);
|
|
380
|
+
if (pos === -1) {
|
|
381
|
+
pos = S.length;
|
|
382
|
+
break;
|
|
383
|
+
}
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
pos++;
|
|
387
|
+
}
|
|
388
|
+
// Skip any whitespace between ] and >
|
|
389
|
+
while (pos < S.length && S.charCodeAt(pos) <= 32) pos++;
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
if (strict && (pos >= S.length || S.charCodeAt(pos) !== GT))
|
|
393
|
+
throw strictError('Unclosed declaration');
|
|
394
|
+
|
|
395
|
+
children.push({
|
|
396
|
+
tagName: declTagName,
|
|
397
|
+
attributes: declAttributes,
|
|
398
|
+
children: [],
|
|
399
|
+
} as TNode);
|
|
400
|
+
}
|
|
401
|
+
pos++;
|
|
402
|
+
continue;
|
|
403
|
+
}
|
|
404
|
+
// ---- Open tag (inline parseNode logic) ----
|
|
405
|
+
pos++;
|
|
406
|
+
const tagName = parseName();
|
|
407
|
+
let attributes: Record<string, string | null> | null = null;
|
|
408
|
+
|
|
409
|
+
// parsing attributes
|
|
410
|
+
while (S.charCodeAt(pos) !== GT && S[pos]) {
|
|
411
|
+
let charCode = S.charCodeAt(pos);
|
|
412
|
+
if (
|
|
413
|
+
(charCode > 64 && charCode < 91) ||
|
|
414
|
+
(charCode > 96 && charCode < 123) ||
|
|
415
|
+
charCode === UNDERSCORE ||
|
|
416
|
+
charCode === COLON ||
|
|
417
|
+
charCode > 127
|
|
418
|
+
) {
|
|
419
|
+
const name = parseName();
|
|
420
|
+
let code = S.charCodeAt(pos);
|
|
421
|
+
while (
|
|
422
|
+
code &&
|
|
423
|
+
code !== SQUOTE &&
|
|
424
|
+
code !== DQUOTE &&
|
|
425
|
+
!(
|
|
426
|
+
(code > 64 && code < 91) ||
|
|
427
|
+
(code > 96 && code < 123) ||
|
|
428
|
+
code === UNDERSCORE ||
|
|
429
|
+
code === COLON ||
|
|
430
|
+
code > 127
|
|
431
|
+
) &&
|
|
432
|
+
code !== GT
|
|
433
|
+
) {
|
|
434
|
+
pos++;
|
|
435
|
+
code = S.charCodeAt(pos);
|
|
436
|
+
}
|
|
437
|
+
let value: string | null;
|
|
438
|
+
if (code === SQUOTE || code === DQUOTE) {
|
|
439
|
+
value = parseString();
|
|
440
|
+
if (pos === -1) {
|
|
441
|
+
// Unterminated attribute string — emit node with what we have
|
|
442
|
+
const node: TNode = { tagName, attributes, children: [] };
|
|
443
|
+
children.push(node);
|
|
444
|
+
// Unwind remaining stack frames
|
|
445
|
+
while (stack.length > 0) {
|
|
446
|
+
const frame = stack.pop()!;
|
|
447
|
+
const parent: TNode = {
|
|
448
|
+
tagName: currentTagName,
|
|
449
|
+
attributes: frame.attributes,
|
|
450
|
+
children,
|
|
451
|
+
};
|
|
452
|
+
currentTagName = frame.tagName;
|
|
453
|
+
children = frame.children;
|
|
454
|
+
children.push(parent);
|
|
455
|
+
}
|
|
456
|
+
return children;
|
|
457
|
+
}
|
|
458
|
+
if (decode) value = decode(value);
|
|
459
|
+
} else {
|
|
460
|
+
value = null;
|
|
461
|
+
pos--;
|
|
462
|
+
}
|
|
463
|
+
if (attributes === null) attributes = Object.create(null);
|
|
464
|
+
attributes![name] = value;
|
|
465
|
+
}
|
|
466
|
+
pos++;
|
|
467
|
+
}
|
|
468
|
+
if (strict && !S[pos]) {
|
|
469
|
+
throw strictError(`Unclosed tag <${tagName}>`);
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
// Determine if this node has children or is self-closing
|
|
473
|
+
if (
|
|
474
|
+
S.charCodeAt(pos - 1) !== SLASH &&
|
|
475
|
+
S.charCodeAt(pos - 1) !== QUESTION &&
|
|
476
|
+
tagName.charCodeAt(0) !== BANG
|
|
477
|
+
) {
|
|
478
|
+
if (rawContentSet !== null && rawContentSet.has(tagName)) {
|
|
479
|
+
// Raw content tag
|
|
480
|
+
const closeTagStr = '</' + tagName + '>';
|
|
481
|
+
const start = pos + 1;
|
|
482
|
+
pos = S.indexOf(closeTagStr, start);
|
|
483
|
+
let rawChildren: (TNode | string)[];
|
|
484
|
+
if (pos === -1) {
|
|
485
|
+
if (strict) throw strictError(`Unclosed tag <${tagName}>`);
|
|
486
|
+
rawChildren = [S.substring(start)];
|
|
487
|
+
pos = S.length;
|
|
488
|
+
} else {
|
|
489
|
+
rawChildren = [S.substring(start, pos)];
|
|
490
|
+
pos += closeTagStr.length;
|
|
491
|
+
}
|
|
492
|
+
const node: TNode = { tagName, attributes, children: rawChildren };
|
|
493
|
+
children.push(node);
|
|
494
|
+
if (tagName.charCodeAt(0) === QUESTION) {
|
|
495
|
+
children.push(...node.children);
|
|
496
|
+
node.children = [];
|
|
497
|
+
}
|
|
498
|
+
} else if (selfClosingSet === null || !selfClosingSet.has(tagName)) {
|
|
499
|
+
// Node has children — push frame and descend
|
|
500
|
+
pos++;
|
|
501
|
+
stack.push({ tagName: currentTagName, attributes, children });
|
|
502
|
+
currentTagName = tagName;
|
|
503
|
+
children = [];
|
|
504
|
+
} else {
|
|
505
|
+
// Self-closing tag (from selfClosingTags list)
|
|
506
|
+
pos++;
|
|
507
|
+
const node: TNode = { tagName, attributes, children: [] };
|
|
508
|
+
children.push(node);
|
|
509
|
+
if (tagName.charCodeAt(0) === QUESTION) {
|
|
510
|
+
children.push(...node.children);
|
|
511
|
+
node.children = [];
|
|
512
|
+
}
|
|
513
|
+
}
|
|
514
|
+
} else {
|
|
515
|
+
// Explicit self-closing (/>) or processing instruction (?>) or declaration
|
|
516
|
+
pos++;
|
|
517
|
+
const node: TNode = { tagName, attributes, children: [] };
|
|
518
|
+
children.push(node);
|
|
519
|
+
if (tagName.charCodeAt(0) === QUESTION) {
|
|
520
|
+
children.push(...node.children);
|
|
521
|
+
node.children = [];
|
|
522
|
+
}
|
|
523
|
+
}
|
|
524
|
+
} else {
|
|
525
|
+
let text = parseText();
|
|
526
|
+
if (decode) text = decode(text);
|
|
527
|
+
if (trimWhitespace) {
|
|
528
|
+
const trimmed = text.trim();
|
|
529
|
+
if (trimmed.length > 0) {
|
|
530
|
+
children.push(trimmed);
|
|
531
|
+
}
|
|
532
|
+
} else {
|
|
533
|
+
if (text.length > 0) {
|
|
534
|
+
children.push(text);
|
|
535
|
+
}
|
|
536
|
+
}
|
|
537
|
+
pos++;
|
|
538
|
+
}
|
|
539
|
+
}
|
|
540
|
+
// If we exit the loop for a named tag, input ended without a close tag
|
|
541
|
+
if (strict && currentTagName !== '') {
|
|
542
|
+
throw strictError(`Unclosed tag <${currentTagName}>`);
|
|
543
|
+
}
|
|
544
|
+
stripIgnorableWhitespace(children);
|
|
545
|
+
// Unwind any remaining stack frames (unclosed tags in non-strict mode)
|
|
546
|
+
while (stack.length > 0) {
|
|
547
|
+
const frame = stack.pop()!;
|
|
548
|
+
const node: TNode = {
|
|
549
|
+
tagName: currentTagName,
|
|
550
|
+
attributes: frame.attributes,
|
|
551
|
+
children,
|
|
552
|
+
};
|
|
553
|
+
currentTagName = frame.tagName;
|
|
554
|
+
children = frame.children;
|
|
555
|
+
children.push(node);
|
|
556
|
+
stripIgnorableWhitespace(children);
|
|
557
|
+
}
|
|
558
|
+
return children;
|
|
559
|
+
}
|
|
560
|
+
|
|
561
|
+
function parseText(): string {
|
|
562
|
+
const start = pos;
|
|
563
|
+
pos = S.indexOf('<', pos) - 1;
|
|
564
|
+
if (pos === -2) pos = S.length;
|
|
565
|
+
return S.substring(start, pos + 1);
|
|
566
|
+
}
|
|
567
|
+
|
|
568
|
+
function parseName(): string {
|
|
569
|
+
const start = pos;
|
|
570
|
+
let charCode = S.charCodeAt(pos);
|
|
571
|
+
while (
|
|
572
|
+
charCode < 128
|
|
573
|
+
? NAME_END[charCode] === 0
|
|
574
|
+
: charCode === charCode /* not NaN = not past end */
|
|
575
|
+
) {
|
|
576
|
+
charCode = S.charCodeAt(++pos);
|
|
577
|
+
}
|
|
578
|
+
return S.substring(start, pos);
|
|
579
|
+
}
|
|
580
|
+
|
|
581
|
+
function parseNode(): TNode {
|
|
582
|
+
pos++;
|
|
583
|
+
const tagName = parseName();
|
|
584
|
+
// Defer attributes allocation until first attribute is found
|
|
585
|
+
let attributes: Record<string, string | null> | null = null;
|
|
586
|
+
let children: (TNode | string)[] = [];
|
|
587
|
+
|
|
588
|
+
// parsing attributes
|
|
589
|
+
while (S.charCodeAt(pos) !== GT && S[pos]) {
|
|
590
|
+
let charCode = S.charCodeAt(pos);
|
|
591
|
+
// Valid XML attribute name start: A-Z, a-z, _, :, or non-ASCII
|
|
592
|
+
if (
|
|
593
|
+
(charCode > 64 && charCode < 91) ||
|
|
594
|
+
(charCode > 96 && charCode < 123) ||
|
|
595
|
+
charCode === UNDERSCORE ||
|
|
596
|
+
charCode === COLON ||
|
|
597
|
+
charCode > 127
|
|
598
|
+
) {
|
|
599
|
+
const name = parseName();
|
|
600
|
+
// search beginning of the string
|
|
601
|
+
let code = S.charCodeAt(pos);
|
|
602
|
+
while (
|
|
603
|
+
code &&
|
|
604
|
+
code !== SQUOTE &&
|
|
605
|
+
code !== DQUOTE &&
|
|
606
|
+
!(
|
|
607
|
+
(code > 64 && code < 91) ||
|
|
608
|
+
(code > 96 && code < 123) ||
|
|
609
|
+
code === UNDERSCORE ||
|
|
610
|
+
code === COLON ||
|
|
611
|
+
code > 127
|
|
612
|
+
) &&
|
|
613
|
+
code !== GT
|
|
614
|
+
) {
|
|
615
|
+
pos++;
|
|
616
|
+
code = S.charCodeAt(pos);
|
|
617
|
+
}
|
|
618
|
+
let value: string | null;
|
|
619
|
+
if (code === SQUOTE || code === DQUOTE) {
|
|
620
|
+
value = parseString();
|
|
621
|
+
if (pos === -1) {
|
|
622
|
+
return { tagName, attributes, children };
|
|
623
|
+
}
|
|
624
|
+
if (decode) value = decode(value);
|
|
625
|
+
} else {
|
|
626
|
+
value = null;
|
|
627
|
+
pos--;
|
|
628
|
+
}
|
|
629
|
+
// Allocate attributes object lazily on first attribute
|
|
630
|
+
if (attributes === null) attributes = Object.create(null);
|
|
631
|
+
attributes![name] = value;
|
|
632
|
+
}
|
|
633
|
+
pos++;
|
|
634
|
+
}
|
|
635
|
+
if (strict && !S[pos]) {
|
|
636
|
+
throw strictError(`Unclosed tag <${tagName}>`);
|
|
637
|
+
}
|
|
638
|
+
// optional parsing of children
|
|
639
|
+
// Self-closing: explicit />, processing instruction ?>, or declaration <!...>
|
|
640
|
+
if (
|
|
641
|
+
S.charCodeAt(pos - 1) !== SLASH &&
|
|
642
|
+
S.charCodeAt(pos - 1) !== QUESTION &&
|
|
643
|
+
tagName.charCodeAt(0) !== BANG
|
|
644
|
+
) {
|
|
645
|
+
if (rawContentSet !== null && rawContentSet.has(tagName)) {
|
|
646
|
+
// Raw content tag: scan for the matching close tag and emit content as raw text
|
|
647
|
+
const closeTag = '</' + tagName + '>';
|
|
648
|
+
const start = pos + 1;
|
|
649
|
+
pos = S.indexOf(closeTag, start);
|
|
650
|
+
if (pos === -1) {
|
|
651
|
+
if (strict) throw strictError(`Unclosed tag <${tagName}>`);
|
|
652
|
+
// Unclosed raw content tag: consume the rest of the string
|
|
653
|
+
children = [S.substring(start)];
|
|
654
|
+
pos = S.length;
|
|
655
|
+
} else {
|
|
656
|
+
children = [S.substring(start, pos)];
|
|
657
|
+
pos += closeTag.length;
|
|
658
|
+
}
|
|
659
|
+
} else if (selfClosingSet === null || !selfClosingSet.has(tagName)) {
|
|
660
|
+
pos++;
|
|
661
|
+
children = parseChildren(tagName);
|
|
662
|
+
} else {
|
|
663
|
+
pos++;
|
|
664
|
+
}
|
|
665
|
+
} else {
|
|
666
|
+
pos++;
|
|
667
|
+
}
|
|
668
|
+
return { tagName, attributes, children };
|
|
669
|
+
}
|
|
670
|
+
|
|
671
|
+
function parseString(): string {
|
|
672
|
+
const quoteCharCode = S.charCodeAt(pos);
|
|
673
|
+
const startPosition = pos + 1;
|
|
674
|
+
pos = S.indexOf(quoteCharCode === SQUOTE ? "'" : '"', startPosition);
|
|
675
|
+
return S.substring(startPosition, pos);
|
|
676
|
+
}
|
|
677
|
+
|
|
678
|
+
function findElements(): number {
|
|
679
|
+
if (!resolvedOptions.attrName || !resolvedOptions.attrValue) return -1;
|
|
680
|
+
const matchResult = new RegExp(
|
|
681
|
+
'\\s' +
|
|
682
|
+
escapeRegExp(resolvedOptions.attrName) +
|
|
683
|
+
'\\s*=[\'"]' +
|
|
684
|
+
escapeRegExp(resolvedOptions.attrValue) +
|
|
685
|
+
'[\'"]',
|
|
686
|
+
).exec(S);
|
|
687
|
+
if (matchResult) {
|
|
688
|
+
return matchResult.index;
|
|
689
|
+
} else {
|
|
690
|
+
return -1;
|
|
691
|
+
}
|
|
692
|
+
}
|
|
693
|
+
|
|
694
|
+
let out: (TNode | string)[] | TNode;
|
|
695
|
+
|
|
696
|
+
if (resolvedOptions.attrValue !== undefined) {
|
|
697
|
+
resolvedOptions.attrName = resolvedOptions.attrName || 'id';
|
|
698
|
+
const results: (TNode | string)[] = [];
|
|
699
|
+
|
|
700
|
+
while ((pos = findElements()) !== -1) {
|
|
701
|
+
pos = S.lastIndexOf('<', pos);
|
|
702
|
+
if (pos !== -1) {
|
|
703
|
+
results.push(parseNode());
|
|
704
|
+
}
|
|
705
|
+
S = S.slice(pos);
|
|
706
|
+
pos = 0;
|
|
707
|
+
}
|
|
708
|
+
out = results;
|
|
709
|
+
} else if (resolvedOptions.parseNode) {
|
|
710
|
+
out = parseNode();
|
|
711
|
+
} else {
|
|
712
|
+
out = parseChildren('');
|
|
713
|
+
}
|
|
714
|
+
|
|
715
|
+
if (resolvedOptions.filter && Array.isArray(out)) {
|
|
716
|
+
out = filter(out, resolvedOptions.filter);
|
|
717
|
+
}
|
|
718
|
+
|
|
719
|
+
if (
|
|
720
|
+
resolvedOptions.setPos &&
|
|
721
|
+
typeof out === 'object' &&
|
|
722
|
+
!Array.isArray(out)
|
|
723
|
+
) {
|
|
724
|
+
(out as TNodeWithPos).pos = pos;
|
|
725
|
+
}
|
|
726
|
+
|
|
727
|
+
return out as (TNode | string)[];
|
|
728
|
+
}
|