@eksml/xml 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. package/LICENSE +22 -0
  2. package/README.md +588 -0
  3. package/dist/converters/fromLossless.d.mts +14 -0
  4. package/dist/converters/fromLossless.d.mts.map +1 -0
  5. package/dist/converters/fromLossless.mjs +35 -0
  6. package/dist/converters/fromLossless.mjs.map +1 -0
  7. package/dist/converters/fromLossy.d.mts +18 -0
  8. package/dist/converters/fromLossy.d.mts.map +1 -0
  9. package/dist/converters/fromLossy.mjs +91 -0
  10. package/dist/converters/fromLossy.mjs.map +1 -0
  11. package/dist/converters/lossless.d.mts +39 -0
  12. package/dist/converters/lossless.d.mts.map +1 -0
  13. package/dist/converters/lossless.mjs +74 -0
  14. package/dist/converters/lossless.mjs.map +1 -0
  15. package/dist/converters/lossy.d.mts +42 -0
  16. package/dist/converters/lossy.d.mts.map +1 -0
  17. package/dist/converters/lossy.mjs +158 -0
  18. package/dist/converters/lossy.mjs.map +1 -0
  19. package/dist/htmlConstants-D6fsKbZ-.mjs +30 -0
  20. package/dist/htmlConstants-D6fsKbZ-.mjs.map +1 -0
  21. package/dist/parser-BfdEfWDg.d.mts +95 -0
  22. package/dist/parser-BfdEfWDg.d.mts.map +1 -0
  23. package/dist/parser-CYq309aR.mjs +479 -0
  24. package/dist/parser-CYq309aR.mjs.map +1 -0
  25. package/dist/parser.d.mts +2 -0
  26. package/dist/parser.mjs +2 -0
  27. package/dist/sax.d.mts +64 -0
  28. package/dist/sax.d.mts.map +1 -0
  29. package/dist/sax.mjs +70 -0
  30. package/dist/sax.mjs.map +1 -0
  31. package/dist/saxEngine-BDnD7ruG.mjs +750 -0
  32. package/dist/saxEngine-BDnD7ruG.mjs.map +1 -0
  33. package/dist/utilities/index.d.mts +88 -0
  34. package/dist/utilities/index.d.mts.map +1 -0
  35. package/dist/utilities/index.mjs +87 -0
  36. package/dist/utilities/index.mjs.map +1 -0
  37. package/dist/writer.d.mts +58 -0
  38. package/dist/writer.d.mts.map +1 -0
  39. package/dist/writer.mjs +357 -0
  40. package/dist/writer.mjs.map +1 -0
  41. package/dist/xmlParseStream.d.mts +138 -0
  42. package/dist/xmlParseStream.d.mts.map +1 -0
  43. package/dist/xmlParseStream.mjs +313 -0
  44. package/dist/xmlParseStream.mjs.map +1 -0
  45. package/package.json +100 -0
  46. package/src/converters/fromLossless.ts +80 -0
  47. package/src/converters/fromLossy.ts +180 -0
  48. package/src/converters/lossless.ts +116 -0
  49. package/src/converters/lossy.ts +274 -0
  50. package/src/parser.ts +728 -0
  51. package/src/sax.ts +157 -0
  52. package/src/saxEngine.ts +1157 -0
  53. package/src/utilities/escapeRegExp.ts +19 -0
  54. package/src/utilities/filter.ts +63 -0
  55. package/src/utilities/getElementById.ts +21 -0
  56. package/src/utilities/getElementsByClassName.ts +22 -0
  57. package/src/utilities/htmlConstants.ts +26 -0
  58. package/src/utilities/index.ts +7 -0
  59. package/src/utilities/isElementNode.ts +19 -0
  60. package/src/utilities/isTextNode.ts +19 -0
  61. package/src/utilities/toContentString.ts +23 -0
  62. package/src/writer.ts +650 -0
  63. package/src/xmlParseStream.ts +597 -0
package/src/parser.ts ADDED
@@ -0,0 +1,728 @@
1
+ import { decodeXML, decodeHTML } from 'entities';
2
+ import { escapeRegExp } from '#src/utilities/escapeRegExp.ts';
3
+ import { filter } from '#src/utilities/filter.ts';
4
+ import {
5
+ HTML_VOID_ELEMENTS,
6
+ HTML_RAW_CONTENT_TAGS,
7
+ } from '#src/utilities/htmlConstants.ts';
8
+ // @generated:char-codes:begin
9
+ const LT = 60; // <
10
+ const GT = 62; // >
11
+ const SLASH = 47; // /
12
+ const BANG = 33; // !
13
+ const QUESTION = 63; // ?
14
+ const LBRACKET = 91; // [
15
+ const RBRACKET = 93; // ]
16
+ const SQUOTE = 39; // '
17
+ const DQUOTE = 34; // "
18
+ const DASH = 45; // -
19
+ const UNDERSCORE = 95; // _
20
+ const COLON = 58; // :
21
+ // @generated:char-codes:end
22
+
23
+ /**
24
+ * A parsed XML node
25
+ */
26
+ export interface TNode {
27
+ tagName: string;
28
+ /**
29
+ * Element attributes, or `null` if the element has no attributes.
30
+ * Values can be:
31
+ * - string: attribute with a value (e.g., `<div id="test">` -> `{id: "test"}`)
32
+ * - null: attribute without a value (e.g., `<input disabled>` -> `{disabled: null}`)
33
+ * - empty string: attribute with empty value (e.g., `<input value="">` -> `{value: ""}`)
34
+ */
35
+ attributes: Record<string, string | null> | null;
36
+ children: (TNode | string)[];
37
+ }
38
+
39
+ /**
40
+ * TNode with a pos property, returned when setPos option is true
41
+ */
42
+ interface TNodeWithPos extends TNode {
43
+ pos: number;
44
+ }
45
+
46
+ /**
47
+ * Options for parsing XML
48
+ */
49
+ export interface ParseOptions {
50
+ /** Starting position in the string */
51
+ pos?: number;
52
+ /**
53
+ * Array of tag names that are self-closing (void elements) and don't need closing tags.
54
+ * In XML mode (default), this defaults to `[]` — self-closing is detected only by `/>` syntax.
55
+ * In HTML mode (`html: true`), this defaults to the standard HTML void elements.
56
+ * Can be overridden explicitly regardless of mode.
57
+ */
58
+ selfClosingTags?: string[];
59
+ /**
60
+ * Array of tag names whose content should be treated as raw text, not parsed as XML/HTML.
61
+ * The parser will scan for the matching `</tagName>` close tag and emit everything between
62
+ * as a single text child node.
63
+ *
64
+ * In XML mode (default), this defaults to `[]`.
65
+ * In HTML mode (`html: true`), this defaults to `["script", "style"]`.
66
+ * Can be overridden explicitly regardless of mode.
67
+ */
68
+ rawContentTags?: string[];
69
+ /**
70
+ * Enable HTML parsing mode. When `true`, sets sensible defaults for:
71
+ * - `selfClosingTags`: standard HTML void elements (area, base, br, col, embed, hr, img, input, link, meta, param, source, track, wbr)
72
+ * - `rawContentTags`: elements whose content is raw text (script, style)
73
+ *
74
+ * These defaults can be overridden by explicitly passing `selfClosingTags` or `rawContentTags`.
75
+ */
76
+ html?: boolean;
77
+ /** Keep XML comments in the output */
78
+ keepComments?: boolean;
79
+ /** Trim whitespace from text nodes and discard whitespace-only text nodes */
80
+ trimWhitespace?: boolean;
81
+ /**
82
+ * Strict mode: throw on malformed XML instead of recovering silently.
83
+ * Catches unclosed comments, CDATA sections, processing instructions,
84
+ * close tags, and open tags that reach end-of-input without closing.
85
+ */
86
+ strict?: boolean;
87
+ /**
88
+ * Decode XML/HTML entities in text content and attribute values.
89
+ * When enabled, named entities (`&amp;`, `&lt;`, etc.), decimal character
90
+ * references (`&#228;`), and hex character references (`&#xe4;`) are decoded.
91
+ *
92
+ * In HTML mode (`html: true`), the full set of HTML named entities is
93
+ * supported (e.g. `&nbsp;`, `&copy;`, `&mdash;`). In XML mode, only the
94
+ * five standard XML entities plus numeric references are decoded.
95
+ *
96
+ * CDATA sections are never decoded regardless of this setting.
97
+ *
98
+ * Defaults to `false` — entities are preserved as-is in the output.
99
+ */
100
+ entities?: boolean;
101
+ /** Attribute name to search for (used with attrValue) */
102
+ attrName?: string;
103
+ /** Attribute value to search for (regex pattern) */
104
+ attrValue?: string;
105
+ /** Filter function to apply to nodes */
106
+ filter?: (node: TNode, index: number, depth: number, path: string) => boolean;
107
+ }
108
+
109
+ /** Internal options extending ParseOptions — not part of the public API. */
110
+ interface InternalParseOptions extends ParseOptions {
111
+ /** If true, the returned object will have a pos property indicating where parsing stopped */
112
+ setPos?: boolean;
113
+ /** Parse a single node instead of a list of nodes */
114
+ parseNode?: boolean;
115
+ }
116
+
117
+ // Pre-computed lookup table: 1 for characters that terminate a name token
118
+ // Name-ending chars: \t(9) \n(10) \r(13) space(32) /(47) =(61) >(62)
119
+ const NAME_END = new Uint8Array(128);
120
+ NAME_END[9] = 1; // \t
121
+ NAME_END[10] = 1; // \n
122
+ NAME_END[13] = 1; // \r
123
+ NAME_END[32] = 1; // space
124
+ NAME_END[47] = 1; // /
125
+ NAME_END[61] = 1; // =
126
+ NAME_END[62] = 1; // >
127
+
128
+ /**
129
+ * Parse XML/HTML into a DOM Object with minimal validation and fault tolerance
130
+ * @param S - The XML string to parse
131
+ * @param options - Parsing options
132
+ * @returns Array of parsed nodes and text content
133
+ */
134
+ export function parse(
135
+ S: string,
136
+ options?: ParseOptions | InternalParseOptions,
137
+ ): (TNode | string)[] {
138
+ const resolvedOptions = (options || {}) as InternalParseOptions;
139
+
140
+ let pos = resolvedOptions.pos || 0;
141
+ const keepComments = !!resolvedOptions.keepComments;
142
+ const trimWhitespace = !!resolvedOptions.trimWhitespace;
143
+ const strict = !!resolvedOptions.strict;
144
+ const htmlMode = !!resolvedOptions.html;
145
+ const decode =
146
+ resolvedOptions.entities === true
147
+ ? htmlMode
148
+ ? decodeHTML
149
+ : decodeXML
150
+ : null;
151
+
152
+ const selfClosingTagList: string[] =
153
+ resolvedOptions.selfClosingTags ?? (htmlMode ? HTML_VOID_ELEMENTS : []);
154
+ const rawContentTagList: string[] =
155
+ resolvedOptions.rawContentTags ?? (htmlMode ? HTML_RAW_CONTENT_TAGS : []);
156
+
157
+ // Convert to Sets for O(1) lookup when non-empty
158
+ const selfClosingSet: Set<string> | null =
159
+ selfClosingTagList.length > 0 ? new Set(selfClosingTagList) : null;
160
+ const rawContentSet: Set<string> | null =
161
+ rawContentTagList.length > 0 ? new Set(rawContentTagList) : null;
162
+
163
+ /** Build an error with line/column info for strict mode. */
164
+ function strictError(message: string, atPos?: number): Error {
165
+ const p = atPos !== undefined ? atPos : pos;
166
+ const before = S.substring(0, p);
167
+ const lines = before.split('\n');
168
+ const line = lines.length;
169
+ const column = lines[lines.length - 1]!.length + 1;
170
+ return new Error(`${message} at line ${line}, column ${column}`);
171
+ }
172
+
173
+ /**
174
+ * Strip whitespace-only text nodes from a children array when it
175
+ * contains only element nodes and whitespace-only text (i.e. "ignorable
176
+ * whitespace" per XML spec). Mixed-content elements — those with at
177
+ * least one non-whitespace text child — are left untouched so that
178
+ * whitespace formatting is preserved.
179
+ * Mutates the array in-place for performance.
180
+ */
181
+ function stripIgnorableWhitespace(children: (TNode | string)[]): void {
182
+ let hasElement = false;
183
+ let hasWhitespaceOnlyText = false;
184
+ let hasNonWhitespaceText = false;
185
+ for (let i = 0; i < children.length; i++) {
186
+ const child = children[i]!;
187
+ if (typeof child !== 'string') {
188
+ hasElement = true;
189
+ } else if (child.trim().length === 0) {
190
+ hasWhitespaceOnlyText = true;
191
+ } else {
192
+ hasNonWhitespaceText = true;
193
+ }
194
+ }
195
+ // Only strip when children are exclusively elements + whitespace-only
196
+ // text (pure element containers). Mixed content is left intact.
197
+ if (hasElement && hasWhitespaceOnlyText && !hasNonWhitespaceText) {
198
+ // Compact in-place with a write pointer (avoids O(n) splice per removal)
199
+ let writeIndex = 0;
200
+ for (let i = 0; i < children.length; i++) {
201
+ const child = children[i]!;
202
+ if (typeof child === 'string' && child.trim().length === 0) continue;
203
+ children[writeIndex++] = child;
204
+ }
205
+ children.length = writeIndex;
206
+ }
207
+ }
208
+
209
+ function parseChildren(rootTagName: string): (TNode | string)[] {
210
+ // Iterative tree-building using an explicit stack to avoid
211
+ // stack overflow on deeply nested XML (the old recursive
212
+ // parseNode → parseChildren → parseNode chain blew up at ~2000 levels).
213
+ interface Frame {
214
+ tagName: string;
215
+ attributes: Record<string, string | null> | null;
216
+ children: (TNode | string)[];
217
+ }
218
+ const stack: Frame[] = [];
219
+ let currentTagName = rootTagName;
220
+ let children: (TNode | string)[] = [];
221
+
222
+ while (S[pos]) {
223
+ if (S.charCodeAt(pos) === LT) {
224
+ if (S.charCodeAt(pos + 1) === SLASH) {
225
+ // ---- Close tag ----
226
+ const closeStart = pos + 2;
227
+ pos = S.indexOf('>', pos);
228
+
229
+ if (pos === -1) {
230
+ if (strict) throw strictError('Unclosed close tag', closeStart - 2);
231
+ pos = S.length;
232
+ stripIgnorableWhitespace(children);
233
+ // Unwind: if we are inside a stacked frame, pop back
234
+ while (stack.length > 0) {
235
+ const frame = stack.pop()!;
236
+ const node: TNode = {
237
+ tagName: currentTagName,
238
+ attributes: frame.attributes,
239
+ children,
240
+ };
241
+ // Restore from stack
242
+ currentTagName = frame.tagName;
243
+ children = frame.children;
244
+ children.push(node);
245
+ stripIgnorableWhitespace(children);
246
+ }
247
+ return children;
248
+ }
249
+
250
+ const closeTag = S.substring(closeStart, pos).trimEnd();
251
+ if (closeTag !== currentTagName) {
252
+ throw strictError(
253
+ `Unexpected close tag </${closeTag}> (expected </${currentTagName}>)`,
254
+ );
255
+ }
256
+
257
+ if (pos + 1) pos += 1;
258
+
259
+ stripIgnorableWhitespace(children);
260
+
261
+ if (stack.length === 0) {
262
+ // We've closed the root tag — return
263
+ return children;
264
+ }
265
+
266
+ // Pop frame: finalize node and add to parent's children
267
+ const frame = stack.pop()!;
268
+ const node: TNode = {
269
+ tagName: currentTagName,
270
+ attributes: frame.attributes,
271
+ children,
272
+ };
273
+ currentTagName = frame.tagName;
274
+ children = frame.children;
275
+ children.push(node);
276
+ // Handle processing instruction children promotion
277
+ if (node.tagName.charCodeAt(0) === QUESTION) {
278
+ children.push(...node.children);
279
+ node.children = [];
280
+ }
281
+ continue;
282
+ } else if (S.charCodeAt(pos + 1) === BANG) {
283
+ if (S.charCodeAt(pos + 2) === DASH) {
284
+ // comment: use indexOf("-->") for fast scanning
285
+ const startCommentPos = pos;
286
+ pos = S.indexOf('-->', pos + 3);
287
+ if (pos === -1) {
288
+ if (strict)
289
+ throw strictError('Unclosed comment', startCommentPos);
290
+ pos = S.length;
291
+ if (keepComments) {
292
+ children.push(S.substring(startCommentPos));
293
+ }
294
+ } else {
295
+ pos += 2; // point to the '>'
296
+ if (keepComments) {
297
+ children.push(S.substring(startCommentPos, pos + 1));
298
+ }
299
+ }
300
+ } else if (
301
+ S.charCodeAt(pos + 2) === LBRACKET &&
302
+ S.charCodeAt(pos + 8) === LBRACKET &&
303
+ S.substring(pos + 3, pos + 8).toLowerCase() === 'cdata'
304
+ ) {
305
+ // cdata
306
+ const cdataEndIndex = S.indexOf(']]>', pos);
307
+ if (cdataEndIndex === -1) {
308
+ if (strict) throw strictError('Unclosed CDATA section');
309
+ children.push(S.substring(pos + 9));
310
+ pos = S.length;
311
+ } else {
312
+ children.push(S.substring(pos + 9, cdataEndIndex));
313
+ pos = cdataEndIndex + 3;
314
+ }
315
+ continue;
316
+ } else {
317
+ // doctype / other <!...> declarations: parse as TNode
318
+ // Read the declaration keyword (e.g. "!DOCTYPE")
319
+ pos += 2; // skip '<!'
320
+ const keywordStart = pos - 1; // include the '!'
321
+ while (pos < S.length) {
322
+ const cc = S.charCodeAt(pos);
323
+ if (cc <= 32 || cc === GT || cc === LBRACKET) break;
324
+ pos++;
325
+ }
326
+ const declTagName = S.substring(keywordStart, pos);
327
+
328
+ // Parse space-separated tokens as null-valued attributes
329
+ let declAttributes: Record<string, string | null> | null = null;
330
+ while (pos < S.length) {
331
+ const cc = S.charCodeAt(pos);
332
+ if (cc === GT || cc === LBRACKET) break;
333
+ // Skip whitespace
334
+ if (cc <= 32) {
335
+ pos++;
336
+ continue;
337
+ }
338
+ // Quoted token — capture including quotes as the key
339
+ if (cc === SQUOTE || cc === DQUOTE) {
340
+ const closePos = S.indexOf(cc === SQUOTE ? "'" : '"', pos + 1);
341
+ if (closePos === -1) {
342
+ if (strict) throw strictError('Unclosed declaration');
343
+ pos = S.length;
344
+ break;
345
+ }
346
+ const token = S.substring(pos + 1, closePos);
347
+ if (declAttributes === null)
348
+ declAttributes = Object.create(null);
349
+ declAttributes![token] = null;
350
+ pos = closePos + 1;
351
+ continue;
352
+ }
353
+ // Unquoted token
354
+ const tokenStart = pos;
355
+ while (pos < S.length) {
356
+ const tc = S.charCodeAt(pos);
357
+ if (tc <= 32 || tc === GT || tc === LBRACKET) break;
358
+ pos++;
359
+ }
360
+ const token = S.substring(tokenStart, pos);
361
+ if (declAttributes === null) declAttributes = Object.create(null);
362
+ declAttributes![token] = null;
363
+ }
364
+
365
+ // Skip internal DTD subset ([...]) if present
366
+ if (pos < S.length && S.charCodeAt(pos) === LBRACKET) {
367
+ pos++; // skip '['
368
+ let insideBracketSection = true;
369
+ while (insideBracketSection && pos < S.length) {
370
+ if (S.charCodeAt(pos) === RBRACKET) {
371
+ insideBracketSection = false;
372
+ } else {
373
+ // Skip quoted strings inside internal DTD subset
374
+ const quoteCharCode = S.charCodeAt(pos);
375
+ if (quoteCharCode === SQUOTE || quoteCharCode === DQUOTE) {
376
+ pos = S.indexOf(
377
+ quoteCharCode === SQUOTE ? "'" : '"',
378
+ pos + 1,
379
+ );
380
+ if (pos === -1) {
381
+ pos = S.length;
382
+ break;
383
+ }
384
+ }
385
+ }
386
+ pos++;
387
+ }
388
+ // Skip any whitespace between ] and >
389
+ while (pos < S.length && S.charCodeAt(pos) <= 32) pos++;
390
+ }
391
+
392
+ if (strict && (pos >= S.length || S.charCodeAt(pos) !== GT))
393
+ throw strictError('Unclosed declaration');
394
+
395
+ children.push({
396
+ tagName: declTagName,
397
+ attributes: declAttributes,
398
+ children: [],
399
+ } as TNode);
400
+ }
401
+ pos++;
402
+ continue;
403
+ }
404
+ // ---- Open tag (inline parseNode logic) ----
405
+ pos++;
406
+ const tagName = parseName();
407
+ let attributes: Record<string, string | null> | null = null;
408
+
409
+ // parsing attributes
410
+ while (S.charCodeAt(pos) !== GT && S[pos]) {
411
+ let charCode = S.charCodeAt(pos);
412
+ if (
413
+ (charCode > 64 && charCode < 91) ||
414
+ (charCode > 96 && charCode < 123) ||
415
+ charCode === UNDERSCORE ||
416
+ charCode === COLON ||
417
+ charCode > 127
418
+ ) {
419
+ const name = parseName();
420
+ let code = S.charCodeAt(pos);
421
+ while (
422
+ code &&
423
+ code !== SQUOTE &&
424
+ code !== DQUOTE &&
425
+ !(
426
+ (code > 64 && code < 91) ||
427
+ (code > 96 && code < 123) ||
428
+ code === UNDERSCORE ||
429
+ code === COLON ||
430
+ code > 127
431
+ ) &&
432
+ code !== GT
433
+ ) {
434
+ pos++;
435
+ code = S.charCodeAt(pos);
436
+ }
437
+ let value: string | null;
438
+ if (code === SQUOTE || code === DQUOTE) {
439
+ value = parseString();
440
+ if (pos === -1) {
441
+ // Unterminated attribute string — emit node with what we have
442
+ const node: TNode = { tagName, attributes, children: [] };
443
+ children.push(node);
444
+ // Unwind remaining stack frames
445
+ while (stack.length > 0) {
446
+ const frame = stack.pop()!;
447
+ const parent: TNode = {
448
+ tagName: currentTagName,
449
+ attributes: frame.attributes,
450
+ children,
451
+ };
452
+ currentTagName = frame.tagName;
453
+ children = frame.children;
454
+ children.push(parent);
455
+ }
456
+ return children;
457
+ }
458
+ if (decode) value = decode(value);
459
+ } else {
460
+ value = null;
461
+ pos--;
462
+ }
463
+ if (attributes === null) attributes = Object.create(null);
464
+ attributes![name] = value;
465
+ }
466
+ pos++;
467
+ }
468
+ if (strict && !S[pos]) {
469
+ throw strictError(`Unclosed tag <${tagName}>`);
470
+ }
471
+
472
+ // Determine if this node has children or is self-closing
473
+ if (
474
+ S.charCodeAt(pos - 1) !== SLASH &&
475
+ S.charCodeAt(pos - 1) !== QUESTION &&
476
+ tagName.charCodeAt(0) !== BANG
477
+ ) {
478
+ if (rawContentSet !== null && rawContentSet.has(tagName)) {
479
+ // Raw content tag
480
+ const closeTagStr = '</' + tagName + '>';
481
+ const start = pos + 1;
482
+ pos = S.indexOf(closeTagStr, start);
483
+ let rawChildren: (TNode | string)[];
484
+ if (pos === -1) {
485
+ if (strict) throw strictError(`Unclosed tag <${tagName}>`);
486
+ rawChildren = [S.substring(start)];
487
+ pos = S.length;
488
+ } else {
489
+ rawChildren = [S.substring(start, pos)];
490
+ pos += closeTagStr.length;
491
+ }
492
+ const node: TNode = { tagName, attributes, children: rawChildren };
493
+ children.push(node);
494
+ if (tagName.charCodeAt(0) === QUESTION) {
495
+ children.push(...node.children);
496
+ node.children = [];
497
+ }
498
+ } else if (selfClosingSet === null || !selfClosingSet.has(tagName)) {
499
+ // Node has children — push frame and descend
500
+ pos++;
501
+ stack.push({ tagName: currentTagName, attributes, children });
502
+ currentTagName = tagName;
503
+ children = [];
504
+ } else {
505
+ // Self-closing tag (from selfClosingTags list)
506
+ pos++;
507
+ const node: TNode = { tagName, attributes, children: [] };
508
+ children.push(node);
509
+ if (tagName.charCodeAt(0) === QUESTION) {
510
+ children.push(...node.children);
511
+ node.children = [];
512
+ }
513
+ }
514
+ } else {
515
+ // Explicit self-closing (/>) or processing instruction (?>) or declaration
516
+ pos++;
517
+ const node: TNode = { tagName, attributes, children: [] };
518
+ children.push(node);
519
+ if (tagName.charCodeAt(0) === QUESTION) {
520
+ children.push(...node.children);
521
+ node.children = [];
522
+ }
523
+ }
524
+ } else {
525
+ let text = parseText();
526
+ if (decode) text = decode(text);
527
+ if (trimWhitespace) {
528
+ const trimmed = text.trim();
529
+ if (trimmed.length > 0) {
530
+ children.push(trimmed);
531
+ }
532
+ } else {
533
+ if (text.length > 0) {
534
+ children.push(text);
535
+ }
536
+ }
537
+ pos++;
538
+ }
539
+ }
540
+ // If we exit the loop for a named tag, input ended without a close tag
541
+ if (strict && currentTagName !== '') {
542
+ throw strictError(`Unclosed tag <${currentTagName}>`);
543
+ }
544
+ stripIgnorableWhitespace(children);
545
+ // Unwind any remaining stack frames (unclosed tags in non-strict mode)
546
+ while (stack.length > 0) {
547
+ const frame = stack.pop()!;
548
+ const node: TNode = {
549
+ tagName: currentTagName,
550
+ attributes: frame.attributes,
551
+ children,
552
+ };
553
+ currentTagName = frame.tagName;
554
+ children = frame.children;
555
+ children.push(node);
556
+ stripIgnorableWhitespace(children);
557
+ }
558
+ return children;
559
+ }
560
+
561
+ function parseText(): string {
562
+ const start = pos;
563
+ pos = S.indexOf('<', pos) - 1;
564
+ if (pos === -2) pos = S.length;
565
+ return S.substring(start, pos + 1);
566
+ }
567
+
568
+ function parseName(): string {
569
+ const start = pos;
570
+ let charCode = S.charCodeAt(pos);
571
+ while (
572
+ charCode < 128
573
+ ? NAME_END[charCode] === 0
574
+ : charCode === charCode /* not NaN = not past end */
575
+ ) {
576
+ charCode = S.charCodeAt(++pos);
577
+ }
578
+ return S.substring(start, pos);
579
+ }
580
+
581
+ function parseNode(): TNode {
582
+ pos++;
583
+ const tagName = parseName();
584
+ // Defer attributes allocation until first attribute is found
585
+ let attributes: Record<string, string | null> | null = null;
586
+ let children: (TNode | string)[] = [];
587
+
588
+ // parsing attributes
589
+ while (S.charCodeAt(pos) !== GT && S[pos]) {
590
+ let charCode = S.charCodeAt(pos);
591
+ // Valid XML attribute name start: A-Z, a-z, _, :, or non-ASCII
592
+ if (
593
+ (charCode > 64 && charCode < 91) ||
594
+ (charCode > 96 && charCode < 123) ||
595
+ charCode === UNDERSCORE ||
596
+ charCode === COLON ||
597
+ charCode > 127
598
+ ) {
599
+ const name = parseName();
600
+ // search beginning of the string
601
+ let code = S.charCodeAt(pos);
602
+ while (
603
+ code &&
604
+ code !== SQUOTE &&
605
+ code !== DQUOTE &&
606
+ !(
607
+ (code > 64 && code < 91) ||
608
+ (code > 96 && code < 123) ||
609
+ code === UNDERSCORE ||
610
+ code === COLON ||
611
+ code > 127
612
+ ) &&
613
+ code !== GT
614
+ ) {
615
+ pos++;
616
+ code = S.charCodeAt(pos);
617
+ }
618
+ let value: string | null;
619
+ if (code === SQUOTE || code === DQUOTE) {
620
+ value = parseString();
621
+ if (pos === -1) {
622
+ return { tagName, attributes, children };
623
+ }
624
+ if (decode) value = decode(value);
625
+ } else {
626
+ value = null;
627
+ pos--;
628
+ }
629
+ // Allocate attributes object lazily on first attribute
630
+ if (attributes === null) attributes = Object.create(null);
631
+ attributes![name] = value;
632
+ }
633
+ pos++;
634
+ }
635
+ if (strict && !S[pos]) {
636
+ throw strictError(`Unclosed tag <${tagName}>`);
637
+ }
638
+ // optional parsing of children
639
+ // Self-closing: explicit />, processing instruction ?>, or declaration <!...>
640
+ if (
641
+ S.charCodeAt(pos - 1) !== SLASH &&
642
+ S.charCodeAt(pos - 1) !== QUESTION &&
643
+ tagName.charCodeAt(0) !== BANG
644
+ ) {
645
+ if (rawContentSet !== null && rawContentSet.has(tagName)) {
646
+ // Raw content tag: scan for the matching close tag and emit content as raw text
647
+ const closeTag = '</' + tagName + '>';
648
+ const start = pos + 1;
649
+ pos = S.indexOf(closeTag, start);
650
+ if (pos === -1) {
651
+ if (strict) throw strictError(`Unclosed tag <${tagName}>`);
652
+ // Unclosed raw content tag: consume the rest of the string
653
+ children = [S.substring(start)];
654
+ pos = S.length;
655
+ } else {
656
+ children = [S.substring(start, pos)];
657
+ pos += closeTag.length;
658
+ }
659
+ } else if (selfClosingSet === null || !selfClosingSet.has(tagName)) {
660
+ pos++;
661
+ children = parseChildren(tagName);
662
+ } else {
663
+ pos++;
664
+ }
665
+ } else {
666
+ pos++;
667
+ }
668
+ return { tagName, attributes, children };
669
+ }
670
+
671
+ function parseString(): string {
672
+ const quoteCharCode = S.charCodeAt(pos);
673
+ const startPosition = pos + 1;
674
+ pos = S.indexOf(quoteCharCode === SQUOTE ? "'" : '"', startPosition);
675
+ return S.substring(startPosition, pos);
676
+ }
677
+
678
+ function findElements(): number {
679
+ if (!resolvedOptions.attrName || !resolvedOptions.attrValue) return -1;
680
+ const matchResult = new RegExp(
681
+ '\\s' +
682
+ escapeRegExp(resolvedOptions.attrName) +
683
+ '\\s*=[\'"]' +
684
+ escapeRegExp(resolvedOptions.attrValue) +
685
+ '[\'"]',
686
+ ).exec(S);
687
+ if (matchResult) {
688
+ return matchResult.index;
689
+ } else {
690
+ return -1;
691
+ }
692
+ }
693
+
694
+ let out: (TNode | string)[] | TNode;
695
+
696
+ if (resolvedOptions.attrValue !== undefined) {
697
+ resolvedOptions.attrName = resolvedOptions.attrName || 'id';
698
+ const results: (TNode | string)[] = [];
699
+
700
+ while ((pos = findElements()) !== -1) {
701
+ pos = S.lastIndexOf('<', pos);
702
+ if (pos !== -1) {
703
+ results.push(parseNode());
704
+ }
705
+ S = S.slice(pos);
706
+ pos = 0;
707
+ }
708
+ out = results;
709
+ } else if (resolvedOptions.parseNode) {
710
+ out = parseNode();
711
+ } else {
712
+ out = parseChildren('');
713
+ }
714
+
715
+ if (resolvedOptions.filter && Array.isArray(out)) {
716
+ out = filter(out, resolvedOptions.filter);
717
+ }
718
+
719
+ if (
720
+ resolvedOptions.setPos &&
721
+ typeof out === 'object' &&
722
+ !Array.isArray(out)
723
+ ) {
724
+ (out as TNodeWithPos).pos = pos;
725
+ }
726
+
727
+ return out as (TNode | string)[];
728
+ }