@eksml/xml 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. package/LICENSE +22 -0
  2. package/README.md +588 -0
  3. package/dist/converters/fromLossless.d.mts +14 -0
  4. package/dist/converters/fromLossless.d.mts.map +1 -0
  5. package/dist/converters/fromLossless.mjs +35 -0
  6. package/dist/converters/fromLossless.mjs.map +1 -0
  7. package/dist/converters/fromLossy.d.mts +18 -0
  8. package/dist/converters/fromLossy.d.mts.map +1 -0
  9. package/dist/converters/fromLossy.mjs +91 -0
  10. package/dist/converters/fromLossy.mjs.map +1 -0
  11. package/dist/converters/lossless.d.mts +39 -0
  12. package/dist/converters/lossless.d.mts.map +1 -0
  13. package/dist/converters/lossless.mjs +74 -0
  14. package/dist/converters/lossless.mjs.map +1 -0
  15. package/dist/converters/lossy.d.mts +42 -0
  16. package/dist/converters/lossy.d.mts.map +1 -0
  17. package/dist/converters/lossy.mjs +158 -0
  18. package/dist/converters/lossy.mjs.map +1 -0
  19. package/dist/htmlConstants-D6fsKbZ-.mjs +30 -0
  20. package/dist/htmlConstants-D6fsKbZ-.mjs.map +1 -0
  21. package/dist/parser-BfdEfWDg.d.mts +95 -0
  22. package/dist/parser-BfdEfWDg.d.mts.map +1 -0
  23. package/dist/parser-CYq309aR.mjs +479 -0
  24. package/dist/parser-CYq309aR.mjs.map +1 -0
  25. package/dist/parser.d.mts +2 -0
  26. package/dist/parser.mjs +2 -0
  27. package/dist/sax.d.mts +64 -0
  28. package/dist/sax.d.mts.map +1 -0
  29. package/dist/sax.mjs +70 -0
  30. package/dist/sax.mjs.map +1 -0
  31. package/dist/saxEngine-BDnD7ruG.mjs +750 -0
  32. package/dist/saxEngine-BDnD7ruG.mjs.map +1 -0
  33. package/dist/utilities/index.d.mts +88 -0
  34. package/dist/utilities/index.d.mts.map +1 -0
  35. package/dist/utilities/index.mjs +87 -0
  36. package/dist/utilities/index.mjs.map +1 -0
  37. package/dist/writer.d.mts +58 -0
  38. package/dist/writer.d.mts.map +1 -0
  39. package/dist/writer.mjs +357 -0
  40. package/dist/writer.mjs.map +1 -0
  41. package/dist/xmlParseStream.d.mts +138 -0
  42. package/dist/xmlParseStream.d.mts.map +1 -0
  43. package/dist/xmlParseStream.mjs +313 -0
  44. package/dist/xmlParseStream.mjs.map +1 -0
  45. package/package.json +100 -0
  46. package/src/converters/fromLossless.ts +80 -0
  47. package/src/converters/fromLossy.ts +180 -0
  48. package/src/converters/lossless.ts +116 -0
  49. package/src/converters/lossy.ts +274 -0
  50. package/src/parser.ts +728 -0
  51. package/src/sax.ts +157 -0
  52. package/src/saxEngine.ts +1157 -0
  53. package/src/utilities/escapeRegExp.ts +19 -0
  54. package/src/utilities/filter.ts +63 -0
  55. package/src/utilities/getElementById.ts +21 -0
  56. package/src/utilities/getElementsByClassName.ts +22 -0
  57. package/src/utilities/htmlConstants.ts +26 -0
  58. package/src/utilities/index.ts +7 -0
  59. package/src/utilities/isElementNode.ts +19 -0
  60. package/src/utilities/isTextNode.ts +19 -0
  61. package/src/utilities/toContentString.ts +23 -0
  62. package/src/writer.ts +650 -0
  63. package/src/xmlParseStream.ts +597 -0
@@ -0,0 +1,1157 @@
1
+ /**
2
+ * saxEngine — a high-performance, synchronous, event-based streaming XML parser.
3
+ *
4
+ * This is an internal module used by `createSaxParser` and `XmlParseStream`.
5
+ * It is not part of the public API.
6
+ *
7
+ * Architecture: single-pass state machine with batch scanning. Each character is
8
+ * consumed exactly once. Within a chunk, hot-path states (text, tag names,
9
+ * attribute names/values, close tags) scan ahead with indexOf / charCodeAt loops
10
+ * to extract tokens via a single substring() rather than per-character +=.
11
+ */
12
+
13
+ // @generated:char-codes:begin
14
+ const GT = 62; // >
15
+ const SLASH = 47; // /
16
+ const BANG = 33; // !
17
+ const QUESTION = 63; // ?
18
+ const EQ = 61; // =
19
+ const LBRACKET = 91; // [
20
+ const RBRACKET = 93; // ]
21
+ const SQUOTE = 39; // '
22
+ const DQUOTE = 34; // "
23
+ const TAB = 9; // \t
24
+ const LF = 10; // \n
25
+ const CR = 13; // \r
26
+ const SPACE = 32; // (space)
27
+ const DASH = 45; // -
28
+ const UPPER_C = 67; // C
29
+ const UPPER_D = 68; // D
30
+ const UPPER_A = 65; // A
31
+ const UPPER_T = 84; // T
32
+ // @generated:char-codes:end
33
+
34
+ // ---------------------------------------------------------------------------
35
+ // Public types
36
+ // ---------------------------------------------------------------------------
37
+
38
+ /** Attributes record emitted with opentag events. */
39
+ export type Attributes = Record<string, string | null>;
40
+
41
+ /** Event handlers for the SAX engine. All callbacks are optional. */
42
+ export interface SaxEngineHandlers {
43
+ /** Fired when an opening tag and its attributes have been fully parsed. */
44
+ onOpenTag?: (tagName: string, attributes: Attributes) => void;
45
+ /** Fired when a closing tag is encountered. */
46
+ onCloseTag?: (tagName: string) => void;
47
+ /** Fired for text content between tags (trimmed; not fired for whitespace-only text). */
48
+ onText?: (text: string) => void;
49
+ /** Fired for CDATA sections. */
50
+ onCdata?: (data: string) => void;
51
+ /** Fired for comments (the full `<!-- ... -->` string). */
52
+ onComment?: (comment: string) => void;
53
+ /** Fired for processing instructions (`<?xml ... ?>`). */
54
+ onProcessingInstruction?: (name: string, body: string) => void;
55
+ /** Fired for DOCTYPE declarations (`<!DOCTYPE html>`, `<!DOCTYPE svg PUBLIC "..." "...">`). */
56
+ onDoctype?: (tagName: string, attributes: Attributes) => void;
57
+ }
58
+
59
+ /** Options for the SAX engine. */
60
+ export interface SaxEngineOptions extends SaxEngineHandlers {
61
+ /** Tag names that are self-closing (void). Default `[]`. */
62
+ selfClosingTags?: string[];
63
+ /** Tag names whose content is raw text. Default `[]`. */
64
+ rawContentTags?: string[];
65
+ /**
66
+ * Maximum allowed size (in characters) for any internal buffer (text,
67
+ * attribute values, comments, CDATA, raw text). When a buffer exceeds
68
+ * this limit a `RangeError` is thrown. Default `undefined` (no limit).
69
+ */
70
+ maxBufferSize?: number;
71
+ }
72
+
73
+ /** The parser instance returned by `saxEngine()`. */
74
+ export interface SaxEngineParser {
75
+ /** Feed a chunk of XML to the parser. */
76
+ write(chunk: string): void;
77
+ /** Signal end-of-input and flush any remaining buffered data. */
78
+ close(): void;
79
+ }
80
+
81
+ // ---------------------------------------------------------------------------
82
+ // Parser states
83
+ // ---------------------------------------------------------------------------
84
+ const State = {
85
+ TEXT: 0,
86
+ TAG_OPEN: 1,
87
+ OPEN_TAG_NAME: 2,
88
+ OPEN_TAG_BODY: 3,
89
+ ATTR_NAME: 4,
90
+ ATTR_AFTER_NAME: 5,
91
+ ATTR_AFTER_EQ: 6,
92
+ ATTR_VALUE_DQ: 7,
93
+ ATTR_VALUE_SQ: 8,
94
+ ATTR_VALUE_UQ: 9,
95
+ CLOSE_TAG: 10,
96
+ SELF_CLOSING: 11,
97
+ COMMENT_1: 12,
98
+ COMMENT: 13,
99
+ COMMENT_END1: 14,
100
+ COMMENT_END2: 15,
101
+ CDATA_1: 16,
102
+ CDATA_2: 17,
103
+ CDATA_3: 18,
104
+ CDATA_4: 19,
105
+ CDATA_5: 20,
106
+ CDATA_6: 21,
107
+ CDATA: 22,
108
+ CDATA_END1: 23,
109
+ CDATA_END2: 24,
110
+ PI: 25,
111
+ PI_END: 26,
112
+ DOCTYPE: 27,
113
+ DOCTYPE_BRACKET: 28,
114
+ BANG_START: 29,
115
+ RAW_TEXT: 30,
116
+ RAW_END_1: 31,
117
+ RAW_END_2: 32,
118
+ RAW_END_3: 33,
119
+ } as const;
120
+ type State = (typeof State)[keyof typeof State];
121
+
122
+ // ---------------------------------------------------------------------------
123
+ // Implementation
124
+ // ---------------------------------------------------------------------------
125
+
126
+ export function saxEngine(options: SaxEngineOptions = {}): SaxEngineParser {
127
+ const {
128
+ onOpenTag,
129
+ onCloseTag,
130
+ onText,
131
+ onCdata,
132
+ onComment,
133
+ onProcessingInstruction,
134
+ onDoctype,
135
+ selfClosingTags = [],
136
+ rawContentTags = [],
137
+ maxBufferSize,
138
+ } = options;
139
+
140
+ const voidSet: Set<string> | null =
141
+ selfClosingTags.length > 0 ? new Set(selfClosingTags) : null;
142
+ const rawSet: Set<string> | null =
143
+ rawContentTags.length > 0 ? new Set(rawContentTags) : null;
144
+
145
+ let state: State = State.TEXT;
146
+ let text = '';
147
+ let tagName = '';
148
+ let attributeName = '';
149
+ let attributeValue = '';
150
+ let attributes: Attributes = Object.create(null);
151
+ let special = '';
152
+ let rawTag = '';
153
+ let rawText = '';
154
+ let rawCloseTagMatchIndex = 0;
155
+ let rawCloseTagTrailing = '';
156
+
157
+ // --- Emit helpers ---
158
+
159
+ function trimWhitespace(input: string): string {
160
+ let startIndex = 0;
161
+ let endIndex = input.length - 1;
162
+ while (startIndex <= endIndex) {
163
+ const charCode = input.charCodeAt(startIndex);
164
+ if (
165
+ charCode !== SPACE &&
166
+ charCode !== TAB &&
167
+ charCode !== LF &&
168
+ charCode !== CR
169
+ )
170
+ break;
171
+ startIndex++;
172
+ }
173
+ while (endIndex >= startIndex) {
174
+ const charCode = input.charCodeAt(endIndex);
175
+ if (
176
+ charCode !== SPACE &&
177
+ charCode !== TAB &&
178
+ charCode !== LF &&
179
+ charCode !== CR
180
+ )
181
+ break;
182
+ endIndex--;
183
+ }
184
+ return startIndex === 0 && endIndex === input.length - 1
185
+ ? input
186
+ : input.substring(startIndex, endIndex + 1);
187
+ }
188
+
189
+ function emitText(): void {
190
+ if (text.length === 0) return;
191
+ if (onText) {
192
+ const trimmed = trimWhitespace(text);
193
+ if (trimmed.length > 0) onText(trimmed);
194
+ }
195
+ text = '';
196
+ }
197
+
198
+ /**
199
+ * Parse the accumulated DOCTYPE body (everything between `<!` and `>`,
200
+ * excluding internal DTD subsets) and emit an onDoctype event.
201
+ *
202
+ * The body string starts with the declaration keyword (e.g. "DOCTYPE html ...")
203
+ * after the `!`. We prepend `!` to form the tagName (e.g. "!DOCTYPE"), then
204
+ * parse the remaining space-separated tokens as null-valued attributes.
205
+ * Quoted strings are unquoted and stored as attribute keys.
206
+ */
207
+ function emitDoctype(body: string): void {
208
+ const bodyLength = body.length;
209
+ let i = 0;
210
+
211
+ // Read the declaration keyword (e.g. "DOCTYPE")
212
+ while (i < bodyLength) {
213
+ const charCode = body.charCodeAt(i);
214
+ if (
215
+ charCode === SPACE ||
216
+ charCode === TAB ||
217
+ charCode === LF ||
218
+ charCode === CR
219
+ )
220
+ break;
221
+ i++;
222
+ }
223
+ const tagName = '!' + body.substring(0, i);
224
+
225
+ // Parse space-separated tokens as null-valued attributes
226
+ const attributes: Attributes = Object.create(null);
227
+ while (i < bodyLength) {
228
+ const charCode = body.charCodeAt(i);
229
+ // Skip whitespace
230
+ if (
231
+ charCode === SPACE ||
232
+ charCode === TAB ||
233
+ charCode === LF ||
234
+ charCode === CR
235
+ ) {
236
+ i++;
237
+ continue;
238
+ }
239
+ // Quoted token — capture content without quotes as the key
240
+ if (charCode === DQUOTE || charCode === SQUOTE) {
241
+ const quoteChar = charCode === DQUOTE ? '"' : "'";
242
+ const closeIndex = body.indexOf(quoteChar, i + 1);
243
+ if (closeIndex === -1) {
244
+ // Unclosed quote — take rest as token
245
+ attributes[body.substring(i + 1)] = null;
246
+ break;
247
+ }
248
+ attributes[body.substring(i + 1, closeIndex)] = null;
249
+ i = closeIndex + 1;
250
+ continue;
251
+ }
252
+ // Unquoted token — scan until whitespace
253
+ const tokenStart = i;
254
+ while (i < bodyLength) {
255
+ const tokenCharCode = body.charCodeAt(i);
256
+ if (
257
+ tokenCharCode === SPACE ||
258
+ tokenCharCode === TAB ||
259
+ tokenCharCode === LF ||
260
+ tokenCharCode === CR
261
+ )
262
+ break;
263
+ i++;
264
+ }
265
+ attributes[body.substring(tokenStart, i)] = null;
266
+ }
267
+
268
+ onDoctype!(tagName, attributes);
269
+ }
270
+
271
+ /** After we finish parsing an open tag's `>`, handle void/raw transitions. */
272
+ function finishOpenTag(): void {
273
+ if (onOpenTag) onOpenTag(tagName, attributes);
274
+ if (voidSet !== null && voidSet.has(tagName)) {
275
+ if (onCloseTag) onCloseTag(tagName);
276
+ } else if (rawSet !== null && rawSet.has(tagName)) {
277
+ rawTag = tagName;
278
+ rawText = '';
279
+ rawCloseTagMatchIndex = 0;
280
+ state = State.RAW_TEXT;
281
+ }
282
+ }
283
+
284
+ // --- Inline scan helpers ---
285
+ // These scan ahead within a chunk and return the end index.
286
+ // If the token isn't complete in this chunk, they return -1.
287
+
288
+ /**
289
+ * Returns true if `charCode` is a tag-name-ending character:
290
+ * `>`, `/`, `=`, or whitespace.
291
+ */
292
+ function isNameEnd(charCode: number): boolean {
293
+ return (
294
+ charCode === GT ||
295
+ charCode === SLASH ||
296
+ charCode === EQ ||
297
+ charCode === SPACE ||
298
+ charCode === TAB ||
299
+ charCode === LF ||
300
+ charCode === CR
301
+ );
302
+ }
303
+
304
+ function processChunk(chunk: string): void {
305
+ const chunkLength = chunk.length;
306
+ let i = 0;
307
+
308
+ while (i < chunkLength) {
309
+ switch (state) {
310
+ // ==================================================================
311
+ // TEXT
312
+ // ==================================================================
313
+ case State.TEXT: {
314
+ const lessThanIndex = chunk.indexOf('<', i);
315
+ if (lessThanIndex === -1) {
316
+ text += i === 0 ? chunk : chunk.substring(i);
317
+ i = chunkLength;
318
+ } else {
319
+ if (lessThanIndex > i) text += chunk.substring(i, lessThanIndex);
320
+ emitText();
321
+ state = State.TAG_OPEN;
322
+ i = lessThanIndex + 1;
323
+ }
324
+ continue;
325
+ }
326
+
327
+ // ==================================================================
328
+ // TAG_OPEN
329
+ // ==================================================================
330
+ case State.TAG_OPEN: {
331
+ const charCode = chunk.charCodeAt(i);
332
+ if (charCode === SLASH) {
333
+ state = State.CLOSE_TAG;
334
+ tagName = '';
335
+ i++;
336
+ } else if (charCode === BANG) {
337
+ state = State.BANG_START;
338
+ special = '';
339
+ i++;
340
+ } else if (charCode === QUESTION) {
341
+ state = State.PI;
342
+ special = '';
343
+ i++;
344
+ } else {
345
+ state = State.OPEN_TAG_NAME;
346
+ tagName = '';
347
+ attributes = Object.create(null);
348
+ }
349
+ continue;
350
+ }
351
+
352
+ // ==================================================================
353
+ // OPEN_TAG_NAME — batch scan for end of name
354
+ // ==================================================================
355
+ case State.OPEN_TAG_NAME: {
356
+ // Scan ahead for end of tag name
357
+ let j = i;
358
+ while (j < chunkLength) {
359
+ const charCode = chunk.charCodeAt(j);
360
+ if (isNameEnd(charCode)) break;
361
+ j++;
362
+ }
363
+ // Accumulate what we scanned
364
+ if (j > i) tagName += chunk.substring(i, j);
365
+ if (j >= chunkLength) {
366
+ // Tag name continues in next chunk
367
+ i = chunkLength;
368
+ continue;
369
+ }
370
+ // We hit a terminator
371
+ const charCode = chunk.charCodeAt(j);
372
+ if (charCode === GT) {
373
+ state = State.TEXT;
374
+ i = j + 1;
375
+ finishOpenTag();
376
+ } else if (charCode === SLASH) {
377
+ state = State.SELF_CLOSING;
378
+ i = j + 1;
379
+ } else {
380
+ // whitespace — enter body
381
+ state = State.OPEN_TAG_BODY;
382
+ i = j + 1;
383
+ }
384
+ continue;
385
+ }
386
+
387
+ // ==================================================================
388
+ // OPEN_TAG_BODY
389
+ // ==================================================================
390
+ case State.OPEN_TAG_BODY: {
391
+ const charCode = chunk.charCodeAt(i);
392
+ if (charCode === GT) {
393
+ state = State.TEXT;
394
+ i++;
395
+ finishOpenTag();
396
+ } else if (charCode === SLASH) {
397
+ state = State.SELF_CLOSING;
398
+ i++;
399
+ } else if (
400
+ charCode === SPACE ||
401
+ charCode === TAB ||
402
+ charCode === LF ||
403
+ charCode === CR
404
+ ) {
405
+ i++;
406
+ } else {
407
+ state = State.ATTR_NAME;
408
+ attributeName = '';
409
+ // don't advance — first char of attr name
410
+ }
411
+ continue;
412
+ }
413
+
414
+ // ==================================================================
415
+ // ATTR_NAME — batch scan for end of attr name
416
+ // ==================================================================
417
+ case State.ATTR_NAME: {
418
+ let j = i;
419
+ while (j < chunkLength) {
420
+ const charCode = chunk.charCodeAt(j);
421
+ if (
422
+ charCode === EQ ||
423
+ charCode === GT ||
424
+ charCode === SLASH ||
425
+ charCode === SPACE ||
426
+ charCode === TAB ||
427
+ charCode === LF ||
428
+ charCode === CR
429
+ )
430
+ break;
431
+ j++;
432
+ }
433
+ if (j > i) attributeName += chunk.substring(i, j);
434
+ if (j >= chunkLength) {
435
+ i = chunkLength;
436
+ continue;
437
+ }
438
+
439
+ const charCode = chunk.charCodeAt(j);
440
+ if (charCode === EQ) {
441
+ state = State.ATTR_AFTER_EQ;
442
+ i = j + 1;
443
+ } else if (charCode === GT) {
444
+ attributes[attributeName] = null;
445
+ state = State.TEXT;
446
+ i = j + 1;
447
+ finishOpenTag();
448
+ } else if (charCode === SLASH) {
449
+ attributes[attributeName] = null;
450
+ state = State.SELF_CLOSING;
451
+ i = j + 1;
452
+ } else {
453
+ // whitespace
454
+ state = State.ATTR_AFTER_NAME;
455
+ i = j + 1;
456
+ }
457
+ continue;
458
+ }
459
+
460
+ // ==================================================================
461
+ // ATTR_AFTER_NAME
462
+ // ==================================================================
463
+ case State.ATTR_AFTER_NAME: {
464
+ const charCode = chunk.charCodeAt(i);
465
+ if (charCode === EQ) {
466
+ state = State.ATTR_AFTER_EQ;
467
+ i++;
468
+ } else if (
469
+ charCode === SPACE ||
470
+ charCode === TAB ||
471
+ charCode === LF ||
472
+ charCode === CR
473
+ ) {
474
+ i++;
475
+ } else if (charCode === GT) {
476
+ attributes[attributeName] = null;
477
+ state = State.TEXT;
478
+ i++;
479
+ finishOpenTag();
480
+ } else if (charCode === SLASH) {
481
+ attributes[attributeName] = null;
482
+ state = State.SELF_CLOSING;
483
+ i++;
484
+ } else {
485
+ // New attribute — boolean (no value)
486
+ attributes[attributeName] = null;
487
+ state = State.ATTR_NAME;
488
+ attributeName = '';
489
+ }
490
+ continue;
491
+ }
492
+
493
+ // ==================================================================
494
+ // ATTR_AFTER_EQ
495
+ // ==================================================================
496
+ case State.ATTR_AFTER_EQ: {
497
+ const charCode = chunk.charCodeAt(i);
498
+ if (charCode === DQUOTE) {
499
+ state = State.ATTR_VALUE_DQ;
500
+ attributeValue = '';
501
+ i++;
502
+ } else if (charCode === SQUOTE) {
503
+ state = State.ATTR_VALUE_SQ;
504
+ attributeValue = '';
505
+ i++;
506
+ } else if (
507
+ charCode === SPACE ||
508
+ charCode === TAB ||
509
+ charCode === LF ||
510
+ charCode === CR
511
+ ) {
512
+ i++;
513
+ } else if (charCode === GT) {
514
+ attributes[attributeName] = '';
515
+ state = State.TEXT;
516
+ i++;
517
+ finishOpenTag();
518
+ } else {
519
+ state = State.ATTR_VALUE_UQ;
520
+ attributeValue = '';
521
+ // don't advance — first char of value
522
+ }
523
+ continue;
524
+ }
525
+
526
+ // ==================================================================
527
+ // ATTR_VALUE_DQ — batch scan for closing "
528
+ // ==================================================================
529
+ case State.ATTR_VALUE_DQ: {
530
+ const quoteIndex = chunk.indexOf('"', i);
531
+ if (quoteIndex === -1) {
532
+ attributeValue += i === 0 ? chunk : chunk.substring(i);
533
+ i = chunkLength;
534
+ } else {
535
+ if (quoteIndex > i)
536
+ attributeValue += chunk.substring(i, quoteIndex);
537
+ attributes[attributeName] = attributeValue;
538
+ state = State.OPEN_TAG_BODY;
539
+ i = quoteIndex + 1;
540
+ }
541
+ continue;
542
+ }
543
+
544
+ // ==================================================================
545
+ // ATTR_VALUE_SQ — batch scan for closing '
546
+ // ==================================================================
547
+ case State.ATTR_VALUE_SQ: {
548
+ const quoteIndex = chunk.indexOf("'", i);
549
+ if (quoteIndex === -1) {
550
+ attributeValue += i === 0 ? chunk : chunk.substring(i);
551
+ i = chunkLength;
552
+ } else {
553
+ if (quoteIndex > i)
554
+ attributeValue += chunk.substring(i, quoteIndex);
555
+ attributes[attributeName] = attributeValue;
556
+ state = State.OPEN_TAG_BODY;
557
+ i = quoteIndex + 1;
558
+ }
559
+ continue;
560
+ }
561
+
562
+ // ==================================================================
563
+ // ATTR_VALUE_UQ — batch scan for end of unquoted value
564
+ // ==================================================================
565
+ case State.ATTR_VALUE_UQ: {
566
+ let j = i;
567
+ while (j < chunkLength) {
568
+ const charCode = chunk.charCodeAt(j);
569
+ if (
570
+ charCode === SPACE ||
571
+ charCode === TAB ||
572
+ charCode === LF ||
573
+ charCode === CR ||
574
+ charCode === GT ||
575
+ charCode === SLASH
576
+ )
577
+ break;
578
+ j++;
579
+ }
580
+ if (j > i) attributeValue += chunk.substring(i, j);
581
+ if (j >= chunkLength) {
582
+ i = chunkLength;
583
+ continue;
584
+ }
585
+
586
+ const charCode = chunk.charCodeAt(j);
587
+ attributes[attributeName] = attributeValue;
588
+ if (charCode === GT) {
589
+ state = State.TEXT;
590
+ i = j + 1;
591
+ finishOpenTag();
592
+ } else if (charCode === SLASH) {
593
+ state = State.SELF_CLOSING;
594
+ i = j + 1;
595
+ } else {
596
+ state = State.OPEN_TAG_BODY;
597
+ i = j + 1;
598
+ }
599
+ continue;
600
+ }
601
+
602
+ // ==================================================================
603
+ // CLOSE_TAG — batch scan for >
604
+ // ==================================================================
605
+ case State.CLOSE_TAG: {
606
+ const greaterThanIndex = chunk.indexOf('>', i);
607
+ if (greaterThanIndex === -1) {
608
+ tagName += i === 0 ? chunk : chunk.substring(i);
609
+ i = chunkLength;
610
+ } else {
611
+ if (greaterThanIndex > i)
612
+ tagName += chunk.substring(i, greaterThanIndex);
613
+ if (onCloseTag) onCloseTag(trimWhitespace(tagName));
614
+ state = State.TEXT;
615
+ i = greaterThanIndex + 1;
616
+ }
617
+ continue;
618
+ }
619
+
620
+ // ==================================================================
621
+ // SELF_CLOSING
622
+ // ==================================================================
623
+ case State.SELF_CLOSING: {
624
+ const charCode = chunk.charCodeAt(i);
625
+ if (charCode === GT) {
626
+ state = State.TEXT;
627
+ i++;
628
+ if (onOpenTag) onOpenTag(tagName, attributes);
629
+ if (onCloseTag) onCloseTag(tagName);
630
+ } else {
631
+ state = State.OPEN_TAG_BODY;
632
+ }
633
+ continue;
634
+ }
635
+
636
+ // ==================================================================
637
+ // BANG_START
638
+ // ==================================================================
639
+ case State.BANG_START: {
640
+ const charCode = chunk.charCodeAt(i);
641
+ if (charCode === DASH) {
642
+ state = State.COMMENT_1;
643
+ special = '<!-';
644
+ i++;
645
+ } else if (charCode === LBRACKET) {
646
+ state = State.CDATA_1;
647
+ i++;
648
+ } else {
649
+ state = State.DOCTYPE;
650
+ special = '';
651
+ // don't advance — first char of declaration body
652
+ }
653
+ continue;
654
+ }
655
+
656
+ // ==================================================================
657
+ // COMMENT_1
658
+ // ==================================================================
659
+ case State.COMMENT_1: {
660
+ const charCode = chunk.charCodeAt(i);
661
+ if (charCode === DASH) {
662
+ state = State.COMMENT;
663
+ special = '<!--';
664
+ i++;
665
+ } else {
666
+ // Not a comment (malformed <!-X...>) — fall through to DOCTYPE.
667
+ // We've consumed "<!-"; the body after "<!" is "-" plus remainder.
668
+ special = '-';
669
+ state = State.DOCTYPE;
670
+ // don't advance — current char is part of the body
671
+ }
672
+ continue;
673
+ }
674
+
675
+ // ==================================================================
676
+ // COMMENT — batch scan for -->
677
+ // ==================================================================
678
+ case State.COMMENT: {
679
+ // Batch: scan for '-' which might start '-->'
680
+ const dashIndex = chunk.indexOf('-', i);
681
+ if (dashIndex === -1) {
682
+ special += i === 0 ? chunk : chunk.substring(i);
683
+ i = chunkLength;
684
+ } else {
685
+ if (dashIndex > i) special += chunk.substring(i, dashIndex);
686
+ special += '-';
687
+ state = State.COMMENT_END1;
688
+ i = dashIndex + 1;
689
+ }
690
+ continue;
691
+ }
692
+
693
+ // ==================================================================
694
+ // COMMENT_END1
695
+ // ==================================================================
696
+ case State.COMMENT_END1: {
697
+ const charCode = chunk.charCodeAt(i);
698
+ if (charCode === DASH) {
699
+ state = State.COMMENT_END2;
700
+ special += '-';
701
+ i++;
702
+ } else {
703
+ state = State.COMMENT;
704
+ special += chunk[i];
705
+ i++;
706
+ }
707
+ continue;
708
+ }
709
+
710
+ // ==================================================================
711
+ // COMMENT_END2
712
+ // ==================================================================
713
+ case State.COMMENT_END2: {
714
+ const charCode = chunk.charCodeAt(i);
715
+ if (charCode === GT) {
716
+ special += '>';
717
+ if (onComment) onComment(special);
718
+ special = '';
719
+ state = State.TEXT;
720
+ i++;
721
+ } else if (charCode === DASH) {
722
+ special += '-';
723
+ i++;
724
+ } else {
725
+ state = State.COMMENT;
726
+ special += chunk[i];
727
+ i++;
728
+ }
729
+ continue;
730
+ }
731
+
732
+ // ==================================================================
733
+ // CDATA handshake states
734
+ // These match the sequence <![CDATA[ char by char. On mismatch,
735
+ // fall through to DOCTYPE with the consumed prefix in `special`.
736
+ // ==================================================================
737
+ case State.CDATA_1: {
738
+ // expecting C after <![
739
+ if (chunk.charCodeAt(i) === UPPER_C) {
740
+ state = State.CDATA_2;
741
+ i++;
742
+ } else {
743
+ special = '[';
744
+ state = State.DOCTYPE;
745
+ }
746
+ continue;
747
+ }
748
+ case State.CDATA_2: {
749
+ // expecting D
750
+ if (chunk.charCodeAt(i) === UPPER_D) {
751
+ state = State.CDATA_3;
752
+ i++;
753
+ } else {
754
+ special = '[C';
755
+ state = State.DOCTYPE;
756
+ }
757
+ continue;
758
+ }
759
+ case State.CDATA_3: {
760
+ // expecting A
761
+ if (chunk.charCodeAt(i) === UPPER_A) {
762
+ state = State.CDATA_4;
763
+ i++;
764
+ } else {
765
+ special = '[CD';
766
+ state = State.DOCTYPE;
767
+ }
768
+ continue;
769
+ }
770
+ case State.CDATA_4: {
771
+ // expecting T
772
+ if (chunk.charCodeAt(i) === UPPER_T) {
773
+ state = State.CDATA_5;
774
+ i++;
775
+ } else {
776
+ special = '[CDA';
777
+ state = State.DOCTYPE;
778
+ }
779
+ continue;
780
+ }
781
+ case State.CDATA_5: {
782
+ // expecting A
783
+ if (chunk.charCodeAt(i) === UPPER_A) {
784
+ state = State.CDATA_6;
785
+ i++;
786
+ } else {
787
+ special = '[CDAT';
788
+ state = State.DOCTYPE;
789
+ }
790
+ continue;
791
+ }
792
+ case State.CDATA_6: {
793
+ // expecting [
794
+ if (chunk.charCodeAt(i) === LBRACKET) {
795
+ state = State.CDATA;
796
+ special = '';
797
+ i++;
798
+ } else {
799
+ special = '[CDATA';
800
+ state = State.DOCTYPE;
801
+ }
802
+ continue;
803
+ }
804
+
805
+ // ==================================================================
806
+ // CDATA — batch scan for ]
807
+ // ==================================================================
808
+ case State.CDATA: {
809
+ const bracketIndex = chunk.indexOf(']', i);
810
+ if (bracketIndex === -1) {
811
+ special += i === 0 ? chunk : chunk.substring(i);
812
+ i = chunkLength;
813
+ } else {
814
+ if (bracketIndex > i) special += chunk.substring(i, bracketIndex);
815
+ state = State.CDATA_END1;
816
+ i = bracketIndex + 1;
817
+ }
818
+ continue;
819
+ }
820
+
821
+ // ==================================================================
822
+ // CDATA_END1
823
+ // ==================================================================
824
+ case State.CDATA_END1: {
825
+ const charCode = chunk.charCodeAt(i);
826
+ if (charCode === RBRACKET) {
827
+ state = State.CDATA_END2;
828
+ i++;
829
+ } else {
830
+ special += ']' + chunk[i];
831
+ state = State.CDATA;
832
+ i++;
833
+ }
834
+ continue;
835
+ }
836
+
837
+ // ==================================================================
838
+ // CDATA_END2
839
+ // ==================================================================
840
+ case State.CDATA_END2: {
841
+ const charCode = chunk.charCodeAt(i);
842
+ if (charCode === GT) {
843
+ if (onCdata) onCdata(special);
844
+ special = '';
845
+ state = State.TEXT;
846
+ i++;
847
+ } else if (charCode === RBRACKET) {
848
+ special += ']';
849
+ i++;
850
+ } else {
851
+ special += ']]' + chunk[i];
852
+ state = State.CDATA;
853
+ i++;
854
+ }
855
+ continue;
856
+ }
857
+
858
+ // ==================================================================
859
+ // PI — batch scan for ?
860
+ // ==================================================================
861
+ case State.PI: {
862
+ const questionMarkIndex = chunk.indexOf('?', i);
863
+ if (questionMarkIndex === -1) {
864
+ special += i === 0 ? chunk : chunk.substring(i);
865
+ i = chunkLength;
866
+ } else {
867
+ if (questionMarkIndex > i)
868
+ special += chunk.substring(i, questionMarkIndex);
869
+ state = State.PI_END;
870
+ i = questionMarkIndex + 1;
871
+ }
872
+ continue;
873
+ }
874
+
875
+ // ==================================================================
876
+ // PI_END
877
+ // ==================================================================
878
+ case State.PI_END: {
879
+ const charCode = chunk.charCodeAt(i);
880
+ if (charCode === GT) {
881
+ if (onProcessingInstruction) {
882
+ const inner = special;
883
+ let whitespaceIndex = -1;
884
+ for (let j = 0; j < inner.length; j++) {
885
+ const innerCharCode = inner.charCodeAt(j);
886
+ if (
887
+ innerCharCode === SPACE ||
888
+ innerCharCode === TAB ||
889
+ innerCharCode === LF ||
890
+ innerCharCode === CR
891
+ ) {
892
+ whitespaceIndex = j;
893
+ break;
894
+ }
895
+ }
896
+ if (whitespaceIndex === -1) {
897
+ onProcessingInstruction(inner, '');
898
+ } else {
899
+ const instructionName = inner.substring(0, whitespaceIndex);
900
+ let bodyStartIndex = whitespaceIndex + 1;
901
+ while (bodyStartIndex < inner.length) {
902
+ const bodyCharCode = inner.charCodeAt(bodyStartIndex);
903
+ if (
904
+ bodyCharCode !== SPACE &&
905
+ bodyCharCode !== TAB &&
906
+ bodyCharCode !== LF &&
907
+ bodyCharCode !== CR
908
+ )
909
+ break;
910
+ bodyStartIndex++;
911
+ }
912
+ let bodyEndIndex = inner.length - 1;
913
+ while (bodyEndIndex >= bodyStartIndex) {
914
+ const bodyCharCode = inner.charCodeAt(bodyEndIndex);
915
+ if (
916
+ bodyCharCode !== SPACE &&
917
+ bodyCharCode !== TAB &&
918
+ bodyCharCode !== LF &&
919
+ bodyCharCode !== CR
920
+ )
921
+ break;
922
+ bodyEndIndex--;
923
+ }
924
+ onProcessingInstruction(
925
+ instructionName,
926
+ bodyStartIndex <= bodyEndIndex
927
+ ? inner.substring(bodyStartIndex, bodyEndIndex + 1)
928
+ : '',
929
+ );
930
+ }
931
+ }
932
+ special = '';
933
+ state = State.TEXT;
934
+ i++;
935
+ } else {
936
+ special += '?';
937
+ state = State.PI;
938
+ // don't advance — re-check this char for '?' again
939
+ }
940
+ continue;
941
+ }
942
+
943
+ // ==================================================================
944
+ // DOCTYPE — accumulate body, parse tokens on >
945
+ // ==================================================================
946
+ case State.DOCTYPE: {
947
+ const charCode = chunk.charCodeAt(i);
948
+ if (charCode === GT) {
949
+ if (onDoctype) {
950
+ emitDoctype(special);
951
+ }
952
+ special = '';
953
+ state = State.TEXT;
954
+ i++;
955
+ } else if (charCode === LBRACKET) {
956
+ state = State.DOCTYPE_BRACKET;
957
+ i++;
958
+ } else {
959
+ // Batch scan: find the next > or [ to avoid per-char accumulation
960
+ let j = i;
961
+ while (j < chunkLength) {
962
+ const scanCharCode = chunk.charCodeAt(j);
963
+ if (scanCharCode === GT || scanCharCode === LBRACKET) break;
964
+ j++;
965
+ }
966
+ special += chunk.substring(i, j);
967
+ i = j;
968
+ // If j < chunkLength, the next iteration will handle > or [
969
+ }
970
+ continue;
971
+ }
972
+
973
+ // ==================================================================
974
+ // DOCTYPE_BRACKET
975
+ // ==================================================================
976
+ case State.DOCTYPE_BRACKET: {
977
+ const charCode = chunk.charCodeAt(i);
978
+ if (charCode === RBRACKET) {
979
+ state = State.DOCTYPE;
980
+ i++;
981
+ } else {
982
+ i++;
983
+ }
984
+ continue;
985
+ }
986
+
987
+ // ==================================================================
988
+ // RAW_TEXT — batch scan for <
989
+ // ==================================================================
990
+ case State.RAW_TEXT: {
991
+ const lessThanIndex = chunk.indexOf('<', i);
992
+ if (lessThanIndex === -1) {
993
+ rawText += i === 0 ? chunk : chunk.substring(i);
994
+ i = chunkLength;
995
+ } else {
996
+ if (lessThanIndex > i) rawText += chunk.substring(i, lessThanIndex);
997
+ state = State.RAW_END_1;
998
+ i = lessThanIndex + 1;
999
+ }
1000
+ continue;
1001
+ }
1002
+
1003
+ // ==================================================================
1004
+ // RAW_END_1
1005
+ // ==================================================================
1006
+ case State.RAW_END_1: {
1007
+ const charCode = chunk.charCodeAt(i);
1008
+ if (charCode === SLASH) {
1009
+ state = State.RAW_END_2;
1010
+ rawCloseTagMatchIndex = 0;
1011
+ i++;
1012
+ } else {
1013
+ rawText += '<';
1014
+ state = State.RAW_TEXT;
1015
+ // don't advance — re-process this char in RAW_TEXT
1016
+ }
1017
+ continue;
1018
+ }
1019
+
1020
+ // ==================================================================
1021
+ // RAW_END_2 — matching close tag name
1022
+ // ==================================================================
1023
+ case State.RAW_END_2: {
1024
+ if (rawCloseTagMatchIndex < rawTag.length) {
1025
+ if (chunk[i] === rawTag[rawCloseTagMatchIndex]) {
1026
+ rawCloseTagMatchIndex++;
1027
+ i++;
1028
+ } else {
1029
+ rawText += '</' + rawTag.substring(0, rawCloseTagMatchIndex);
1030
+ state = State.RAW_TEXT;
1031
+ // don't advance — re-process this char
1032
+ }
1033
+ } else {
1034
+ const charCode = chunk.charCodeAt(i);
1035
+ if (charCode === GT) {
1036
+ if (onText && rawText.length > 0) onText(rawText);
1037
+ if (onCloseTag) onCloseTag(rawTag);
1038
+ rawText = '';
1039
+ rawTag = '';
1040
+ state = State.TEXT;
1041
+ i++;
1042
+ } else if (
1043
+ charCode === SPACE ||
1044
+ charCode === TAB ||
1045
+ charCode === LF ||
1046
+ charCode === CR
1047
+ ) {
1048
+ rawCloseTagTrailing = chunk[i]!;
1049
+ state = State.RAW_END_3;
1050
+ i++;
1051
+ } else {
1052
+ rawText += '</' + rawTag;
1053
+ state = State.RAW_TEXT;
1054
+ // don't advance
1055
+ }
1056
+ }
1057
+ continue;
1058
+ }
1059
+
1060
+ // ==================================================================
1061
+ // RAW_END_3
1062
+ // ==================================================================
1063
+ case State.RAW_END_3: {
1064
+ const charCode = chunk.charCodeAt(i);
1065
+ if (charCode === GT) {
1066
+ if (onText && rawText.length > 0) onText(rawText);
1067
+ if (onCloseTag) onCloseTag(rawTag);
1068
+ rawText = '';
1069
+ rawTag = '';
1070
+ rawCloseTagTrailing = '';
1071
+ state = State.TEXT;
1072
+ i++;
1073
+ } else if (
1074
+ charCode === SPACE ||
1075
+ charCode === TAB ||
1076
+ charCode === LF ||
1077
+ charCode === CR
1078
+ ) {
1079
+ rawCloseTagTrailing += chunk[i];
1080
+ i++;
1081
+ } else {
1082
+ rawText += '</' + rawTag + rawCloseTagTrailing;
1083
+ rawCloseTagTrailing = '';
1084
+ state = State.RAW_TEXT;
1085
+ // don't advance
1086
+ }
1087
+ continue;
1088
+ }
1089
+
1090
+ default:
1091
+ i++;
1092
+ continue;
1093
+ }
1094
+ }
1095
+ }
1096
+
1097
+ return {
1098
+ write(chunk: string): void {
1099
+ if (chunk.length === 0) return;
1100
+ processChunk(chunk);
1101
+ if (
1102
+ maxBufferSize !== undefined &&
1103
+ (text.length > maxBufferSize ||
1104
+ attributeValue.length > maxBufferSize ||
1105
+ special.length > maxBufferSize ||
1106
+ rawText.length > maxBufferSize)
1107
+ ) {
1108
+ const buf =
1109
+ text.length > maxBufferSize
1110
+ ? 'text'
1111
+ : attributeValue.length > maxBufferSize
1112
+ ? 'attribute value'
1113
+ : special.length > maxBufferSize
1114
+ ? 'special'
1115
+ : 'raw text';
1116
+ throw new RangeError(
1117
+ `Buffer overflow: ${buf} buffer exceeded maxBufferSize (${maxBufferSize})`,
1118
+ );
1119
+ }
1120
+ },
1121
+
1122
+ close(): void {
1123
+ if (state === State.TEXT) {
1124
+ emitText();
1125
+ } else if (state === State.RAW_END_3) {
1126
+ // Full tag name matched + trailing whitespace — treat as valid close
1127
+ if (onText && rawText.length > 0) onText(rawText);
1128
+ if (onCloseTag) onCloseTag(rawTag);
1129
+ rawText = '';
1130
+ rawTag = '';
1131
+ rawCloseTagTrailing = '';
1132
+ state = State.TEXT;
1133
+ } else if (
1134
+ state === State.RAW_TEXT ||
1135
+ state === State.RAW_END_1 ||
1136
+ state === State.RAW_END_2
1137
+ ) {
1138
+ if (state === State.RAW_END_1) {
1139
+ rawText += '<';
1140
+ } else if (state === State.RAW_END_2) {
1141
+ rawText += '</' + rawTag.substring(0, rawCloseTagMatchIndex);
1142
+ }
1143
+ if (onText && rawText.length > 0) onText(rawText);
1144
+ if (onCloseTag) onCloseTag(rawTag);
1145
+ rawText = '';
1146
+ rawTag = '';
1147
+ state = State.TEXT;
1148
+ }
1149
+ text = '';
1150
+ tagName = '';
1151
+ attributeName = '';
1152
+ attributeValue = '';
1153
+ special = '';
1154
+ state = State.TEXT;
1155
+ },
1156
+ };
1157
+ }