@eksml/xml 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +22 -0
- package/README.md +588 -0
- package/dist/converters/fromLossless.d.mts +14 -0
- package/dist/converters/fromLossless.d.mts.map +1 -0
- package/dist/converters/fromLossless.mjs +35 -0
- package/dist/converters/fromLossless.mjs.map +1 -0
- package/dist/converters/fromLossy.d.mts +18 -0
- package/dist/converters/fromLossy.d.mts.map +1 -0
- package/dist/converters/fromLossy.mjs +91 -0
- package/dist/converters/fromLossy.mjs.map +1 -0
- package/dist/converters/lossless.d.mts +39 -0
- package/dist/converters/lossless.d.mts.map +1 -0
- package/dist/converters/lossless.mjs +74 -0
- package/dist/converters/lossless.mjs.map +1 -0
- package/dist/converters/lossy.d.mts +42 -0
- package/dist/converters/lossy.d.mts.map +1 -0
- package/dist/converters/lossy.mjs +158 -0
- package/dist/converters/lossy.mjs.map +1 -0
- package/dist/htmlConstants-D6fsKbZ-.mjs +30 -0
- package/dist/htmlConstants-D6fsKbZ-.mjs.map +1 -0
- package/dist/parser-BfdEfWDg.d.mts +95 -0
- package/dist/parser-BfdEfWDg.d.mts.map +1 -0
- package/dist/parser-CYq309aR.mjs +479 -0
- package/dist/parser-CYq309aR.mjs.map +1 -0
- package/dist/parser.d.mts +2 -0
- package/dist/parser.mjs +2 -0
- package/dist/sax.d.mts +64 -0
- package/dist/sax.d.mts.map +1 -0
- package/dist/sax.mjs +70 -0
- package/dist/sax.mjs.map +1 -0
- package/dist/saxEngine-BDnD7ruG.mjs +750 -0
- package/dist/saxEngine-BDnD7ruG.mjs.map +1 -0
- package/dist/utilities/index.d.mts +88 -0
- package/dist/utilities/index.d.mts.map +1 -0
- package/dist/utilities/index.mjs +87 -0
- package/dist/utilities/index.mjs.map +1 -0
- package/dist/writer.d.mts +58 -0
- package/dist/writer.d.mts.map +1 -0
- package/dist/writer.mjs +357 -0
- package/dist/writer.mjs.map +1 -0
- package/dist/xmlParseStream.d.mts +138 -0
- package/dist/xmlParseStream.d.mts.map +1 -0
- package/dist/xmlParseStream.mjs +313 -0
- package/dist/xmlParseStream.mjs.map +1 -0
- package/package.json +100 -0
- package/src/converters/fromLossless.ts +80 -0
- package/src/converters/fromLossy.ts +180 -0
- package/src/converters/lossless.ts +116 -0
- package/src/converters/lossy.ts +274 -0
- package/src/parser.ts +728 -0
- package/src/sax.ts +157 -0
- package/src/saxEngine.ts +1157 -0
- package/src/utilities/escapeRegExp.ts +19 -0
- package/src/utilities/filter.ts +63 -0
- package/src/utilities/getElementById.ts +21 -0
- package/src/utilities/getElementsByClassName.ts +22 -0
- package/src/utilities/htmlConstants.ts +26 -0
- package/src/utilities/index.ts +7 -0
- package/src/utilities/isElementNode.ts +19 -0
- package/src/utilities/isTextNode.ts +19 -0
- package/src/utilities/toContentString.ts +23 -0
- package/src/writer.ts +650 -0
- package/src/xmlParseStream.ts +597 -0
package/src/saxEngine.ts
ADDED
|
@@ -0,0 +1,1157 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* saxEngine — a high-performance, synchronous, event-based streaming XML parser.
|
|
3
|
+
*
|
|
4
|
+
* This is an internal module used by `createSaxParser` and `XmlParseStream`.
|
|
5
|
+
* It is not part of the public API.
|
|
6
|
+
*
|
|
7
|
+
* Architecture: single-pass state machine with batch scanning. Each character is
|
|
8
|
+
* consumed exactly once. Within a chunk, hot-path states (text, tag names,
|
|
9
|
+
* attribute names/values, close tags) scan ahead with indexOf / charCodeAt loops
|
|
10
|
+
* to extract tokens via a single substring() rather than per-character +=.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
// @generated:char-codes:begin
|
|
14
|
+
const GT = 62; // >
|
|
15
|
+
const SLASH = 47; // /
|
|
16
|
+
const BANG = 33; // !
|
|
17
|
+
const QUESTION = 63; // ?
|
|
18
|
+
const EQ = 61; // =
|
|
19
|
+
const LBRACKET = 91; // [
|
|
20
|
+
const RBRACKET = 93; // ]
|
|
21
|
+
const SQUOTE = 39; // '
|
|
22
|
+
const DQUOTE = 34; // "
|
|
23
|
+
const TAB = 9; // \t
|
|
24
|
+
const LF = 10; // \n
|
|
25
|
+
const CR = 13; // \r
|
|
26
|
+
const SPACE = 32; // (space)
|
|
27
|
+
const DASH = 45; // -
|
|
28
|
+
const UPPER_C = 67; // C
|
|
29
|
+
const UPPER_D = 68; // D
|
|
30
|
+
const UPPER_A = 65; // A
|
|
31
|
+
const UPPER_T = 84; // T
|
|
32
|
+
// @generated:char-codes:end
|
|
33
|
+
|
|
34
|
+
// ---------------------------------------------------------------------------
|
|
35
|
+
// Public types
|
|
36
|
+
// ---------------------------------------------------------------------------
|
|
37
|
+
|
|
38
|
+
/** Attributes record emitted with opentag events. */
|
|
39
|
+
export type Attributes = Record<string, string | null>;
|
|
40
|
+
|
|
41
|
+
/** Event handlers for the SAX engine. All callbacks are optional. */
|
|
42
|
+
export interface SaxEngineHandlers {
|
|
43
|
+
/** Fired when an opening tag and its attributes have been fully parsed. */
|
|
44
|
+
onOpenTag?: (tagName: string, attributes: Attributes) => void;
|
|
45
|
+
/** Fired when a closing tag is encountered. */
|
|
46
|
+
onCloseTag?: (tagName: string) => void;
|
|
47
|
+
/** Fired for text content between tags (trimmed; not fired for whitespace-only text). */
|
|
48
|
+
onText?: (text: string) => void;
|
|
49
|
+
/** Fired for CDATA sections. */
|
|
50
|
+
onCdata?: (data: string) => void;
|
|
51
|
+
/** Fired for comments (the full `<!-- ... -->` string). */
|
|
52
|
+
onComment?: (comment: string) => void;
|
|
53
|
+
/** Fired for processing instructions (`<?xml ... ?>`). */
|
|
54
|
+
onProcessingInstruction?: (name: string, body: string) => void;
|
|
55
|
+
/** Fired for DOCTYPE declarations (`<!DOCTYPE html>`, `<!DOCTYPE svg PUBLIC "..." "...">`). */
|
|
56
|
+
onDoctype?: (tagName: string, attributes: Attributes) => void;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/** Options for the SAX engine. */
|
|
60
|
+
export interface SaxEngineOptions extends SaxEngineHandlers {
|
|
61
|
+
/** Tag names that are self-closing (void). Default `[]`. */
|
|
62
|
+
selfClosingTags?: string[];
|
|
63
|
+
/** Tag names whose content is raw text. Default `[]`. */
|
|
64
|
+
rawContentTags?: string[];
|
|
65
|
+
/**
|
|
66
|
+
* Maximum allowed size (in characters) for any internal buffer (text,
|
|
67
|
+
* attribute values, comments, CDATA, raw text). When a buffer exceeds
|
|
68
|
+
* this limit a `RangeError` is thrown. Default `undefined` (no limit).
|
|
69
|
+
*/
|
|
70
|
+
maxBufferSize?: number;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
/** The parser instance returned by `saxEngine()`. */
|
|
74
|
+
export interface SaxEngineParser {
|
|
75
|
+
/** Feed a chunk of XML to the parser. */
|
|
76
|
+
write(chunk: string): void;
|
|
77
|
+
/** Signal end-of-input and flush any remaining buffered data. */
|
|
78
|
+
close(): void;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// ---------------------------------------------------------------------------
|
|
82
|
+
// Parser states
|
|
83
|
+
// ---------------------------------------------------------------------------
|
|
84
|
+
const State = {
|
|
85
|
+
TEXT: 0,
|
|
86
|
+
TAG_OPEN: 1,
|
|
87
|
+
OPEN_TAG_NAME: 2,
|
|
88
|
+
OPEN_TAG_BODY: 3,
|
|
89
|
+
ATTR_NAME: 4,
|
|
90
|
+
ATTR_AFTER_NAME: 5,
|
|
91
|
+
ATTR_AFTER_EQ: 6,
|
|
92
|
+
ATTR_VALUE_DQ: 7,
|
|
93
|
+
ATTR_VALUE_SQ: 8,
|
|
94
|
+
ATTR_VALUE_UQ: 9,
|
|
95
|
+
CLOSE_TAG: 10,
|
|
96
|
+
SELF_CLOSING: 11,
|
|
97
|
+
COMMENT_1: 12,
|
|
98
|
+
COMMENT: 13,
|
|
99
|
+
COMMENT_END1: 14,
|
|
100
|
+
COMMENT_END2: 15,
|
|
101
|
+
CDATA_1: 16,
|
|
102
|
+
CDATA_2: 17,
|
|
103
|
+
CDATA_3: 18,
|
|
104
|
+
CDATA_4: 19,
|
|
105
|
+
CDATA_5: 20,
|
|
106
|
+
CDATA_6: 21,
|
|
107
|
+
CDATA: 22,
|
|
108
|
+
CDATA_END1: 23,
|
|
109
|
+
CDATA_END2: 24,
|
|
110
|
+
PI: 25,
|
|
111
|
+
PI_END: 26,
|
|
112
|
+
DOCTYPE: 27,
|
|
113
|
+
DOCTYPE_BRACKET: 28,
|
|
114
|
+
BANG_START: 29,
|
|
115
|
+
RAW_TEXT: 30,
|
|
116
|
+
RAW_END_1: 31,
|
|
117
|
+
RAW_END_2: 32,
|
|
118
|
+
RAW_END_3: 33,
|
|
119
|
+
} as const;
|
|
120
|
+
type State = (typeof State)[keyof typeof State];
|
|
121
|
+
|
|
122
|
+
// ---------------------------------------------------------------------------
|
|
123
|
+
// Implementation
|
|
124
|
+
// ---------------------------------------------------------------------------
|
|
125
|
+
|
|
126
|
+
export function saxEngine(options: SaxEngineOptions = {}): SaxEngineParser {
|
|
127
|
+
const {
|
|
128
|
+
onOpenTag,
|
|
129
|
+
onCloseTag,
|
|
130
|
+
onText,
|
|
131
|
+
onCdata,
|
|
132
|
+
onComment,
|
|
133
|
+
onProcessingInstruction,
|
|
134
|
+
onDoctype,
|
|
135
|
+
selfClosingTags = [],
|
|
136
|
+
rawContentTags = [],
|
|
137
|
+
maxBufferSize,
|
|
138
|
+
} = options;
|
|
139
|
+
|
|
140
|
+
const voidSet: Set<string> | null =
|
|
141
|
+
selfClosingTags.length > 0 ? new Set(selfClosingTags) : null;
|
|
142
|
+
const rawSet: Set<string> | null =
|
|
143
|
+
rawContentTags.length > 0 ? new Set(rawContentTags) : null;
|
|
144
|
+
|
|
145
|
+
let state: State = State.TEXT;
|
|
146
|
+
let text = '';
|
|
147
|
+
let tagName = '';
|
|
148
|
+
let attributeName = '';
|
|
149
|
+
let attributeValue = '';
|
|
150
|
+
let attributes: Attributes = Object.create(null);
|
|
151
|
+
let special = '';
|
|
152
|
+
let rawTag = '';
|
|
153
|
+
let rawText = '';
|
|
154
|
+
let rawCloseTagMatchIndex = 0;
|
|
155
|
+
let rawCloseTagTrailing = '';
|
|
156
|
+
|
|
157
|
+
// --- Emit helpers ---
|
|
158
|
+
|
|
159
|
+
function trimWhitespace(input: string): string {
|
|
160
|
+
let startIndex = 0;
|
|
161
|
+
let endIndex = input.length - 1;
|
|
162
|
+
while (startIndex <= endIndex) {
|
|
163
|
+
const charCode = input.charCodeAt(startIndex);
|
|
164
|
+
if (
|
|
165
|
+
charCode !== SPACE &&
|
|
166
|
+
charCode !== TAB &&
|
|
167
|
+
charCode !== LF &&
|
|
168
|
+
charCode !== CR
|
|
169
|
+
)
|
|
170
|
+
break;
|
|
171
|
+
startIndex++;
|
|
172
|
+
}
|
|
173
|
+
while (endIndex >= startIndex) {
|
|
174
|
+
const charCode = input.charCodeAt(endIndex);
|
|
175
|
+
if (
|
|
176
|
+
charCode !== SPACE &&
|
|
177
|
+
charCode !== TAB &&
|
|
178
|
+
charCode !== LF &&
|
|
179
|
+
charCode !== CR
|
|
180
|
+
)
|
|
181
|
+
break;
|
|
182
|
+
endIndex--;
|
|
183
|
+
}
|
|
184
|
+
return startIndex === 0 && endIndex === input.length - 1
|
|
185
|
+
? input
|
|
186
|
+
: input.substring(startIndex, endIndex + 1);
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
function emitText(): void {
|
|
190
|
+
if (text.length === 0) return;
|
|
191
|
+
if (onText) {
|
|
192
|
+
const trimmed = trimWhitespace(text);
|
|
193
|
+
if (trimmed.length > 0) onText(trimmed);
|
|
194
|
+
}
|
|
195
|
+
text = '';
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
/**
|
|
199
|
+
* Parse the accumulated DOCTYPE body (everything between `<!` and `>`,
|
|
200
|
+
* excluding internal DTD subsets) and emit an onDoctype event.
|
|
201
|
+
*
|
|
202
|
+
* The body string starts with the declaration keyword (e.g. "DOCTYPE html ...")
|
|
203
|
+
* after the `!`. We prepend `!` to form the tagName (e.g. "!DOCTYPE"), then
|
|
204
|
+
* parse the remaining space-separated tokens as null-valued attributes.
|
|
205
|
+
* Quoted strings are unquoted and stored as attribute keys.
|
|
206
|
+
*/
|
|
207
|
+
function emitDoctype(body: string): void {
|
|
208
|
+
const bodyLength = body.length;
|
|
209
|
+
let i = 0;
|
|
210
|
+
|
|
211
|
+
// Read the declaration keyword (e.g. "DOCTYPE")
|
|
212
|
+
while (i < bodyLength) {
|
|
213
|
+
const charCode = body.charCodeAt(i);
|
|
214
|
+
if (
|
|
215
|
+
charCode === SPACE ||
|
|
216
|
+
charCode === TAB ||
|
|
217
|
+
charCode === LF ||
|
|
218
|
+
charCode === CR
|
|
219
|
+
)
|
|
220
|
+
break;
|
|
221
|
+
i++;
|
|
222
|
+
}
|
|
223
|
+
const tagName = '!' + body.substring(0, i);
|
|
224
|
+
|
|
225
|
+
// Parse space-separated tokens as null-valued attributes
|
|
226
|
+
const attributes: Attributes = Object.create(null);
|
|
227
|
+
while (i < bodyLength) {
|
|
228
|
+
const charCode = body.charCodeAt(i);
|
|
229
|
+
// Skip whitespace
|
|
230
|
+
if (
|
|
231
|
+
charCode === SPACE ||
|
|
232
|
+
charCode === TAB ||
|
|
233
|
+
charCode === LF ||
|
|
234
|
+
charCode === CR
|
|
235
|
+
) {
|
|
236
|
+
i++;
|
|
237
|
+
continue;
|
|
238
|
+
}
|
|
239
|
+
// Quoted token — capture content without quotes as the key
|
|
240
|
+
if (charCode === DQUOTE || charCode === SQUOTE) {
|
|
241
|
+
const quoteChar = charCode === DQUOTE ? '"' : "'";
|
|
242
|
+
const closeIndex = body.indexOf(quoteChar, i + 1);
|
|
243
|
+
if (closeIndex === -1) {
|
|
244
|
+
// Unclosed quote — take rest as token
|
|
245
|
+
attributes[body.substring(i + 1)] = null;
|
|
246
|
+
break;
|
|
247
|
+
}
|
|
248
|
+
attributes[body.substring(i + 1, closeIndex)] = null;
|
|
249
|
+
i = closeIndex + 1;
|
|
250
|
+
continue;
|
|
251
|
+
}
|
|
252
|
+
// Unquoted token — scan until whitespace
|
|
253
|
+
const tokenStart = i;
|
|
254
|
+
while (i < bodyLength) {
|
|
255
|
+
const tokenCharCode = body.charCodeAt(i);
|
|
256
|
+
if (
|
|
257
|
+
tokenCharCode === SPACE ||
|
|
258
|
+
tokenCharCode === TAB ||
|
|
259
|
+
tokenCharCode === LF ||
|
|
260
|
+
tokenCharCode === CR
|
|
261
|
+
)
|
|
262
|
+
break;
|
|
263
|
+
i++;
|
|
264
|
+
}
|
|
265
|
+
attributes[body.substring(tokenStart, i)] = null;
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
onDoctype!(tagName, attributes);
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
/** After we finish parsing an open tag's `>`, handle void/raw transitions. */
|
|
272
|
+
function finishOpenTag(): void {
|
|
273
|
+
if (onOpenTag) onOpenTag(tagName, attributes);
|
|
274
|
+
if (voidSet !== null && voidSet.has(tagName)) {
|
|
275
|
+
if (onCloseTag) onCloseTag(tagName);
|
|
276
|
+
} else if (rawSet !== null && rawSet.has(tagName)) {
|
|
277
|
+
rawTag = tagName;
|
|
278
|
+
rawText = '';
|
|
279
|
+
rawCloseTagMatchIndex = 0;
|
|
280
|
+
state = State.RAW_TEXT;
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
// --- Inline scan helpers ---
|
|
285
|
+
// These scan ahead within a chunk and return the end index.
|
|
286
|
+
// If the token isn't complete in this chunk, they return -1.
|
|
287
|
+
|
|
288
|
+
/**
|
|
289
|
+
* Returns true if `charCode` is a tag-name-ending character:
|
|
290
|
+
* `>`, `/`, `=`, or whitespace.
|
|
291
|
+
*/
|
|
292
|
+
function isNameEnd(charCode: number): boolean {
|
|
293
|
+
return (
|
|
294
|
+
charCode === GT ||
|
|
295
|
+
charCode === SLASH ||
|
|
296
|
+
charCode === EQ ||
|
|
297
|
+
charCode === SPACE ||
|
|
298
|
+
charCode === TAB ||
|
|
299
|
+
charCode === LF ||
|
|
300
|
+
charCode === CR
|
|
301
|
+
);
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
function processChunk(chunk: string): void {
|
|
305
|
+
const chunkLength = chunk.length;
|
|
306
|
+
let i = 0;
|
|
307
|
+
|
|
308
|
+
while (i < chunkLength) {
|
|
309
|
+
switch (state) {
|
|
310
|
+
// ==================================================================
|
|
311
|
+
// TEXT
|
|
312
|
+
// ==================================================================
|
|
313
|
+
case State.TEXT: {
|
|
314
|
+
const lessThanIndex = chunk.indexOf('<', i);
|
|
315
|
+
if (lessThanIndex === -1) {
|
|
316
|
+
text += i === 0 ? chunk : chunk.substring(i);
|
|
317
|
+
i = chunkLength;
|
|
318
|
+
} else {
|
|
319
|
+
if (lessThanIndex > i) text += chunk.substring(i, lessThanIndex);
|
|
320
|
+
emitText();
|
|
321
|
+
state = State.TAG_OPEN;
|
|
322
|
+
i = lessThanIndex + 1;
|
|
323
|
+
}
|
|
324
|
+
continue;
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
// ==================================================================
|
|
328
|
+
// TAG_OPEN
|
|
329
|
+
// ==================================================================
|
|
330
|
+
case State.TAG_OPEN: {
|
|
331
|
+
const charCode = chunk.charCodeAt(i);
|
|
332
|
+
if (charCode === SLASH) {
|
|
333
|
+
state = State.CLOSE_TAG;
|
|
334
|
+
tagName = '';
|
|
335
|
+
i++;
|
|
336
|
+
} else if (charCode === BANG) {
|
|
337
|
+
state = State.BANG_START;
|
|
338
|
+
special = '';
|
|
339
|
+
i++;
|
|
340
|
+
} else if (charCode === QUESTION) {
|
|
341
|
+
state = State.PI;
|
|
342
|
+
special = '';
|
|
343
|
+
i++;
|
|
344
|
+
} else {
|
|
345
|
+
state = State.OPEN_TAG_NAME;
|
|
346
|
+
tagName = '';
|
|
347
|
+
attributes = Object.create(null);
|
|
348
|
+
}
|
|
349
|
+
continue;
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
// ==================================================================
|
|
353
|
+
// OPEN_TAG_NAME — batch scan for end of name
|
|
354
|
+
// ==================================================================
|
|
355
|
+
case State.OPEN_TAG_NAME: {
|
|
356
|
+
// Scan ahead for end of tag name
|
|
357
|
+
let j = i;
|
|
358
|
+
while (j < chunkLength) {
|
|
359
|
+
const charCode = chunk.charCodeAt(j);
|
|
360
|
+
if (isNameEnd(charCode)) break;
|
|
361
|
+
j++;
|
|
362
|
+
}
|
|
363
|
+
// Accumulate what we scanned
|
|
364
|
+
if (j > i) tagName += chunk.substring(i, j);
|
|
365
|
+
if (j >= chunkLength) {
|
|
366
|
+
// Tag name continues in next chunk
|
|
367
|
+
i = chunkLength;
|
|
368
|
+
continue;
|
|
369
|
+
}
|
|
370
|
+
// We hit a terminator
|
|
371
|
+
const charCode = chunk.charCodeAt(j);
|
|
372
|
+
if (charCode === GT) {
|
|
373
|
+
state = State.TEXT;
|
|
374
|
+
i = j + 1;
|
|
375
|
+
finishOpenTag();
|
|
376
|
+
} else if (charCode === SLASH) {
|
|
377
|
+
state = State.SELF_CLOSING;
|
|
378
|
+
i = j + 1;
|
|
379
|
+
} else {
|
|
380
|
+
// whitespace — enter body
|
|
381
|
+
state = State.OPEN_TAG_BODY;
|
|
382
|
+
i = j + 1;
|
|
383
|
+
}
|
|
384
|
+
continue;
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
// ==================================================================
|
|
388
|
+
// OPEN_TAG_BODY
|
|
389
|
+
// ==================================================================
|
|
390
|
+
case State.OPEN_TAG_BODY: {
|
|
391
|
+
const charCode = chunk.charCodeAt(i);
|
|
392
|
+
if (charCode === GT) {
|
|
393
|
+
state = State.TEXT;
|
|
394
|
+
i++;
|
|
395
|
+
finishOpenTag();
|
|
396
|
+
} else if (charCode === SLASH) {
|
|
397
|
+
state = State.SELF_CLOSING;
|
|
398
|
+
i++;
|
|
399
|
+
} else if (
|
|
400
|
+
charCode === SPACE ||
|
|
401
|
+
charCode === TAB ||
|
|
402
|
+
charCode === LF ||
|
|
403
|
+
charCode === CR
|
|
404
|
+
) {
|
|
405
|
+
i++;
|
|
406
|
+
} else {
|
|
407
|
+
state = State.ATTR_NAME;
|
|
408
|
+
attributeName = '';
|
|
409
|
+
// don't advance — first char of attr name
|
|
410
|
+
}
|
|
411
|
+
continue;
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
// ==================================================================
|
|
415
|
+
// ATTR_NAME — batch scan for end of attr name
|
|
416
|
+
// ==================================================================
|
|
417
|
+
case State.ATTR_NAME: {
|
|
418
|
+
let j = i;
|
|
419
|
+
while (j < chunkLength) {
|
|
420
|
+
const charCode = chunk.charCodeAt(j);
|
|
421
|
+
if (
|
|
422
|
+
charCode === EQ ||
|
|
423
|
+
charCode === GT ||
|
|
424
|
+
charCode === SLASH ||
|
|
425
|
+
charCode === SPACE ||
|
|
426
|
+
charCode === TAB ||
|
|
427
|
+
charCode === LF ||
|
|
428
|
+
charCode === CR
|
|
429
|
+
)
|
|
430
|
+
break;
|
|
431
|
+
j++;
|
|
432
|
+
}
|
|
433
|
+
if (j > i) attributeName += chunk.substring(i, j);
|
|
434
|
+
if (j >= chunkLength) {
|
|
435
|
+
i = chunkLength;
|
|
436
|
+
continue;
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
const charCode = chunk.charCodeAt(j);
|
|
440
|
+
if (charCode === EQ) {
|
|
441
|
+
state = State.ATTR_AFTER_EQ;
|
|
442
|
+
i = j + 1;
|
|
443
|
+
} else if (charCode === GT) {
|
|
444
|
+
attributes[attributeName] = null;
|
|
445
|
+
state = State.TEXT;
|
|
446
|
+
i = j + 1;
|
|
447
|
+
finishOpenTag();
|
|
448
|
+
} else if (charCode === SLASH) {
|
|
449
|
+
attributes[attributeName] = null;
|
|
450
|
+
state = State.SELF_CLOSING;
|
|
451
|
+
i = j + 1;
|
|
452
|
+
} else {
|
|
453
|
+
// whitespace
|
|
454
|
+
state = State.ATTR_AFTER_NAME;
|
|
455
|
+
i = j + 1;
|
|
456
|
+
}
|
|
457
|
+
continue;
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
// ==================================================================
|
|
461
|
+
// ATTR_AFTER_NAME
|
|
462
|
+
// ==================================================================
|
|
463
|
+
case State.ATTR_AFTER_NAME: {
|
|
464
|
+
const charCode = chunk.charCodeAt(i);
|
|
465
|
+
if (charCode === EQ) {
|
|
466
|
+
state = State.ATTR_AFTER_EQ;
|
|
467
|
+
i++;
|
|
468
|
+
} else if (
|
|
469
|
+
charCode === SPACE ||
|
|
470
|
+
charCode === TAB ||
|
|
471
|
+
charCode === LF ||
|
|
472
|
+
charCode === CR
|
|
473
|
+
) {
|
|
474
|
+
i++;
|
|
475
|
+
} else if (charCode === GT) {
|
|
476
|
+
attributes[attributeName] = null;
|
|
477
|
+
state = State.TEXT;
|
|
478
|
+
i++;
|
|
479
|
+
finishOpenTag();
|
|
480
|
+
} else if (charCode === SLASH) {
|
|
481
|
+
attributes[attributeName] = null;
|
|
482
|
+
state = State.SELF_CLOSING;
|
|
483
|
+
i++;
|
|
484
|
+
} else {
|
|
485
|
+
// New attribute — boolean (no value)
|
|
486
|
+
attributes[attributeName] = null;
|
|
487
|
+
state = State.ATTR_NAME;
|
|
488
|
+
attributeName = '';
|
|
489
|
+
}
|
|
490
|
+
continue;
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
// ==================================================================
|
|
494
|
+
// ATTR_AFTER_EQ
|
|
495
|
+
// ==================================================================
|
|
496
|
+
case State.ATTR_AFTER_EQ: {
|
|
497
|
+
const charCode = chunk.charCodeAt(i);
|
|
498
|
+
if (charCode === DQUOTE) {
|
|
499
|
+
state = State.ATTR_VALUE_DQ;
|
|
500
|
+
attributeValue = '';
|
|
501
|
+
i++;
|
|
502
|
+
} else if (charCode === SQUOTE) {
|
|
503
|
+
state = State.ATTR_VALUE_SQ;
|
|
504
|
+
attributeValue = '';
|
|
505
|
+
i++;
|
|
506
|
+
} else if (
|
|
507
|
+
charCode === SPACE ||
|
|
508
|
+
charCode === TAB ||
|
|
509
|
+
charCode === LF ||
|
|
510
|
+
charCode === CR
|
|
511
|
+
) {
|
|
512
|
+
i++;
|
|
513
|
+
} else if (charCode === GT) {
|
|
514
|
+
attributes[attributeName] = '';
|
|
515
|
+
state = State.TEXT;
|
|
516
|
+
i++;
|
|
517
|
+
finishOpenTag();
|
|
518
|
+
} else {
|
|
519
|
+
state = State.ATTR_VALUE_UQ;
|
|
520
|
+
attributeValue = '';
|
|
521
|
+
// don't advance — first char of value
|
|
522
|
+
}
|
|
523
|
+
continue;
|
|
524
|
+
}
|
|
525
|
+
|
|
526
|
+
// ==================================================================
|
|
527
|
+
// ATTR_VALUE_DQ — batch scan for closing "
|
|
528
|
+
// ==================================================================
|
|
529
|
+
case State.ATTR_VALUE_DQ: {
|
|
530
|
+
const quoteIndex = chunk.indexOf('"', i);
|
|
531
|
+
if (quoteIndex === -1) {
|
|
532
|
+
attributeValue += i === 0 ? chunk : chunk.substring(i);
|
|
533
|
+
i = chunkLength;
|
|
534
|
+
} else {
|
|
535
|
+
if (quoteIndex > i)
|
|
536
|
+
attributeValue += chunk.substring(i, quoteIndex);
|
|
537
|
+
attributes[attributeName] = attributeValue;
|
|
538
|
+
state = State.OPEN_TAG_BODY;
|
|
539
|
+
i = quoteIndex + 1;
|
|
540
|
+
}
|
|
541
|
+
continue;
|
|
542
|
+
}
|
|
543
|
+
|
|
544
|
+
// ==================================================================
|
|
545
|
+
// ATTR_VALUE_SQ — batch scan for closing '
|
|
546
|
+
// ==================================================================
|
|
547
|
+
case State.ATTR_VALUE_SQ: {
|
|
548
|
+
const quoteIndex = chunk.indexOf("'", i);
|
|
549
|
+
if (quoteIndex === -1) {
|
|
550
|
+
attributeValue += i === 0 ? chunk : chunk.substring(i);
|
|
551
|
+
i = chunkLength;
|
|
552
|
+
} else {
|
|
553
|
+
if (quoteIndex > i)
|
|
554
|
+
attributeValue += chunk.substring(i, quoteIndex);
|
|
555
|
+
attributes[attributeName] = attributeValue;
|
|
556
|
+
state = State.OPEN_TAG_BODY;
|
|
557
|
+
i = quoteIndex + 1;
|
|
558
|
+
}
|
|
559
|
+
continue;
|
|
560
|
+
}
|
|
561
|
+
|
|
562
|
+
// ==================================================================
|
|
563
|
+
// ATTR_VALUE_UQ — batch scan for end of unquoted value
|
|
564
|
+
// ==================================================================
|
|
565
|
+
case State.ATTR_VALUE_UQ: {
|
|
566
|
+
let j = i;
|
|
567
|
+
while (j < chunkLength) {
|
|
568
|
+
const charCode = chunk.charCodeAt(j);
|
|
569
|
+
if (
|
|
570
|
+
charCode === SPACE ||
|
|
571
|
+
charCode === TAB ||
|
|
572
|
+
charCode === LF ||
|
|
573
|
+
charCode === CR ||
|
|
574
|
+
charCode === GT ||
|
|
575
|
+
charCode === SLASH
|
|
576
|
+
)
|
|
577
|
+
break;
|
|
578
|
+
j++;
|
|
579
|
+
}
|
|
580
|
+
if (j > i) attributeValue += chunk.substring(i, j);
|
|
581
|
+
if (j >= chunkLength) {
|
|
582
|
+
i = chunkLength;
|
|
583
|
+
continue;
|
|
584
|
+
}
|
|
585
|
+
|
|
586
|
+
const charCode = chunk.charCodeAt(j);
|
|
587
|
+
attributes[attributeName] = attributeValue;
|
|
588
|
+
if (charCode === GT) {
|
|
589
|
+
state = State.TEXT;
|
|
590
|
+
i = j + 1;
|
|
591
|
+
finishOpenTag();
|
|
592
|
+
} else if (charCode === SLASH) {
|
|
593
|
+
state = State.SELF_CLOSING;
|
|
594
|
+
i = j + 1;
|
|
595
|
+
} else {
|
|
596
|
+
state = State.OPEN_TAG_BODY;
|
|
597
|
+
i = j + 1;
|
|
598
|
+
}
|
|
599
|
+
continue;
|
|
600
|
+
}
|
|
601
|
+
|
|
602
|
+
// ==================================================================
|
|
603
|
+
// CLOSE_TAG — batch scan for >
|
|
604
|
+
// ==================================================================
|
|
605
|
+
case State.CLOSE_TAG: {
|
|
606
|
+
const greaterThanIndex = chunk.indexOf('>', i);
|
|
607
|
+
if (greaterThanIndex === -1) {
|
|
608
|
+
tagName += i === 0 ? chunk : chunk.substring(i);
|
|
609
|
+
i = chunkLength;
|
|
610
|
+
} else {
|
|
611
|
+
if (greaterThanIndex > i)
|
|
612
|
+
tagName += chunk.substring(i, greaterThanIndex);
|
|
613
|
+
if (onCloseTag) onCloseTag(trimWhitespace(tagName));
|
|
614
|
+
state = State.TEXT;
|
|
615
|
+
i = greaterThanIndex + 1;
|
|
616
|
+
}
|
|
617
|
+
continue;
|
|
618
|
+
}
|
|
619
|
+
|
|
620
|
+
// ==================================================================
|
|
621
|
+
// SELF_CLOSING
|
|
622
|
+
// ==================================================================
|
|
623
|
+
case State.SELF_CLOSING: {
|
|
624
|
+
const charCode = chunk.charCodeAt(i);
|
|
625
|
+
if (charCode === GT) {
|
|
626
|
+
state = State.TEXT;
|
|
627
|
+
i++;
|
|
628
|
+
if (onOpenTag) onOpenTag(tagName, attributes);
|
|
629
|
+
if (onCloseTag) onCloseTag(tagName);
|
|
630
|
+
} else {
|
|
631
|
+
state = State.OPEN_TAG_BODY;
|
|
632
|
+
}
|
|
633
|
+
continue;
|
|
634
|
+
}
|
|
635
|
+
|
|
636
|
+
// ==================================================================
|
|
637
|
+
// BANG_START
|
|
638
|
+
// ==================================================================
|
|
639
|
+
case State.BANG_START: {
|
|
640
|
+
const charCode = chunk.charCodeAt(i);
|
|
641
|
+
if (charCode === DASH) {
|
|
642
|
+
state = State.COMMENT_1;
|
|
643
|
+
special = '<!-';
|
|
644
|
+
i++;
|
|
645
|
+
} else if (charCode === LBRACKET) {
|
|
646
|
+
state = State.CDATA_1;
|
|
647
|
+
i++;
|
|
648
|
+
} else {
|
|
649
|
+
state = State.DOCTYPE;
|
|
650
|
+
special = '';
|
|
651
|
+
// don't advance — first char of declaration body
|
|
652
|
+
}
|
|
653
|
+
continue;
|
|
654
|
+
}
|
|
655
|
+
|
|
656
|
+
// ==================================================================
|
|
657
|
+
// COMMENT_1
|
|
658
|
+
// ==================================================================
|
|
659
|
+
case State.COMMENT_1: {
|
|
660
|
+
const charCode = chunk.charCodeAt(i);
|
|
661
|
+
if (charCode === DASH) {
|
|
662
|
+
state = State.COMMENT;
|
|
663
|
+
special = '<!--';
|
|
664
|
+
i++;
|
|
665
|
+
} else {
|
|
666
|
+
// Not a comment (malformed <!-X...>) — fall through to DOCTYPE.
|
|
667
|
+
// We've consumed "<!-"; the body after "<!" is "-" plus remainder.
|
|
668
|
+
special = '-';
|
|
669
|
+
state = State.DOCTYPE;
|
|
670
|
+
// don't advance — current char is part of the body
|
|
671
|
+
}
|
|
672
|
+
continue;
|
|
673
|
+
}
|
|
674
|
+
|
|
675
|
+
// ==================================================================
|
|
676
|
+
// COMMENT — batch scan for -->
|
|
677
|
+
// ==================================================================
|
|
678
|
+
case State.COMMENT: {
|
|
679
|
+
// Batch: scan for '-' which might start '-->'
|
|
680
|
+
const dashIndex = chunk.indexOf('-', i);
|
|
681
|
+
if (dashIndex === -1) {
|
|
682
|
+
special += i === 0 ? chunk : chunk.substring(i);
|
|
683
|
+
i = chunkLength;
|
|
684
|
+
} else {
|
|
685
|
+
if (dashIndex > i) special += chunk.substring(i, dashIndex);
|
|
686
|
+
special += '-';
|
|
687
|
+
state = State.COMMENT_END1;
|
|
688
|
+
i = dashIndex + 1;
|
|
689
|
+
}
|
|
690
|
+
continue;
|
|
691
|
+
}
|
|
692
|
+
|
|
693
|
+
// ==================================================================
|
|
694
|
+
// COMMENT_END1
|
|
695
|
+
// ==================================================================
|
|
696
|
+
case State.COMMENT_END1: {
|
|
697
|
+
const charCode = chunk.charCodeAt(i);
|
|
698
|
+
if (charCode === DASH) {
|
|
699
|
+
state = State.COMMENT_END2;
|
|
700
|
+
special += '-';
|
|
701
|
+
i++;
|
|
702
|
+
} else {
|
|
703
|
+
state = State.COMMENT;
|
|
704
|
+
special += chunk[i];
|
|
705
|
+
i++;
|
|
706
|
+
}
|
|
707
|
+
continue;
|
|
708
|
+
}
|
|
709
|
+
|
|
710
|
+
// ==================================================================
|
|
711
|
+
// COMMENT_END2
|
|
712
|
+
// ==================================================================
|
|
713
|
+
case State.COMMENT_END2: {
|
|
714
|
+
const charCode = chunk.charCodeAt(i);
|
|
715
|
+
if (charCode === GT) {
|
|
716
|
+
special += '>';
|
|
717
|
+
if (onComment) onComment(special);
|
|
718
|
+
special = '';
|
|
719
|
+
state = State.TEXT;
|
|
720
|
+
i++;
|
|
721
|
+
} else if (charCode === DASH) {
|
|
722
|
+
special += '-';
|
|
723
|
+
i++;
|
|
724
|
+
} else {
|
|
725
|
+
state = State.COMMENT;
|
|
726
|
+
special += chunk[i];
|
|
727
|
+
i++;
|
|
728
|
+
}
|
|
729
|
+
continue;
|
|
730
|
+
}
|
|
731
|
+
|
|
732
|
+
// ==================================================================
|
|
733
|
+
// CDATA handshake states
|
|
734
|
+
// These match the sequence <![CDATA[ char by char. On mismatch,
|
|
735
|
+
// fall through to DOCTYPE with the consumed prefix in `special`.
|
|
736
|
+
// ==================================================================
|
|
737
|
+
case State.CDATA_1: {
|
|
738
|
+
// expecting C after <![
|
|
739
|
+
if (chunk.charCodeAt(i) === UPPER_C) {
|
|
740
|
+
state = State.CDATA_2;
|
|
741
|
+
i++;
|
|
742
|
+
} else {
|
|
743
|
+
special = '[';
|
|
744
|
+
state = State.DOCTYPE;
|
|
745
|
+
}
|
|
746
|
+
continue;
|
|
747
|
+
}
|
|
748
|
+
case State.CDATA_2: {
|
|
749
|
+
// expecting D
|
|
750
|
+
if (chunk.charCodeAt(i) === UPPER_D) {
|
|
751
|
+
state = State.CDATA_3;
|
|
752
|
+
i++;
|
|
753
|
+
} else {
|
|
754
|
+
special = '[C';
|
|
755
|
+
state = State.DOCTYPE;
|
|
756
|
+
}
|
|
757
|
+
continue;
|
|
758
|
+
}
|
|
759
|
+
case State.CDATA_3: {
|
|
760
|
+
// expecting A
|
|
761
|
+
if (chunk.charCodeAt(i) === UPPER_A) {
|
|
762
|
+
state = State.CDATA_4;
|
|
763
|
+
i++;
|
|
764
|
+
} else {
|
|
765
|
+
special = '[CD';
|
|
766
|
+
state = State.DOCTYPE;
|
|
767
|
+
}
|
|
768
|
+
continue;
|
|
769
|
+
}
|
|
770
|
+
case State.CDATA_4: {
|
|
771
|
+
// expecting T
|
|
772
|
+
if (chunk.charCodeAt(i) === UPPER_T) {
|
|
773
|
+
state = State.CDATA_5;
|
|
774
|
+
i++;
|
|
775
|
+
} else {
|
|
776
|
+
special = '[CDA';
|
|
777
|
+
state = State.DOCTYPE;
|
|
778
|
+
}
|
|
779
|
+
continue;
|
|
780
|
+
}
|
|
781
|
+
case State.CDATA_5: {
|
|
782
|
+
// expecting A
|
|
783
|
+
if (chunk.charCodeAt(i) === UPPER_A) {
|
|
784
|
+
state = State.CDATA_6;
|
|
785
|
+
i++;
|
|
786
|
+
} else {
|
|
787
|
+
special = '[CDAT';
|
|
788
|
+
state = State.DOCTYPE;
|
|
789
|
+
}
|
|
790
|
+
continue;
|
|
791
|
+
}
|
|
792
|
+
case State.CDATA_6: {
|
|
793
|
+
// expecting [
|
|
794
|
+
if (chunk.charCodeAt(i) === LBRACKET) {
|
|
795
|
+
state = State.CDATA;
|
|
796
|
+
special = '';
|
|
797
|
+
i++;
|
|
798
|
+
} else {
|
|
799
|
+
special = '[CDATA';
|
|
800
|
+
state = State.DOCTYPE;
|
|
801
|
+
}
|
|
802
|
+
continue;
|
|
803
|
+
}
|
|
804
|
+
|
|
805
|
+
// ==================================================================
|
|
806
|
+
// CDATA — batch scan for ]
|
|
807
|
+
// ==================================================================
|
|
808
|
+
case State.CDATA: {
|
|
809
|
+
const bracketIndex = chunk.indexOf(']', i);
|
|
810
|
+
if (bracketIndex === -1) {
|
|
811
|
+
special += i === 0 ? chunk : chunk.substring(i);
|
|
812
|
+
i = chunkLength;
|
|
813
|
+
} else {
|
|
814
|
+
if (bracketIndex > i) special += chunk.substring(i, bracketIndex);
|
|
815
|
+
state = State.CDATA_END1;
|
|
816
|
+
i = bracketIndex + 1;
|
|
817
|
+
}
|
|
818
|
+
continue;
|
|
819
|
+
}
|
|
820
|
+
|
|
821
|
+
// ==================================================================
|
|
822
|
+
// CDATA_END1
|
|
823
|
+
// ==================================================================
|
|
824
|
+
case State.CDATA_END1: {
|
|
825
|
+
const charCode = chunk.charCodeAt(i);
|
|
826
|
+
if (charCode === RBRACKET) {
|
|
827
|
+
state = State.CDATA_END2;
|
|
828
|
+
i++;
|
|
829
|
+
} else {
|
|
830
|
+
special += ']' + chunk[i];
|
|
831
|
+
state = State.CDATA;
|
|
832
|
+
i++;
|
|
833
|
+
}
|
|
834
|
+
continue;
|
|
835
|
+
}
|
|
836
|
+
|
|
837
|
+
// ==================================================================
|
|
838
|
+
// CDATA_END2
|
|
839
|
+
// ==================================================================
|
|
840
|
+
case State.CDATA_END2: {
|
|
841
|
+
const charCode = chunk.charCodeAt(i);
|
|
842
|
+
if (charCode === GT) {
|
|
843
|
+
if (onCdata) onCdata(special);
|
|
844
|
+
special = '';
|
|
845
|
+
state = State.TEXT;
|
|
846
|
+
i++;
|
|
847
|
+
} else if (charCode === RBRACKET) {
|
|
848
|
+
special += ']';
|
|
849
|
+
i++;
|
|
850
|
+
} else {
|
|
851
|
+
special += ']]' + chunk[i];
|
|
852
|
+
state = State.CDATA;
|
|
853
|
+
i++;
|
|
854
|
+
}
|
|
855
|
+
continue;
|
|
856
|
+
}
|
|
857
|
+
|
|
858
|
+
// ==================================================================
|
|
859
|
+
// PI — batch scan for ?
|
|
860
|
+
// ==================================================================
|
|
861
|
+
case State.PI: {
|
|
862
|
+
const questionMarkIndex = chunk.indexOf('?', i);
|
|
863
|
+
if (questionMarkIndex === -1) {
|
|
864
|
+
special += i === 0 ? chunk : chunk.substring(i);
|
|
865
|
+
i = chunkLength;
|
|
866
|
+
} else {
|
|
867
|
+
if (questionMarkIndex > i)
|
|
868
|
+
special += chunk.substring(i, questionMarkIndex);
|
|
869
|
+
state = State.PI_END;
|
|
870
|
+
i = questionMarkIndex + 1;
|
|
871
|
+
}
|
|
872
|
+
continue;
|
|
873
|
+
}
|
|
874
|
+
|
|
875
|
+
// ==================================================================
|
|
876
|
+
// PI_END
|
|
877
|
+
// ==================================================================
|
|
878
|
+
case State.PI_END: {
|
|
879
|
+
const charCode = chunk.charCodeAt(i);
|
|
880
|
+
if (charCode === GT) {
|
|
881
|
+
if (onProcessingInstruction) {
|
|
882
|
+
const inner = special;
|
|
883
|
+
let whitespaceIndex = -1;
|
|
884
|
+
for (let j = 0; j < inner.length; j++) {
|
|
885
|
+
const innerCharCode = inner.charCodeAt(j);
|
|
886
|
+
if (
|
|
887
|
+
innerCharCode === SPACE ||
|
|
888
|
+
innerCharCode === TAB ||
|
|
889
|
+
innerCharCode === LF ||
|
|
890
|
+
innerCharCode === CR
|
|
891
|
+
) {
|
|
892
|
+
whitespaceIndex = j;
|
|
893
|
+
break;
|
|
894
|
+
}
|
|
895
|
+
}
|
|
896
|
+
if (whitespaceIndex === -1) {
|
|
897
|
+
onProcessingInstruction(inner, '');
|
|
898
|
+
} else {
|
|
899
|
+
const instructionName = inner.substring(0, whitespaceIndex);
|
|
900
|
+
let bodyStartIndex = whitespaceIndex + 1;
|
|
901
|
+
while (bodyStartIndex < inner.length) {
|
|
902
|
+
const bodyCharCode = inner.charCodeAt(bodyStartIndex);
|
|
903
|
+
if (
|
|
904
|
+
bodyCharCode !== SPACE &&
|
|
905
|
+
bodyCharCode !== TAB &&
|
|
906
|
+
bodyCharCode !== LF &&
|
|
907
|
+
bodyCharCode !== CR
|
|
908
|
+
)
|
|
909
|
+
break;
|
|
910
|
+
bodyStartIndex++;
|
|
911
|
+
}
|
|
912
|
+
let bodyEndIndex = inner.length - 1;
|
|
913
|
+
while (bodyEndIndex >= bodyStartIndex) {
|
|
914
|
+
const bodyCharCode = inner.charCodeAt(bodyEndIndex);
|
|
915
|
+
if (
|
|
916
|
+
bodyCharCode !== SPACE &&
|
|
917
|
+
bodyCharCode !== TAB &&
|
|
918
|
+
bodyCharCode !== LF &&
|
|
919
|
+
bodyCharCode !== CR
|
|
920
|
+
)
|
|
921
|
+
break;
|
|
922
|
+
bodyEndIndex--;
|
|
923
|
+
}
|
|
924
|
+
onProcessingInstruction(
|
|
925
|
+
instructionName,
|
|
926
|
+
bodyStartIndex <= bodyEndIndex
|
|
927
|
+
? inner.substring(bodyStartIndex, bodyEndIndex + 1)
|
|
928
|
+
: '',
|
|
929
|
+
);
|
|
930
|
+
}
|
|
931
|
+
}
|
|
932
|
+
special = '';
|
|
933
|
+
state = State.TEXT;
|
|
934
|
+
i++;
|
|
935
|
+
} else {
|
|
936
|
+
special += '?';
|
|
937
|
+
state = State.PI;
|
|
938
|
+
// don't advance — re-check this char for '?' again
|
|
939
|
+
}
|
|
940
|
+
continue;
|
|
941
|
+
}
|
|
942
|
+
|
|
943
|
+
// ==================================================================
|
|
944
|
+
// DOCTYPE — accumulate body, parse tokens on >
|
|
945
|
+
// ==================================================================
|
|
946
|
+
case State.DOCTYPE: {
|
|
947
|
+
const charCode = chunk.charCodeAt(i);
|
|
948
|
+
if (charCode === GT) {
|
|
949
|
+
if (onDoctype) {
|
|
950
|
+
emitDoctype(special);
|
|
951
|
+
}
|
|
952
|
+
special = '';
|
|
953
|
+
state = State.TEXT;
|
|
954
|
+
i++;
|
|
955
|
+
} else if (charCode === LBRACKET) {
|
|
956
|
+
state = State.DOCTYPE_BRACKET;
|
|
957
|
+
i++;
|
|
958
|
+
} else {
|
|
959
|
+
// Batch scan: find the next > or [ to avoid per-char accumulation
|
|
960
|
+
let j = i;
|
|
961
|
+
while (j < chunkLength) {
|
|
962
|
+
const scanCharCode = chunk.charCodeAt(j);
|
|
963
|
+
if (scanCharCode === GT || scanCharCode === LBRACKET) break;
|
|
964
|
+
j++;
|
|
965
|
+
}
|
|
966
|
+
special += chunk.substring(i, j);
|
|
967
|
+
i = j;
|
|
968
|
+
// If j < chunkLength, the next iteration will handle > or [
|
|
969
|
+
}
|
|
970
|
+
continue;
|
|
971
|
+
}
|
|
972
|
+
|
|
973
|
+
// ==================================================================
|
|
974
|
+
// DOCTYPE_BRACKET
|
|
975
|
+
// ==================================================================
|
|
976
|
+
case State.DOCTYPE_BRACKET: {
|
|
977
|
+
const charCode = chunk.charCodeAt(i);
|
|
978
|
+
if (charCode === RBRACKET) {
|
|
979
|
+
state = State.DOCTYPE;
|
|
980
|
+
i++;
|
|
981
|
+
} else {
|
|
982
|
+
i++;
|
|
983
|
+
}
|
|
984
|
+
continue;
|
|
985
|
+
}
|
|
986
|
+
|
|
987
|
+
// ==================================================================
|
|
988
|
+
// RAW_TEXT — batch scan for <
|
|
989
|
+
// ==================================================================
|
|
990
|
+
case State.RAW_TEXT: {
|
|
991
|
+
const lessThanIndex = chunk.indexOf('<', i);
|
|
992
|
+
if (lessThanIndex === -1) {
|
|
993
|
+
rawText += i === 0 ? chunk : chunk.substring(i);
|
|
994
|
+
i = chunkLength;
|
|
995
|
+
} else {
|
|
996
|
+
if (lessThanIndex > i) rawText += chunk.substring(i, lessThanIndex);
|
|
997
|
+
state = State.RAW_END_1;
|
|
998
|
+
i = lessThanIndex + 1;
|
|
999
|
+
}
|
|
1000
|
+
continue;
|
|
1001
|
+
}
|
|
1002
|
+
|
|
1003
|
+
// ==================================================================
|
|
1004
|
+
// RAW_END_1
|
|
1005
|
+
// ==================================================================
|
|
1006
|
+
case State.RAW_END_1: {
|
|
1007
|
+
const charCode = chunk.charCodeAt(i);
|
|
1008
|
+
if (charCode === SLASH) {
|
|
1009
|
+
state = State.RAW_END_2;
|
|
1010
|
+
rawCloseTagMatchIndex = 0;
|
|
1011
|
+
i++;
|
|
1012
|
+
} else {
|
|
1013
|
+
rawText += '<';
|
|
1014
|
+
state = State.RAW_TEXT;
|
|
1015
|
+
// don't advance — re-process this char in RAW_TEXT
|
|
1016
|
+
}
|
|
1017
|
+
continue;
|
|
1018
|
+
}
|
|
1019
|
+
|
|
1020
|
+
// ==================================================================
|
|
1021
|
+
// RAW_END_2 — matching close tag name
|
|
1022
|
+
// ==================================================================
|
|
1023
|
+
case State.RAW_END_2: {
|
|
1024
|
+
if (rawCloseTagMatchIndex < rawTag.length) {
|
|
1025
|
+
if (chunk[i] === rawTag[rawCloseTagMatchIndex]) {
|
|
1026
|
+
rawCloseTagMatchIndex++;
|
|
1027
|
+
i++;
|
|
1028
|
+
} else {
|
|
1029
|
+
rawText += '</' + rawTag.substring(0, rawCloseTagMatchIndex);
|
|
1030
|
+
state = State.RAW_TEXT;
|
|
1031
|
+
// don't advance — re-process this char
|
|
1032
|
+
}
|
|
1033
|
+
} else {
|
|
1034
|
+
const charCode = chunk.charCodeAt(i);
|
|
1035
|
+
if (charCode === GT) {
|
|
1036
|
+
if (onText && rawText.length > 0) onText(rawText);
|
|
1037
|
+
if (onCloseTag) onCloseTag(rawTag);
|
|
1038
|
+
rawText = '';
|
|
1039
|
+
rawTag = '';
|
|
1040
|
+
state = State.TEXT;
|
|
1041
|
+
i++;
|
|
1042
|
+
} else if (
|
|
1043
|
+
charCode === SPACE ||
|
|
1044
|
+
charCode === TAB ||
|
|
1045
|
+
charCode === LF ||
|
|
1046
|
+
charCode === CR
|
|
1047
|
+
) {
|
|
1048
|
+
rawCloseTagTrailing = chunk[i]!;
|
|
1049
|
+
state = State.RAW_END_3;
|
|
1050
|
+
i++;
|
|
1051
|
+
} else {
|
|
1052
|
+
rawText += '</' + rawTag;
|
|
1053
|
+
state = State.RAW_TEXT;
|
|
1054
|
+
// don't advance
|
|
1055
|
+
}
|
|
1056
|
+
}
|
|
1057
|
+
continue;
|
|
1058
|
+
}
|
|
1059
|
+
|
|
1060
|
+
// ==================================================================
|
|
1061
|
+
// RAW_END_3
|
|
1062
|
+
// ==================================================================
|
|
1063
|
+
case State.RAW_END_3: {
|
|
1064
|
+
const charCode = chunk.charCodeAt(i);
|
|
1065
|
+
if (charCode === GT) {
|
|
1066
|
+
if (onText && rawText.length > 0) onText(rawText);
|
|
1067
|
+
if (onCloseTag) onCloseTag(rawTag);
|
|
1068
|
+
rawText = '';
|
|
1069
|
+
rawTag = '';
|
|
1070
|
+
rawCloseTagTrailing = '';
|
|
1071
|
+
state = State.TEXT;
|
|
1072
|
+
i++;
|
|
1073
|
+
} else if (
|
|
1074
|
+
charCode === SPACE ||
|
|
1075
|
+
charCode === TAB ||
|
|
1076
|
+
charCode === LF ||
|
|
1077
|
+
charCode === CR
|
|
1078
|
+
) {
|
|
1079
|
+
rawCloseTagTrailing += chunk[i];
|
|
1080
|
+
i++;
|
|
1081
|
+
} else {
|
|
1082
|
+
rawText += '</' + rawTag + rawCloseTagTrailing;
|
|
1083
|
+
rawCloseTagTrailing = '';
|
|
1084
|
+
state = State.RAW_TEXT;
|
|
1085
|
+
// don't advance
|
|
1086
|
+
}
|
|
1087
|
+
continue;
|
|
1088
|
+
}
|
|
1089
|
+
|
|
1090
|
+
default:
|
|
1091
|
+
i++;
|
|
1092
|
+
continue;
|
|
1093
|
+
}
|
|
1094
|
+
}
|
|
1095
|
+
}
|
|
1096
|
+
|
|
1097
|
+
return {
|
|
1098
|
+
write(chunk: string): void {
|
|
1099
|
+
if (chunk.length === 0) return;
|
|
1100
|
+
processChunk(chunk);
|
|
1101
|
+
if (
|
|
1102
|
+
maxBufferSize !== undefined &&
|
|
1103
|
+
(text.length > maxBufferSize ||
|
|
1104
|
+
attributeValue.length > maxBufferSize ||
|
|
1105
|
+
special.length > maxBufferSize ||
|
|
1106
|
+
rawText.length > maxBufferSize)
|
|
1107
|
+
) {
|
|
1108
|
+
const buf =
|
|
1109
|
+
text.length > maxBufferSize
|
|
1110
|
+
? 'text'
|
|
1111
|
+
: attributeValue.length > maxBufferSize
|
|
1112
|
+
? 'attribute value'
|
|
1113
|
+
: special.length > maxBufferSize
|
|
1114
|
+
? 'special'
|
|
1115
|
+
: 'raw text';
|
|
1116
|
+
throw new RangeError(
|
|
1117
|
+
`Buffer overflow: ${buf} buffer exceeded maxBufferSize (${maxBufferSize})`,
|
|
1118
|
+
);
|
|
1119
|
+
}
|
|
1120
|
+
},
|
|
1121
|
+
|
|
1122
|
+
close(): void {
|
|
1123
|
+
if (state === State.TEXT) {
|
|
1124
|
+
emitText();
|
|
1125
|
+
} else if (state === State.RAW_END_3) {
|
|
1126
|
+
// Full tag name matched + trailing whitespace — treat as valid close
|
|
1127
|
+
if (onText && rawText.length > 0) onText(rawText);
|
|
1128
|
+
if (onCloseTag) onCloseTag(rawTag);
|
|
1129
|
+
rawText = '';
|
|
1130
|
+
rawTag = '';
|
|
1131
|
+
rawCloseTagTrailing = '';
|
|
1132
|
+
state = State.TEXT;
|
|
1133
|
+
} else if (
|
|
1134
|
+
state === State.RAW_TEXT ||
|
|
1135
|
+
state === State.RAW_END_1 ||
|
|
1136
|
+
state === State.RAW_END_2
|
|
1137
|
+
) {
|
|
1138
|
+
if (state === State.RAW_END_1) {
|
|
1139
|
+
rawText += '<';
|
|
1140
|
+
} else if (state === State.RAW_END_2) {
|
|
1141
|
+
rawText += '</' + rawTag.substring(0, rawCloseTagMatchIndex);
|
|
1142
|
+
}
|
|
1143
|
+
if (onText && rawText.length > 0) onText(rawText);
|
|
1144
|
+
if (onCloseTag) onCloseTag(rawTag);
|
|
1145
|
+
rawText = '';
|
|
1146
|
+
rawTag = '';
|
|
1147
|
+
state = State.TEXT;
|
|
1148
|
+
}
|
|
1149
|
+
text = '';
|
|
1150
|
+
tagName = '';
|
|
1151
|
+
attributeName = '';
|
|
1152
|
+
attributeValue = '';
|
|
1153
|
+
special = '';
|
|
1154
|
+
state = State.TEXT;
|
|
1155
|
+
},
|
|
1156
|
+
};
|
|
1157
|
+
}
|