@nodable/flexible-xml-parser 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,522 @@
1
+ import { isName } from './util.js';
2
+ import { ParseError, ErrorCode } from './ParseError.js';
3
+
4
+ export function readDocType(parser) {
5
+ parser.source.markTokenStart(1);
6
+
7
+ // <!D are already consumed by the caller up to this point
8
+ if (!parser.source.canRead(5)) {
9
+ throw new ParseError(
10
+ `Unexpected end of source reading DOCTYPE preamble`,
11
+ ErrorCode.UNEXPECTED_END,
12
+ { line: parser.source.line, col: parser.source.cols, index: parser.source.startIndex }
13
+ );
14
+ }
15
+ let str = parser.source.readStr(6); // "OCTYPE"
16
+ parser.source.updateBufferBoundary(6);
17
+
18
+ if (str !== "OCTYPE") {
19
+ throw new ParseError(
20
+ `Invalid DOCTYPE expression at ${parser.source.line}:${parser.source.cols}`,
21
+ ErrorCode.INVALID_TAG,
22
+ { line: parser.source.line, col: parser.source.cols, index: parser.source.startIndex }
23
+ );
24
+ }
25
+
26
+ const entities = Object.create(null);
27
+ let entityCount = 0;
28
+ let hasBody = false;
29
+ let bodyDone = false;
30
+
31
+ while (parser.source.canRead()) {
32
+ // Save a local snapshot of startIndex BEFORE consuming this character.
33
+ // If the sub-tag dispatch below throws UNEXPECTED_END we restore here
34
+ // and re-throw so that feed()'s catch calls rewindToMark(), which
35
+ // restores all the way back to the '<' that began the DOCTYPE tag
36
+ // (the level-0 mark set by parseXml's loop). We must NOT call
37
+ // markTokenStart(0) here because that would overwrite parseXml's
38
+ // level-0 mark and cause rewindToMark() to land at the wrong position.
39
+ const subTagStart = parser.source.startIndex;
40
+
41
+ let ch = parser.source.readCh();
42
+
43
+ if (ch === '<' && hasBody && !bodyDone) {
44
+ // ── "<!…" sub-tag inside [...] body ───────────────────────────────
45
+ // If any read below hits a chunk boundary we restore to subTagStart
46
+ // (the '<') and re-throw UNEXPECTED_END so the outer rewind via
47
+ // rewindToMark() lands at parseXml's level-0 mark (the DOCTYPE '<').
48
+ try {
49
+ if (!parser.source.canRead()) {
50
+ throw new ParseError(`Unexpected end of source reading DOCTYPE sub-tag`,
51
+ ErrorCode.UNEXPECTED_END,
52
+ { line: parser.source.line, col: parser.source.cols, index: parser.source.startIndex });
53
+ }
54
+ let bang = parser.source.readStr(1);
55
+ parser.source.updateBufferBoundary(1);
56
+ if (bang !== "!") throw new ParseError(
57
+ `Invalid DOCTYPE body tag starting with "<${bang}"`,
58
+ ErrorCode.INVALID_TAG,
59
+ { line: parser.source.line, col: parser.source.cols, index: parser.source.startIndex }
60
+ );
61
+
62
+ if (!parser.source.canRead()) {
63
+ throw new ParseError(`Unexpected end of source reading DOCTYPE sub-tag type`,
64
+ ErrorCode.UNEXPECTED_END,
65
+ { line: parser.source.line, col: parser.source.cols, index: parser.source.startIndex });
66
+ }
67
+ let typeChar = parser.source.readStr(1);
68
+ parser.source.updateBufferBoundary(1);
69
+
70
+ if (typeChar === "-") {
71
+ // <!-- comment -->
72
+ if (!parser.source.canRead()) {
73
+ throw new ParseError(`Unexpected end of source reading DOCTYPE comment`,
74
+ ErrorCode.UNEXPECTED_END,
75
+ { line: parser.source.line, col: parser.source.cols, index: parser.source.startIndex });
76
+ }
77
+ let dash2 = parser.source.readStr(1);
78
+ parser.source.updateBufferBoundary(1);
79
+ if (dash2 !== "-") throw new ParseError(
80
+ "Invalid comment in DOCTYPE",
81
+ ErrorCode.INVALID_TAG,
82
+ { line: parser.source.line, col: parser.source.cols, index: parser.source.startIndex }
83
+ );
84
+ parser.source.readUpto("-->");
85
+
86
+ } else if (typeChar === "E") {
87
+ // ENTITY or ELEMENT — one more char to distinguish
88
+ if (!parser.source.canRead()) {
89
+ throw new ParseError(`Unexpected end of source reading DOCTYPE E-type sub-tag`,
90
+ ErrorCode.UNEXPECTED_END,
91
+ { line: parser.source.line, col: parser.source.cols, index: parser.source.startIndex });
92
+ }
93
+ let typeChar2 = parser.source.readStr(1);
94
+ parser.source.updateBufferBoundary(1);
95
+
96
+ if (typeChar2 === "N") {
97
+ // <!ENTITY — need 4 more chars for "TITY"
98
+ if (!parser.source.canRead(3)) {
99
+ throw new ParseError(`Unexpected end of source reading DOCTYPE ENTITY keyword`,
100
+ ErrorCode.UNEXPECTED_END,
101
+ { line: parser.source.line, col: parser.source.cols, index: parser.source.startIndex });
102
+ }
103
+ let rest = parser.source.readStr(4);
104
+ parser.source.updateBufferBoundary(4);
105
+ if (rest !== "TITY") throw new ParseError(
106
+ "Invalid DOCTYPE ENTITY expression",
107
+ ErrorCode.INVALID_TAG,
108
+ { line: parser.source.line, col: parser.source.cols, index: parser.source.startIndex }
109
+ );
110
+
111
+ const [entityName, entityValue] = readEntityExp(parser);
112
+
113
+ if (entityValue.indexOf("&") === -1) {
114
+ const ep = parser.options?.doctypeOptions;
115
+ if (ep?.maxEntityCount && entityCount >= ep.maxEntityCount) {
116
+ throw new ParseError(
117
+ `Entity count (${entityCount + 1}) exceeds maximum allowed (${ep.maxEntityCount})`,
118
+ ErrorCode.ENTITY_MAX_COUNT,
119
+ { line: parser.source.line, col: parser.source.cols, index: parser.source.startIndex }
120
+ );
121
+ }
122
+ const escaped = entityName.replace(/[.\-+*:]/g, '\\$&');
123
+ entities[entityName] = {
124
+ regx: RegExp(`&${escaped};`, "g"),
125
+ val: entityValue
126
+ };
127
+ entityCount++;
128
+ }
129
+
130
+ } else if (typeChar2 === "L") {
131
+ // <!ELEMENT — need 5 more chars for "EMENT"
132
+ if (!parser.source.canRead(4)) {
133
+ throw new ParseError(`Unexpected end of source reading DOCTYPE ELEMENT keyword`,
134
+ ErrorCode.UNEXPECTED_END,
135
+ { line: parser.source.line, col: parser.source.cols, index: parser.source.startIndex });
136
+ }
137
+ let rest = parser.source.readStr(5);
138
+ parser.source.updateBufferBoundary(5);
139
+ if (rest !== "EMENT") throw new ParseError(
140
+ "Invalid DOCTYPE ELEMENT expression",
141
+ ErrorCode.INVALID_TAG,
142
+ { line: parser.source.line, col: parser.source.cols, index: parser.source.startIndex }
143
+ );
144
+ readElementExp(parser);
145
+
146
+ } else {
147
+ throw new ParseError(
148
+ `Invalid DOCTYPE sub-tag "<!E${typeChar2}"`,
149
+ ErrorCode.INVALID_TAG,
150
+ { line: parser.source.line, col: parser.source.cols, index: parser.source.startIndex }
151
+ );
152
+ }
153
+
154
+ } else if (typeChar === "A") {
155
+ // <!ATTLIST — need 6 more chars for "TTLIST"
156
+ if (!parser.source.canRead(5)) {
157
+ throw new ParseError(`Unexpected end of source reading DOCTYPE ATTLIST keyword`,
158
+ ErrorCode.UNEXPECTED_END,
159
+ { line: parser.source.line, col: parser.source.cols, index: parser.source.startIndex });
160
+ }
161
+ let rest = parser.source.readStr(6);
162
+ parser.source.updateBufferBoundary(6);
163
+ if (rest !== "TTLIST") throw new ParseError(
164
+ "Invalid DOCTYPE ATTLIST expression",
165
+ ErrorCode.INVALID_TAG,
166
+ { line: parser.source.line, col: parser.source.cols, index: parser.source.startIndex }
167
+ );
168
+ readAttlistExp(parser);
169
+
170
+ } else if (typeChar === "N") {
171
+ // <!NOTATION — need 7 more chars for "OTATION"
172
+ if (!parser.source.canRead(6)) {
173
+ throw new ParseError(`Unexpected end of source reading DOCTYPE NOTATION keyword`,
174
+ ErrorCode.UNEXPECTED_END,
175
+ { line: parser.source.line, col: parser.source.cols, index: parser.source.startIndex });
176
+ }
177
+ let rest = parser.source.readStr(7);
178
+ parser.source.updateBufferBoundary(7);
179
+ if (rest !== "OTATION") throw new ParseError(
180
+ "Invalid DOCTYPE NOTATION expression",
181
+ ErrorCode.INVALID_TAG,
182
+ { line: parser.source.line, col: parser.source.cols, index: parser.source.startIndex }
183
+ );
184
+ readNotationExp(parser);
185
+
186
+ } else {
187
+ throw new ParseError(
188
+ `Invalid DOCTYPE sub-tag "<!${typeChar}"`,
189
+ ErrorCode.INVALID_TAG,
190
+ { line: parser.source.line, col: parser.source.cols, index: parser.source.startIndex }
191
+ );
192
+ }
193
+
194
+ } catch (err) {
195
+ if (err.code === ErrorCode.UNEXPECTED_END) {
196
+ // Restore cursor to the '<' that started this sub-tag so
197
+ // that when feed() calls rewindToMark() (which goes all the
198
+ // way back to the DOCTYPE '<' via parseXml's level-0 mark)
199
+ // the full DOCTYPE — including this sub-tag — is replayed.
200
+ parser.source.startIndex = subTagStart;
201
+ }
202
+ // Always re-throw: UNEXPECTED_END bubbles up to feed() for rewind;
203
+ // INVALID_TAG and others bubble up as real parse failures.
204
+ throw err;
205
+ }
206
+
207
+ } else if (ch === '[') {
208
+ hasBody = true;
209
+
210
+ } else if (ch === ']') {
211
+ bodyDone = true;
212
+
213
+ } else if (ch === '>') {
214
+ if (!hasBody || bodyDone) {
215
+ return entities;
216
+ }
217
+ // '>' before '[' is part of the external identifier — skip it
218
+ }
219
+ // whitespace, external identifier text, public id text — all skipped
220
+ }
221
+
222
+ throw new ParseError(
223
+ "Unclosed DOCTYPE",
224
+ ErrorCode.UNEXPECTED_END,
225
+ { line: parser.source.line, col: parser.source.cols, index: parser.source.startIndex }
226
+ );
227
+ }
228
+
229
+ // ---------------------------------------------------------------------------
230
+ // Sub-expression readers
231
+ // ---------------------------------------------------------------------------
232
+
233
+ /**
234
+ * Read an ENTITY declaration body.
235
+ * "<!ENTITY" has already been consumed by the caller.
236
+ *
237
+ * All canRead() guards throw UNEXPECTED_END on chunk boundaries. The caller's
238
+ * try/catch restores startIndex to the '<' of this sub-tag, then re-throws
239
+ * so feed() → rewindToMark() resets all the way to the DOCTYPE opening '<'.
240
+ *
241
+ * @returns {[string, string]} [entityName, entityValue]
242
+ */
243
+ function readEntityExp(parser) {
244
+ const source = parser.source;
245
+
246
+ skipSourceWhitespace(source);
247
+
248
+ if (!source.canRead()) {
249
+ throw new ParseError(`Unexpected end of source reading entity name`,
250
+ ErrorCode.UNEXPECTED_END,
251
+ { line: source.line, col: source.cols, index: source.startIndex });
252
+ }
253
+
254
+ const entityNameStart = source.startIndex;
255
+ let entityNameLen = 0;
256
+ while (source.canRead()) {
257
+ const ch = source.readCh();
258
+ if (ch === ' ' || ch === '\t' || ch === '\n' || ch === '\r' || ch === '"' || ch === "'") break;
259
+ entityNameLen++;
260
+ }
261
+ const entityName = source.readStr(entityNameLen, entityNameStart);
262
+
263
+ // Ran out mid-name without hitting a terminator — wait for more data
264
+ if (!source.canRead()) {
265
+ throw new ParseError(`Unexpected end of source reading entity name "${entityName}"`,
266
+ ErrorCode.UNEXPECTED_END,
267
+ { line: source.line, col: source.cols, index: source.startIndex });
268
+ }
269
+
270
+ validateEntityName(entityName);
271
+ skipSourceWhitespace(source);
272
+
273
+ if (!source.canRead()) {
274
+ throw new ParseError(`Unexpected end of source after entity name "${entityName}"`,
275
+ ErrorCode.UNEXPECTED_END,
276
+ { line: source.line, col: source.cols, index: source.startIndex });
277
+ }
278
+
279
+ // SYSTEM check requires 6 chars; only peek when they are available
280
+ if (source.canRead(5)) {
281
+ let peek6 = source.readStr(6);
282
+ if (peek6.toUpperCase() === "SYSTEM") {
283
+ throw new ParseError("External entities are not supported",
284
+ ErrorCode.INVALID_TAG,
285
+ { line: source.line, col: source.cols, index: source.startIndex });
286
+ }
287
+ }
288
+
289
+ if (source.readStr(1) === "%") {
290
+ throw new ParseError("Parameter entities are not supported",
291
+ ErrorCode.INVALID_TAG,
292
+ { line: source.line, col: source.cols, index: source.startIndex });
293
+ }
294
+
295
+ // Need at least the opening quote char
296
+ if (!source.canRead()) {
297
+ throw new ParseError(`Unexpected end of source reading entity value for "${entityName}"`,
298
+ ErrorCode.UNEXPECTED_END,
299
+ { line: source.line, col: source.cols, index: source.startIndex });
300
+ }
301
+
302
+ const [entityValue] = readIdentifierVal(source, "entity");
303
+
304
+ const ep = parser.options?.doctypeOptions;
305
+ if (ep?.maxEntitySize && entityValue.length > ep.maxEntitySize) {
306
+ throw new ParseError(
307
+ `Entity "${entityName}" size (${entityValue.length}) exceeds maximum allowed size (${ep.maxEntitySize})`,
308
+ ErrorCode.ENTITY_MAX_SIZE,
309
+ { line: source.line, col: source.cols, index: source.startIndex }
310
+ );
311
+ }
312
+
313
+ // readUpto throws UNEXPECTED_END automatically if ">" is not in the buffer yet
314
+ source.readUptoChar(">");
315
+
316
+ return [entityName, entityValue];
317
+ }
318
+
319
+ /**
320
+ * Read an ELEMENT declaration body.
321
+ * "<!ELEMENT" has already been consumed by the caller.
322
+ */
323
+ function readElementExp(parser) {
324
+ const source = parser.source;
325
+
326
+ skipSourceWhitespace(source);
327
+
328
+ if (!source.canRead()) {
329
+ throw new ParseError(`Unexpected end of source reading ELEMENT name`,
330
+ ErrorCode.UNEXPECTED_END,
331
+ { line: source.line, col: source.cols, index: source.startIndex });
332
+ }
333
+
334
+ const elementNameStart = source.startIndex;
335
+ let elementNameLen = 0;
336
+ while (source.canRead()) {
337
+ const ch = source.readCh();
338
+ if (ch === ' ' || ch === '\t' || ch === '\n' || ch === '\r') break;
339
+ elementNameLen++;
340
+ }
341
+ const elementName = source.readStr(elementNameLen, elementNameStart);
342
+
343
+ if (!source.canRead()) {
344
+ throw new ParseError(`Unexpected end of source after ELEMENT name "${elementName}"`,
345
+ ErrorCode.UNEXPECTED_END,
346
+ { line: source.line, col: source.cols, index: source.startIndex });
347
+ }
348
+
349
+ if (!isName(elementName)) {
350
+ throw new ParseError(`Invalid element name: "${elementName}"`,
351
+ ErrorCode.INVALID_TAG,
352
+ { line: source.line, col: source.cols, index: source.startIndex });
353
+ }
354
+
355
+ skipSourceWhitespace(source);
356
+
357
+ if (!source.canRead()) {
358
+ throw new ParseError(`Unexpected end of source reading ELEMENT content model`,
359
+ ErrorCode.UNEXPECTED_END,
360
+ { line: source.line, col: source.cols, index: source.startIndex });
361
+ }
362
+
363
+ let peek1 = source.readStr(1);
364
+ if (peek1 === "E") {
365
+ if (!source.canRead(4)) {
366
+ throw new ParseError(`Unexpected end of source reading ELEMENT content model keyword`,
367
+ ErrorCode.UNEXPECTED_END,
368
+ { line: source.line, col: source.cols, index: source.startIndex });
369
+ }
370
+ let peek5 = source.readStr(5);
371
+ if (peek5 === "EMPTY") {
372
+ source.updateBufferBoundary(5);
373
+ } else {
374
+ source.readUptoChar(">");
375
+ return { elementName, contentModel: "" };
376
+ }
377
+ } else if (peek1 === "A") {
378
+ if (!source.canRead(2)) {
379
+ throw new ParseError(`Unexpected end of source reading ELEMENT content model keyword`,
380
+ ErrorCode.UNEXPECTED_END,
381
+ { line: source.line, col: source.cols, index: source.startIndex });
382
+ }
383
+ let peek3 = source.readStr(3);
384
+ if (peek3 === "ANY") {
385
+ source.updateBufferBoundary(3);
386
+ } else {
387
+ source.readUptoChar(">");
388
+ return { elementName, contentModel: "" };
389
+ }
390
+ } else if (peek1 === "(") {
391
+ source.updateBufferBoundary(1);
392
+ source.readUptoChar(")");
393
+ }
394
+
395
+ source.readUptoChar(">");
396
+ return { elementName };
397
+ }
398
+
399
+ /**
400
+ * Read an ATTLIST declaration body.
401
+ * "<!ATTLIST" has already been consumed by the caller.
402
+ */
403
+ function readAttlistExp(parser) {
404
+ parser.source.readUptoChar(">");
405
+ }
406
+
407
+ /**
408
+ * Read a NOTATION declaration body.
409
+ * "<!NOTATION" has already been consumed by the caller.
410
+ */
411
+ function readNotationExp(parser) {
412
+ const source = parser.source;
413
+
414
+ skipSourceWhitespace(source);
415
+
416
+ if (!source.canRead()) {
417
+ throw new ParseError(`Unexpected end of source reading NOTATION name`,
418
+ ErrorCode.UNEXPECTED_END,
419
+ { line: source.line, col: source.cols, index: source.startIndex });
420
+ }
421
+
422
+ const notationNameStart = source.startIndex;
423
+ let notationNameLen = 0;
424
+ while (source.canRead()) {
425
+ const ch = source.readCh();
426
+ if (ch === ' ' || ch === '\t' || ch === '\n' || ch === '\r') break;
427
+ notationNameLen++;
428
+ }
429
+ const notationName = source.readStr(notationNameLen, notationNameStart);
430
+
431
+ if (!source.canRead()) {
432
+ throw new ParseError(`Unexpected end of source after NOTATION name "${notationName}"`,
433
+ ErrorCode.UNEXPECTED_END,
434
+ { line: source.line, col: source.cols, index: source.startIndex });
435
+ }
436
+
437
+ validateEntityName(notationName);
438
+ skipSourceWhitespace(source);
439
+
440
+ // Need all 6 chars of "SYSTEM" / "PUBLIC" before we can classify
441
+ if (!source.canRead(5)) {
442
+ throw new ParseError(`Unexpected end of source reading NOTATION identifier type`,
443
+ ErrorCode.UNEXPECTED_END,
444
+ { line: source.line, col: source.cols, index: source.startIndex });
445
+ }
446
+
447
+ let identifierType = source.readStr(6).toUpperCase();
448
+ if (identifierType === "SYSTEM") {
449
+ source.updateBufferBoundary(6);
450
+ skipSourceWhitespace(source);
451
+ readIdentifierVal(source, "systemIdentifier");
452
+ } else if (identifierType === "PUBLIC") {
453
+ source.updateBufferBoundary(6);
454
+ skipSourceWhitespace(source);
455
+ readIdentifierVal(source, "publicIdentifier");
456
+ skipSourceWhitespace(source);
457
+ if (!source.canRead()) {
458
+ throw new ParseError(`Unexpected end of source after NOTATION PUBLIC identifier`,
459
+ ErrorCode.UNEXPECTED_END,
460
+ { line: source.line, col: source.cols, index: source.startIndex });
461
+ }
462
+ let next = source.readStr(1);
463
+ if (next === '"' || next === "'") {
464
+ readIdentifierVal(source, "systemIdentifier");
465
+ }
466
+ } else {
467
+ throw new ParseError(
468
+ `Expected SYSTEM or PUBLIC in NOTATION, found "${identifierType}"`,
469
+ ErrorCode.INVALID_TAG,
470
+ { line: source.line, col: source.cols, index: source.startIndex }
471
+ );
472
+ }
473
+
474
+ source.readUptoChar(">");
475
+ }
476
+
477
+ /**
478
+ * Read a quoted identifier value from the source.
479
+ * Consumes the opening quote, the content, and the closing quote.
480
+ * @returns {[string]} [value]
481
+ */
482
+ function readIdentifierVal(source, type) {
483
+ if (!source.canRead()) {
484
+ throw new ParseError(`Unexpected end of source reading ${type} opening quote`,
485
+ ErrorCode.UNEXPECTED_END,
486
+ { line: source.line, col: source.cols, index: source.startIndex });
487
+ }
488
+ let startChar = source.readStr(1);
489
+ if (startChar !== '"' && startChar !== "'") {
490
+ throw new ParseError(
491
+ `Expected quoted string for ${type}, found "${startChar}"`,
492
+ ErrorCode.INVALID_TAG,
493
+ { line: source.line, col: source.cols, index: source.startIndex }
494
+ );
495
+ }
496
+ source.updateBufferBoundary(1);
497
+ // readUpto throws UNEXPECTED_END automatically when the closing quote is absent
498
+ let value = source.readUptoChar(startChar);
499
+ return [value];
500
+ }
501
+
502
+ // ---------------------------------------------------------------------------
503
+ // Helpers
504
+ // ---------------------------------------------------------------------------
505
+
506
+ function skipSourceWhitespace(source) {
507
+ while (source.canRead()) {
508
+ const ch = source.readChAt(0);
509
+ if (ch !== ' ' && ch !== '\t' && ch !== '\n' && ch !== '\r') break;
510
+ //source.readCh();
511
+ source.updateBufferBoundary(1)
512
+ }
513
+ }
514
+
515
+ function validateEntityName(name) {
516
+ if (isName(name)) return name;
517
+ throw new ParseError(
518
+ `Invalid entity name "${name}"`,
519
+ ErrorCode.ENTITY_INVALID_KEY,
520
+ {}
521
+ );
522
+ }