@nodable/flexible-xml-parser 1.6.0 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -30,9 +30,13 @@ import { isSpaceCode } from "./util.js"
30
30
  /**
31
31
  * Parse an attribute expression string into an array of match tuples.
32
32
  *
33
- * Each element has the same shape the old getAllMatches() returned so that
34
- * callers are unchanged:
35
- * [fullMatch, name, '=value' | undefined, quote | undefined, value | undefined]
33
+ * Each element is `{ name, value, startIndex }` `value` is `undefined` for
34
+ * a boolean attribute (no `=`). Earlier versions of this function also built
35
+ * a full-match string and an `'=value'` string per attribute (matching an
36
+ * old regex-based getAllMatches() return shape) — neither was ever read by
37
+ * collectRawAttributes()/flushAttributes() (only `name`, `value`, and
38
+ * `.startIndex` are), so building them was pure wasted string concatenation
39
+ * on every attribute, on every tag. Dropped.
36
40
  *
37
41
  * The implementation is a single O(n) pass over char codes with no regex and
38
42
  * no recursion, making it safe for arbitrarily long attribute strings.
@@ -44,7 +48,7 @@ import { isSpaceCode } from "./util.js"
44
48
  * IN_VALUE — inside a quoted value, accumulating until the closing quote
45
49
  *
46
50
  * @param {string} attrStr
47
- * @returns {Array} array of match tuples (see shape above)
51
+ * @returns {Array<{name: string, value: string|undefined, startIndex: number}>}
48
52
  */
49
53
  function parseAttributes(attrStr) {
50
54
  const results = [];
@@ -66,9 +70,7 @@ function parseAttributes(attrStr) {
66
70
 
67
71
  if (i >= len || attrStr.charCodeAt(i) !== 61) {
68
72
  // Boolean attribute — no '='
69
- const m = [name, name, undefined, undefined, undefined];
70
- m.startIndex = nameStart;
71
- results.push(m);
73
+ results.push({ name, value: undefined, startIndex: nameStart });
72
74
  continue;
73
75
  }
74
76
 
@@ -81,7 +83,6 @@ function parseAttributes(attrStr) {
81
83
  const quote = attrStr.charCodeAt(i);
82
84
  if (quote === 34 || quote === 39) { // " or '
83
85
  i++; // skip opening quote
84
- const valueStart = i;
85
86
  let value = '';
86
87
  let segStart = i;
87
88
  while (i < len && attrStr.charCodeAt(i) !== quote) {
@@ -94,10 +95,7 @@ function parseAttributes(attrStr) {
94
95
  }
95
96
  value += attrStr.substring(segStart, i);
96
97
  i++; // skip closing quote
97
- const quoteChar = String.fromCharCode(quote);
98
- const m = [name + '=' + quoteChar + value + quoteChar, name, '=' + quoteChar + value + quoteChar, quoteChar, value];
99
- m.startIndex = nameStart;
100
- results.push(m);
98
+ results.push({ name, value, startIndex: nameStart });
101
99
  }
102
100
  }
103
101
 
@@ -105,7 +103,20 @@ function parseAttributes(attrStr) {
105
103
  }
106
104
 
107
105
  /**
108
- * Pass 1: extract raw (unparsed) attribute values into rawAttributes.
106
+ * Pass 1: extract raw (unparsed) attribute values into rawAttributes, AND
107
+ * build tagExp._parsedAttrs — the processed-name/value list pass 2 will
108
+ * consume directly.
109
+ *
110
+ * Previously, pass 2 (flushAttributes) re-ran parseAttributes() from scratch
111
+ * on the same attrStr, and re-ran parser.processAttrName() (ns-prefix
112
+ * resolution + name validation + sanitizeName + reserved-name check) on
113
+ * every attribute a second time — full re-tokenization plus full re-validation
114
+ * of work already done here. processAttrName() is a pure function of
115
+ * (rawName, options) — nothing between pass 1 and pass 2 (matcher.push,
116
+ * stop/skip resolution) can change its result — so it's safe to compute once
117
+ * and cache. The matcher still gets the *raw* (pre-resolveNsPrefix/sanitize)
118
+ * name as its rawAttributes key, unchanged, since PEM's attribute-condition
119
+ * matching (`div[class=code]`) matches against attribute names as written.
109
120
  *
110
121
  * @param {string} attrStr - raw attribute expression substring
111
122
  * @param {object} parser - Xml2JsParser instance (for processAttrName)
@@ -116,56 +127,60 @@ export function collectRawAttributes(attrStr, parser, tagExp) {
116
127
 
117
128
  const matches = parseAttributes(attrStr);
118
129
  const len = matches.length;
130
+ tagExp._rawAttrMatchCount = len; // total parsed attrs, incl. dropped (xmlns:) ones — for maxAttributesPerTag parity with old behavior
131
+ const parsedAttrs = [];
119
132
  let count = 0;
120
133
  for (let i = 0; i < len; i++) {
121
- const attrName = parser.processAttrName(matches[i][1]);
134
+ const m = matches[i];
135
+ const attrName = parser.processAttrName(m.name);
122
136
  if (attrName === false) continue;
123
137
  count++;
124
- const rawVal = matches[i][4];
125
- tagExp.rawAttributes[matches[i][1]] = rawVal !== undefined ? rawVal : true;
138
+ const rawVal = m.value;
139
+ const attrVal = rawVal !== undefined ? rawVal : true;
140
+ tagExp.rawAttributes[m.name] = attrVal;
141
+ parsedAttrs.push({ name: attrName, value: attrVal, index: m.startIndex });
126
142
  }
127
143
  tagExp.rawAttributesLen = count;
144
+ tagExp._parsedAttrs = parsedAttrs;
128
145
  }
129
146
 
130
147
  /**
131
- * Pass 2: run value parsers and push each attribute to the output builder.
148
+ * Pass 2: push each attribute (already parsed + name-processed by pass 1,
149
+ * see tagExp._parsedAttrs) to the output builder. No re-parsing, no
150
+ * re-running processAttrName — this is now a plain loop over cached data.
132
151
  *
133
- * @param {string} attrStr - raw attribute expression substring
152
+ * @param {Array<{name: string, value: *, index: number}>} parsedAttrs - tagExp._parsedAttrs from collectRawAttributes
134
153
  * @param {object} parser - Xml2JsParser instance
135
- * @param {number} [attrsExpStart] - absolute document offset where `attrStr`
136
- * begins (tagExp._attrsExpStart from buildTagExpObj). When provided, each
154
+ * @param {number} [attrsExpStart] - absolute document offset where the
155
+ * attribute expression began (tagExp._attrsExpStart). When provided, each
137
156
  * attribute's absolute document index is computed and passed to
138
157
  * addAttribute() as a 4th argument: { index }. Line/col are intentionally
139
158
  * NOT computed here — doing so would require re-scanning attrStr for
140
159
  * newlines on every call, for a field most builders won't use; callers
141
160
  * that need it can derive line/col from `index` plus the document text.
161
+ * @param {number} rawAttrMatchCount - tagExp._rawAttrMatchCount, used for the
162
+ * maxAttributesPerTag limit check (counts all parsed attrs, including any
163
+ * dropped by processAttrName, matching the limit's pre-existing semantics).
142
164
  */
143
- export function flushAttributes(attrStr, parser, attrsExpStart) {
144
- if (!attrStr || attrStr.length === 0) return;
145
- const matches = parseAttributes(attrStr);
146
- const len = matches.length;
165
+ export function flushAttributes(parsedAttrs, parser, attrsExpStart, rawAttrMatchCount) {
166
+ if (!parsedAttrs || parsedAttrs.length === 0) return;
147
167
 
148
168
  const maxAttrs = parser.options.limits?.maxAttributesPerTag;
149
- if (maxAttrs !== undefined && maxAttrs !== null && len > maxAttrs) {
169
+ if (maxAttrs !== undefined && maxAttrs !== null && rawAttrMatchCount > maxAttrs) {
150
170
  const tagName = parser.currentTagDetail?.name ?? '(unknown)';
151
171
  throw new ParseError(
152
- `Tag '${tagName}' has ${len} attributes, exceeding limit of ${maxAttrs}`,
172
+ `Tag '${tagName}' has ${rawAttrMatchCount} attributes, exceeding limit of ${maxAttrs}`,
153
173
  ErrorCode.LIMIT_MAX_ATTRIBUTES,
154
174
  { line: parser.source.line, col: parser.source.cols, index: parser.source.startIndex }
155
175
  );
156
176
  }
157
177
 
178
+ const len = parsedAttrs.length;
158
179
  for (let i = 0; i < len; i++) {
159
- const attrName = parser.processAttrName(matches[i][1]);
160
- if (attrName === false) continue;
161
-
162
- const rawVal = matches[i][4];
163
- const attrVal = rawVal !== undefined ? rawVal : true;
164
-
180
+ const a = parsedAttrs[i];
165
181
  const attrMeta = attrsExpStart !== undefined
166
- ? { index: attrsExpStart + matches[i].startIndex }
182
+ ? { index: attrsExpStart + a.index }
167
183
  : undefined;
168
-
169
- parser.outputBuilder.addAttribute(attrName, attrVal, parser.readonlyMatcher, attrMeta);
184
+ parser.outputBuilder.addAttribute(a.name, a.value, parser.readonlyMatcher, attrMeta);
170
185
  }
171
186
  }
@@ -1,5 +1,5 @@
1
1
  import { ParseError, ErrorCode } from './ParseError.js';
2
- import { name as isName, qName as isQName } from 'xml-naming';
2
+
3
3
 
4
4
  export function readDocType(parser) {
5
5
  parser.source.markTokenStart(1);
@@ -267,7 +267,7 @@ function readEntityExp(parser) {
267
267
  { line: source.line, col: source.cols, index: source.startIndex });
268
268
  }
269
269
 
270
- validateEntityName(entityName, parser.xmlVersion);
270
+ validateEntityName(entityName, parser);
271
271
  skipSourceWhitespace(source);
272
272
 
273
273
  if (!source.canRead()) {
@@ -346,7 +346,7 @@ function readElementExp(parser) {
346
346
  { line: source.line, col: source.cols, index: source.startIndex });
347
347
  }
348
348
 
349
- if (!isName(elementName, parser.xmlVersion)) {
349
+ if (!parser.getNameValidator('name')(elementName)) {
350
350
  throw new ParseError(`Invalid element name: "${elementName}"`,
351
351
  ErrorCode.INVALID_TAG,
352
352
  { line: source.line, col: source.cols, index: source.startIndex });
@@ -434,7 +434,7 @@ function readNotationExp(parser) {
434
434
  { line: source.line, col: source.cols, index: source.startIndex });
435
435
  }
436
436
 
437
- validateEntityName(notationName, parser.xmlVersion);
437
+ validateEntityName(notationName, parser);
438
438
  skipSourceWhitespace(source);
439
439
 
440
440
  // Need all 6 chars of "SYSTEM" / "PUBLIC" before we can classify
@@ -512,8 +512,8 @@ function skipSourceWhitespace(source) {
512
512
  }
513
513
  }
514
514
 
515
- function validateEntityName(name, xmlVersion) {
516
- if (isName(name, xmlVersion)) return name;
515
+ function validateEntityName(name, parser) {
516
+ if (parser.getNameValidator('name')(name)) return name;
517
517
  throw new ParseError(
518
518
  `Invalid entity name "${name}"`,
519
519
  ErrorCode.ENTITY_INVALID_KEY,
@@ -114,6 +114,31 @@ export default class BufferSource {
114
114
  return this.buffer.slice(from, from + n).toString();
115
115
  }
116
116
 
117
+ /**
118
+ * See StringSource.scanTagExpEnd() for full rationale. Byte-indexed —
119
+ * quote/`>` are single-byte ASCII, safe for multi-byte UTF-8 content too
120
+ * (a `>` byte never appears as a UTF-8 continuation byte). Buffer isn't a
121
+ * rope, so no equivalent of FeedableSource's charCodeAt/flatten concern.
122
+ */
123
+ scanTagExpEnd() {
124
+ const buf = this.buffer;
125
+ const len = buf.length;
126
+ const start = this.startIndex;
127
+ let inSingle = false;
128
+ let inDouble = false;
129
+ for (let i = start; i < len; i++) {
130
+ const c = buf[i];
131
+ if (c === 39) { // '
132
+ if (!inDouble) inSingle = !inSingle;
133
+ } else if (c === 34) { // "
134
+ if (!inSingle) inDouble = !inDouble;
135
+ } else if (c === 62 && !inSingle && !inDouble) { // >
136
+ return i - start;
137
+ }
138
+ }
139
+ return -1;
140
+ }
141
+
117
142
  /**
118
143
  * Scan buffer[this.startIndex, end) for byte code 10 ('\n') and advance
119
144
  * line/cols to match, mirroring readCh()'s per-byte logic. Does NOT touch
@@ -224,20 +249,9 @@ export default class BufferSource {
224
249
  }
225
250
 
226
251
  readFromBuffer(n, shouldUpdate) {
227
- let ch;
228
- if (n === 1) {
229
- ch = this.buffer[this.startIndex];
230
- if (ch === 10) { // '\n'
231
- this.line++;
232
- this.cols = 1;
233
- } else {
234
- this.cols++;
235
- }
236
- ch = String.fromCharCode(ch);
237
- } else {
238
- this.cols += n;
239
- ch = this.buffer.slice(this.startIndex, this.startIndex + n).toString();
240
- }
252
+ const ch = n === 1
253
+ ? String.fromCharCode(this.buffer[this.startIndex])
254
+ : this.buffer.slice(this.startIndex, this.startIndex + n).toString();
241
255
  if (shouldUpdate) this.updateBufferBoundary(n);
242
256
  return ch;
243
257
  }
@@ -1,4 +1,5 @@
1
1
  import { ParseError, ErrorCode } from '../ParseError.js';
2
+ import { StringDecoder } from 'node:string_decoder';
2
3
 
3
4
  /**
4
5
  * FeedableSource — input source for the feed()/end() API.
@@ -78,6 +79,19 @@ export default class FeedableSource {
78
79
  * the next feed() double-counts every '\n' it consumed before failing.
79
80
  */
80
81
  this._marks = [null, null];
82
+
83
+ /**
84
+ * Lazily-created, persistent across the whole feed() session. Buffer
85
+ * chunks must go through this rather than Buffer#toString() per chunk —
86
+ * toString() decodes each chunk in isolation, so a multi-byte UTF-8
87
+ * character whose bytes straddle a chunk boundary gets corrupted (each
88
+ * half independently replaced with U+FFFD). StringDecoder holds back an
89
+ * incomplete trailing sequence internally and prepends it to the next
90
+ * write(), so a split character decodes correctly once the rest of its
91
+ * bytes arrive. Only created if Buffer input is ever fed — string-only
92
+ * callers never pay for it.
93
+ */
94
+ this._decoder = null;
81
95
  }
82
96
 
83
97
  /**
@@ -89,9 +103,28 @@ export default class FeedableSource {
89
103
  * the limit.
90
104
  *
91
105
  * @param {string|Buffer} data
106
+ * @returns {number} number of characters appended to the buffer (after
107
+ * decoding) — callers that track fed-byte totals (e.g. XMLParser.feed's
108
+ * batch threshold) should use this rather than the raw input length,
109
+ * since a Buffer chunk ending mid-character may decode to fewer chars
110
+ * than its byte length until the next chunk completes the sequence.
92
111
  */
93
112
  feed(data) {
94
- const newData = typeof data === 'string' ? data : data.toString();
113
+ let newData;
114
+ if (typeof data === 'string') {
115
+ newData = data;
116
+ } else if (Buffer.isBuffer(data)) {
117
+ // Stateful decode: bytes of a multi-byte char split across two feed()
118
+ // calls are buffered internally by StringDecoder and correctly
119
+ // stitched together, instead of each chunk being decoded in isolation.
120
+ if (!this._decoder) this._decoder = new StringDecoder('utf8');
121
+ newData = this._decoder.write(data);
122
+ } else if (data?.toString) {
123
+ newData = data.toString();
124
+ } else {
125
+ throw new ParseError('feed() data must be a string or Buffer.', ErrorCode.DATA_MUST_BE_STRING);
126
+ }
127
+
95
128
  const liveBytes = this.buffer.length - this.startIndex;
96
129
 
97
130
  if (liveBytes + newData.length > this.maxBufferSize) {
@@ -103,10 +136,20 @@ export default class FeedableSource {
103
136
  }
104
137
 
105
138
  this.buffer += newData;
139
+ return newData.length;
106
140
  }
107
141
 
108
142
  /** Signal that no more data will be fed. */
109
143
  end() {
144
+ if (this._decoder) {
145
+ // Flush any final incomplete byte sequence held by the decoder. For
146
+ // well-formed UTF-8 input this is normally '' (nothing pending); a
147
+ // non-empty result here means the input was genuinely truncated
148
+ // mid-character, and StringDecoder's own U+FFFD substitution is the
149
+ // correct, standard behavior for that case.
150
+ const tail = this._decoder.end();
151
+ if (tail) this.buffer += tail;
152
+ }
110
153
  this.isComplete = true;
111
154
  }
112
155
 
@@ -214,6 +257,43 @@ export default class FeedableSource {
214
257
  return this.buffer.substring(from, from + n);
215
258
  }
216
259
 
260
+ /**
261
+ * Quote-aware scan, from the current read position, for the unquoted '>'
262
+ * that ends a tag expression. Used by readTagExp() — replaces the old
263
+ * per-char canRead(i)/readChAt(i) loop, which profiling showed as the
264
+ * single largest hotspot (~23-26% of parse time).
265
+ *
266
+ * IMPORTANT: bracket char access (`buf[i]`), not `charCodeAt(i)`. This
267
+ * source's buffer is built via repeated `+=` in feed() (a growing V8
268
+ * ConsString/rope). charCodeAt forces a full rope-flatten on access —
269
+ * confirmed via a crash (Runtime_StringCharCodeAt -> String::SlowFlatten)
270
+ * causing real O(n^2) memory growth when this was first written with
271
+ * charCodeAt. Bracket access matches what the pre-existing readChAt()
272
+ * already safely used.
273
+ *
274
+ * @returns {number} relative offset of the unquoted '>', or -1 if the
275
+ * buffer runs out first — caller treats that as UNEXPECTED_END, the
276
+ * normal retryable chunk-boundary signal for this source.
277
+ */
278
+ scanTagExpEnd() {
279
+ const buf = this.buffer;
280
+ const len = buf.length;
281
+ const start = this.startIndex;
282
+ let inSingle = false;
283
+ let inDouble = false;
284
+ for (let i = start; i < len; i++) {
285
+ const c = buf[i];
286
+ if (c === "'") {
287
+ if (!inDouble) inSingle = !inSingle;
288
+ } else if (c === '"') {
289
+ if (!inSingle) inDouble = !inDouble;
290
+ } else if (c === '>' && !inSingle && !inDouble) {
291
+ return i - start;
292
+ }
293
+ }
294
+ return -1;
295
+ }
296
+
217
297
  /**
218
298
  * Read until stop string is found.
219
299
  * @param {string} stopStr
@@ -341,8 +421,14 @@ export default class FeedableSource {
341
421
  const end = this.startIndex + n;
342
422
  this._advanceLineCol(end);
343
423
  this.startIndex = end;
344
- const anyMarkActive = this._marks[0] !== null || this._marks[1] !== null;
345
- if (this.autoFlush && this.startIndex >= this.flushThreshold && !anyMarkActive) {
424
+ // No "any mark active" gate here flush()'s own min(startIndex, marks...)
425
+ // origin computation already guarantees any in-progress token (at either
426
+ // mark level) survives the trim. A separate boolean gate on top of that
427
+ // was redundant, and since _marks[0] is set on every parseXml() loop
428
+ // iteration and never nulled outside of rewindToMark() (an error path),
429
+ // that gate was effectively permanent — flush() never ran in normal
430
+ // operation. See specs/flushArchitecture_spec.js for the regression test.
431
+ if (this.autoFlush && this.startIndex >= this.flushThreshold) {
346
432
  this.flush();
347
433
  }
348
434
  }
@@ -353,7 +439,10 @@ export default class FeedableSource {
353
439
  *
354
440
  * The flush origin is the minimum of all active mark positions, so that any
355
441
  * in-progress token (at either mark level) is preserved in the buffer and
356
- * can be re-read after the flush.
442
+ * can be re-read after the flush. This is the sole safety mechanism for
443
+ * flush() — callers do not need to additionally check "is a mark active"
444
+ * before calling this; an active mark simply caps how much origin can
445
+ * advance, rather than blocking the call outright.
357
446
  *
358
447
  * If no marks are active, the origin is startIndex itself — everything
359
448
  * before the current read position is discarded.
@@ -28,7 +28,11 @@ export default class StreamSource extends FeedableSource {
28
28
  attachStream(readable, onChunk, onEnd, onError) {
29
29
  readable.on('data', chunk => {
30
30
  try {
31
- this.feed(typeof chunk === 'string' ? chunk : chunk.toString());
31
+ // Pass the raw chunk (Buffer or string) straight through feed()
32
+ // decodes Buffers via a persistent StringDecoder so a multi-byte
33
+ // UTF-8 character split across two chunks decodes correctly instead
34
+ // of each half being independently mangled by a per-chunk toString().
35
+ this.feed(chunk);
32
36
  onChunk(null); // chunk appended successfully — caller runs parseXml()
33
37
  } catch (err) {
34
38
  onChunk(err); // buffer overflow or coercion failure
@@ -127,6 +127,35 @@ export default class StringSource {
127
127
  return this.buffer.substring(from, from + n);
128
128
  }
129
129
 
130
+ /**
131
+ * Quote-aware scan, from the current read position, for the unquoted '>'
132
+ * that ends a tag expression (`<tag attr="...">`). Used by readTagExp().
133
+ * Direct-buffer, bracket-indexed (not charCodeAt — see FeedableSource's
134
+ * copy of this method for why that matters there; kept identical here
135
+ * for consistency even though StringSource's buffer is never re-concatenated).
136
+ *
137
+ * @returns {number} relative offset (from startIndex) of the unquoted '>',
138
+ * or -1 if the buffer is exhausted first (malformed input for StringSource).
139
+ */
140
+ scanTagExpEnd() {
141
+ const buf = this.buffer;
142
+ const len = buf.length;
143
+ const start = this.startIndex;
144
+ let inSingle = false;
145
+ let inDouble = false;
146
+ for (let i = start; i < len; i++) {
147
+ const c = buf[i];
148
+ if (c === "'") {
149
+ if (!inDouble) inSingle = !inSingle;
150
+ } else if (c === '"') {
151
+ if (!inSingle) inDouble = !inDouble;
152
+ } else if (c === '>' && !inSingle && !inDouble) {
153
+ return i - start;
154
+ }
155
+ }
156
+ return -1;
157
+ }
158
+
130
159
  /**
131
160
  * Scan buffer[this.startIndex, end) for '\n' and advance line/cols to match,
132
161
  * mirroring readCh()'s per-char logic. Does NOT touch startIndex — callers
@@ -251,8 +280,12 @@ export default class StringSource {
251
280
  const end = this.startIndex + n;
252
281
  this._advanceLineCol(end);
253
282
  this.startIndex = end;
254
- const anyMarkActive = this._marks[0] >= 0 || this._marks[1] >= 0;
255
- if (this.autoFlush && this.startIndex >= this.flushThreshold && !anyMarkActive) {
283
+ // See FeedableSource.updateBufferBoundary() for why there is no "any mark
284
+ // active" gate here flush()'s own min-origin computation already
285
+ // protects any in-progress token; a separate gate was redundant and, since
286
+ // marks are effectively always set in normal operation, made flush()
287
+ // permanently unreachable. See specs/flushArchitecture_spec.js.
288
+ if (this.autoFlush && this.startIndex >= this.flushThreshold) {
256
289
  this.flush();
257
290
  }
258
291
  }
package/src/XMLParser.js CHANGED
@@ -183,19 +183,15 @@ export default class XMLParser {
183
183
  this._initFeedSession();
184
184
  }
185
185
 
186
- let str;
187
- if (typeof data === 'string') {
188
- str = data;
189
- } else if (Buffer.isBuffer(data)) {
190
- str = data.toString();
191
- } else if (data?.toString) {
192
- str = data.toString();
193
- } else {
194
- throw new ParseError('feed() data must be a string or Buffer.', ErrorCode.DATA_MUST_BE_STRING);
195
- }
196
-
197
- this._feedSource.feed(str);
198
- this._pendingBytes += str.length;
186
+ // Pass raw data straight through — do NOT pre-convert Buffers to string
187
+ // here. FeedableSource.feed() decodes Buffers via a persistent
188
+ // StringDecoder so a multi-byte UTF-8 character split across two feed()
189
+ // calls decodes correctly; converting each chunk with .toString() first
190
+ // (as this used to do) decodes each chunk in isolation and corrupts a
191
+ // split character. feed() itself validates the type and throws
192
+ // DATA_MUST_BE_STRING for anything unsupported.
193
+ const appendedLength = this._feedSource.feed(data);
194
+ this._pendingBytes += appendedLength;
199
195
 
200
196
  if (this._pendingBytes >= this._batchThreshold) {
201
197
  this._runParse();
@@ -8,7 +8,7 @@ import { readDocType } from './DocTypeReader.js';
8
8
  import { DANGEROUS_PROPERTY_NAMES, criticalProperties } from './util.js';
9
9
  import AutoCloseHandler from './AutoCloseHandler.js';
10
10
  import { ParseError, ErrorCode } from './ParseError.js';
11
- import { name as isName, qName as isQName } from 'xml-naming';
11
+ import { createValidator } from 'xml-naming';
12
12
 
13
13
  class TagDetail {
14
14
  /**
@@ -66,6 +66,14 @@ export default class Xml2JsParser {
66
66
  this.tagsStack = [];
67
67
  this._stopNodeProcessor = null;
68
68
  this._exitIfTriggered = false;
69
+ // Lazily-built, memoized xml-naming validators (v0.3.0 createValidator).
70
+ // Lazy because xmlDec.version isn't final until the optional <?xml?>
71
+ // declaration (if any) has been read — which happens after this method
72
+ // runs but before any tag name is ever validated. Reset here (once per
73
+ // document/session, see XMLParser._createParser / feed() call sites) so
74
+ // a reused Xml2JsParser instance never validates against a stale
75
+ // xmlVersion or leaks one document's name cache into the next.
76
+ this._nameValidators = Object.create(null);
69
77
  this.xmlDec = {
70
78
  version: 1.0,
71
79
  lang: null,
@@ -156,13 +164,18 @@ export default class Xml2JsParser {
156
164
  { line: this.source.line, col: this.source.cols, index: this.source.startIndex }
157
165
  );
158
166
 
159
- if (nextChar === '!' || nextChar === '?') {
167
+ //sorted frequency wise
168
+ if (nextChar === '/') {
169
+ this.source.updateBufferBoundary();
170
+ this.readClosingTag(tagStart);
171
+ } else if (nextChar === '!') {
160
172
  this.source.updateBufferBoundary();
161
173
  this.addTextNode();
162
174
  this.readSpecialTag(nextChar);
163
- } else if (nextChar === '/') {
175
+ } else if (nextChar === '?') {
164
176
  this.source.updateBufferBoundary();
165
- this.readClosingTag(tagStart);
177
+ this.addTextNode();
178
+ readPiTag(this);
166
179
  } else {
167
180
  this.readOpeningTag(tagStart);
168
181
  }
@@ -365,7 +378,7 @@ export default class Xml2JsParser {
365
378
  const skipTagConfig = stopNodeConfig ? null : this.isSkipTag();
366
379
 
367
380
  if (!options.skip.attributes && !skipTagConfig) {
368
- flushAttributes(tagExp._attrsExp, this, tagExp._attrsExpStart);
381
+ flushAttributes(tagExp._parsedAttrs, this, tagExp._attrsExpStart, tagExp._rawAttrMatchCount);
369
382
  }
370
383
 
371
384
  // Stop-node and skip-tag checks AFTER attributes are set so attribute conditions work.
@@ -526,8 +539,6 @@ export default class Xml2JsParser {
526
539
  this.outputBuilder.addInputEntities(docTypeEntities);
527
540
  }
528
541
  }
529
- } else if (startCh === "?") {
530
- readPiTag(this);
531
542
  } else {
532
543
  throw new ParseError(`Invalid tag '<${startCh}'`, ErrorCode.INVALID_TAG, { line: this.source.line, col: this.source.cols, index: this.source.startIndex });
533
544
  }
@@ -543,10 +554,34 @@ export default class Xml2JsParser {
543
554
  }
544
555
  }
545
556
 
557
+ /**
558
+ * Returns a memoized xml-naming validator for the given production
559
+ * ('qName' for tag/attribute names, 'name' for DOCTYPE entity/element/
560
+ * notation names), built lazily on first use and cached per parser
561
+ * instance for the rest of the document/session.
562
+ *
563
+ * xmlDec.version is stored as a number (1.0 / 1.1) but xml-naming's
564
+ * xmlVersion option is the string '1.0'/'1.1' — normalized here rather
565
+ * than changing xmlDec's public shape (it's forwarded as-is to
566
+ * outputBuilder.addDeclaration(), so its type is part of the builder
567
+ * contract, not just an internal detail).
568
+ *
569
+ * @param {'name'|'qName'} production
570
+ */
571
+ getNameValidator(production) {
572
+ let validator = this._nameValidators[production];
573
+ if (!validator) {
574
+ const xmlVersion = this.xmlDec.version === 1.1 || this.xmlDec.version === '1.1' ? '1.1' : '1.0';
575
+ validator = createValidator(production, { xmlVersion });
576
+ this._nameValidators[production] = validator;
577
+ }
578
+ return validator;
579
+ }
580
+
546
581
  processAttrName(attrName) {
547
582
  const options = this.options;
548
583
  attrName = resolveNsPrefix(attrName, options.skip.nsPrefix);
549
- if (!isQName(attrName, this.xmlDec.version)) { //TODO: make it optional
584
+ if (!this.getNameValidator('qName')(attrName)) { //TODO: make it optional
550
585
  throw new ParseError(`Invalid attribute name: ${attrName}`, ErrorCode.INVALID_ATTRIBUTE_NAME);
551
586
  }
552
587
  attrName = sanitizeName(attrName, options.onDangerousProperty);