npm - @nodable/flexible-xml-parser - Versions diffs - 1.6.1 → 1.7.0 - Mend

@nodable/flexible-xml-parser 1.6.1 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/CHANGELOG.md +9 -0
package/package.json +2 -2
package/src/AttributeProcessor.js +50 -35
package/src/DocTypeReader.js +6 -6
package/src/InputSource/BufferSource.js +28 -14
package/src/InputSource/FeedableSource.js +93 -4
package/src/InputSource/StreamSource.js +5 -1
package/src/InputSource/StringSource.js +35 -2
package/src/XMLParser.js +9 -13
package/src/Xml2JsParser.js +43 -8
package/src/XmlPartReader.js +15 -27
package/src/XmlSpecialTagsReader.js +12 -1
package/src/CharsSymbol.js +0 -16

package/CHANGELOG.md CHANGED Viewed

@@ -1,4 +1,13 @@
+**1.7.0 (2026-07-03)**
+- perf: upgrade to xml-naming v0.3.0 to support caching
+- perf: parse attributes only once
+- perf: quote aware scan: `scanTagExpEnd` to all input sources
+- perf: call tag reading methods frequency wise
+- perf: autoflush
+- fix(#5): StreamSource and FeedableSource don't respect multi-byte characters
 **1.6.1 (2026-06-30)**
 - Pass xml declaration attributes to output builder irrespection of parser options.

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@nodable/flexible-xml-parser",
-  "version": "1.6.1",
+  "version": "1.7.0",
   "description": "Fastest and fully customizable XML parser in pure JS with fully customizable ouput",
   "main": "./lib/fxp.cjs",
   "type": "module",
@@ -51,7 +51,7 @@
     "@nodable/base-output-builder": "^2.0.0",
     "@nodable/compact-builder": "^2.0.0",
     "path-expression-matcher": "^1.6.1",
-    "xml-naming": "^0.1.0"
+    "xml-naming": "^0.3.0"
   },
   "devDependencies": {
     "@babel/core": "^7.29.0",

package/src/AttributeProcessor.js CHANGED Viewed

@@ -30,9 +30,13 @@ import { isSpaceCode } from "./util.js"
 /**
  * Parse an attribute expression string into an array of match tuples.
  *
- * Each element has the same shape the old getAllMatches() returned so that
- * callers are unchanged:
- *   [fullMatch, name, '=value' | undefined, quote | undefined, value | undefined]
+ * Each element is `{ name, value, startIndex }` — `value` is `undefined` for
+ * a boolean attribute (no `=`). Earlier versions of this function also built
+ * a full-match string and an `'=value'` string per attribute (matching an
+ * old regex-based getAllMatches() return shape) — neither was ever read by
+ * collectRawAttributes()/flushAttributes() (only `name`, `value`, and
+ * `.startIndex` are), so building them was pure wasted string concatenation
+ * on every attribute, on every tag. Dropped.
  *
  * The implementation is a single O(n) pass over char codes with no regex and
  * no recursion, making it safe for arbitrarily long attribute strings.
@@ -44,7 +48,7 @@ import { isSpaceCode } from "./util.js"
  *   IN_VALUE   — inside a quoted value, accumulating until the closing quote
  *
  * @param {string} attrStr
- * @returns {Array}  array of match tuples (see shape above)
+ * @returns {Array<{name: string, value: string|undefined, startIndex: number}>}
  */
 function parseAttributes(attrStr) {
   const results = [];
@@ -66,9 +70,7 @@ function parseAttributes(attrStr) {
     if (i >= len || attrStr.charCodeAt(i) !== 61) {
       // Boolean attribute — no '='
-      const m = [name, name, undefined, undefined, undefined];
-      m.startIndex = nameStart;
-      results.push(m);
+      results.push({ name, value: undefined, startIndex: nameStart });
       continue;
     }
@@ -81,7 +83,6 @@ function parseAttributes(attrStr) {
     const quote = attrStr.charCodeAt(i);
     if (quote === 34 || quote === 39) { // " or '
       i++; // skip opening quote
-      const valueStart = i;
       let value = '';
       let segStart = i;
       while (i < len && attrStr.charCodeAt(i) !== quote) {
@@ -94,10 +95,7 @@ function parseAttributes(attrStr) {
       }
       value += attrStr.substring(segStart, i);
       i++; // skip closing quote
-      const quoteChar = String.fromCharCode(quote);
-      const m = [name + '=' + quoteChar + value + quoteChar, name, '=' + quoteChar + value + quoteChar, quoteChar, value];
-      m.startIndex = nameStart;
-      results.push(m);
+      results.push({ name, value, startIndex: nameStart });
     }
   }
@@ -105,7 +103,20 @@ function parseAttributes(attrStr) {
 }
 /**
- * Pass 1: extract raw (unparsed) attribute values into rawAttributes.
+ * Pass 1: extract raw (unparsed) attribute values into rawAttributes, AND
+ * build tagExp._parsedAttrs — the processed-name/value list pass 2 will
+ * consume directly.
+ *
+ * Previously, pass 2 (flushAttributes) re-ran parseAttributes() from scratch
+ * on the same attrStr, and re-ran parser.processAttrName() (ns-prefix
+ * resolution + name validation + sanitizeName + reserved-name check) on
+ * every attribute a second time — full re-tokenization plus full re-validation
+ * of work already done here. processAttrName() is a pure function of
+ * (rawName, options) — nothing between pass 1 and pass 2 (matcher.push,
+ * stop/skip resolution) can change its result — so it's safe to compute once
+ * and cache. The matcher still gets the *raw* (pre-resolveNsPrefix/sanitize)
+ * name as its rawAttributes key, unchanged, since PEM's attribute-condition
+ * matching (`div[class=code]`) matches against attribute names as written.
  *
  * @param {string} attrStr      - raw attribute expression substring
  * @param {object} parser       - Xml2JsParser instance (for processAttrName)
@@ -116,56 +127,60 @@ export function collectRawAttributes(attrStr, parser, tagExp) {
   const matches = parseAttributes(attrStr);
   const len = matches.length;
+  tagExp._rawAttrMatchCount = len; // total parsed attrs, incl. dropped (xmlns:) ones — for maxAttributesPerTag parity with old behavior
+  const parsedAttrs = [];
   let count = 0;
   for (let i = 0; i < len; i++) {
-    const attrName = parser.processAttrName(matches[i][1]);
+    const m = matches[i];
+    const attrName = parser.processAttrName(m.name);
     if (attrName === false) continue;
     count++;
-    const rawVal = matches[i][4];
-    tagExp.rawAttributes[matches[i][1]] = rawVal !== undefined ? rawVal : true;
+    const rawVal = m.value;
+    const attrVal = rawVal !== undefined ? rawVal : true;
+    tagExp.rawAttributes[m.name] = attrVal;
+    parsedAttrs.push({ name: attrName, value: attrVal, index: m.startIndex });
   }
   tagExp.rawAttributesLen = count;
+  tagExp._parsedAttrs = parsedAttrs;
 }
 /**
- * Pass 2: run value parsers and push each attribute to the output builder.
+ * Pass 2: push each attribute (already parsed + name-processed by pass 1,
+ * see tagExp._parsedAttrs) to the output builder. No re-parsing, no
+ * re-running processAttrName — this is now a plain loop over cached data.
  *
- * @param {string} attrStr - raw attribute expression substring
+ * @param {Array<{name: string, value: *, index: number}>} parsedAttrs - tagExp._parsedAttrs from collectRawAttributes
  * @param {object} parser  - Xml2JsParser instance
- * @param {number} [attrsExpStart] - absolute document offset where `attrStr`
- *   begins (tagExp._attrsExpStart from buildTagExpObj). When provided, each
+ * @param {number} [attrsExpStart] - absolute document offset where the
+ *   attribute expression began (tagExp._attrsExpStart). When provided, each
  *   attribute's absolute document index is computed and passed to
  *   addAttribute() as a 4th argument: { index }. Line/col are intentionally
  *   NOT computed here — doing so would require re-scanning attrStr for
  *   newlines on every call, for a field most builders won't use; callers
  *   that need it can derive line/col from `index` plus the document text.
+ * @param {number} rawAttrMatchCount - tagExp._rawAttrMatchCount, used for the
+ *   maxAttributesPerTag limit check (counts all parsed attrs, including any
+ *   dropped by processAttrName, matching the limit's pre-existing semantics).
  */
-export function flushAttributes(attrStr, parser, attrsExpStart) {
-  if (!attrStr || attrStr.length === 0) return;
-  const matches = parseAttributes(attrStr);
-  const len = matches.length;
+export function flushAttributes(parsedAttrs, parser, attrsExpStart, rawAttrMatchCount) {
+  if (!parsedAttrs || parsedAttrs.length === 0) return;
   const maxAttrs = parser.options.limits?.maxAttributesPerTag;
-  if (maxAttrs !== undefined && maxAttrs !== null && len > maxAttrs) {
+  if (maxAttrs !== undefined && maxAttrs !== null && rawAttrMatchCount > maxAttrs) {
     const tagName = parser.currentTagDetail?.name ?? '(unknown)';
     throw new ParseError(
-      `Tag '${tagName}' has ${len} attributes, exceeding limit of ${maxAttrs}`,
+      `Tag '${tagName}' has ${rawAttrMatchCount} attributes, exceeding limit of ${maxAttrs}`,
       ErrorCode.LIMIT_MAX_ATTRIBUTES,
       { line: parser.source.line, col: parser.source.cols, index: parser.source.startIndex }
     );
   }
+  const len = parsedAttrs.length;
   for (let i = 0; i < len; i++) {
-    const attrName = parser.processAttrName(matches[i][1]);
-    if (attrName === false) continue;
-    const rawVal = matches[i][4];
-    const attrVal = rawVal !== undefined ? rawVal : true;
+    const a = parsedAttrs[i];
     const attrMeta = attrsExpStart !== undefined
-      ? { index: attrsExpStart + matches[i].startIndex }
+      ? { index: attrsExpStart + a.index }
       : undefined;
-    parser.outputBuilder.addAttribute(attrName, attrVal, parser.readonlyMatcher, attrMeta);
+    parser.outputBuilder.addAttribute(a.name, a.value, parser.readonlyMatcher, attrMeta);
   }
 }

package/src/DocTypeReader.js CHANGED Viewed

@@ -1,5 +1,5 @@
 import { ParseError, ErrorCode } from './ParseError.js';
-import { name as isName, qName as isQName } from 'xml-naming';
 export function readDocType(parser) {
     parser.source.markTokenStart(1);
@@ -267,7 +267,7 @@ function readEntityExp(parser) {
             { line: source.line, col: source.cols, index: source.startIndex });
     }
-    validateEntityName(entityName, parser.xmlVersion);
+    validateEntityName(entityName, parser);
     skipSourceWhitespace(source);
     if (!source.canRead()) {
@@ -346,7 +346,7 @@ function readElementExp(parser) {
             { line: source.line, col: source.cols, index: source.startIndex });
     }
-    if (!isName(elementName, parser.xmlVersion)) {
+    if (!parser.getNameValidator('name')(elementName)) {
         throw new ParseError(`Invalid element name: "${elementName}"`,
             ErrorCode.INVALID_TAG,
             { line: source.line, col: source.cols, index: source.startIndex });
@@ -434,7 +434,7 @@ function readNotationExp(parser) {
             { line: source.line, col: source.cols, index: source.startIndex });
     }
-    validateEntityName(notationName, parser.xmlVersion);
+    validateEntityName(notationName, parser);
     skipSourceWhitespace(source);
     // Need all 6 chars of "SYSTEM" / "PUBLIC" before we can classify
@@ -512,8 +512,8 @@ function skipSourceWhitespace(source) {
     }
 }
-function validateEntityName(name, xmlVersion) {
-    if (isName(name, xmlVersion)) return name;
+function validateEntityName(name, parser) {
+    if (parser.getNameValidator('name')(name)) return name;
     throw new ParseError(
         `Invalid entity name "${name}"`,
         ErrorCode.ENTITY_INVALID_KEY,

package/src/InputSource/BufferSource.js CHANGED Viewed

@@ -114,6 +114,31 @@ export default class BufferSource {
     return this.buffer.slice(from, from + n).toString();
   }
+  /**
+   * See StringSource.scanTagExpEnd() for full rationale. Byte-indexed —
+   * quote/`>` are single-byte ASCII, safe for multi-byte UTF-8 content too
+   * (a `>` byte never appears as a UTF-8 continuation byte). Buffer isn't a
+   * rope, so no equivalent of FeedableSource's charCodeAt/flatten concern.
+   */
+  scanTagExpEnd() {
+    const buf = this.buffer;
+    const len = buf.length;
+    const start = this.startIndex;
+    let inSingle = false;
+    let inDouble = false;
+    for (let i = start; i < len; i++) {
+      const c = buf[i];
+      if (c === 39) { // '
+        if (!inDouble) inSingle = !inSingle;
+      } else if (c === 34) { // "
+        if (!inSingle) inDouble = !inDouble;
+      } else if (c === 62 && !inSingle && !inDouble) { // >
+        return i - start;
+      }
+    }
+    return -1;
+  }
   /**
    * Scan buffer[this.startIndex, end) for byte code 10 ('\n') and advance
    * line/cols to match, mirroring readCh()'s per-byte logic. Does NOT touch
@@ -224,20 +249,9 @@ export default class BufferSource {
   }
   readFromBuffer(n, shouldUpdate) {
-    let ch;
-    if (n === 1) {
-      ch = this.buffer[this.startIndex];
-      if (ch === 10) { // '\n'
-        this.line++;
-        this.cols = 1;
-      } else {
-        this.cols++;
-      }
-      ch = String.fromCharCode(ch);
-    } else {
-      this.cols += n;
-      ch = this.buffer.slice(this.startIndex, this.startIndex + n).toString();
-    }
+    const ch = n === 1
+      ? String.fromCharCode(this.buffer[this.startIndex])
+      : this.buffer.slice(this.startIndex, this.startIndex + n).toString();
     if (shouldUpdate) this.updateBufferBoundary(n);
     return ch;
   }

package/src/InputSource/FeedableSource.js CHANGED Viewed

@@ -1,4 +1,5 @@
 import { ParseError, ErrorCode } from '../ParseError.js';
+import { StringDecoder } from 'node:string_decoder';
 /**
  * FeedableSource — input source for the feed()/end() API.
@@ -78,6 +79,19 @@ export default class FeedableSource {
      * the next feed() double-counts every '\n' it consumed before failing.
      */
     this._marks = [null, null];
+    /**
+     * Lazily-created, persistent across the whole feed() session. Buffer
+     * chunks must go through this rather than Buffer#toString() per chunk —
+     * toString() decodes each chunk in isolation, so a multi-byte UTF-8
+     * character whose bytes straddle a chunk boundary gets corrupted (each
+     * half independently replaced with U+FFFD). StringDecoder holds back an
+     * incomplete trailing sequence internally and prepends it to the next
+     * write(), so a split character decodes correctly once the rest of its
+     * bytes arrive. Only created if Buffer input is ever fed — string-only
+     * callers never pay for it.
+     */
+    this._decoder = null;
   }
   /**
@@ -89,9 +103,28 @@ export default class FeedableSource {
    * the limit.
    *
    * @param {string|Buffer} data
+   * @returns {number} number of characters appended to the buffer (after
+   *   decoding) — callers that track fed-byte totals (e.g. XMLParser.feed's
+   *   batch threshold) should use this rather than the raw input length,
+   *   since a Buffer chunk ending mid-character may decode to fewer chars
+   *   than its byte length until the next chunk completes the sequence.
    */
   feed(data) {
-    const newData = typeof data === 'string' ? data : data.toString();
+    let newData;
+    if (typeof data === 'string') {
+      newData = data;
+    } else if (Buffer.isBuffer(data)) {
+      // Stateful decode: bytes of a multi-byte char split across two feed()
+      // calls are buffered internally by StringDecoder and correctly
+      // stitched together, instead of each chunk being decoded in isolation.
+      if (!this._decoder) this._decoder = new StringDecoder('utf8');
+      newData = this._decoder.write(data);
+    } else if (data?.toString) {
+      newData = data.toString();
+    } else {
+      throw new ParseError('feed() data must be a string or Buffer.', ErrorCode.DATA_MUST_BE_STRING);
+    }
     const liveBytes = this.buffer.length - this.startIndex;
     if (liveBytes + newData.length > this.maxBufferSize) {
@@ -103,10 +136,20 @@ export default class FeedableSource {
     }
     this.buffer += newData;
+    return newData.length;
   }
   /** Signal that no more data will be fed. */
   end() {
+    if (this._decoder) {
+      // Flush any final incomplete byte sequence held by the decoder. For
+      // well-formed UTF-8 input this is normally '' (nothing pending); a
+      // non-empty result here means the input was genuinely truncated
+      // mid-character, and StringDecoder's own U+FFFD substitution is the
+      // correct, standard behavior for that case.
+      const tail = this._decoder.end();
+      if (tail) this.buffer += tail;
+    }
     this.isComplete = true;
   }
@@ -214,6 +257,43 @@ export default class FeedableSource {
     return this.buffer.substring(from, from + n);
   }
+  /**
+   * Quote-aware scan, from the current read position, for the unquoted '>'
+   * that ends a tag expression. Used by readTagExp() — replaces the old
+   * per-char canRead(i)/readChAt(i) loop, which profiling showed as the
+   * single largest hotspot (~23-26% of parse time).
+   *
+   * IMPORTANT: bracket char access (`buf[i]`), not `charCodeAt(i)`. This
+   * source's buffer is built via repeated `+=` in feed() (a growing V8
+   * ConsString/rope). charCodeAt forces a full rope-flatten on access —
+   * confirmed via a crash (Runtime_StringCharCodeAt -> String::SlowFlatten)
+   * causing real O(n^2) memory growth when this was first written with
+   * charCodeAt. Bracket access matches what the pre-existing readChAt()
+   * already safely used.
+   *
+   * @returns {number} relative offset of the unquoted '>', or -1 if the
+   *   buffer runs out first — caller treats that as UNEXPECTED_END, the
+   *   normal retryable chunk-boundary signal for this source.
+   */
+  scanTagExpEnd() {
+    const buf = this.buffer;
+    const len = buf.length;
+    const start = this.startIndex;
+    let inSingle = false;
+    let inDouble = false;
+    for (let i = start; i < len; i++) {
+      const c = buf[i];
+      if (c === "'") {
+        if (!inDouble) inSingle = !inSingle;
+      } else if (c === '"') {
+        if (!inSingle) inDouble = !inDouble;
+      } else if (c === '>' && !inSingle && !inDouble) {
+        return i - start;
+      }
+    }
+    return -1;
+  }
   /**
    * Read until stop string is found.
    * @param {string} stopStr
@@ -341,8 +421,14 @@ export default class FeedableSource {
     const end = this.startIndex + n;
     this._advanceLineCol(end);
     this.startIndex = end;
-    const anyMarkActive = this._marks[0] !== null || this._marks[1] !== null;
-    if (this.autoFlush && this.startIndex >= this.flushThreshold && !anyMarkActive) {
+    // No "any mark active" gate here — flush()'s own min(startIndex, marks...)
+    // origin computation already guarantees any in-progress token (at either
+    // mark level) survives the trim. A separate boolean gate on top of that
+    // was redundant, and since _marks[0] is set on every parseXml() loop
+    // iteration and never nulled outside of rewindToMark() (an error path),
+    // that gate was effectively permanent — flush() never ran in normal
+    // operation. See specs/flushArchitecture_spec.js for the regression test.
+    if (this.autoFlush && this.startIndex >= this.flushThreshold) {
       this.flush();
     }
   }
@@ -353,7 +439,10 @@ export default class FeedableSource {
    *
    * The flush origin is the minimum of all active mark positions, so that any
    * in-progress token (at either mark level) is preserved in the buffer and
-   * can be re-read after the flush.
+   * can be re-read after the flush. This is the sole safety mechanism for
+   * flush() — callers do not need to additionally check "is a mark active"
+   * before calling this; an active mark simply caps how much origin can
+   * advance, rather than blocking the call outright.
    *
    * If no marks are active, the origin is startIndex itself — everything
    * before the current read position is discarded.

package/src/InputSource/StreamSource.js CHANGED Viewed

@@ -28,7 +28,11 @@ export default class StreamSource extends FeedableSource {
   attachStream(readable, onChunk, onEnd, onError) {
     readable.on('data', chunk => {
       try {
-        this.feed(typeof chunk === 'string' ? chunk : chunk.toString());
+        // Pass the raw chunk (Buffer or string) straight through — feed()
+        // decodes Buffers via a persistent StringDecoder so a multi-byte
+        // UTF-8 character split across two chunks decodes correctly instead
+        // of each half being independently mangled by a per-chunk toString().
+        this.feed(chunk);
         onChunk(null); // chunk appended successfully — caller runs parseXml()
       } catch (err) {
         onChunk(err); // buffer overflow or coercion failure

package/src/InputSource/StringSource.js CHANGED Viewed

@@ -127,6 +127,35 @@ export default class StringSource {
     return this.buffer.substring(from, from + n);
   }
+  /**
+   * Quote-aware scan, from the current read position, for the unquoted '>'
+   * that ends a tag expression (`<tag attr="...">`). Used by readTagExp().
+   * Direct-buffer, bracket-indexed (not charCodeAt — see FeedableSource's
+   * copy of this method for why that matters there; kept identical here
+   * for consistency even though StringSource's buffer is never re-concatenated).
+   *
+   * @returns {number} relative offset (from startIndex) of the unquoted '>',
+   *   or -1 if the buffer is exhausted first (malformed input for StringSource).
+   */
+  scanTagExpEnd() {
+    const buf = this.buffer;
+    const len = buf.length;
+    const start = this.startIndex;
+    let inSingle = false;
+    let inDouble = false;
+    for (let i = start; i < len; i++) {
+      const c = buf[i];
+      if (c === "'") {
+        if (!inDouble) inSingle = !inSingle;
+      } else if (c === '"') {
+        if (!inSingle) inDouble = !inDouble;
+      } else if (c === '>' && !inSingle && !inDouble) {
+        return i - start;
+      }
+    }
+    return -1;
+  }
   /**
    * Scan buffer[this.startIndex, end) for '\n' and advance line/cols to match,
    * mirroring readCh()'s per-char logic. Does NOT touch startIndex — callers
@@ -251,8 +280,12 @@ export default class StringSource {
     const end = this.startIndex + n;
     this._advanceLineCol(end);
     this.startIndex = end;
-    const anyMarkActive = this._marks[0] >= 0 || this._marks[1] >= 0;
-    if (this.autoFlush && this.startIndex >= this.flushThreshold && !anyMarkActive) {
+    // See FeedableSource.updateBufferBoundary() for why there is no "any mark
+    // active" gate here — flush()'s own min-origin computation already
+    // protects any in-progress token; a separate gate was redundant and, since
+    // marks are effectively always set in normal operation, made flush()
+    // permanently unreachable. See specs/flushArchitecture_spec.js.
+    if (this.autoFlush && this.startIndex >= this.flushThreshold) {
       this.flush();
     }
   }

package/src/XMLParser.js CHANGED Viewed

@@ -183,19 +183,15 @@ export default class XMLParser {
       this._initFeedSession();
     }
-    let str;
-    if (typeof data === 'string') {
-      str = data;
-    } else if (Buffer.isBuffer(data)) {
-      str = data.toString();
-    } else if (data?.toString) {
-      str = data.toString();
-    } else {
-      throw new ParseError('feed() data must be a string or Buffer.', ErrorCode.DATA_MUST_BE_STRING);
-    }
-    this._feedSource.feed(str);
-    this._pendingBytes += str.length;
+    // Pass raw data straight through — do NOT pre-convert Buffers to string
+    // here. FeedableSource.feed() decodes Buffers via a persistent
+    // StringDecoder so a multi-byte UTF-8 character split across two feed()
+    // calls decodes correctly; converting each chunk with .toString() first
+    // (as this used to do) decodes each chunk in isolation and corrupts a
+    // split character. feed() itself validates the type and throws
+    // DATA_MUST_BE_STRING for anything unsupported.
+    const appendedLength = this._feedSource.feed(data);
+    this._pendingBytes += appendedLength;
     if (this._pendingBytes >= this._batchThreshold) {
       this._runParse();

package/src/Xml2JsParser.js CHANGED Viewed

@@ -8,7 +8,7 @@ import { readDocType } from './DocTypeReader.js';
 import { DANGEROUS_PROPERTY_NAMES, criticalProperties } from './util.js';
 import AutoCloseHandler from './AutoCloseHandler.js';
 import { ParseError, ErrorCode } from './ParseError.js';
-import { name as isName, qName as isQName } from 'xml-naming';
+import { createValidator } from 'xml-naming';
 class TagDetail {
   /**
@@ -66,6 +66,14 @@ export default class Xml2JsParser {
     this.tagsStack = [];
     this._stopNodeProcessor = null;
     this._exitIfTriggered = false;
+    // Lazily-built, memoized xml-naming validators (v0.3.0 createValidator).
+    // Lazy because xmlDec.version isn't final until the optional <?xml?>
+    // declaration (if any) has been read — which happens after this method
+    // runs but before any tag name is ever validated. Reset here (once per
+    // document/session, see XMLParser._createParser / feed() call sites) so
+    // a reused Xml2JsParser instance never validates against a stale
+    // xmlVersion or leaks one document's name cache into the next.
+    this._nameValidators = Object.create(null);
     this.xmlDec = {
       version: 1.0,
       lang: null,
@@ -156,13 +164,18 @@ export default class Xml2JsParser {
           { line: this.source.line, col: this.source.cols, index: this.source.startIndex }
         );
-        if (nextChar === '!' || nextChar === '?') {
+        //sorted frequency wise
+        if (nextChar === '/') {
+          this.source.updateBufferBoundary();
+          this.readClosingTag(tagStart);
+        } else if (nextChar === '!') {
           this.source.updateBufferBoundary();
           this.addTextNode();
           this.readSpecialTag(nextChar);
-        } else if (nextChar === '/') {
+        } else if (nextChar === '?') {
           this.source.updateBufferBoundary();
-          this.readClosingTag(tagStart);
+          this.addTextNode();
+          readPiTag(this);
         } else {
           this.readOpeningTag(tagStart);
         }
@@ -365,7 +378,7 @@ export default class Xml2JsParser {
     const skipTagConfig = stopNodeConfig ? null : this.isSkipTag();
     if (!options.skip.attributes && !skipTagConfig) {
-      flushAttributes(tagExp._attrsExp, this, tagExp._attrsExpStart);
+      flushAttributes(tagExp._parsedAttrs, this, tagExp._attrsExpStart, tagExp._rawAttrMatchCount);
     }
     // Stop-node and skip-tag checks AFTER attributes are set so attribute conditions work.
@@ -526,8 +539,6 @@ export default class Xml2JsParser {
           this.outputBuilder.addInputEntities(docTypeEntities);
         }
       }
-    } else if (startCh === "?") {
-      readPiTag(this);
     } else {
       throw new ParseError(`Invalid tag '<${startCh}'`, ErrorCode.INVALID_TAG, { line: this.source.line, col: this.source.cols, index: this.source.startIndex });
     }
@@ -543,10 +554,34 @@ export default class Xml2JsParser {
     }
   }
+  /**
+   * Returns a memoized xml-naming validator for the given production
+   * ('qName' for tag/attribute names, 'name' for DOCTYPE entity/element/
+   * notation names), built lazily on first use and cached per parser
+   * instance for the rest of the document/session.
+   *
+   * xmlDec.version is stored as a number (1.0 / 1.1) but xml-naming's
+   * xmlVersion option is the string '1.0'/'1.1' — normalized here rather
+   * than changing xmlDec's public shape (it's forwarded as-is to
+   * outputBuilder.addDeclaration(), so its type is part of the builder
+   * contract, not just an internal detail).
+   *
+   * @param {'name'|'qName'} production
+   */
+  getNameValidator(production) {
+    let validator = this._nameValidators[production];
+    if (!validator) {
+      const xmlVersion = this.xmlDec.version === 1.1 || this.xmlDec.version === '1.1' ? '1.1' : '1.0';
+      validator = createValidator(production, { xmlVersion });
+      this._nameValidators[production] = validator;
+    }
+    return validator;
+  }
   processAttrName(attrName) {
     const options = this.options;
     attrName = resolveNsPrefix(attrName, options.skip.nsPrefix);
-    if (!isQName(attrName, this.xmlDec.version)) { //TODO: make it optional
+    if (!this.getNameValidator('qName')(attrName)) { //TODO: make it optional
       throw new ParseError(`Invalid attribute name: ${attrName}`, ErrorCode.INVALID_ATTRIBUTE_NAME);
     }
     attrName = sanitizeName(attrName, options.onDangerousProperty);

package/src/XmlPartReader.js CHANGED Viewed

@@ -2,7 +2,6 @@
 import { ParseError, ErrorCode } from './ParseError.js';
 import { collectRawAttributes } from './AttributeProcessor.js';
 import { isSpace } from "./util.js"
-import { name as isName, qName as isQName } from 'xml-naming';
 // Re-export flushAttributes so Xml2JsParser and XmlSpecialTagsReader can
 // continue to import it from here without changing their import lines.
 export { flushAttributes } from './AttributeProcessor.js';
@@ -52,35 +51,24 @@ export function readTagExp(parser) {
   // begins — captured before any reads so buildTagExpObj can compute each
   // attribute's absolute document position from its offset within attrsExp.
   const expStart = parser.source.startIndex;
-  let inSingleQuotes = false;
-  let inDoubleQuotes = false;
-  let i;
-  let EOE = false;
-  for (i = 0; parser.source.canRead(i); i++) {
-    const char = parser.source.readChAt(i);
-    if (char === "'" && !inDoubleQuotes) {
-      inSingleQuotes = !inSingleQuotes;
-    } else if (char === '"' && !inSingleQuotes) {
-      inDoubleQuotes = !inDoubleQuotes;
-    } else if (char === '>' && !inSingleQuotes && !inDoubleQuotes) {
-      EOE = true;
-      break;
-    }
-  }
-  if (!EOE) {
-    // Buffer exhausted before '>' — chunk boundary mid-tag. Throw UNEXPECTED_END
-    // so feed()/parseStream() rewinds to the level-0 outer mark and retries.
+  const relEnd = parser.source.scanTagExpEnd();
+  if (relEnd === -1) {
+    // Buffer exhausted before an unquoted '>' was found — chunk boundary
+    // mid-tag. Throw UNEXPECTED_END so feed()/parseStream() rewinds to the
+    // level-0 outer mark and retries. (Note: scanTagExpEnd() only returns a
+    // non-negative index once both quote flags are already balanced-closed —
+    // by construction, not by a separate post-scan check — so there is no
+    // longer a distinct "unclosed quote but '>' was found" case to detect;
+    // the old UNCLOSED_QUOTE branch here was checking the same two flags
+    // immediately after the only code path that requires them both false,
+    // making it permanently unreachable.)
     throw new ParseError("Unexpected closing of source waiting for '>'", ErrorCode.UNEXPECTED_END);
-  } else if (inSingleQuotes || inDoubleQuotes) {
-    // '>' found but a quote was never closed — real syntax error.
-    throw new ParseError("Invalid attribute expression. Quote is not properly closed", ErrorCode.UNCLOSED_QUOTE);
   }
-  const exp = parser.source.readStr(i);
-  parser.source.updateBufferBoundary(i + 1);
+  const exp = parser.source.readStr(relEnd);
+  parser.source.updateBufferBoundary(relEnd + 1);
   return buildTagExpObj(exp, parser, expStart);
 }
@@ -183,7 +171,7 @@ function buildTagExpObj(exp, parser, expStart, forceToReadAttrs = false) {
   tagExp.tagName = tagExp.tagName.trimEnd();
   tagExp._attrsExp = attrsExp;
-  if (!isQName(tagExp.tagName, parser.xmlDec.version)) {
+  if (!parser.getNameValidator('qName')(tagExp.tagName)) {
     throw new ParseError("Invalid tag name", ErrorCode.INVALID_TAG_NAME);
   }

package/src/XmlSpecialTagsReader.js CHANGED Viewed

@@ -50,6 +50,17 @@ export function readPiTag(parser) {
     }
     parser.xmlDec.encoding = tagExp.rawAttributes?.encoding;
     parser.xmlDec.standalone = tagExp.rawAttributes?.standalone;
+    // BUG FIX: getNameValidator('qName') was already called (and memoized)
+    // above the moment this PI tag's own name ("xml") got validated — before
+    // xmlDec.version was known, so it was always cached with the '1.0'
+    // default. Every subsequent tag/attribute name in the document —
+    // including the root element — would silently be checked against XML
+    // 1.0 rules even for a document declaring version="1.1". Reset the
+    // cache now that the real version is known; this runs at most once per
+    // document (a <?xml?> declaration can only appear once), so the cost is
+    // negligible.
+    parser._nameValidators = Object.create(null);
   }
   // Flush attributes into the output builder's this.attributes accumulator
@@ -57,7 +68,7 @@ export function readPiTag(parser) {
   // does for regular tags. PI tags are not pushed onto the matcher, so no
   // updateCurrent() call is needed here.
   if (!skipOptions.attributes) {
-    flushAttributes(tagExp._attrsExp, parser, tagExp._attrsExpStart);
+    flushAttributes(tagExp._parsedAttrs, parser, tagExp._attrsExpStart, tagExp._rawAttrMatchCount);
   }
   if (tagExp.tagName === "xml") {

package/src/CharsSymbol.js DELETED Viewed

@@ -1,16 +0,0 @@
-export default {
-  "<" : "<", //tag start
-  ">" : ">", //tag end
-  "/" : "/", //close tag
-  "!" : "!", //comment or docttype
-  "!--" : "!--", //comment
-  "-->" : "-->", //comment end
-  "?" : "?", //pi
-  "?>" : "?>", //pi end
-  "?xml" : "?xml", //pi end
-  "![" : "![", //cdata
-  "]]>" : "]]>", //cdata end
-  "[" : "[",
-  "-" : "-",
-  "D" : "D",
-}