npm - @nodable/flexible-xml-parser - Versions diffs - 1.1.1 → 1.2.0 - Mend

@nodable/flexible-xml-parser 1.1.1 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/CHANGELOG.md +8 -0
package/package.json +4 -3
package/src/AttributeProcessor.js +85 -32
package/src/DocTypeReader.js +6 -6
package/src/OptionsBuilder.js +1 -0
package/src/XMLParser.js +42 -12
package/src/Xml2JsParser.js +23 -5
package/src/XmlPartReader.js +10 -7
package/src/XmlSpecialTagsReader.js +15 -5
package/src/util.js +9 -10

package/CHANGELOG.md CHANGED Viewed

@@ -0,0 +1,8 @@
+**1.2.0 (2026-05-132**
+- fix: Tag name can be separated with rest of the tag expression by any type of spaces.
+- fix: parser should not fail when tag expresison is very long
+- fix: stop node with namespace should work
+- support `feedable.bufferSize` option to improve/speed up feed method.
+- integrate `xml-naming` library that would also consider xml version

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@nodable/flexible-xml-parser",
-  "version": "1.1.1",
+  "version": "1.2.0",
   "description": "Fastest XML parser in pure JS with fully customizable ouput",
   "main": "./lib/fxp.cjs",
   "type": "module",
@@ -45,15 +45,16 @@
   },
   "dependencies": {
     "@nodable/base-output-builder": "^1.0.5",
-    "@nodable/compact-builder": "^1.0.6",
+    "@nodable/compact-builder": "^1.0.8",
     "path-expression-matcher": "^1.5.0",
-    "strnum": "^2.2.2"
+    "xml-naming": "^0.1.0"
   },
   "devDependencies": {
     "@babel/core": "^7.29.0",
     "@babel/plugin-transform-runtime": "^7.29.0",
     "@babel/preset-env": "^7.29.2",
     "@babel/register": "^7.28.6",
+    "@byspec/xml": "^0.1.0",
     "@nodable/entities": "^2.1.0",
     "@types/node": "^20.19.37",
     "babel-loader": "^10.1.1",

package/src/AttributeProcessor.js CHANGED Viewed

@@ -1,5 +1,6 @@
 'use strict';
 import { ParseError, ErrorCode } from './ParseError.js';
+import { isSpaceCode } from "./util.js"
 /**
  * AttributeProcessor — owns all attribute parsing logic.
@@ -21,9 +22,87 @@ import { ParseError, ErrorCode } from './ParseError.js';
  *     complete attribute context when value parsers execute.
  */
-// Module-level regex. Stateless between calls because getAllMatches() always
-// resets lastIndex to 0 before iterating — see getAllMatches() below.
-const attrsRegx = new RegExp('([^\\s=]+)\\s*(=\\s*([\'"])([\\s\\S]*?)\\3)?', 'gm');
+// Module-level regex kept for reference only — no longer called from this
+// module. parseAttributes() below replaces it with an O(n) linear scanner
+// that is immune to catastrophic backtracking and stack overflow.
+// const attrsRegx = new RegExp('([^\\s=]+)\\s*(=\\s*([\'"])([\\s\\S]*?)\\3)?', 'gm');
+/**
+ * Parse an attribute expression string into an array of match tuples.
+ *
+ * Each element has the same shape the old getAllMatches() returned so that
+ * callers are unchanged:
+ *   [fullMatch, name, '=value' | undefined, quote | undefined, value | undefined]
+ *
+ * The implementation is a single O(n) pass over char codes with no regex and
+ * no recursion, making it safe for arbitrarily long attribute strings.
+ *
+ * State machine:
+ *   SEEK_NAME  — skipping whitespace looking for the start of an attr name
+ *   IN_NAME    — accumulating a name token until whitespace or '='
+ *   SEEK_VALUE — saw name + optional whitespace, now expecting '=' or next name
+ *   IN_VALUE   — inside a quoted value, accumulating until the closing quote
+ *
+ * @param {string} attrStr
+ * @returns {Array}  array of match tuples (see shape above)
+ */
+function parseAttributes(attrStr) {
+  const results = [];
+  const len = attrStr.length;
+  let i = 0;
+  while (i < len) {
+    // Skip whitespace between attributes
+    while (i < len && isSpaceCode(attrStr.charCodeAt(i))) i++;
+    if (i >= len) break;
+    // Read name
+    const nameStart = i;
+    while (i < len && attrStr.charCodeAt(i) !== 61 && !isSpaceCode(attrStr.charCodeAt(i))) i++;
+    const name = attrStr.substring(nameStart, i);
+    // Skip whitespace before '='
+    while (i < len && isSpaceCode(attrStr.charCodeAt(i))) i++;
+    if (i >= len || attrStr.charCodeAt(i) !== 61) {
+      // Boolean attribute — no '='
+      const m = [name, name, undefined, undefined, undefined];
+      m.startIndex = nameStart;
+      results.push(m);
+      continue;
+    }
+    i++; // skip '='
+    // Skip whitespace after '='
+    while (i < len && isSpaceCode(attrStr.charCodeAt(i))) i++;
+    // Read quoted value
+    const quote = attrStr.charCodeAt(i);
+    if (quote === 34 || quote === 39) { // " or '
+      i++; // skip opening quote
+      const valueStart = i;
+      let value = '';
+      let segStart = i;
+      while (i < len && attrStr.charCodeAt(i) !== quote) {
+        const c = attrStr.charCodeAt(i);
+        if (c === 10 || c === 13) { // \n or \r → space per XML §3.3.3
+          value += attrStr.substring(segStart, i) + ' ';
+          segStart = i + 1;
+        }
+        i++;
+      }
+      value += attrStr.substring(segStart, i);
+      i++; // skip closing quote
+      const quoteChar = String.fromCharCode(quote);
+      const m = [name + '=' + quoteChar + value + quoteChar, name, '=' + quoteChar + value + quoteChar, quoteChar, value];
+      m.startIndex = nameStart;
+      results.push(m);
+    }
+  }
+  return results;
+}
 /**
  * Pass 1: extract raw (unparsed) attribute values into rawAttributes.
@@ -33,9 +112,9 @@ const attrsRegx = new RegExp('([^\\s=]+)\\s*(=\\s*([\'"])([\\s\\S]*?)\\3)?', 'gm
  * @param {object} tagExp - tagExp object to populate rawAttributes (Object.create(null))
  */
 export function collectRawAttributes(attrStr, parser, tagExp) {
   if (!attrStr || attrStr.length === 0) return;
-  const matches = getAllMatches(attrStr, attrsRegx);
+  const matches = parseAttributes(attrStr);
   const len = matches.length;
   let count = 0;
   for (let i = 0; i < len; i++) {
@@ -56,7 +135,7 @@ export function collectRawAttributes(attrStr, parser, tagExp) {
  */
 export function flushAttributes(attrStr, parser) {
   if (!attrStr || attrStr.length === 0) return;
-  const matches = getAllMatches(attrStr, attrsRegx);
+  const matches = parseAttributes(attrStr);
   const len = matches.length;
   const maxAttrs = parser.options.limits?.maxAttributesPerTag;
@@ -78,30 +157,4 @@ export function flushAttributes(attrStr, parser) {
     parser.outputBuilder.addAttribute(attrName, attrVal, parser.readonlyMatcher);
   }
-}
-/**
- * Run the regex against the string and return all capture groups.
- * lastIndex is always reset to 0 before iterating so the module-level
- * stateful regex is safe to share across calls.
- *
- * @param {string} string
- * @param {RegExp} regex
- * @returns {Array}
- */
-function getAllMatches(string, regex) {
-  regex.lastIndex = 0;
-  const matches = [];
-  let match = regex.exec(string);
-  while (match) {
-    const allmatches = [];
-    allmatches.startIndex = regex.lastIndex - match[0].length;
-    const len = match.length;
-    for (let index = 0; index < len; index++) {
-      allmatches.push(match[index]);
-    }
-    matches.push(allmatches);
-    match = regex.exec(string);
-  }
-  return matches;
 }

package/src/DocTypeReader.js CHANGED Viewed

@@ -1,5 +1,5 @@
-import { isName } from './util.js';
 import { ParseError, ErrorCode } from './ParseError.js';
+import { name as isName, qName as isQName } from 'xml-naming';
 export function readDocType(parser) {
     parser.source.markTokenStart(1);
@@ -267,7 +267,7 @@ function readEntityExp(parser) {
             { line: source.line, col: source.cols, index: source.startIndex });
     }
-    validateEntityName(entityName);
+    validateEntityName(entityName, parser.xmlVersion);
     skipSourceWhitespace(source);
     if (!source.canRead()) {
@@ -346,7 +346,7 @@ function readElementExp(parser) {
             { line: source.line, col: source.cols, index: source.startIndex });
     }
-    if (!isName(elementName)) {
+    if (!isName(elementName, parser.xmlVersion)) {
         throw new ParseError(`Invalid element name: "${elementName}"`,
             ErrorCode.INVALID_TAG,
             { line: source.line, col: source.cols, index: source.startIndex });
@@ -434,7 +434,7 @@ function readNotationExp(parser) {
             { line: source.line, col: source.cols, index: source.startIndex });
     }
-    validateEntityName(notationName);
+    validateEntityName(notationName, parser.xmlVersion);
     skipSourceWhitespace(source);
     // Need all 6 chars of "SYSTEM" / "PUBLIC" before we can classify
@@ -512,8 +512,8 @@ function skipSourceWhitespace(source) {
     }
 }
-function validateEntityName(name) {
-    if (isName(name)) return name;
+function validateEntityName(name, xmlVersion) {
+    if (isName(name, xmlVersion)) return name;
     throw new ParseError(
         `Invalid entity name "${name}"`,
         ErrorCode.ENTITY_INVALID_KEY,

package/src/OptionsBuilder.js CHANGED Viewed

@@ -132,6 +132,7 @@ export const defaultOptions = {
     maxBufferSize: 10 * 1024 * 1024,
     autoFlush: true,
     flushThreshold: 1024,
+    bufferSize: 256
   },
   // --- exitIf ---

package/src/XMLParser.js CHANGED Viewed

@@ -13,6 +13,10 @@ export default class XMLParser {
     this._feedParser = null;
     this._feedSource = null;
     this._isFeeding = false;
+    // ── Batching state ──────────────────────────────────
+    this._pendingBytes = 0;
+    this._batchThreshold = this.options.feedable?.bufferSize;
   }
   // ─── One-shot parse methods ───────────────────────────────────────────────
@@ -126,6 +130,37 @@ export default class XMLParser {
   // ─── Incremental feed()/end() API ────────────────────────────────────────
+  _runParse() {
+    if (!this._feedParser) return;
+    const beforePos = this._feedSource.startIndex; // bytes consumed so far
+    try {
+      this._feedParser.parseXml();
+    } catch (err) {
+      if (err.code === ErrorCode.UNEXPECTED_END) {
+        this._feedSource.rewindToMark();
+      } else {
+        throw err;
+      }
+    }
+    const afterPos = this._feedSource.startIndex;
+    const didAdvance = afterPos > beforePos;
+    if (didAdvance) {
+      // Real progress made — reset threshold normally
+      this._pendingBytes = 0;
+    } else {
+      // Parser is stuck mid-token — grow the threshold to avoid
+      // hammering parseXml() until significantly more data arrives
+      this._batchThreshold = Math.min(
+        this._batchThreshold * 2,
+        this.options.feedable.maxBufferSize
+      );
+    }
+  }
   /**
    * Feed an XML data chunk for incremental parsing.
    *
@@ -160,20 +195,12 @@ export default class XMLParser {
     }
     this._feedSource.feed(str);
+    this._pendingBytes += str.length;
-    try {
-      this._feedParser.parseXml();
-    } catch (err) {
-      if (err.code === ErrorCode.UNEXPECTED_END) {
-        // Chunk boundary fell mid-token. Rewind to the token start so the
-        // incomplete bytes are re-parsed when the next chunk arrives.
-        this._feedSource.rewindToMark();
-      } else {
-        // Real parse error — clean up and propagate.
-        this._cleanupFeedSession();
-        throw err;
-      }
+    if (this._pendingBytes >= this._batchThreshold) {
+      this._runParse();
     }
+    // Otherwise, delay parsing until next feed() or end()
     return this;
   }
@@ -201,6 +228,9 @@ export default class XMLParser {
       throw new ParseError('No data fed. Call feed() before end().', ErrorCode.NOT_STREAMING);
     }
+    // Force a final parse (any pending bytes are now processed)
+    this._runParse();
     try {
       // Mark the source as complete so readers know there is no more data.
       this._feedSource.end();

package/src/Xml2JsParser.js CHANGED Viewed

@@ -5,9 +5,10 @@ import { StopNodeProcessor } from './StopNodeProcessor.js';
 import { readComment, readCdata, readPiTag } from './XmlSpecialTagsReader.js';
 import { Expression, ExpressionSet, Matcher } from 'path-expression-matcher';
 import { readDocType } from './DocTypeReader.js';
-import { isName, DANGEROUS_PROPERTY_NAMES, criticalProperties } from './util.js';
+import { DANGEROUS_PROPERTY_NAMES, criticalProperties } from './util.js';
 import AutoCloseHandler from './AutoCloseHandler.js';
 import { ParseError, ErrorCode } from './ParseError.js';
+import { name as isName, qName as isQName } from 'xml-naming';
 class TagDetail {
   /**
@@ -60,6 +61,7 @@ export default class Xml2JsParser {
     this.tagsStack = [];
     this._stopNodeProcessor = null;
     this._exitIfTriggered = false;
+    this.xmlVersion = '1.0';
     if (!this.matcher) {
       this.matcher = new Matcher();
@@ -283,6 +285,18 @@ export default class Xml2JsParser {
       this.source.startIndex,
     );
+    // Extract namespace prefix and local name from raw tag name (e.g. "ns:tag" → "ns", "tag").
+    // Always done from the raw name (tagExp.tagName), before processTagName strips the prefix,
+    // so these values are stable regardless of skip.nsPrefix.
+    const colonIdx = tagExp.tagName.indexOf(':');
+    const tagNamespace = colonIdx !== -1 ? tagExp.tagName.slice(0, colonIdx) : undefined;
+    // Local name for the matcher: prefix-free always (e.g. "code" from "ns:code").
+    // The matcher library tracks namespace separately via the 3rd push() argument —
+    // passing the full "ns:code" as the tag name would break ns::code expression matching.
+    const matcherTagName = tagNamespace !== undefined
+      ? tagExp.tagName.slice(colonIdx + 1)
+      : processedTagName;
     // ── Limit: maxNestedTags ─────────────────────────────────────────────────
     const maxNested = options.limits?.maxNestedTags;
     if (maxNested !== undefined && maxNested !== null) {
@@ -304,7 +318,7 @@ export default class Xml2JsParser {
       raeAttrLen = tagExp.rawAttributesLen;
     }
-    this.matcher.push(processedTagName, {});
+    this.matcher.push(matcherTagName, {}, tagNamespace);
     if (raeAttrLen > 0) {
       this.matcher.updateCurrent(rawAttributes);
     }
@@ -334,7 +348,10 @@ export default class Xml2JsParser {
       this.matcher.pop();
     } else if (stopNodeConfig) {
       // Create a fresh processor with the matching nested + skipEnclosures config.
-      this._stopNodeProcessor = new StopNodeProcessor(processedTagName, {
+      // Raw tag name (tagExp.tagName) is used — the processor scans the source
+      // character-by-character and must match the prefix-as-written (e.g. "ns:code"),
+      // independent of what skip.nsPrefix does to the processed output name.
+      this._stopNodeProcessor = new StopNodeProcessor(tagExp.tagName, {
         nested: stopNodeConfig.nested,
         skipEnclosures: stopNodeConfig.skipEnclosures,
       });
@@ -351,7 +368,8 @@ export default class Xml2JsParser {
     } else if (skipTagConfig) {
       // Skip tag: collect raw content (to advance the source past the closing tag)
       // but call no output builder methods — the tag is silently dropped.
-      this._stopNodeProcessor = new StopNodeProcessor(processedTagName, {
+      // Raw tag name used for the same reason as the stop-node branch above.
+      this._stopNodeProcessor = new StopNodeProcessor(tagExp.tagName, {
         nested: skipTagConfig.nested,
         skipEnclosures: skipTagConfig.skipEnclosures,
       });
@@ -460,7 +478,7 @@ export default class Xml2JsParser {
   processAttrName(attrName) {
     const options = this.options;
     attrName = resolveNsPrefix(attrName, options.skip.nsPrefix);
-    if (!isName(attrName)) { //TODO: make it optional
+    if (!isQName(attrName, this.xmlVersion)) { //TODO: make it optional
       throw new ParseError(`Invalid attribute name: ${attrName}`, ErrorCode.INVALID_ATTRIBUTE_NAME);
     }
     attrName = sanitizeName(attrName, options.onDangerousProperty);

package/src/XmlPartReader.js CHANGED Viewed

@@ -1,7 +1,8 @@
 'use strict';
 import { ParseError, ErrorCode } from './ParseError.js';
 import { collectRawAttributes } from './AttributeProcessor.js';
-import { isName } from "./util.js"
+import { isSpace } from "./util.js"
+import { name as isName, qName as isQName } from 'xml-naming';
 // Re-export flushAttributes so Xml2JsParser and XmlSpecialTagsReader can
 // continue to import it from here without changing their import lines.
 export { flushAttributes } from './AttributeProcessor.js';
@@ -157,19 +158,20 @@ function buildTagExpObj(exp, parser) {
   let attrsExp = "";
   let i = 0;
-  for (; i < expLen; i++) {
-    if (exp[i] === " ") {
+  for (; i < exp.length; i++) {
+    const c = exp[i];
+    if (isSpace(c)) {
       tagExp.tagName = exp.substring(0, i);
       attrsExp = exp.substring(i + 1);
       break;
     }
   }
   //only tag
-  if (tagExp.tagName.length === 0 && i === expLen) tagExp.tagName = exp;
+  if (tagExp.tagName.length === 0 && i === exp.length) tagExp.tagName = exp;
   tagExp.tagName = tagExp.tagName.trimEnd();
   tagExp._attrsExp = attrsExp;
-  if (!isName(tagExp.tagName)) {
+  if (!isQName(tagExp.tagName, parser.xmlVersion)) {
     throw new ParseError("Invalid tag name", ErrorCode.INVALID_TAG_NAME);
   }
@@ -178,6 +180,7 @@ function buildTagExpObj(exp, parser) {
   if (!parser.options.skip.attributes && attrsExp.length > 0) {
     collectRawAttributes(attrsExp, parser, tagExp);
   }
+  // console.log(tagExp)
   return tagExp;
-}
+}

package/src/XmlSpecialTagsReader.js CHANGED Viewed

@@ -36,11 +36,21 @@ export function readPiTag(parser) {
   parser.source.markTokenStart(1);
   //<? already consumed
   let tagExp = readPiExp(parser, "?>");
-  if (!tagExp) throw new ParseError(
-    "Invalid Pi Tag expression.",
-    ErrorCode.INVALID_TAG,
-    { line: parser.source.line, col: parser.source.cols, index: parser.source.startIndex }
-  );
+  if (!tagExp) {
+    throw new ParseError(
+      "Invalid Pi Tag expression.",
+      ErrorCode.INVALID_TAG,
+      { line: parser.source.line, col: parser.source.cols, index: parser.source.startIndex }
+    )
+  } else if (tagExp.tagName === "xml") {
+    // Read version from the declaration and store it on the parser for validators.
+    const version = tagExp.rawAttributes?.version;
+    if (version === '1.1') {
+      parser.xmlVersion = 1.1;
+    } else {
+      parser.xmlVersion = 1.0; // default
+    }
+  }
   // Flush attributes into the output builder's this.attributes accumulator
   // so addDeclaration() / addInstruction() pick them up, mirroring what readOpeningTag

package/src/util.js CHANGED Viewed

@@ -1,10 +1,3 @@
-'use strict';
-const nameStartChar = ':A-Za-z_\\u00C0-\\u00D6\\u00D8-\\u00F6\\u00F8-\\u02FF\\u0370-\\u037D\\u037F-\\u1FFF\\u200C-\\u200D\\u2070-\\u218F\\u2C00-\\u2FEF\\u3001-\\uD7FF\\uF900-\\uFDCF\\uFDF0-\\uFFFD';
-const nameChar = nameStartChar + '\\-.\\d\\u00B7\\u0300-\\u036F\\u203F-\\u2040';
-export const nameRegexp = '[' + nameStartChar + '][' + nameChar + ']*';
-const regexName = new RegExp('^' + nameRegexp + '$');
 export function getAllMatches(string, regex) {
   const matches = [];
   let match = regex.exec(string);
@@ -21,9 +14,15 @@ export function getAllMatches(string, regex) {
   return matches;
 }
-export const isName = function (string) {
-  const match = regexName.exec(string);
-  return !(match === null || typeof match === 'undefined');
+export function isSpace(char) {
+  return char === " " || char === "\t" || char === "\n" || char === "\r" || char === "\f";
+}
+export function isSpaceCode(code) {
+  return code === 32 || code === 9 || code === 10 || code === 13 || code === 12; // space \t \n \r \f
 }
 export function isExist(v) {