@nodable/flexible-xml-parser 1.1.1 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -0,0 +1,8 @@
1
+
2
+ **1.2.0 (2026-05-132**
3
+ - fix: Tag name can be separated with rest of the tag expression by any type of spaces.
4
+ - fix: parser should not fail when tag expresison is very long
5
+ - fix: stop node with namespace should work
6
+ - support `feedable.bufferSize` option to improve/speed up feed method.
7
+ - integrate `xml-naming` library that would also consider xml version
8
+
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@nodable/flexible-xml-parser",
3
- "version": "1.1.1",
3
+ "version": "1.2.0",
4
4
  "description": "Fastest XML parser in pure JS with fully customizable ouput",
5
5
  "main": "./lib/fxp.cjs",
6
6
  "type": "module",
@@ -45,15 +45,16 @@
45
45
  },
46
46
  "dependencies": {
47
47
  "@nodable/base-output-builder": "^1.0.5",
48
- "@nodable/compact-builder": "^1.0.6",
48
+ "@nodable/compact-builder": "^1.0.8",
49
49
  "path-expression-matcher": "^1.5.0",
50
- "strnum": "^2.2.2"
50
+ "xml-naming": "^0.1.0"
51
51
  },
52
52
  "devDependencies": {
53
53
  "@babel/core": "^7.29.0",
54
54
  "@babel/plugin-transform-runtime": "^7.29.0",
55
55
  "@babel/preset-env": "^7.29.2",
56
56
  "@babel/register": "^7.28.6",
57
+ "@byspec/xml": "^0.1.0",
57
58
  "@nodable/entities": "^2.1.0",
58
59
  "@types/node": "^20.19.37",
59
60
  "babel-loader": "^10.1.1",
@@ -1,5 +1,6 @@
1
1
  'use strict';
2
2
  import { ParseError, ErrorCode } from './ParseError.js';
3
+ import { isSpaceCode } from "./util.js"
3
4
 
4
5
  /**
5
6
  * AttributeProcessor — owns all attribute parsing logic.
@@ -21,9 +22,87 @@ import { ParseError, ErrorCode } from './ParseError.js';
21
22
  * complete attribute context when value parsers execute.
22
23
  */
23
24
 
24
- // Module-level regex. Stateless between calls because getAllMatches() always
25
- // resets lastIndex to 0 before iterating see getAllMatches() below.
26
- const attrsRegx = new RegExp('([^\\s=]+)\\s*(=\\s*([\'"])([\\s\\S]*?)\\3)?', 'gm');
25
+ // Module-level regex kept for reference only no longer called from this
26
+ // module. parseAttributes() below replaces it with an O(n) linear scanner
27
+ // that is immune to catastrophic backtracking and stack overflow.
28
+ // const attrsRegx = new RegExp('([^\\s=]+)\\s*(=\\s*([\'"])([\\s\\S]*?)\\3)?', 'gm');
29
+
30
+ /**
31
+ * Parse an attribute expression string into an array of match tuples.
32
+ *
33
+ * Each element has the same shape the old getAllMatches() returned so that
34
+ * callers are unchanged:
35
+ * [fullMatch, name, '=value' | undefined, quote | undefined, value | undefined]
36
+ *
37
+ * The implementation is a single O(n) pass over char codes with no regex and
38
+ * no recursion, making it safe for arbitrarily long attribute strings.
39
+ *
40
+ * State machine:
41
+ * SEEK_NAME — skipping whitespace looking for the start of an attr name
42
+ * IN_NAME — accumulating a name token until whitespace or '='
43
+ * SEEK_VALUE — saw name + optional whitespace, now expecting '=' or next name
44
+ * IN_VALUE — inside a quoted value, accumulating until the closing quote
45
+ *
46
+ * @param {string} attrStr
47
+ * @returns {Array} array of match tuples (see shape above)
48
+ */
49
+ function parseAttributes(attrStr) {
50
+ const results = [];
51
+ const len = attrStr.length;
52
+ let i = 0;
53
+
54
+ while (i < len) {
55
+ // Skip whitespace between attributes
56
+ while (i < len && isSpaceCode(attrStr.charCodeAt(i))) i++;
57
+ if (i >= len) break;
58
+
59
+ // Read name
60
+ const nameStart = i;
61
+ while (i < len && attrStr.charCodeAt(i) !== 61 && !isSpaceCode(attrStr.charCodeAt(i))) i++;
62
+ const name = attrStr.substring(nameStart, i);
63
+
64
+ // Skip whitespace before '='
65
+ while (i < len && isSpaceCode(attrStr.charCodeAt(i))) i++;
66
+
67
+ if (i >= len || attrStr.charCodeAt(i) !== 61) {
68
+ // Boolean attribute — no '='
69
+ const m = [name, name, undefined, undefined, undefined];
70
+ m.startIndex = nameStart;
71
+ results.push(m);
72
+ continue;
73
+ }
74
+
75
+ i++; // skip '='
76
+
77
+ // Skip whitespace after '='
78
+ while (i < len && isSpaceCode(attrStr.charCodeAt(i))) i++;
79
+
80
+ // Read quoted value
81
+ const quote = attrStr.charCodeAt(i);
82
+ if (quote === 34 || quote === 39) { // " or '
83
+ i++; // skip opening quote
84
+ const valueStart = i;
85
+ let value = '';
86
+ let segStart = i;
87
+ while (i < len && attrStr.charCodeAt(i) !== quote) {
88
+ const c = attrStr.charCodeAt(i);
89
+ if (c === 10 || c === 13) { // \n or \r → space per XML §3.3.3
90
+ value += attrStr.substring(segStart, i) + ' ';
91
+ segStart = i + 1;
92
+ }
93
+ i++;
94
+ }
95
+ value += attrStr.substring(segStart, i);
96
+ i++; // skip closing quote
97
+ const quoteChar = String.fromCharCode(quote);
98
+ const m = [name + '=' + quoteChar + value + quoteChar, name, '=' + quoteChar + value + quoteChar, quoteChar, value];
99
+ m.startIndex = nameStart;
100
+ results.push(m);
101
+ }
102
+ }
103
+
104
+ return results;
105
+ }
27
106
 
28
107
  /**
29
108
  * Pass 1: extract raw (unparsed) attribute values into rawAttributes.
@@ -33,9 +112,9 @@ const attrsRegx = new RegExp('([^\\s=]+)\\s*(=\\s*([\'"])([\\s\\S]*?)\\3)?', 'gm
33
112
  * @param {object} tagExp - tagExp object to populate rawAttributes (Object.create(null))
34
113
  */
35
114
  export function collectRawAttributes(attrStr, parser, tagExp) {
36
-
37
115
  if (!attrStr || attrStr.length === 0) return;
38
- const matches = getAllMatches(attrStr, attrsRegx);
116
+
117
+ const matches = parseAttributes(attrStr);
39
118
  const len = matches.length;
40
119
  let count = 0;
41
120
  for (let i = 0; i < len; i++) {
@@ -56,7 +135,7 @@ export function collectRawAttributes(attrStr, parser, tagExp) {
56
135
  */
57
136
  export function flushAttributes(attrStr, parser) {
58
137
  if (!attrStr || attrStr.length === 0) return;
59
- const matches = getAllMatches(attrStr, attrsRegx);
138
+ const matches = parseAttributes(attrStr);
60
139
  const len = matches.length;
61
140
 
62
141
  const maxAttrs = parser.options.limits?.maxAttributesPerTag;
@@ -78,30 +157,4 @@ export function flushAttributes(attrStr, parser) {
78
157
 
79
158
  parser.outputBuilder.addAttribute(attrName, attrVal, parser.readonlyMatcher);
80
159
  }
81
- }
82
-
83
- /**
84
- * Run the regex against the string and return all capture groups.
85
- * lastIndex is always reset to 0 before iterating so the module-level
86
- * stateful regex is safe to share across calls.
87
- *
88
- * @param {string} string
89
- * @param {RegExp} regex
90
- * @returns {Array}
91
- */
92
- function getAllMatches(string, regex) {
93
- regex.lastIndex = 0;
94
- const matches = [];
95
- let match = regex.exec(string);
96
- while (match) {
97
- const allmatches = [];
98
- allmatches.startIndex = regex.lastIndex - match[0].length;
99
- const len = match.length;
100
- for (let index = 0; index < len; index++) {
101
- allmatches.push(match[index]);
102
- }
103
- matches.push(allmatches);
104
- match = regex.exec(string);
105
- }
106
- return matches;
107
160
  }
@@ -1,5 +1,5 @@
1
- import { isName } from './util.js';
2
1
  import { ParseError, ErrorCode } from './ParseError.js';
2
+ import { name as isName, qName as isQName } from 'xml-naming';
3
3
 
4
4
  export function readDocType(parser) {
5
5
  parser.source.markTokenStart(1);
@@ -267,7 +267,7 @@ function readEntityExp(parser) {
267
267
  { line: source.line, col: source.cols, index: source.startIndex });
268
268
  }
269
269
 
270
- validateEntityName(entityName);
270
+ validateEntityName(entityName, parser.xmlVersion);
271
271
  skipSourceWhitespace(source);
272
272
 
273
273
  if (!source.canRead()) {
@@ -346,7 +346,7 @@ function readElementExp(parser) {
346
346
  { line: source.line, col: source.cols, index: source.startIndex });
347
347
  }
348
348
 
349
- if (!isName(elementName)) {
349
+ if (!isName(elementName, parser.xmlVersion)) {
350
350
  throw new ParseError(`Invalid element name: "${elementName}"`,
351
351
  ErrorCode.INVALID_TAG,
352
352
  { line: source.line, col: source.cols, index: source.startIndex });
@@ -434,7 +434,7 @@ function readNotationExp(parser) {
434
434
  { line: source.line, col: source.cols, index: source.startIndex });
435
435
  }
436
436
 
437
- validateEntityName(notationName);
437
+ validateEntityName(notationName, parser.xmlVersion);
438
438
  skipSourceWhitespace(source);
439
439
 
440
440
  // Need all 6 chars of "SYSTEM" / "PUBLIC" before we can classify
@@ -512,8 +512,8 @@ function skipSourceWhitespace(source) {
512
512
  }
513
513
  }
514
514
 
515
- function validateEntityName(name) {
516
- if (isName(name)) return name;
515
+ function validateEntityName(name, xmlVersion) {
516
+ if (isName(name, xmlVersion)) return name;
517
517
  throw new ParseError(
518
518
  `Invalid entity name "${name}"`,
519
519
  ErrorCode.ENTITY_INVALID_KEY,
@@ -132,6 +132,7 @@ export const defaultOptions = {
132
132
  maxBufferSize: 10 * 1024 * 1024,
133
133
  autoFlush: true,
134
134
  flushThreshold: 1024,
135
+ bufferSize: 256
135
136
  },
136
137
 
137
138
  // --- exitIf ---
package/src/XMLParser.js CHANGED
@@ -13,6 +13,10 @@ export default class XMLParser {
13
13
  this._feedParser = null;
14
14
  this._feedSource = null;
15
15
  this._isFeeding = false;
16
+
17
+ // ── Batching state ──────────────────────────────────
18
+ this._pendingBytes = 0;
19
+ this._batchThreshold = this.options.feedable?.bufferSize;
16
20
  }
17
21
 
18
22
  // ─── One-shot parse methods ───────────────────────────────────────────────
@@ -126,6 +130,37 @@ export default class XMLParser {
126
130
 
127
131
  // ─── Incremental feed()/end() API ────────────────────────────────────────
128
132
 
133
+ _runParse() {
134
+ if (!this._feedParser) return;
135
+
136
+ const beforePos = this._feedSource.startIndex; // bytes consumed so far
137
+
138
+ try {
139
+ this._feedParser.parseXml();
140
+ } catch (err) {
141
+ if (err.code === ErrorCode.UNEXPECTED_END) {
142
+ this._feedSource.rewindToMark();
143
+ } else {
144
+ throw err;
145
+ }
146
+ }
147
+
148
+ const afterPos = this._feedSource.startIndex;
149
+ const didAdvance = afterPos > beforePos;
150
+
151
+ if (didAdvance) {
152
+ // Real progress made — reset threshold normally
153
+ this._pendingBytes = 0;
154
+ } else {
155
+ // Parser is stuck mid-token — grow the threshold to avoid
156
+ // hammering parseXml() until significantly more data arrives
157
+ this._batchThreshold = Math.min(
158
+ this._batchThreshold * 2,
159
+ this.options.feedable.maxBufferSize
160
+ );
161
+ }
162
+ }
163
+
129
164
  /**
130
165
  * Feed an XML data chunk for incremental parsing.
131
166
  *
@@ -160,20 +195,12 @@ export default class XMLParser {
160
195
  }
161
196
 
162
197
  this._feedSource.feed(str);
198
+ this._pendingBytes += str.length;
163
199
 
164
- try {
165
- this._feedParser.parseXml();
166
- } catch (err) {
167
- if (err.code === ErrorCode.UNEXPECTED_END) {
168
- // Chunk boundary fell mid-token. Rewind to the token start so the
169
- // incomplete bytes are re-parsed when the next chunk arrives.
170
- this._feedSource.rewindToMark();
171
- } else {
172
- // Real parse error — clean up and propagate.
173
- this._cleanupFeedSession();
174
- throw err;
175
- }
200
+ if (this._pendingBytes >= this._batchThreshold) {
201
+ this._runParse();
176
202
  }
203
+ // Otherwise, delay parsing until next feed() or end()
177
204
 
178
205
  return this;
179
206
  }
@@ -201,6 +228,9 @@ export default class XMLParser {
201
228
  throw new ParseError('No data fed. Call feed() before end().', ErrorCode.NOT_STREAMING);
202
229
  }
203
230
 
231
+ // Force a final parse (any pending bytes are now processed)
232
+ this._runParse();
233
+
204
234
  try {
205
235
  // Mark the source as complete so readers know there is no more data.
206
236
  this._feedSource.end();
@@ -5,9 +5,10 @@ import { StopNodeProcessor } from './StopNodeProcessor.js';
5
5
  import { readComment, readCdata, readPiTag } from './XmlSpecialTagsReader.js';
6
6
  import { Expression, ExpressionSet, Matcher } from 'path-expression-matcher';
7
7
  import { readDocType } from './DocTypeReader.js';
8
- import { isName, DANGEROUS_PROPERTY_NAMES, criticalProperties } from './util.js';
8
+ import { DANGEROUS_PROPERTY_NAMES, criticalProperties } from './util.js';
9
9
  import AutoCloseHandler from './AutoCloseHandler.js';
10
10
  import { ParseError, ErrorCode } from './ParseError.js';
11
+ import { name as isName, qName as isQName } from 'xml-naming';
11
12
 
12
13
  class TagDetail {
13
14
  /**
@@ -60,6 +61,7 @@ export default class Xml2JsParser {
60
61
  this.tagsStack = [];
61
62
  this._stopNodeProcessor = null;
62
63
  this._exitIfTriggered = false;
64
+ this.xmlVersion = '1.0';
63
65
 
64
66
  if (!this.matcher) {
65
67
  this.matcher = new Matcher();
@@ -283,6 +285,18 @@ export default class Xml2JsParser {
283
285
  this.source.startIndex,
284
286
  );
285
287
 
288
+ // Extract namespace prefix and local name from raw tag name (e.g. "ns:tag" → "ns", "tag").
289
+ // Always done from the raw name (tagExp.tagName), before processTagName strips the prefix,
290
+ // so these values are stable regardless of skip.nsPrefix.
291
+ const colonIdx = tagExp.tagName.indexOf(':');
292
+ const tagNamespace = colonIdx !== -1 ? tagExp.tagName.slice(0, colonIdx) : undefined;
293
+ // Local name for the matcher: prefix-free always (e.g. "code" from "ns:code").
294
+ // The matcher library tracks namespace separately via the 3rd push() argument —
295
+ // passing the full "ns:code" as the tag name would break ns::code expression matching.
296
+ const matcherTagName = tagNamespace !== undefined
297
+ ? tagExp.tagName.slice(colonIdx + 1)
298
+ : processedTagName;
299
+
286
300
  // ── Limit: maxNestedTags ─────────────────────────────────────────────────
287
301
  const maxNested = options.limits?.maxNestedTags;
288
302
  if (maxNested !== undefined && maxNested !== null) {
@@ -304,7 +318,7 @@ export default class Xml2JsParser {
304
318
  raeAttrLen = tagExp.rawAttributesLen;
305
319
  }
306
320
 
307
- this.matcher.push(processedTagName, {});
321
+ this.matcher.push(matcherTagName, {}, tagNamespace);
308
322
  if (raeAttrLen > 0) {
309
323
  this.matcher.updateCurrent(rawAttributes);
310
324
  }
@@ -334,7 +348,10 @@ export default class Xml2JsParser {
334
348
  this.matcher.pop();
335
349
  } else if (stopNodeConfig) {
336
350
  // Create a fresh processor with the matching nested + skipEnclosures config.
337
- this._stopNodeProcessor = new StopNodeProcessor(processedTagName, {
351
+ // Raw tag name (tagExp.tagName) is used — the processor scans the source
352
+ // character-by-character and must match the prefix-as-written (e.g. "ns:code"),
353
+ // independent of what skip.nsPrefix does to the processed output name.
354
+ this._stopNodeProcessor = new StopNodeProcessor(tagExp.tagName, {
338
355
  nested: stopNodeConfig.nested,
339
356
  skipEnclosures: stopNodeConfig.skipEnclosures,
340
357
  });
@@ -351,7 +368,8 @@ export default class Xml2JsParser {
351
368
  } else if (skipTagConfig) {
352
369
  // Skip tag: collect raw content (to advance the source past the closing tag)
353
370
  // but call no output builder methods — the tag is silently dropped.
354
- this._stopNodeProcessor = new StopNodeProcessor(processedTagName, {
371
+ // Raw tag name used for the same reason as the stop-node branch above.
372
+ this._stopNodeProcessor = new StopNodeProcessor(tagExp.tagName, {
355
373
  nested: skipTagConfig.nested,
356
374
  skipEnclosures: skipTagConfig.skipEnclosures,
357
375
  });
@@ -460,7 +478,7 @@ export default class Xml2JsParser {
460
478
  processAttrName(attrName) {
461
479
  const options = this.options;
462
480
  attrName = resolveNsPrefix(attrName, options.skip.nsPrefix);
463
- if (!isName(attrName)) { //TODO: make it optional
481
+ if (!isQName(attrName, this.xmlVersion)) { //TODO: make it optional
464
482
  throw new ParseError(`Invalid attribute name: ${attrName}`, ErrorCode.INVALID_ATTRIBUTE_NAME);
465
483
  }
466
484
  attrName = sanitizeName(attrName, options.onDangerousProperty);
@@ -1,7 +1,8 @@
1
1
  'use strict';
2
2
  import { ParseError, ErrorCode } from './ParseError.js';
3
3
  import { collectRawAttributes } from './AttributeProcessor.js';
4
- import { isName } from "./util.js"
4
+ import { isSpace } from "./util.js"
5
+ import { name as isName, qName as isQName } from 'xml-naming';
5
6
  // Re-export flushAttributes so Xml2JsParser and XmlSpecialTagsReader can
6
7
  // continue to import it from here without changing their import lines.
7
8
  export { flushAttributes } from './AttributeProcessor.js';
@@ -157,19 +158,20 @@ function buildTagExpObj(exp, parser) {
157
158
  let attrsExp = "";
158
159
  let i = 0;
159
160
 
160
- for (; i < expLen; i++) {
161
- if (exp[i] === " ") {
161
+ for (; i < exp.length; i++) {
162
+ const c = exp[i];
163
+ if (isSpace(c)) {
162
164
  tagExp.tagName = exp.substring(0, i);
163
165
  attrsExp = exp.substring(i + 1);
164
166
  break;
165
167
  }
166
168
  }
167
169
  //only tag
168
- if (tagExp.tagName.length === 0 && i === expLen) tagExp.tagName = exp;
170
+ if (tagExp.tagName.length === 0 && i === exp.length) tagExp.tagName = exp;
169
171
  tagExp.tagName = tagExp.tagName.trimEnd();
170
172
  tagExp._attrsExp = attrsExp;
171
173
 
172
- if (!isName(tagExp.tagName)) {
174
+ if (!isQName(tagExp.tagName, parser.xmlVersion)) {
173
175
  throw new ParseError("Invalid tag name", ErrorCode.INVALID_TAG_NAME);
174
176
  }
175
177
 
@@ -178,6 +180,7 @@ function buildTagExpObj(exp, parser) {
178
180
  if (!parser.options.skip.attributes && attrsExp.length > 0) {
179
181
  collectRawAttributes(attrsExp, parser, tagExp);
180
182
  }
181
-
183
+ // console.log(tagExp)
182
184
  return tagExp;
183
- }
185
+ }
186
+
@@ -36,11 +36,21 @@ export function readPiTag(parser) {
36
36
  parser.source.markTokenStart(1);
37
37
  //<? already consumed
38
38
  let tagExp = readPiExp(parser, "?>");
39
- if (!tagExp) throw new ParseError(
40
- "Invalid Pi Tag expression.",
41
- ErrorCode.INVALID_TAG,
42
- { line: parser.source.line, col: parser.source.cols, index: parser.source.startIndex }
43
- );
39
+ if (!tagExp) {
40
+ throw new ParseError(
41
+ "Invalid Pi Tag expression.",
42
+ ErrorCode.INVALID_TAG,
43
+ { line: parser.source.line, col: parser.source.cols, index: parser.source.startIndex }
44
+ )
45
+ } else if (tagExp.tagName === "xml") {
46
+ // Read version from the declaration and store it on the parser for validators.
47
+ const version = tagExp.rawAttributes?.version;
48
+ if (version === '1.1') {
49
+ parser.xmlVersion = 1.1;
50
+ } else {
51
+ parser.xmlVersion = 1.0; // default
52
+ }
53
+ }
44
54
 
45
55
  // Flush attributes into the output builder's this.attributes accumulator
46
56
  // so addDeclaration() / addInstruction() pick them up, mirroring what readOpeningTag
package/src/util.js CHANGED
@@ -1,10 +1,3 @@
1
- 'use strict';
2
-
3
- const nameStartChar = ':A-Za-z_\\u00C0-\\u00D6\\u00D8-\\u00F6\\u00F8-\\u02FF\\u0370-\\u037D\\u037F-\\u1FFF\\u200C-\\u200D\\u2070-\\u218F\\u2C00-\\u2FEF\\u3001-\\uD7FF\\uF900-\\uFDCF\\uFDF0-\\uFFFD';
4
- const nameChar = nameStartChar + '\\-.\\d\\u00B7\\u0300-\\u036F\\u203F-\\u2040';
5
- export const nameRegexp = '[' + nameStartChar + '][' + nameChar + ']*';
6
- const regexName = new RegExp('^' + nameRegexp + '$');
7
-
8
1
  export function getAllMatches(string, regex) {
9
2
  const matches = [];
10
3
  let match = regex.exec(string);
@@ -21,9 +14,15 @@ export function getAllMatches(string, regex) {
21
14
  return matches;
22
15
  }
23
16
 
24
- export const isName = function (string) {
25
- const match = regexName.exec(string);
26
- return !(match === null || typeof match === 'undefined');
17
+
18
+
19
+ export function isSpace(char) {
20
+ return char === " " || char === "\t" || char === "\n" || char === "\r" || char === "\f";
21
+ }
22
+
23
+
24
+ export function isSpaceCode(code) {
25
+ return code === 32 || code === 9 || code === 10 || code === 13 || code === 12; // space \t \n \r \f
27
26
  }
28
27
 
29
28
  export function isExist(v) {