@nodable/flexible-xml-parser 1.6.1 → 1.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +9 -0
- package/package.json +2 -2
- package/src/AttributeProcessor.js +50 -35
- package/src/DocTypeReader.js +6 -6
- package/src/InputSource/BufferSource.js +28 -14
- package/src/InputSource/FeedableSource.js +93 -4
- package/src/InputSource/StreamSource.js +5 -1
- package/src/InputSource/StringSource.js +35 -2
- package/src/XMLParser.js +9 -13
- package/src/Xml2JsParser.js +43 -8
- package/src/XmlPartReader.js +15 -27
- package/src/XmlSpecialTagsReader.js +12 -1
- package/src/CharsSymbol.js +0 -16
package/CHANGELOG.md
CHANGED
|
@@ -1,4 +1,13 @@
|
|
|
1
1
|
|
|
2
|
+
**1.7.0 (2026-07-03)**
|
|
3
|
+
- perf: upgrade to xml-naming v0.3.0 to support caching
|
|
4
|
+
- perf: parse attributes only once
|
|
5
|
+
- perf: quote aware scan: `scanTagExpEnd` to all input sources
|
|
6
|
+
- perf: call tag reading methods frequency wise
|
|
7
|
+
- perf: autoflush
|
|
8
|
+
- fix(#5): StreamSource and FeedableSource don't respect multi-byte characters
|
|
9
|
+
|
|
10
|
+
|
|
2
11
|
**1.6.1 (2026-06-30)**
|
|
3
12
|
- Pass xml declaration attributes to output builder irrespection of parser options.
|
|
4
13
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@nodable/flexible-xml-parser",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.7.0",
|
|
4
4
|
"description": "Fastest and fully customizable XML parser in pure JS with fully customizable ouput",
|
|
5
5
|
"main": "./lib/fxp.cjs",
|
|
6
6
|
"type": "module",
|
|
@@ -51,7 +51,7 @@
|
|
|
51
51
|
"@nodable/base-output-builder": "^2.0.0",
|
|
52
52
|
"@nodable/compact-builder": "^2.0.0",
|
|
53
53
|
"path-expression-matcher": "^1.6.1",
|
|
54
|
-
"xml-naming": "^0.
|
|
54
|
+
"xml-naming": "^0.3.0"
|
|
55
55
|
},
|
|
56
56
|
"devDependencies": {
|
|
57
57
|
"@babel/core": "^7.29.0",
|
|
@@ -30,9 +30,13 @@ import { isSpaceCode } from "./util.js"
|
|
|
30
30
|
/**
|
|
31
31
|
* Parse an attribute expression string into an array of match tuples.
|
|
32
32
|
*
|
|
33
|
-
* Each element
|
|
34
|
-
*
|
|
35
|
-
*
|
|
33
|
+
* Each element is `{ name, value, startIndex }` — `value` is `undefined` for
|
|
34
|
+
* a boolean attribute (no `=`). Earlier versions of this function also built
|
|
35
|
+
* a full-match string and an `'=value'` string per attribute (matching an
|
|
36
|
+
* old regex-based getAllMatches() return shape) — neither was ever read by
|
|
37
|
+
* collectRawAttributes()/flushAttributes() (only `name`, `value`, and
|
|
38
|
+
* `.startIndex` are), so building them was pure wasted string concatenation
|
|
39
|
+
* on every attribute, on every tag. Dropped.
|
|
36
40
|
*
|
|
37
41
|
* The implementation is a single O(n) pass over char codes with no regex and
|
|
38
42
|
* no recursion, making it safe for arbitrarily long attribute strings.
|
|
@@ -44,7 +48,7 @@ import { isSpaceCode } from "./util.js"
|
|
|
44
48
|
* IN_VALUE — inside a quoted value, accumulating until the closing quote
|
|
45
49
|
*
|
|
46
50
|
* @param {string} attrStr
|
|
47
|
-
* @returns {Array
|
|
51
|
+
* @returns {Array<{name: string, value: string|undefined, startIndex: number}>}
|
|
48
52
|
*/
|
|
49
53
|
function parseAttributes(attrStr) {
|
|
50
54
|
const results = [];
|
|
@@ -66,9 +70,7 @@ function parseAttributes(attrStr) {
|
|
|
66
70
|
|
|
67
71
|
if (i >= len || attrStr.charCodeAt(i) !== 61) {
|
|
68
72
|
// Boolean attribute — no '='
|
|
69
|
-
|
|
70
|
-
m.startIndex = nameStart;
|
|
71
|
-
results.push(m);
|
|
73
|
+
results.push({ name, value: undefined, startIndex: nameStart });
|
|
72
74
|
continue;
|
|
73
75
|
}
|
|
74
76
|
|
|
@@ -81,7 +83,6 @@ function parseAttributes(attrStr) {
|
|
|
81
83
|
const quote = attrStr.charCodeAt(i);
|
|
82
84
|
if (quote === 34 || quote === 39) { // " or '
|
|
83
85
|
i++; // skip opening quote
|
|
84
|
-
const valueStart = i;
|
|
85
86
|
let value = '';
|
|
86
87
|
let segStart = i;
|
|
87
88
|
while (i < len && attrStr.charCodeAt(i) !== quote) {
|
|
@@ -94,10 +95,7 @@ function parseAttributes(attrStr) {
|
|
|
94
95
|
}
|
|
95
96
|
value += attrStr.substring(segStart, i);
|
|
96
97
|
i++; // skip closing quote
|
|
97
|
-
|
|
98
|
-
const m = [name + '=' + quoteChar + value + quoteChar, name, '=' + quoteChar + value + quoteChar, quoteChar, value];
|
|
99
|
-
m.startIndex = nameStart;
|
|
100
|
-
results.push(m);
|
|
98
|
+
results.push({ name, value, startIndex: nameStart });
|
|
101
99
|
}
|
|
102
100
|
}
|
|
103
101
|
|
|
@@ -105,7 +103,20 @@ function parseAttributes(attrStr) {
|
|
|
105
103
|
}
|
|
106
104
|
|
|
107
105
|
/**
|
|
108
|
-
* Pass 1: extract raw (unparsed) attribute values into rawAttributes
|
|
106
|
+
* Pass 1: extract raw (unparsed) attribute values into rawAttributes, AND
|
|
107
|
+
* build tagExp._parsedAttrs — the processed-name/value list pass 2 will
|
|
108
|
+
* consume directly.
|
|
109
|
+
*
|
|
110
|
+
* Previously, pass 2 (flushAttributes) re-ran parseAttributes() from scratch
|
|
111
|
+
* on the same attrStr, and re-ran parser.processAttrName() (ns-prefix
|
|
112
|
+
* resolution + name validation + sanitizeName + reserved-name check) on
|
|
113
|
+
* every attribute a second time — full re-tokenization plus full re-validation
|
|
114
|
+
* of work already done here. processAttrName() is a pure function of
|
|
115
|
+
* (rawName, options) — nothing between pass 1 and pass 2 (matcher.push,
|
|
116
|
+
* stop/skip resolution) can change its result — so it's safe to compute once
|
|
117
|
+
* and cache. The matcher still gets the *raw* (pre-resolveNsPrefix/sanitize)
|
|
118
|
+
* name as its rawAttributes key, unchanged, since PEM's attribute-condition
|
|
119
|
+
* matching (`div[class=code]`) matches against attribute names as written.
|
|
109
120
|
*
|
|
110
121
|
* @param {string} attrStr - raw attribute expression substring
|
|
111
122
|
* @param {object} parser - Xml2JsParser instance (for processAttrName)
|
|
@@ -116,56 +127,60 @@ export function collectRawAttributes(attrStr, parser, tagExp) {
|
|
|
116
127
|
|
|
117
128
|
const matches = parseAttributes(attrStr);
|
|
118
129
|
const len = matches.length;
|
|
130
|
+
tagExp._rawAttrMatchCount = len; // total parsed attrs, incl. dropped (xmlns:) ones — for maxAttributesPerTag parity with old behavior
|
|
131
|
+
const parsedAttrs = [];
|
|
119
132
|
let count = 0;
|
|
120
133
|
for (let i = 0; i < len; i++) {
|
|
121
|
-
const
|
|
134
|
+
const m = matches[i];
|
|
135
|
+
const attrName = parser.processAttrName(m.name);
|
|
122
136
|
if (attrName === false) continue;
|
|
123
137
|
count++;
|
|
124
|
-
const rawVal =
|
|
125
|
-
|
|
138
|
+
const rawVal = m.value;
|
|
139
|
+
const attrVal = rawVal !== undefined ? rawVal : true;
|
|
140
|
+
tagExp.rawAttributes[m.name] = attrVal;
|
|
141
|
+
parsedAttrs.push({ name: attrName, value: attrVal, index: m.startIndex });
|
|
126
142
|
}
|
|
127
143
|
tagExp.rawAttributesLen = count;
|
|
144
|
+
tagExp._parsedAttrs = parsedAttrs;
|
|
128
145
|
}
|
|
129
146
|
|
|
130
147
|
/**
|
|
131
|
-
* Pass 2:
|
|
148
|
+
* Pass 2: push each attribute (already parsed + name-processed by pass 1,
|
|
149
|
+
* see tagExp._parsedAttrs) to the output builder. No re-parsing, no
|
|
150
|
+
* re-running processAttrName — this is now a plain loop over cached data.
|
|
132
151
|
*
|
|
133
|
-
* @param {string}
|
|
152
|
+
* @param {Array<{name: string, value: *, index: number}>} parsedAttrs - tagExp._parsedAttrs from collectRawAttributes
|
|
134
153
|
* @param {object} parser - Xml2JsParser instance
|
|
135
|
-
* @param {number} [attrsExpStart] - absolute document offset where
|
|
136
|
-
*
|
|
154
|
+
* @param {number} [attrsExpStart] - absolute document offset where the
|
|
155
|
+
* attribute expression began (tagExp._attrsExpStart). When provided, each
|
|
137
156
|
* attribute's absolute document index is computed and passed to
|
|
138
157
|
* addAttribute() as a 4th argument: { index }. Line/col are intentionally
|
|
139
158
|
* NOT computed here — doing so would require re-scanning attrStr for
|
|
140
159
|
* newlines on every call, for a field most builders won't use; callers
|
|
141
160
|
* that need it can derive line/col from `index` plus the document text.
|
|
161
|
+
* @param {number} rawAttrMatchCount - tagExp._rawAttrMatchCount, used for the
|
|
162
|
+
* maxAttributesPerTag limit check (counts all parsed attrs, including any
|
|
163
|
+
* dropped by processAttrName, matching the limit's pre-existing semantics).
|
|
142
164
|
*/
|
|
143
|
-
export function flushAttributes(
|
|
144
|
-
if (!
|
|
145
|
-
const matches = parseAttributes(attrStr);
|
|
146
|
-
const len = matches.length;
|
|
165
|
+
export function flushAttributes(parsedAttrs, parser, attrsExpStart, rawAttrMatchCount) {
|
|
166
|
+
if (!parsedAttrs || parsedAttrs.length === 0) return;
|
|
147
167
|
|
|
148
168
|
const maxAttrs = parser.options.limits?.maxAttributesPerTag;
|
|
149
|
-
if (maxAttrs !== undefined && maxAttrs !== null &&
|
|
169
|
+
if (maxAttrs !== undefined && maxAttrs !== null && rawAttrMatchCount > maxAttrs) {
|
|
150
170
|
const tagName = parser.currentTagDetail?.name ?? '(unknown)';
|
|
151
171
|
throw new ParseError(
|
|
152
|
-
`Tag '${tagName}' has ${
|
|
172
|
+
`Tag '${tagName}' has ${rawAttrMatchCount} attributes, exceeding limit of ${maxAttrs}`,
|
|
153
173
|
ErrorCode.LIMIT_MAX_ATTRIBUTES,
|
|
154
174
|
{ line: parser.source.line, col: parser.source.cols, index: parser.source.startIndex }
|
|
155
175
|
);
|
|
156
176
|
}
|
|
157
177
|
|
|
178
|
+
const len = parsedAttrs.length;
|
|
158
179
|
for (let i = 0; i < len; i++) {
|
|
159
|
-
const
|
|
160
|
-
if (attrName === false) continue;
|
|
161
|
-
|
|
162
|
-
const rawVal = matches[i][4];
|
|
163
|
-
const attrVal = rawVal !== undefined ? rawVal : true;
|
|
164
|
-
|
|
180
|
+
const a = parsedAttrs[i];
|
|
165
181
|
const attrMeta = attrsExpStart !== undefined
|
|
166
|
-
? { index: attrsExpStart +
|
|
182
|
+
? { index: attrsExpStart + a.index }
|
|
167
183
|
: undefined;
|
|
168
|
-
|
|
169
|
-
parser.outputBuilder.addAttribute(attrName, attrVal, parser.readonlyMatcher, attrMeta);
|
|
184
|
+
parser.outputBuilder.addAttribute(a.name, a.value, parser.readonlyMatcher, attrMeta);
|
|
170
185
|
}
|
|
171
186
|
}
|
package/src/DocTypeReader.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { ParseError, ErrorCode } from './ParseError.js';
|
|
2
|
-
|
|
2
|
+
|
|
3
3
|
|
|
4
4
|
export function readDocType(parser) {
|
|
5
5
|
parser.source.markTokenStart(1);
|
|
@@ -267,7 +267,7 @@ function readEntityExp(parser) {
|
|
|
267
267
|
{ line: source.line, col: source.cols, index: source.startIndex });
|
|
268
268
|
}
|
|
269
269
|
|
|
270
|
-
validateEntityName(entityName, parser
|
|
270
|
+
validateEntityName(entityName, parser);
|
|
271
271
|
skipSourceWhitespace(source);
|
|
272
272
|
|
|
273
273
|
if (!source.canRead()) {
|
|
@@ -346,7 +346,7 @@ function readElementExp(parser) {
|
|
|
346
346
|
{ line: source.line, col: source.cols, index: source.startIndex });
|
|
347
347
|
}
|
|
348
348
|
|
|
349
|
-
if (!
|
|
349
|
+
if (!parser.getNameValidator('name')(elementName)) {
|
|
350
350
|
throw new ParseError(`Invalid element name: "${elementName}"`,
|
|
351
351
|
ErrorCode.INVALID_TAG,
|
|
352
352
|
{ line: source.line, col: source.cols, index: source.startIndex });
|
|
@@ -434,7 +434,7 @@ function readNotationExp(parser) {
|
|
|
434
434
|
{ line: source.line, col: source.cols, index: source.startIndex });
|
|
435
435
|
}
|
|
436
436
|
|
|
437
|
-
validateEntityName(notationName, parser
|
|
437
|
+
validateEntityName(notationName, parser);
|
|
438
438
|
skipSourceWhitespace(source);
|
|
439
439
|
|
|
440
440
|
// Need all 6 chars of "SYSTEM" / "PUBLIC" before we can classify
|
|
@@ -512,8 +512,8 @@ function skipSourceWhitespace(source) {
|
|
|
512
512
|
}
|
|
513
513
|
}
|
|
514
514
|
|
|
515
|
-
function validateEntityName(name,
|
|
516
|
-
if (
|
|
515
|
+
function validateEntityName(name, parser) {
|
|
516
|
+
if (parser.getNameValidator('name')(name)) return name;
|
|
517
517
|
throw new ParseError(
|
|
518
518
|
`Invalid entity name "${name}"`,
|
|
519
519
|
ErrorCode.ENTITY_INVALID_KEY,
|
|
@@ -114,6 +114,31 @@ export default class BufferSource {
|
|
|
114
114
|
return this.buffer.slice(from, from + n).toString();
|
|
115
115
|
}
|
|
116
116
|
|
|
117
|
+
/**
|
|
118
|
+
* See StringSource.scanTagExpEnd() for full rationale. Byte-indexed —
|
|
119
|
+
* quote/`>` are single-byte ASCII, safe for multi-byte UTF-8 content too
|
|
120
|
+
* (a `>` byte never appears as a UTF-8 continuation byte). Buffer isn't a
|
|
121
|
+
* rope, so no equivalent of FeedableSource's charCodeAt/flatten concern.
|
|
122
|
+
*/
|
|
123
|
+
scanTagExpEnd() {
|
|
124
|
+
const buf = this.buffer;
|
|
125
|
+
const len = buf.length;
|
|
126
|
+
const start = this.startIndex;
|
|
127
|
+
let inSingle = false;
|
|
128
|
+
let inDouble = false;
|
|
129
|
+
for (let i = start; i < len; i++) {
|
|
130
|
+
const c = buf[i];
|
|
131
|
+
if (c === 39) { // '
|
|
132
|
+
if (!inDouble) inSingle = !inSingle;
|
|
133
|
+
} else if (c === 34) { // "
|
|
134
|
+
if (!inSingle) inDouble = !inDouble;
|
|
135
|
+
} else if (c === 62 && !inSingle && !inDouble) { // >
|
|
136
|
+
return i - start;
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
return -1;
|
|
140
|
+
}
|
|
141
|
+
|
|
117
142
|
/**
|
|
118
143
|
* Scan buffer[this.startIndex, end) for byte code 10 ('\n') and advance
|
|
119
144
|
* line/cols to match, mirroring readCh()'s per-byte logic. Does NOT touch
|
|
@@ -224,20 +249,9 @@ export default class BufferSource {
|
|
|
224
249
|
}
|
|
225
250
|
|
|
226
251
|
readFromBuffer(n, shouldUpdate) {
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
if (ch === 10) { // '\n'
|
|
231
|
-
this.line++;
|
|
232
|
-
this.cols = 1;
|
|
233
|
-
} else {
|
|
234
|
-
this.cols++;
|
|
235
|
-
}
|
|
236
|
-
ch = String.fromCharCode(ch);
|
|
237
|
-
} else {
|
|
238
|
-
this.cols += n;
|
|
239
|
-
ch = this.buffer.slice(this.startIndex, this.startIndex + n).toString();
|
|
240
|
-
}
|
|
252
|
+
const ch = n === 1
|
|
253
|
+
? String.fromCharCode(this.buffer[this.startIndex])
|
|
254
|
+
: this.buffer.slice(this.startIndex, this.startIndex + n).toString();
|
|
241
255
|
if (shouldUpdate) this.updateBufferBoundary(n);
|
|
242
256
|
return ch;
|
|
243
257
|
}
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { ParseError, ErrorCode } from '../ParseError.js';
|
|
2
|
+
import { StringDecoder } from 'node:string_decoder';
|
|
2
3
|
|
|
3
4
|
/**
|
|
4
5
|
* FeedableSource — input source for the feed()/end() API.
|
|
@@ -78,6 +79,19 @@ export default class FeedableSource {
|
|
|
78
79
|
* the next feed() double-counts every '\n' it consumed before failing.
|
|
79
80
|
*/
|
|
80
81
|
this._marks = [null, null];
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* Lazily-created, persistent across the whole feed() session. Buffer
|
|
85
|
+
* chunks must go through this rather than Buffer#toString() per chunk —
|
|
86
|
+
* toString() decodes each chunk in isolation, so a multi-byte UTF-8
|
|
87
|
+
* character whose bytes straddle a chunk boundary gets corrupted (each
|
|
88
|
+
* half independently replaced with U+FFFD). StringDecoder holds back an
|
|
89
|
+
* incomplete trailing sequence internally and prepends it to the next
|
|
90
|
+
* write(), so a split character decodes correctly once the rest of its
|
|
91
|
+
* bytes arrive. Only created if Buffer input is ever fed — string-only
|
|
92
|
+
* callers never pay for it.
|
|
93
|
+
*/
|
|
94
|
+
this._decoder = null;
|
|
81
95
|
}
|
|
82
96
|
|
|
83
97
|
/**
|
|
@@ -89,9 +103,28 @@ export default class FeedableSource {
|
|
|
89
103
|
* the limit.
|
|
90
104
|
*
|
|
91
105
|
* @param {string|Buffer} data
|
|
106
|
+
* @returns {number} number of characters appended to the buffer (after
|
|
107
|
+
* decoding) — callers that track fed-byte totals (e.g. XMLParser.feed's
|
|
108
|
+
* batch threshold) should use this rather than the raw input length,
|
|
109
|
+
* since a Buffer chunk ending mid-character may decode to fewer chars
|
|
110
|
+
* than its byte length until the next chunk completes the sequence.
|
|
92
111
|
*/
|
|
93
112
|
feed(data) {
|
|
94
|
-
|
|
113
|
+
let newData;
|
|
114
|
+
if (typeof data === 'string') {
|
|
115
|
+
newData = data;
|
|
116
|
+
} else if (Buffer.isBuffer(data)) {
|
|
117
|
+
// Stateful decode: bytes of a multi-byte char split across two feed()
|
|
118
|
+
// calls are buffered internally by StringDecoder and correctly
|
|
119
|
+
// stitched together, instead of each chunk being decoded in isolation.
|
|
120
|
+
if (!this._decoder) this._decoder = new StringDecoder('utf8');
|
|
121
|
+
newData = this._decoder.write(data);
|
|
122
|
+
} else if (data?.toString) {
|
|
123
|
+
newData = data.toString();
|
|
124
|
+
} else {
|
|
125
|
+
throw new ParseError('feed() data must be a string or Buffer.', ErrorCode.DATA_MUST_BE_STRING);
|
|
126
|
+
}
|
|
127
|
+
|
|
95
128
|
const liveBytes = this.buffer.length - this.startIndex;
|
|
96
129
|
|
|
97
130
|
if (liveBytes + newData.length > this.maxBufferSize) {
|
|
@@ -103,10 +136,20 @@ export default class FeedableSource {
|
|
|
103
136
|
}
|
|
104
137
|
|
|
105
138
|
this.buffer += newData;
|
|
139
|
+
return newData.length;
|
|
106
140
|
}
|
|
107
141
|
|
|
108
142
|
/** Signal that no more data will be fed. */
|
|
109
143
|
end() {
|
|
144
|
+
if (this._decoder) {
|
|
145
|
+
// Flush any final incomplete byte sequence held by the decoder. For
|
|
146
|
+
// well-formed UTF-8 input this is normally '' (nothing pending); a
|
|
147
|
+
// non-empty result here means the input was genuinely truncated
|
|
148
|
+
// mid-character, and StringDecoder's own U+FFFD substitution is the
|
|
149
|
+
// correct, standard behavior for that case.
|
|
150
|
+
const tail = this._decoder.end();
|
|
151
|
+
if (tail) this.buffer += tail;
|
|
152
|
+
}
|
|
110
153
|
this.isComplete = true;
|
|
111
154
|
}
|
|
112
155
|
|
|
@@ -214,6 +257,43 @@ export default class FeedableSource {
|
|
|
214
257
|
return this.buffer.substring(from, from + n);
|
|
215
258
|
}
|
|
216
259
|
|
|
260
|
+
/**
|
|
261
|
+
* Quote-aware scan, from the current read position, for the unquoted '>'
|
|
262
|
+
* that ends a tag expression. Used by readTagExp() — replaces the old
|
|
263
|
+
* per-char canRead(i)/readChAt(i) loop, which profiling showed as the
|
|
264
|
+
* single largest hotspot (~23-26% of parse time).
|
|
265
|
+
*
|
|
266
|
+
* IMPORTANT: bracket char access (`buf[i]`), not `charCodeAt(i)`. This
|
|
267
|
+
* source's buffer is built via repeated `+=` in feed() (a growing V8
|
|
268
|
+
* ConsString/rope). charCodeAt forces a full rope-flatten on access —
|
|
269
|
+
* confirmed via a crash (Runtime_StringCharCodeAt -> String::SlowFlatten)
|
|
270
|
+
* causing real O(n^2) memory growth when this was first written with
|
|
271
|
+
* charCodeAt. Bracket access matches what the pre-existing readChAt()
|
|
272
|
+
* already safely used.
|
|
273
|
+
*
|
|
274
|
+
* @returns {number} relative offset of the unquoted '>', or -1 if the
|
|
275
|
+
* buffer runs out first — caller treats that as UNEXPECTED_END, the
|
|
276
|
+
* normal retryable chunk-boundary signal for this source.
|
|
277
|
+
*/
|
|
278
|
+
scanTagExpEnd() {
|
|
279
|
+
const buf = this.buffer;
|
|
280
|
+
const len = buf.length;
|
|
281
|
+
const start = this.startIndex;
|
|
282
|
+
let inSingle = false;
|
|
283
|
+
let inDouble = false;
|
|
284
|
+
for (let i = start; i < len; i++) {
|
|
285
|
+
const c = buf[i];
|
|
286
|
+
if (c === "'") {
|
|
287
|
+
if (!inDouble) inSingle = !inSingle;
|
|
288
|
+
} else if (c === '"') {
|
|
289
|
+
if (!inSingle) inDouble = !inDouble;
|
|
290
|
+
} else if (c === '>' && !inSingle && !inDouble) {
|
|
291
|
+
return i - start;
|
|
292
|
+
}
|
|
293
|
+
}
|
|
294
|
+
return -1;
|
|
295
|
+
}
|
|
296
|
+
|
|
217
297
|
/**
|
|
218
298
|
* Read until stop string is found.
|
|
219
299
|
* @param {string} stopStr
|
|
@@ -341,8 +421,14 @@ export default class FeedableSource {
|
|
|
341
421
|
const end = this.startIndex + n;
|
|
342
422
|
this._advanceLineCol(end);
|
|
343
423
|
this.startIndex = end;
|
|
344
|
-
|
|
345
|
-
|
|
424
|
+
// No "any mark active" gate here — flush()'s own min(startIndex, marks...)
|
|
425
|
+
// origin computation already guarantees any in-progress token (at either
|
|
426
|
+
// mark level) survives the trim. A separate boolean gate on top of that
|
|
427
|
+
// was redundant, and since _marks[0] is set on every parseXml() loop
|
|
428
|
+
// iteration and never nulled outside of rewindToMark() (an error path),
|
|
429
|
+
// that gate was effectively permanent — flush() never ran in normal
|
|
430
|
+
// operation. See specs/flushArchitecture_spec.js for the regression test.
|
|
431
|
+
if (this.autoFlush && this.startIndex >= this.flushThreshold) {
|
|
346
432
|
this.flush();
|
|
347
433
|
}
|
|
348
434
|
}
|
|
@@ -353,7 +439,10 @@ export default class FeedableSource {
|
|
|
353
439
|
*
|
|
354
440
|
* The flush origin is the minimum of all active mark positions, so that any
|
|
355
441
|
* in-progress token (at either mark level) is preserved in the buffer and
|
|
356
|
-
* can be re-read after the flush.
|
|
442
|
+
* can be re-read after the flush. This is the sole safety mechanism for
|
|
443
|
+
* flush() — callers do not need to additionally check "is a mark active"
|
|
444
|
+
* before calling this; an active mark simply caps how much origin can
|
|
445
|
+
* advance, rather than blocking the call outright.
|
|
357
446
|
*
|
|
358
447
|
* If no marks are active, the origin is startIndex itself — everything
|
|
359
448
|
* before the current read position is discarded.
|
|
@@ -28,7 +28,11 @@ export default class StreamSource extends FeedableSource {
|
|
|
28
28
|
attachStream(readable, onChunk, onEnd, onError) {
|
|
29
29
|
readable.on('data', chunk => {
|
|
30
30
|
try {
|
|
31
|
-
|
|
31
|
+
// Pass the raw chunk (Buffer or string) straight through — feed()
|
|
32
|
+
// decodes Buffers via a persistent StringDecoder so a multi-byte
|
|
33
|
+
// UTF-8 character split across two chunks decodes correctly instead
|
|
34
|
+
// of each half being independently mangled by a per-chunk toString().
|
|
35
|
+
this.feed(chunk);
|
|
32
36
|
onChunk(null); // chunk appended successfully — caller runs parseXml()
|
|
33
37
|
} catch (err) {
|
|
34
38
|
onChunk(err); // buffer overflow or coercion failure
|
|
@@ -127,6 +127,35 @@ export default class StringSource {
|
|
|
127
127
|
return this.buffer.substring(from, from + n);
|
|
128
128
|
}
|
|
129
129
|
|
|
130
|
+
/**
|
|
131
|
+
* Quote-aware scan, from the current read position, for the unquoted '>'
|
|
132
|
+
* that ends a tag expression (`<tag attr="...">`). Used by readTagExp().
|
|
133
|
+
* Direct-buffer, bracket-indexed (not charCodeAt — see FeedableSource's
|
|
134
|
+
* copy of this method for why that matters there; kept identical here
|
|
135
|
+
* for consistency even though StringSource's buffer is never re-concatenated).
|
|
136
|
+
*
|
|
137
|
+
* @returns {number} relative offset (from startIndex) of the unquoted '>',
|
|
138
|
+
* or -1 if the buffer is exhausted first (malformed input for StringSource).
|
|
139
|
+
*/
|
|
140
|
+
scanTagExpEnd() {
|
|
141
|
+
const buf = this.buffer;
|
|
142
|
+
const len = buf.length;
|
|
143
|
+
const start = this.startIndex;
|
|
144
|
+
let inSingle = false;
|
|
145
|
+
let inDouble = false;
|
|
146
|
+
for (let i = start; i < len; i++) {
|
|
147
|
+
const c = buf[i];
|
|
148
|
+
if (c === "'") {
|
|
149
|
+
if (!inDouble) inSingle = !inSingle;
|
|
150
|
+
} else if (c === '"') {
|
|
151
|
+
if (!inSingle) inDouble = !inDouble;
|
|
152
|
+
} else if (c === '>' && !inSingle && !inDouble) {
|
|
153
|
+
return i - start;
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
return -1;
|
|
157
|
+
}
|
|
158
|
+
|
|
130
159
|
/**
|
|
131
160
|
* Scan buffer[this.startIndex, end) for '\n' and advance line/cols to match,
|
|
132
161
|
* mirroring readCh()'s per-char logic. Does NOT touch startIndex — callers
|
|
@@ -251,8 +280,12 @@ export default class StringSource {
|
|
|
251
280
|
const end = this.startIndex + n;
|
|
252
281
|
this._advanceLineCol(end);
|
|
253
282
|
this.startIndex = end;
|
|
254
|
-
|
|
255
|
-
|
|
283
|
+
// See FeedableSource.updateBufferBoundary() for why there is no "any mark
|
|
284
|
+
// active" gate here — flush()'s own min-origin computation already
|
|
285
|
+
// protects any in-progress token; a separate gate was redundant and, since
|
|
286
|
+
// marks are effectively always set in normal operation, made flush()
|
|
287
|
+
// permanently unreachable. See specs/flushArchitecture_spec.js.
|
|
288
|
+
if (this.autoFlush && this.startIndex >= this.flushThreshold) {
|
|
256
289
|
this.flush();
|
|
257
290
|
}
|
|
258
291
|
}
|
package/src/XMLParser.js
CHANGED
|
@@ -183,19 +183,15 @@ export default class XMLParser {
|
|
|
183
183
|
this._initFeedSession();
|
|
184
184
|
}
|
|
185
185
|
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
}
|
|
196
|
-
|
|
197
|
-
this._feedSource.feed(str);
|
|
198
|
-
this._pendingBytes += str.length;
|
|
186
|
+
// Pass raw data straight through — do NOT pre-convert Buffers to string
|
|
187
|
+
// here. FeedableSource.feed() decodes Buffers via a persistent
|
|
188
|
+
// StringDecoder so a multi-byte UTF-8 character split across two feed()
|
|
189
|
+
// calls decodes correctly; converting each chunk with .toString() first
|
|
190
|
+
// (as this used to do) decodes each chunk in isolation and corrupts a
|
|
191
|
+
// split character. feed() itself validates the type and throws
|
|
192
|
+
// DATA_MUST_BE_STRING for anything unsupported.
|
|
193
|
+
const appendedLength = this._feedSource.feed(data);
|
|
194
|
+
this._pendingBytes += appendedLength;
|
|
199
195
|
|
|
200
196
|
if (this._pendingBytes >= this._batchThreshold) {
|
|
201
197
|
this._runParse();
|
package/src/Xml2JsParser.js
CHANGED
|
@@ -8,7 +8,7 @@ import { readDocType } from './DocTypeReader.js';
|
|
|
8
8
|
import { DANGEROUS_PROPERTY_NAMES, criticalProperties } from './util.js';
|
|
9
9
|
import AutoCloseHandler from './AutoCloseHandler.js';
|
|
10
10
|
import { ParseError, ErrorCode } from './ParseError.js';
|
|
11
|
-
import {
|
|
11
|
+
import { createValidator } from 'xml-naming';
|
|
12
12
|
|
|
13
13
|
class TagDetail {
|
|
14
14
|
/**
|
|
@@ -66,6 +66,14 @@ export default class Xml2JsParser {
|
|
|
66
66
|
this.tagsStack = [];
|
|
67
67
|
this._stopNodeProcessor = null;
|
|
68
68
|
this._exitIfTriggered = false;
|
|
69
|
+
// Lazily-built, memoized xml-naming validators (v0.3.0 createValidator).
|
|
70
|
+
// Lazy because xmlDec.version isn't final until the optional <?xml?>
|
|
71
|
+
// declaration (if any) has been read — which happens after this method
|
|
72
|
+
// runs but before any tag name is ever validated. Reset here (once per
|
|
73
|
+
// document/session, see XMLParser._createParser / feed() call sites) so
|
|
74
|
+
// a reused Xml2JsParser instance never validates against a stale
|
|
75
|
+
// xmlVersion or leaks one document's name cache into the next.
|
|
76
|
+
this._nameValidators = Object.create(null);
|
|
69
77
|
this.xmlDec = {
|
|
70
78
|
version: 1.0,
|
|
71
79
|
lang: null,
|
|
@@ -156,13 +164,18 @@ export default class Xml2JsParser {
|
|
|
156
164
|
{ line: this.source.line, col: this.source.cols, index: this.source.startIndex }
|
|
157
165
|
);
|
|
158
166
|
|
|
159
|
-
|
|
167
|
+
//sorted frequency wise
|
|
168
|
+
if (nextChar === '/') {
|
|
169
|
+
this.source.updateBufferBoundary();
|
|
170
|
+
this.readClosingTag(tagStart);
|
|
171
|
+
} else if (nextChar === '!') {
|
|
160
172
|
this.source.updateBufferBoundary();
|
|
161
173
|
this.addTextNode();
|
|
162
174
|
this.readSpecialTag(nextChar);
|
|
163
|
-
} else if (nextChar === '
|
|
175
|
+
} else if (nextChar === '?') {
|
|
164
176
|
this.source.updateBufferBoundary();
|
|
165
|
-
this.
|
|
177
|
+
this.addTextNode();
|
|
178
|
+
readPiTag(this);
|
|
166
179
|
} else {
|
|
167
180
|
this.readOpeningTag(tagStart);
|
|
168
181
|
}
|
|
@@ -365,7 +378,7 @@ export default class Xml2JsParser {
|
|
|
365
378
|
const skipTagConfig = stopNodeConfig ? null : this.isSkipTag();
|
|
366
379
|
|
|
367
380
|
if (!options.skip.attributes && !skipTagConfig) {
|
|
368
|
-
flushAttributes(tagExp.
|
|
381
|
+
flushAttributes(tagExp._parsedAttrs, this, tagExp._attrsExpStart, tagExp._rawAttrMatchCount);
|
|
369
382
|
}
|
|
370
383
|
|
|
371
384
|
// Stop-node and skip-tag checks AFTER attributes are set so attribute conditions work.
|
|
@@ -526,8 +539,6 @@ export default class Xml2JsParser {
|
|
|
526
539
|
this.outputBuilder.addInputEntities(docTypeEntities);
|
|
527
540
|
}
|
|
528
541
|
}
|
|
529
|
-
} else if (startCh === "?") {
|
|
530
|
-
readPiTag(this);
|
|
531
542
|
} else {
|
|
532
543
|
throw new ParseError(`Invalid tag '<${startCh}'`, ErrorCode.INVALID_TAG, { line: this.source.line, col: this.source.cols, index: this.source.startIndex });
|
|
533
544
|
}
|
|
@@ -543,10 +554,34 @@ export default class Xml2JsParser {
|
|
|
543
554
|
}
|
|
544
555
|
}
|
|
545
556
|
|
|
557
|
+
/**
|
|
558
|
+
* Returns a memoized xml-naming validator for the given production
|
|
559
|
+
* ('qName' for tag/attribute names, 'name' for DOCTYPE entity/element/
|
|
560
|
+
* notation names), built lazily on first use and cached per parser
|
|
561
|
+
* instance for the rest of the document/session.
|
|
562
|
+
*
|
|
563
|
+
* xmlDec.version is stored as a number (1.0 / 1.1) but xml-naming's
|
|
564
|
+
* xmlVersion option is the string '1.0'/'1.1' — normalized here rather
|
|
565
|
+
* than changing xmlDec's public shape (it's forwarded as-is to
|
|
566
|
+
* outputBuilder.addDeclaration(), so its type is part of the builder
|
|
567
|
+
* contract, not just an internal detail).
|
|
568
|
+
*
|
|
569
|
+
* @param {'name'|'qName'} production
|
|
570
|
+
*/
|
|
571
|
+
getNameValidator(production) {
|
|
572
|
+
let validator = this._nameValidators[production];
|
|
573
|
+
if (!validator) {
|
|
574
|
+
const xmlVersion = this.xmlDec.version === 1.1 || this.xmlDec.version === '1.1' ? '1.1' : '1.0';
|
|
575
|
+
validator = createValidator(production, { xmlVersion });
|
|
576
|
+
this._nameValidators[production] = validator;
|
|
577
|
+
}
|
|
578
|
+
return validator;
|
|
579
|
+
}
|
|
580
|
+
|
|
546
581
|
processAttrName(attrName) {
|
|
547
582
|
const options = this.options;
|
|
548
583
|
attrName = resolveNsPrefix(attrName, options.skip.nsPrefix);
|
|
549
|
-
if (!
|
|
584
|
+
if (!this.getNameValidator('qName')(attrName)) { //TODO: make it optional
|
|
550
585
|
throw new ParseError(`Invalid attribute name: ${attrName}`, ErrorCode.INVALID_ATTRIBUTE_NAME);
|
|
551
586
|
}
|
|
552
587
|
attrName = sanitizeName(attrName, options.onDangerousProperty);
|
package/src/XmlPartReader.js
CHANGED
|
@@ -2,7 +2,6 @@
|
|
|
2
2
|
import { ParseError, ErrorCode } from './ParseError.js';
|
|
3
3
|
import { collectRawAttributes } from './AttributeProcessor.js';
|
|
4
4
|
import { isSpace } from "./util.js"
|
|
5
|
-
import { name as isName, qName as isQName } from 'xml-naming';
|
|
6
5
|
// Re-export flushAttributes so Xml2JsParser and XmlSpecialTagsReader can
|
|
7
6
|
// continue to import it from here without changing their import lines.
|
|
8
7
|
export { flushAttributes } from './AttributeProcessor.js';
|
|
@@ -52,35 +51,24 @@ export function readTagExp(parser) {
|
|
|
52
51
|
// begins — captured before any reads so buildTagExpObj can compute each
|
|
53
52
|
// attribute's absolute document position from its offset within attrsExp.
|
|
54
53
|
const expStart = parser.source.startIndex;
|
|
55
|
-
let inSingleQuotes = false;
|
|
56
|
-
let inDoubleQuotes = false;
|
|
57
|
-
let i;
|
|
58
|
-
let EOE = false;
|
|
59
54
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
if (!EOE) {
|
|
74
|
-
// Buffer exhausted before '>' — chunk boundary mid-tag. Throw UNEXPECTED_END
|
|
75
|
-
// so feed()/parseStream() rewinds to the level-0 outer mark and retries.
|
|
55
|
+
const relEnd = parser.source.scanTagExpEnd();
|
|
56
|
+
|
|
57
|
+
if (relEnd === -1) {
|
|
58
|
+
// Buffer exhausted before an unquoted '>' was found — chunk boundary
|
|
59
|
+
// mid-tag. Throw UNEXPECTED_END so feed()/parseStream() rewinds to the
|
|
60
|
+
// level-0 outer mark and retries. (Note: scanTagExpEnd() only returns a
|
|
61
|
+
// non-negative index once both quote flags are already balanced-closed —
|
|
62
|
+
// by construction, not by a separate post-scan check — so there is no
|
|
63
|
+
// longer a distinct "unclosed quote but '>' was found" case to detect;
|
|
64
|
+
// the old UNCLOSED_QUOTE branch here was checking the same two flags
|
|
65
|
+
// immediately after the only code path that requires them both false,
|
|
66
|
+
// making it permanently unreachable.)
|
|
76
67
|
throw new ParseError("Unexpected closing of source waiting for '>'", ErrorCode.UNEXPECTED_END);
|
|
77
|
-
} else if (inSingleQuotes || inDoubleQuotes) {
|
|
78
|
-
// '>' found but a quote was never closed — real syntax error.
|
|
79
|
-
throw new ParseError("Invalid attribute expression. Quote is not properly closed", ErrorCode.UNCLOSED_QUOTE);
|
|
80
68
|
}
|
|
81
69
|
|
|
82
|
-
const exp = parser.source.readStr(
|
|
83
|
-
parser.source.updateBufferBoundary(
|
|
70
|
+
const exp = parser.source.readStr(relEnd);
|
|
71
|
+
parser.source.updateBufferBoundary(relEnd + 1);
|
|
84
72
|
return buildTagExpObj(exp, parser, expStart);
|
|
85
73
|
}
|
|
86
74
|
|
|
@@ -183,7 +171,7 @@ function buildTagExpObj(exp, parser, expStart, forceToReadAttrs = false) {
|
|
|
183
171
|
tagExp.tagName = tagExp.tagName.trimEnd();
|
|
184
172
|
tagExp._attrsExp = attrsExp;
|
|
185
173
|
|
|
186
|
-
if (!
|
|
174
|
+
if (!parser.getNameValidator('qName')(tagExp.tagName)) {
|
|
187
175
|
throw new ParseError("Invalid tag name", ErrorCode.INVALID_TAG_NAME);
|
|
188
176
|
}
|
|
189
177
|
|
|
@@ -50,6 +50,17 @@ export function readPiTag(parser) {
|
|
|
50
50
|
}
|
|
51
51
|
parser.xmlDec.encoding = tagExp.rawAttributes?.encoding;
|
|
52
52
|
parser.xmlDec.standalone = tagExp.rawAttributes?.standalone;
|
|
53
|
+
|
|
54
|
+
// BUG FIX: getNameValidator('qName') was already called (and memoized)
|
|
55
|
+
// above the moment this PI tag's own name ("xml") got validated — before
|
|
56
|
+
// xmlDec.version was known, so it was always cached with the '1.0'
|
|
57
|
+
// default. Every subsequent tag/attribute name in the document —
|
|
58
|
+
// including the root element — would silently be checked against XML
|
|
59
|
+
// 1.0 rules even for a document declaring version="1.1". Reset the
|
|
60
|
+
// cache now that the real version is known; this runs at most once per
|
|
61
|
+
// document (a <?xml?> declaration can only appear once), so the cost is
|
|
62
|
+
// negligible.
|
|
63
|
+
parser._nameValidators = Object.create(null);
|
|
53
64
|
}
|
|
54
65
|
|
|
55
66
|
// Flush attributes into the output builder's this.attributes accumulator
|
|
@@ -57,7 +68,7 @@ export function readPiTag(parser) {
|
|
|
57
68
|
// does for regular tags. PI tags are not pushed onto the matcher, so no
|
|
58
69
|
// updateCurrent() call is needed here.
|
|
59
70
|
if (!skipOptions.attributes) {
|
|
60
|
-
flushAttributes(tagExp.
|
|
71
|
+
flushAttributes(tagExp._parsedAttrs, parser, tagExp._attrsExpStart, tagExp._rawAttrMatchCount);
|
|
61
72
|
}
|
|
62
73
|
|
|
63
74
|
if (tagExp.tagName === "xml") {
|
package/src/CharsSymbol.js
DELETED
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
export default {
|
|
2
|
-
"<" : "<", //tag start
|
|
3
|
-
">" : ">", //tag end
|
|
4
|
-
"/" : "/", //close tag
|
|
5
|
-
"!" : "!", //comment or docttype
|
|
6
|
-
"!--" : "!--", //comment
|
|
7
|
-
"-->" : "-->", //comment end
|
|
8
|
-
"?" : "?", //pi
|
|
9
|
-
"?>" : "?>", //pi end
|
|
10
|
-
"?xml" : "?xml", //pi end
|
|
11
|
-
"![" : "![", //cdata
|
|
12
|
-
"]]>" : "]]>", //cdata end
|
|
13
|
-
"[" : "[",
|
|
14
|
-
"-" : "-",
|
|
15
|
-
"D" : "D",
|
|
16
|
-
}
|