@nodable/flexible-xml-parser 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,228 @@
1
+ import { ParseError, ErrorCode } from '../ParseError.js';
2
+
3
+ const Constants = {
4
+ space: 32,
5
+ tab: 9,
6
+ };
7
+
8
+ /**
9
+ * BufferSource — input source backed by a Node.js Buffer (byte array).
10
+ *
11
+ * ### Memory reclamation
12
+ *
13
+ * The full document is available from the start, so there is no chunk-boundary
14
+ * risk and rewindToMark() is a safe no-op. However, the parsed prefix of the
15
+ * Buffer is held in memory until the parse finishes. flush() reclaims it by
16
+ * slicing the Buffer and resetting startIndex to 0.
17
+ *
18
+ * The same mark/flush protocol used by FeedableSource is implemented here so
19
+ * all reader functions work without source-type conditionals:
20
+ *
21
+ * markTokenStart() — save current read position at the start of a token
22
+ * rewindToMark() — no-op for BufferSource (full doc always present)
23
+ * flush() — drop the already-parsed prefix to free memory
24
+ *
25
+ * Auto-flush fires inside updateBufferBoundary() whenever the processed
26
+ * portion exceeds flushThreshold and no token checkpoint is active.
27
+ */
28
+ export default class BufferSource {
29
+ /**
30
+ * @param {Buffer} bytesArr — the full XML document as a Node.js Buffer
31
+ * @param {object} [options]
32
+ * @param {boolean} [options.autoFlush=true] — enable automatic flushing
33
+ * @param {number} [options.flushThreshold=1024] — flush after this many processed bytes
34
+ */
35
+ constructor(bytesArr, options = {}) {
36
+ this.line = 1;
37
+ this.cols = 0;
38
+ this.buffer = bytesArr;
39
+ this.startIndex = 0;
40
+
41
+ this.autoFlush = options.autoFlush !== false;
42
+ this.flushThreshold = options.flushThreshold ?? 1024;
43
+
44
+ // Token-start checkpoint for mark/rewind (mirrors FeedableSource API).
45
+ this._tokenStart = -1;
46
+ }
47
+
48
+ // ─── Token-start checkpoint ───────────────────────────────────────────────
49
+
50
+ /**
51
+ * Save the current read position as the start of a new logical token.
52
+ *
53
+ * For BufferSource this primarily guards flush() from reclaiming data that
54
+ * is still being read, mirroring the same safety invariant as FeedableSource.
55
+ */
56
+ markTokenStart() {
57
+ this._tokenStart = this.startIndex;
58
+ }
59
+
60
+ /**
61
+ * Restore startIndex to the last markTokenStart() position.
62
+ *
63
+ * BufferSource always has the full document available, so a mid-token end
64
+ * of input cannot occur and this method is a safe no-op. It exists solely
65
+ * so caller code can call rewindToMark() unconditionally without branching
66
+ * on source type.
67
+ */
68
+ rewindToMark() {
69
+ // No-op: the complete document is in memory; no rewind is ever needed.
70
+ }
71
+
72
+ /**
73
+ * Discard the already-processed prefix of the buffer to free memory.
74
+ *
75
+ * Uses Buffer.subarray() (zero-copy view) rather than Buffer.slice() for
76
+ * clarity, then copies to a fresh Buffer so the original allocation can be
77
+ * GC'd. If a token checkpoint is active, the flush origin is moved back to
78
+ * the checkpoint so the in-progress token is preserved.
79
+ */
80
+ flush() {
81
+ const origin = this._tokenStart >= 0 ? this._tokenStart : this.startIndex;
82
+ if (origin > 0) {
83
+ // Buffer.from(subarray) copies the bytes so the original large Buffer
84
+ // can be released by the GC once no other references remain.
85
+ this.buffer = Buffer.from(this.buffer.subarray(origin));
86
+ if (this._tokenStart >= 0) {
87
+ this.startIndex -= origin;
88
+ this._tokenStart = 0;
89
+ } else {
90
+ this.startIndex = 0;
91
+ }
92
+ }
93
+ }
94
+
95
+ // ─── Core read interface ──────────────────────────────────────────────────
96
+
97
+ readCh() {
98
+ return String.fromCharCode(this.buffer[this.startIndex++]);
99
+ }
100
+
101
+ readChAt(index) {
102
+ return String.fromCharCode(this.buffer[this.startIndex + index]);
103
+ }
104
+
105
+ readStr(n, from) {
106
+ if (typeof from === 'undefined') from = this.startIndex;
107
+ return this.buffer.slice(from, from + n).toString();
108
+ }
109
+
110
+ readUpto(stopStr) {
111
+ const inputLength = this.buffer.length;
112
+ const stopLength = stopStr.length;
113
+ const stopBuffer = Buffer.from(stopStr);
114
+
115
+ for (let i = this.startIndex; i < inputLength; i++) {
116
+ let match = true;
117
+ for (let j = 0; j < stopLength; j++) {
118
+ if (this.buffer[i + j] !== stopBuffer[j]) { match = false; break; }
119
+ }
120
+ if (match) {
121
+ const result = this.buffer.slice(this.startIndex, i).toString();
122
+ this.startIndex = i + stopLength;
123
+ return result;
124
+ }
125
+ }
126
+
127
+ throw new ParseError(`Unexpected end of source reading '${stopStr}'`, ErrorCode.UNEXPECTED_END);
128
+ }
129
+
130
+ /**
131
+ * Single-character variant of readUpto — faster because there is no inner
132
+ * match loop. Reads until `stopChar` is found, consumes it, and returns
133
+ * the text before it.
134
+ *
135
+ * @param {string} stopChar Exactly one character.
136
+ * @returns {string}
137
+ */
138
+ readUptoChar(stopChar) {
139
+ const stopCode = stopChar.charCodeAt(0);
140
+ const buf = this.buffer;
141
+ const len = buf.length;
142
+ for (let i = this.startIndex; i < len; i++) {
143
+ if (buf[i] === stopCode) {
144
+ const result = buf.slice(this.startIndex, i).toString();
145
+ this.startIndex = i + 1;
146
+ return result;
147
+ }
148
+ }
149
+ throw new ParseError(`Unexpected end of source reading '${stopChar}'`, ErrorCode.UNEXPECTED_END);
150
+ }
151
+
152
+ readUptoCloseTag(stopStr) { // stopStr: "</tagname"
153
+ const inputLength = this.buffer.length;
154
+ const stopLength = stopStr.length;
155
+ const stopBuffer = Buffer.from(stopStr);
156
+ const GT = 62; // '>'
157
+ let tagMatchStart = -1;
158
+ let state = 0; // 0=scanning, 1=tag-name matched (scanning for '>'), 2=full match
159
+
160
+ for (let i = this.startIndex; i < inputLength; i++) {
161
+ if (state === 1) {
162
+ const b = this.buffer[i];
163
+ if (b === Constants.space || b === Constants.tab) continue;
164
+ if (b === GT) { state = 2; }
165
+ else { state = 0; tagMatchStart = -1; } // false match e.g. </scriptX>
166
+ } else {
167
+ // Try to match stopStr at position i
168
+ let matched = true;
169
+ for (let j = 0; j < stopLength; j++) {
170
+ if (this.buffer[i + j] !== stopBuffer[j]) { matched = false; break; }
171
+ }
172
+ if (matched) {
173
+ state = 1;
174
+ tagMatchStart = i;
175
+ i += stopLength - 1; // skip past matched string
176
+ }
177
+ }
178
+ if (state === 2) {
179
+ const result = this.buffer.slice(this.startIndex, tagMatchStart).toString();
180
+ this.startIndex = i + 1;
181
+ return result;
182
+ }
183
+ }
184
+
185
+ throw new ParseError(`Unexpected end of source reading '${stopStr}'`, ErrorCode.UNEXPECTED_END);
186
+ }
187
+
188
+ readFromBuffer(n, shouldUpdate) {
189
+ let ch;
190
+ if (n === 1) {
191
+ ch = this.buffer[this.startIndex];
192
+ if (ch === 10) { // '\n'
193
+ this.line++;
194
+ this.cols = 1;
195
+ } else {
196
+ this.cols++;
197
+ }
198
+ ch = String.fromCharCode(ch);
199
+ } else {
200
+ this.cols += n;
201
+ ch = this.buffer.slice(this.startIndex, this.startIndex + n).toString();
202
+ }
203
+ if (shouldUpdate) this.updateBufferBoundary(n);
204
+ return ch;
205
+ }
206
+
207
+ /**
208
+ * Advance the read cursor by n bytes.
209
+ *
210
+ * Triggers an automatic flush of already-processed data when autoFlush is
211
+ * enabled, the processed portion has grown past flushThreshold, and no
212
+ * token checkpoint is currently active (a flush while a checkpoint is live
213
+ * would invalidate the saved position).
214
+ *
215
+ * @param {number} [n=1]
216
+ */
217
+ updateBufferBoundary(n = 1) {
218
+ this.startIndex += n;
219
+ if (this.autoFlush && this.startIndex >= this.flushThreshold && this._tokenStart < 0) {
220
+ this.flush();
221
+ }
222
+ }
223
+
224
+ canRead(n) {
225
+ n = (n !== undefined) ? n : this.startIndex;
226
+ return this.buffer.length - n > 0;
227
+ }
228
+ }
@@ -0,0 +1,340 @@
1
+ import { ParseError, ErrorCode } from '../ParseError.js';
2
+
3
+ /**
4
+ * FeedableSource — input source for the feed()/end() API.
5
+ *
6
+ * Accepts incremental string/Buffer chunks via feed(), accumulates them in a
7
+ * single string buffer, and exposes the same read interface as StringSource so
8
+ * Xml2JsParser can use it without modification.
9
+ *
10
+ * ### Incremental parsing
11
+ *
12
+ * The parser calls parseXml() after every feed() call, consuming as much of
13
+ * the buffer as possible. When a chunk boundary falls mid-token (e.g. a CDATA
14
+ * section split across two feeds), every reader function marks its start
15
+ * position with markTokenStart() before it begins. If the reader throws
16
+ * UNEXPECTED_END, the caller (XMLParser.feed) catches it and calls
17
+ * rewindToMark() to restore startIndex to the beginning of the incomplete
18
+ * token. The incomplete bytes stay in the buffer and are re-parsed on the
19
+ * next feed() once the rest of the token has arrived.
20
+ *
21
+ * ### Two-level mark stack
22
+ *
23
+ * There are two mark levels:
24
+ *
25
+ * Level 0 — outer mark, set by parseXml()'s main loop BEFORE it reads the
26
+ * '<' character that begins a tag dispatch. This is the position
27
+ * that rewindToMark() always restores to, so the full tag (including
28
+ * its '<![', '</', etc. prefix) is replayed correctly on the next
29
+ * feed().
30
+ *
31
+ * Level 1 — inner mark, set by individual reader functions (readCdata,
32
+ * readClosingTagName, readTagExp, …) at the point where *they*
33
+ * begin. This does NOT affect rewindToMark(); it is used only by
34
+ * flush() to determine the safe trim boundary while a reader is
35
+ * in progress.
36
+ *
37
+ * Using two levels instead of a single slot prevents inner markTokenStart()
38
+ * calls from overwriting the outer mark that feed() needs to rewind to.
39
+ *
40
+ * ### Memory
41
+ *
42
+ * Parsed data is reclaimed from the buffer automatically (autoFlush) once the
43
+ * processed portion exceeds flushThreshold bytes. Because parseXml() runs per
44
+ * chunk and completed tokens are consumed before the next chunk arrives, only
45
+ * incomplete tokens at the current chunk boundary are retained — not the whole
46
+ * document.
47
+ *
48
+ * maxBufferSize is checked against the live (unprocessed) portion of the
49
+ * buffer plus the incoming chunk, not the raw buffer.length, so post-flush
50
+ * sizing stays accurate.
51
+ */
52
+ export default class FeedableSource {
53
+ constructor(options = {}) {
54
+ this.line = 1;
55
+ this.cols = 0;
56
+ this.buffer = '';
57
+ this.startIndex = 0;
58
+ this.isComplete = false;
59
+
60
+ this.maxBufferSize = options.maxBufferSize || 10 * 1024 * 1024; // 10 MB
61
+ this.autoFlush = options.autoFlush !== false; // true by default
62
+ this.flushThreshold = options.flushThreshold || 1024; // 1 KB
63
+
64
+ /**
65
+ * Two-level mark stack.
66
+ *
67
+ * _marks[0] — outer mark: set by parseXml()'s loop before consuming '<'.
68
+ * rewindToMark() always restores startIndex here.
69
+ * _marks[1] — inner mark: set by individual reader functions.
70
+ * Used only by flush() as the safe trim boundary.
71
+ *
72
+ * -1 means "not set" for that level.
73
+ */
74
+ this._marks = [-1, -1];
75
+ }
76
+
77
+ /**
78
+ * Append a data chunk to the buffer.
79
+ *
80
+ * maxBufferSize is checked against the live unprocessed portion
81
+ * (buffer.length - startIndex) plus the incoming data length. Data that has
82
+ * already been parsed and is waiting to be flushed does not count against
83
+ * the limit.
84
+ *
85
+ * @param {string|Buffer} data
86
+ */
87
+ feed(data) {
88
+ const newData = typeof data === 'string' ? data : data.toString();
89
+ const liveBytes = this.buffer.length - this.startIndex;
90
+
91
+ if (liveBytes + newData.length > this.maxBufferSize) {
92
+ throw new ParseError(
93
+ `Buffer size limit exceeded (${liveBytes + newData.length} > ${this.maxBufferSize}). ` +
94
+ `Increase feedable.maxBufferSize or reduce chunk size.`,
95
+ ErrorCode.INVALID_INPUT
96
+ );
97
+ }
98
+
99
+ this.buffer += newData;
100
+ }
101
+
102
+ /** Signal that no more data will be fed. */
103
+ end() {
104
+ this.isComplete = true;
105
+ }
106
+
107
+ /**
108
+ * Returns true when there is at least one character available at or after
109
+ * the given offset (relative to startIndex).
110
+ * @param {number} [n=0]
111
+ */
112
+ canRead(n = 0) {
113
+ return this.startIndex + n < this.buffer.length;
114
+ }
115
+
116
+ // ─── Two-level mark API ───────────────────────────────────────────────────
117
+
118
+ /**
119
+ * Save the current read position into the mark stack.
120
+ *
121
+ * The `level` parameter selects which mark slot to write:
122
+ *
123
+ * level 0 (default) — outer mark, written by parseXml()'s main loop
124
+ * before it reads the '<' that begins a dispatch.
125
+ * level 1 — inner mark, written by reader functions
126
+ * (readCdata, readClosingTagName, readTagExp, …)
127
+ * at the start of their own logic.
128
+ *
129
+ * The two levels are independent. An inner markTokenStart(1) never
130
+ * overwrites the outer mark[0] that rewindToMark() relies on.
131
+ *
132
+ * @param {0|1} [level=0]
133
+ */
134
+ markTokenStart(level = 0) {
135
+ this._marks[level] = this.startIndex;
136
+ }
137
+
138
+ /**
139
+ * Restore startIndex to the OUTER mark (level 0) and clear both marks.
140
+ *
141
+ * Always rewinds to the outermost saved position so the full tag —
142
+ * including any prefix characters consumed by parseXml() before the
143
+ * dispatch (e.g. '<', '!', '[') — is replayed on the next feed().
144
+ *
145
+ * Called by XMLParser.feed() when a reader throws UNEXPECTED_END.
146
+ */
147
+ rewindToMark() {
148
+ if (this._marks[0] >= 0) {
149
+ this.startIndex = this._marks[0];
150
+ }
151
+ this._marks[0] = -1;
152
+ this._marks[1] = -1;
153
+ }
154
+
155
+ /**
156
+ * Clear both mark slots after a token completes successfully.
157
+ *
158
+ * Should be called (or marks allowed to be overwritten) once a dispatch
159
+ * fully succeeds so stale positions don't block flush().
160
+ *
161
+ * In practice the outer mark is overwritten at the top of every
162
+ * parseXml() loop iteration, so explicit clearing is only needed when
163
+ * the loop does NOT continue (e.g. after a non-'<' character is consumed
164
+ * as plain text). The flush guard uses the minimum of set marks, so a
165
+ * stale mark only delays flushing — it does not cause correctness issues.
166
+ */
167
+ clearMark() {
168
+ this._marks[0] = -1;
169
+ this._marks[1] = -1;
170
+ }
171
+
172
+ /**
173
+ * Read next character and advance position.
174
+ * @returns {string}
175
+ */
176
+ readCh() {
177
+ const ch = this.buffer[this.startIndex++];
178
+
179
+ if (ch === '\n') {
180
+ this.line++;
181
+ this.cols = 0;
182
+ } else {
183
+ this.cols++;
184
+ }
185
+
186
+ return ch;
187
+ }
188
+
189
+ /**
190
+ * Read character at offset without advancing.
191
+ * @param {number} index - Offset from current position
192
+ * @returns {string}
193
+ */
194
+ readChAt(index) {
195
+ return this.buffer[this.startIndex + index];
196
+ }
197
+
198
+ /**
199
+ * Read n characters as string.
200
+ * @param {number} n - Number of characters to read
201
+ * @param {number} from - Start position (default: current position)
202
+ * @returns {string}
203
+ */
204
+ readStr(n, from) {
205
+ if (typeof from === 'undefined') from = this.startIndex;
206
+ return this.buffer.substring(from, from + n);
207
+ }
208
+
209
+ /**
210
+ * Read until stop string is found.
211
+ * @param {string} stopStr
212
+ * @returns {string} content before the stop string (stop string is consumed)
213
+ * @throws {ParseError} UNEXPECTED_END when stop string is not found
214
+ */
215
+ readUpto(stopStr) {
216
+ const inputLength = this.buffer.length;
217
+ const stopLength = stopStr.length;
218
+
219
+ for (let i = this.startIndex; i < inputLength; i++) {
220
+ let match = true;
221
+ for (let j = 0; j < stopLength; j++) {
222
+ if (this.buffer[i + j] !== stopStr[j]) { match = false; break; }
223
+ }
224
+ if (match) {
225
+ const result = this.buffer.substring(this.startIndex, i);
226
+ this.startIndex = i + stopLength;
227
+ return result;
228
+ }
229
+ }
230
+
231
+ throw new ParseError(`Unexpected end of source reading '${stopStr}'`, ErrorCode.UNEXPECTED_END);
232
+ }
233
+
234
+ /**
235
+ * Single-character variant of readUpto — faster because there is no inner
236
+ * match loop. Reads until `stopChar` is found, consumes it, and returns
237
+ * the text before it.
238
+ *
239
+ * @param {string} stopChar Exactly one character.
240
+ * @returns {string}
241
+ */
242
+ readUptoChar(stopChar) {
243
+ const i = this.buffer.indexOf(stopChar, this.startIndex);
244
+ if (i === -1) {
245
+ throw new ParseError(`Unexpected end of source reading '${stopChar}'`, ErrorCode.UNEXPECTED_END);
246
+ }
247
+ const result = this.buffer.substring(this.startIndex, i);
248
+ this.startIndex = i + 1;
249
+ return result;
250
+ }
251
+
252
+ /**
253
+ * Read until a closing tag is found (used for stop nodes).
254
+ * @param {string} stopStr e.g. `"</tagname"`
255
+ * @returns {string} raw content between the current position and the closing tag
256
+ * @throws {ParseError} UNEXPECTED_END when the closing tag is not found
257
+ */
258
+ readUptoCloseTag(stopStr) {
259
+ const inputLength = this.buffer.length;
260
+ const stopLength = stopStr.length;
261
+ let tagMatchStart = -1;
262
+ let state = 0; // 0=scanning, 1=tag-name matched (scanning for '>'), 2=full match
263
+
264
+ for (let i = this.startIndex; i < inputLength; i++) {
265
+ if (state === 1) {
266
+ const c = this.buffer[i];
267
+ if (c === ' ' || c === '\t') continue;
268
+ if (c === '>') { state = 2; }
269
+ else { state = 0; tagMatchStart = -1; } // false match e.g. </scriptX>
270
+ } else {
271
+ // Try to match stopStr at position i
272
+ let matched = true;
273
+ for (let j = 0; j < stopLength; j++) {
274
+ if (this.buffer[i + j] !== stopStr[j]) { matched = false; break; }
275
+ }
276
+ if (matched) {
277
+ state = 1;
278
+ tagMatchStart = i;
279
+ i += stopLength - 1; // skip past matched string
280
+ }
281
+ }
282
+ if (state === 2) {
283
+ const result = this.buffer.substring(this.startIndex, tagMatchStart);
284
+ this.startIndex = i + 1;
285
+ return result;
286
+ }
287
+ }
288
+
289
+ throw new ParseError(`Unexpected end of source reading '${stopStr}'`, ErrorCode.UNEXPECTED_END);
290
+ }
291
+
292
+ /**
293
+ * Advance the read cursor by n characters.
294
+ *
295
+ * Triggers an automatic flush of already-processed data when autoFlush is
296
+ * enabled, the processed portion has grown past flushThreshold, and no
297
+ * mark is currently active. Any active mark (either level) blocks the
298
+ * flush to prevent the saved position from becoming invalid.
299
+ *
300
+ * @param {number} [n=1]
301
+ */
302
+ updateBufferBoundary(n = 1) {
303
+ this.startIndex += n;
304
+ const anyMarkActive = this._marks[0] >= 0 || this._marks[1] >= 0;
305
+ if (this.autoFlush && this.startIndex >= this.flushThreshold && !anyMarkActive) {
306
+ this.flush();
307
+ }
308
+ }
309
+
310
+ /**
311
+ * Discard already-processed data from the front of the buffer to free memory.
312
+ * startIndex is reset to 0 after the trim.
313
+ *
314
+ * The flush origin is the minimum of all active mark positions, so that any
315
+ * in-progress token (at either mark level) is preserved in the buffer and
316
+ * can be re-read after the flush.
317
+ *
318
+ * If no marks are active, the origin is startIndex itself — everything
319
+ * before the current read position is discarded.
320
+ */
321
+ flush() {
322
+ // Determine the earliest position that must be kept.
323
+ let origin = this.startIndex;
324
+ for (const m of this._marks) {
325
+ if (m >= 0 && m < origin) origin = m;
326
+ }
327
+
328
+ if (origin > 0) {
329
+ this.buffer = this.buffer.substring(origin);
330
+
331
+ // Adjust all mark positions by the amount trimmed.
332
+ const marksLen = this._marks.length;
333
+ for (let i = 0; i < marksLen; i++) {
334
+ if (this._marks[i] >= 0) this._marks[i] -= origin;
335
+ }
336
+
337
+ this.startIndex -= origin;
338
+ }
339
+ }
340
+ }
@@ -0,0 +1,49 @@
1
+ import FeedableSource from './FeedableSource.js';
2
+
3
+ /**
4
+ * StreamSource — input source that reads from a Node.js Readable stream.
5
+ *
6
+ * Extends FeedableSource so it shares the same buffer management and read
7
+ * interface. attachStream() wires Node.js stream events. On each 'data'
8
+ * event the chunk is appended to the buffer and onChunk is called so the
9
+ * caller can run parseXml() incrementally. Parsing is therefore driven
10
+ * chunk-by-chunk rather than once over the full accumulated document.
11
+ */
12
+ export default class StreamSource extends FeedableSource {
13
+ /**
14
+ * Wire a Readable stream to this source.
15
+ *
16
+ * @param {NodeJS.ReadableStream} readable
17
+ * @param {function(Error|null):void} onChunk
18
+ * Called after each successful feed() with null, or immediately with the
19
+ * feed error if the buffer limit is exceeded. The caller runs parseXml()
20
+ * inside this callback and handles UNEXPECTED_END (chunk boundary mid-token)
21
+ * by calling rewindToMark().
22
+ * @param {function():void} onEnd
23
+ * Called when the stream ends cleanly. The caller should finalise the parse
24
+ * (finalizeXml) here.
25
+ * @param {function(Error):void} onError
26
+ * Called with any stream-level error (e.g. 'error' event from the readable).
27
+ */
28
+ attachStream(readable, onChunk, onEnd, onError) {
29
+ readable.on('data', chunk => {
30
+ try {
31
+ this.feed(typeof chunk === 'string' ? chunk : chunk.toString());
32
+ onChunk(null); // chunk appended successfully — caller runs parseXml()
33
+ } catch (err) {
34
+ onChunk(err); // buffer overflow or coercion failure
35
+ }
36
+ });
37
+
38
+ readable.on('error', onError);
39
+
40
+ readable.on('end', () => {
41
+ try {
42
+ this.end();
43
+ onEnd();
44
+ } catch (err) {
45
+ onError(err);
46
+ }
47
+ });
48
+ }
49
+ }