@nodable/flexible-xml-parser 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,293 @@
1
+ import { buildOptions } from './OptionsBuilder.js';
2
+ import { ParseError, ErrorCode } from './ParseError.js';
3
+ import Xml2JsParser from './Xml2JsParser.js';
4
+ import FeedableSource from './InputSource/FeedableSource.js';
5
+ import StreamSource from './InputSource/StreamSource.js';
6
+
7
+ export default class XMLParser {
8
+
9
+ constructor(options) {
10
+ this.options = buildOptions(options);
11
+
12
+ // feed()/end() session state
13
+ this._feedParser = null;
14
+ this._feedSource = null;
15
+ this._isFeeding = false;
16
+ }
17
+
18
+ // ─── One-shot parse methods ───────────────────────────────────────────────
19
+
20
+ /**
21
+ * Parse an XML string or Buffer and return a JS object.
22
+ * @param {string|Buffer} xmlData
23
+ */
24
+ parse(xmlData) {
25
+ if (xmlData instanceof Buffer || ArrayBuffer.isView(xmlData)) {
26
+ xmlData = xmlData.toString();
27
+ } else if (typeof xmlData !== 'string') {
28
+ if (xmlData && typeof xmlData.toString === 'function') {
29
+ xmlData = xmlData.toString();
30
+ } else {
31
+ throw new ParseError('XML data must be a string or Buffer.', ErrorCode.INVALID_INPUT);
32
+ }
33
+ }
34
+
35
+ const parser = this._createParser();
36
+ const result = parser.parse(xmlData);
37
+ this.wasExited = parser.wasExited();
38
+ this._lastParseErrors = parser.autoCloseHandler?.getErrors() ?? [];
39
+ return result;
40
+ }
41
+
42
+ /**
43
+ * Parse a Uint8Array / byte array and return a JS object.
44
+ * @param {Uint8Array|ArrayBufferView} xmlData
45
+ */
46
+ parseBytesArr(xmlData) {
47
+ if (xmlData instanceof Uint8Array || ArrayBuffer.isView(xmlData)) {
48
+ xmlData = Buffer.from(xmlData);
49
+ } else {
50
+ throw new ParseError('XML data must be a Uint8Array or ArrayBufferView.', ErrorCode.INVALID_INPUT);
51
+ }
52
+
53
+ const parser = this._createParser();
54
+ const result = parser.parseBytesArr(xmlData);
55
+ this.wasExited = parser.wasExited();
56
+ this._lastParseErrors = parser.autoCloseHandler?.getErrors() ?? [];
57
+ return result;
58
+ }
59
+
60
+ // ─── Stream input ─────────────────────────────────────────────────────────
61
+
62
+ /**
63
+ * Parse an XML Node.js Readable stream and return a Promise that resolves
64
+ * with the parsed JS object.
65
+ *
66
+ * Chunks are processed incrementally as they arrive — parseXml() runs after
67
+ * each 'data' event and already-consumed input is freed before the next
68
+ * chunk arrives, so memory stays proportional to the largest incomplete token
69
+ * at any chunk boundary rather than the total document size.
70
+ *
71
+ * @param {NodeJS.ReadableStream} readable
72
+ * @returns {Promise<any>}
73
+ */
74
+ parseStream(readable) {
75
+ if (!isReadableStream(readable)) {
76
+ throw new ParseError('parseStream() requires a Node.js Readable stream.', ErrorCode.INVALID_STREAM);
77
+ }
78
+
79
+ const source = new StreamSource(this.options.feedable);
80
+ const streamParser = this._createParser();
81
+ streamParser.source = source;
82
+ streamParser.initializeParser();
83
+
84
+ return new Promise((resolve, reject) => {
85
+ let settled = false;
86
+ const fail = (err) => {
87
+ if (!settled) {
88
+ settled = true;
89
+ readable.destroy(); // stop further data/end events and free the handle
90
+ reject(err);
91
+ }
92
+ };
93
+
94
+ source.attachStream(
95
+ readable,
96
+ // onChunk — run the parser incrementally after each chunk arrives.
97
+ // Mirrors what feed() does: advance as far as possible, rewind on
98
+ // UNEXPECTED_END (chunk boundary mid-token), re-throw real errors.
99
+ (err) => {
100
+ if (err) { fail(err); return; }
101
+ try {
102
+ streamParser.parseXml();
103
+ } catch (parseErr) {
104
+ if (parseErr.code === ErrorCode.UNEXPECTED_END) {
105
+ source.rewindToMark();
106
+ } else {
107
+ fail(parseErr);
108
+ }
109
+ }
110
+ },
111
+ // onEnd — stream finished cleanly; finalise the document.
112
+ () => {
113
+ if (settled) return;
114
+ try {
115
+ streamParser.finalizeXml();
116
+ this._lastParseErrors = streamParser.autoCloseHandler?.getErrors() ?? [];
117
+ settled = true;
118
+ resolve(streamParser.outputBuilder.getOutput());
119
+ } catch (err) { fail(err); }
120
+ },
121
+ // onError — stream-level error (e.g. file not found, network drop)
122
+ fail,
123
+ );
124
+ });
125
+ }
126
+
127
+ // ─── Incremental feed()/end() API ────────────────────────────────────────
128
+
129
+ /**
130
+ * Feed an XML data chunk for incremental parsing.
131
+ *
132
+ * After appending the chunk, parseXml() is run immediately so the parser
133
+ * advances as far as possible. If a chunk boundary falls mid-token, the
134
+ * reader throws UNEXPECTED_END; this is caught here and the source is
135
+ * rewound to the start of the incomplete token so it will be re-parsed on
136
+ * the next feed() call once more data has arrived.
137
+ *
138
+ * Any other ParseError (unclosed quote, mismatched tag, etc.) is a real
139
+ * parse failure and is re-thrown after cleaning up the session.
140
+ *
141
+ * Returns `this` for chaining.
142
+ *
143
+ * @param {string|Buffer} data
144
+ * @returns {XMLParser}
145
+ */
146
+ feed(data) {
147
+ if (!this._isFeeding) {
148
+ this._initFeedSession();
149
+ }
150
+
151
+ let str;
152
+ if (typeof data === 'string') {
153
+ str = data;
154
+ } else if (Buffer.isBuffer(data)) {
155
+ str = data.toString();
156
+ } else if (data?.toString) {
157
+ str = data.toString();
158
+ } else {
159
+ throw new ParseError('feed() data must be a string or Buffer.', ErrorCode.DATA_MUST_BE_STRING);
160
+ }
161
+
162
+ this._feedSource.feed(str);
163
+
164
+ try {
165
+ this._feedParser.parseXml();
166
+ } catch (err) {
167
+ if (err.code === ErrorCode.UNEXPECTED_END) {
168
+ // Chunk boundary fell mid-token. Rewind to the token start so the
169
+ // incomplete bytes are re-parsed when the next chunk arrives.
170
+ this._feedSource.rewindToMark();
171
+ } else {
172
+ // Real parse error — clean up and propagate.
173
+ this._cleanupFeedSession();
174
+ throw err;
175
+ }
176
+ }
177
+
178
+ return this;
179
+ }
180
+
181
+ /**
182
+ * Signal end of input, validate end-of-document state, and return the
183
+ * parsed result. Throws if called before any feed() call.
184
+ *
185
+ * parseXml() is called one final time after marking the source complete.
186
+ * This replays any bytes that were rewound during the last feed() call
187
+ * (e.g. a tag that was split across the final chunk boundary). Now that
188
+ * isComplete is true, any UNEXPECTED_END thrown by a reader means the
189
+ * document is genuinely truncated — not a chunk boundary — so it is
190
+ * treated as a real parse error rather than silently swallowed.
191
+ *
192
+ * autoClose partial-tag recovery works the same way it does in
193
+ * _parseAndFinalize(): if autoCloseHandler is configured and parseXml()
194
+ * throws UNEXPECTED_END, the handler is given a chance to recover before
195
+ * finalizeXml() runs.
196
+ *
197
+ * @returns {any}
198
+ */
199
+ end() {
200
+ if (!this._isFeeding) {
201
+ throw new ParseError('No data fed. Call feed() before end().', ErrorCode.NOT_STREAMING);
202
+ }
203
+
204
+ try {
205
+ // Mark the source as complete so readers know there is no more data.
206
+ this._feedSource.end();
207
+
208
+ // Replay any bytes rewound during the last feed() call (e.g. an
209
+ // incomplete tag at the very end of the input stream). Any
210
+ // UNEXPECTED_END thrown here is a genuine truncation error.
211
+ let partialTagError = null;
212
+ const autoClose = this._feedParser.autoCloseHandler;
213
+ if (autoClose) autoClose.reset();
214
+
215
+ try {
216
+ this._feedParser.parseXml();
217
+ } catch (err) {
218
+ if (err.code === ErrorCode.UNEXPECTED_END) {
219
+ if (autoClose) {
220
+ // autoClose recovery: treat the truncated tag the same way
221
+ // _parseAndFinalize() does for the one-shot parse path.
222
+ partialTagError = err;
223
+ } else {
224
+ // No recovery configured — truncated document is a hard error.
225
+ throw err;
226
+ }
227
+ } else {
228
+ throw err;
229
+ }
230
+ }
231
+
232
+ if (partialTagError) {
233
+ autoClose.handlePartialTag(partialTagError, this._feedParser._parserState());
234
+ } else {
235
+ this._feedParser.finalizeXml();
236
+ }
237
+
238
+ this._lastParseErrors = autoClose?.getErrors() ?? [];
239
+ this.wasExited = this._feedParser.wasExited();
240
+ return this._feedParser.outputBuilder.getOutput();
241
+ } finally {
242
+ this._cleanupFeedSession();
243
+ }
244
+ }
245
+
246
+ // ─── Error reporting ──────────────────────────────────────────────────────
247
+
248
+ /**
249
+ * Return structural errors collected during the last parse call.
250
+ * Only populated when autoClose.collectErrors is true.
251
+ * Each entry: { type, tag, expected, line, col, index }
252
+ *
253
+ * @returns {Array}
254
+ */
255
+ getParseErrors() {
256
+ return this._lastParseErrors ?? [];
257
+ }
258
+
259
+ // ─── Private helpers ──────────────────────────────────────────────────────
260
+
261
+ /** @private */
262
+ _createParser() {
263
+ return new Xml2JsParser(this.options);
264
+ }
265
+
266
+ /** @private */
267
+ _initFeedSession() {
268
+ this._feedSource = new FeedableSource(this.options.feedable);
269
+ this._feedParser = this._createParser();
270
+ this._feedParser.source = this._feedSource;
271
+ this._feedParser.initializeParser();
272
+ this._isFeeding = true;
273
+ }
274
+
275
+ /** @private */
276
+ _cleanupFeedSession() {
277
+ this._feedParser = null;
278
+ this._feedSource = null;
279
+ this._isFeeding = false;
280
+ }
281
+ }
282
+
283
+ // ─── Helpers ─────────────────────────────────────────────────────────────────
284
+
285
+ function isReadableStream(value) {
286
+ return (
287
+ value !== null &&
288
+ typeof value === 'object' &&
289
+ typeof value.read === 'function' &&
290
+ typeof value.on === 'function' &&
291
+ typeof value.readableEnded === 'boolean'
292
+ );
293
+ }