@nodable/flexible-xml-parser 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +0 -0
- package/LICENSE +21 -0
- package/README.md +284 -0
- package/lib/fxp.d.cts +652 -0
- package/package.json +80 -0
- package/src/AttributeProcessor.js +107 -0
- package/src/AutoCloseHandler.js +257 -0
- package/src/CharsSymbol.js +16 -0
- package/src/DocTypeReader.js +522 -0
- package/src/InputSource/BufferSource.js +228 -0
- package/src/InputSource/FeedableSource.js +340 -0
- package/src/InputSource/StreamSource.js +49 -0
- package/src/InputSource/StringSource.js +225 -0
- package/src/OptionsBuilder.js +400 -0
- package/src/ParseError.js +91 -0
- package/src/StopNodeProcessor.js +573 -0
- package/src/XMLParser.js +293 -0
- package/src/Xml2JsParser.js +573 -0
- package/src/XmlPartReader.js +183 -0
- package/src/XmlSpecialTagsReader.js +82 -0
- package/src/fxp.d.ts +619 -0
- package/src/fxp.js +8 -0
- package/src/util.js +58 -0
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
import { ParseError, ErrorCode } from '../ParseError.js';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* StringSource — input source backed by an in-memory string.
|
|
5
|
+
*
|
|
6
|
+
* ### Memory reclamation
|
|
7
|
+
*
|
|
8
|
+
* Unlike FeedableSource, the full document is available from the start, so
|
|
9
|
+
* there is no chunk-boundary risk and rewindToMark() is a safe no-op.
|
|
10
|
+
* However, the parsed prefix of the string is still held in memory until the
|
|
11
|
+
* parse finishes. flush() reclaims that prefix by slicing the buffer and
|
|
12
|
+
* resetting startIndex to 0.
|
|
13
|
+
*
|
|
14
|
+
* The same mark/flush protocol used by FeedableSource is implemented here so
|
|
15
|
+
* all reader functions (readTagExp, readClosingTagName, readCdata, etc.) work
|
|
16
|
+
* without any source-type conditionals:
|
|
17
|
+
*
|
|
18
|
+
* markTokenStart() — save the current read position at the start of a token
|
|
19
|
+
* rewindToMark() — no-op for StringSource (full doc always present)
|
|
20
|
+
* flush() — drop the already-parsed prefix to free memory
|
|
21
|
+
*
|
|
22
|
+
* Auto-flush fires inside updateBufferBoundary() whenever the processed
|
|
23
|
+
* portion exceeds flushThreshold and no token checkpoint is active.
|
|
24
|
+
*/
|
|
25
|
+
export default class StringSource {
|
|
26
|
+
/**
|
|
27
|
+
* @param {string} str — the full XML document string
|
|
28
|
+
* @param {object} [options]
|
|
29
|
+
* @param {boolean} [options.autoFlush=true] — enable automatic flushing
|
|
30
|
+
* @param {number} [options.flushThreshold=1024] — flush after this many processed chars
|
|
31
|
+
*/
|
|
32
|
+
constructor(str, options = {}) {
|
|
33
|
+
this.line = 1;
|
|
34
|
+
this.cols = 0;
|
|
35
|
+
this.buffer = str;
|
|
36
|
+
// Boundary pointer: data before this index has been consumed and may be freed.
|
|
37
|
+
this.startIndex = 0;
|
|
38
|
+
|
|
39
|
+
this.autoFlush = options.autoFlush !== false;
|
|
40
|
+
this.flushThreshold = options.flushThreshold ?? 1024;
|
|
41
|
+
|
|
42
|
+
// Two-level mark stack matching FeedableSource's API.
|
|
43
|
+
// _marks[0] = outer mark (parseXml loop), _marks[1] = inner mark (readers).
|
|
44
|
+
// -1 means "not set" for that level.
|
|
45
|
+
this._marks = [-1, -1];
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
// ─── Token-start checkpoint ───────────────────────────────────────────────
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* Save the current read position into the two-level mark stack.
|
|
52
|
+
*
|
|
53
|
+
* Mirrors FeedableSource's two-level API so all reader functions work
|
|
54
|
+
* identically regardless of source type:
|
|
55
|
+
*
|
|
56
|
+
* level 0 (default) — outer mark, set by parseXml()'s main loop.
|
|
57
|
+
* level 1 — inner mark, set by individual reader functions.
|
|
58
|
+
*
|
|
59
|
+
* For StringSource the distinction only matters for flush() boundary
|
|
60
|
+
* calculations — rewindToMark() is always a no-op here.
|
|
61
|
+
*
|
|
62
|
+
* @param {0|1} [level=0]
|
|
63
|
+
*/
|
|
64
|
+
markTokenStart(level = 0) {
|
|
65
|
+
this._marks[level] = this.startIndex;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Restore startIndex to the last markTokenStart() position.
|
|
70
|
+
*
|
|
71
|
+
* StringSource always has the full document available, so a mid-token end
|
|
72
|
+
* of input cannot occur and this method is a safe no-op. It exists solely
|
|
73
|
+
* so caller code (XMLParser.feed / parseXml) can call rewindToMark()
|
|
74
|
+
* unconditionally without branching on source type.
|
|
75
|
+
*/
|
|
76
|
+
rewindToMark() {
|
|
77
|
+
// No-op: the complete document is in memory; no rewind is ever needed.
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
/** Clear both mark slots (mirrors FeedableSource.clearMark). */
|
|
81
|
+
clearMark() {
|
|
82
|
+
this._marks[0] = -1;
|
|
83
|
+
this._marks[1] = -1;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
/**
|
|
87
|
+
* Discard the already-processed prefix of the buffer to free memory.
|
|
88
|
+
*
|
|
89
|
+
* The flush origin is the minimum of all active mark positions so that any
|
|
90
|
+
* in-progress token (at either mark level) is preserved in the buffer.
|
|
91
|
+
* If no marks are active, the origin is startIndex itself.
|
|
92
|
+
*/
|
|
93
|
+
flush() {
|
|
94
|
+
let origin = this.startIndex;
|
|
95
|
+
for (const m of this._marks) {
|
|
96
|
+
if (m >= 0 && m < origin) origin = m;
|
|
97
|
+
}
|
|
98
|
+
if (origin > 0) {
|
|
99
|
+
this.buffer = this.buffer.substring(origin);
|
|
100
|
+
const marksLen = this._marks.length;
|
|
101
|
+
for (let i = 0; i < marksLen; i++) {
|
|
102
|
+
if (this._marks[i] >= 0) this._marks[i] -= origin;
|
|
103
|
+
}
|
|
104
|
+
this.startIndex -= origin;
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
// ─── Core read interface ──────────────────────────────────────────────────
|
|
109
|
+
|
|
110
|
+
readCh() {
|
|
111
|
+
return this.buffer[this.startIndex++];
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
readChAt(index) {
|
|
115
|
+
return this.buffer[this.startIndex + index];
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
readStr(n, from) {
|
|
119
|
+
if (typeof from === 'undefined') from = this.startIndex;
|
|
120
|
+
return this.buffer.substring(from, from + n);
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
readUpto(stopStr) {
|
|
124
|
+
const inputLength = this.buffer.length;
|
|
125
|
+
const stopLength = stopStr.length;
|
|
126
|
+
|
|
127
|
+
for (let i = this.startIndex; i < inputLength; i++) {
|
|
128
|
+
let match = true;
|
|
129
|
+
for (let j = 0; j < stopLength; j++) {
|
|
130
|
+
if (this.buffer[i + j] !== stopStr[j]) { match = false; break; }
|
|
131
|
+
}
|
|
132
|
+
if (match) {
|
|
133
|
+
const result = this.buffer.substring(this.startIndex, i);
|
|
134
|
+
this.startIndex = i + stopLength;
|
|
135
|
+
return result;
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
throw new ParseError(`Unexpected end of source reading '${stopStr}'`, ErrorCode.UNEXPECTED_END);
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
/**
|
|
143
|
+
* Single-character variant of readUpto — faster because there is no inner
|
|
144
|
+
* match loop. Reads until `stopChar` is found, consumes it, and returns
|
|
145
|
+
* the text before it.
|
|
146
|
+
*
|
|
147
|
+
* @param {string} stopChar Exactly one character.
|
|
148
|
+
* @returns {string}
|
|
149
|
+
*/
|
|
150
|
+
readUptoChar(stopChar) {
|
|
151
|
+
const i = this.buffer.indexOf(stopChar, this.startIndex);
|
|
152
|
+
if (i === -1) {
|
|
153
|
+
throw new ParseError(`Unexpected end of source reading '${stopChar}'`, ErrorCode.UNEXPECTED_END);
|
|
154
|
+
}
|
|
155
|
+
const result = this.buffer.substring(this.startIndex, i);
|
|
156
|
+
this.startIndex = i + 1;
|
|
157
|
+
return result;
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
readUptoCloseTag(stopStr) { // stopStr: "</tagname"
|
|
161
|
+
const inputLength = this.buffer.length;
|
|
162
|
+
const stopLength = stopStr.length;
|
|
163
|
+
let tagMatchStart = -1;
|
|
164
|
+
// 0: scanning, 1: tag-name matched (scanning for '>'), 2: full match
|
|
165
|
+
let state = 0;
|
|
166
|
+
|
|
167
|
+
for (let i = this.startIndex; i < inputLength; i++) {
|
|
168
|
+
if (state === 1) {
|
|
169
|
+
const c = this.buffer[i];
|
|
170
|
+
if (c === ' ' || c === '\t') continue;
|
|
171
|
+
if (c === '>') { state = 2; }
|
|
172
|
+
else { state = 0; tagMatchStart = -1; } // false match e.g. </scriptX>
|
|
173
|
+
} else {
|
|
174
|
+
// Try to match stopStr at position i
|
|
175
|
+
let matched = true;
|
|
176
|
+
for (let j = 0; j < stopLength; j++) {
|
|
177
|
+
if (this.buffer[i + j] !== stopStr[j]) { matched = false; break; }
|
|
178
|
+
}
|
|
179
|
+
if (matched) {
|
|
180
|
+
state = 1;
|
|
181
|
+
tagMatchStart = i;
|
|
182
|
+
i += stopLength - 1; // skip past matched string
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
if (state === 2) {
|
|
186
|
+
const result = this.buffer.substring(this.startIndex, tagMatchStart);
|
|
187
|
+
this.startIndex = i + 1;
|
|
188
|
+
return result;
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
throw new ParseError(`Unexpected end of source reading '${stopStr}'`, ErrorCode.UNEXPECTED_END);
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
readFromBuffer(n, updateIndex) {
|
|
196
|
+
const ch = n === 1
|
|
197
|
+
? this.buffer[this.startIndex]
|
|
198
|
+
: this.buffer.substring(this.startIndex, this.startIndex + n);
|
|
199
|
+
if (updateIndex) this.updateBufferBoundary(n);
|
|
200
|
+
return ch;
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
/**
|
|
204
|
+
* Advance the read cursor by n characters.
|
|
205
|
+
*
|
|
206
|
+
* Triggers an automatic flush of already-processed data when autoFlush is
|
|
207
|
+
* enabled, the processed portion has grown past flushThreshold, and no
|
|
208
|
+
* token checkpoint is currently active (a flush while a checkpoint is live
|
|
209
|
+
* would invalidate the saved position).
|
|
210
|
+
*
|
|
211
|
+
* @param {number} [n=1]
|
|
212
|
+
*/
|
|
213
|
+
updateBufferBoundary(n = 1) {
|
|
214
|
+
this.startIndex += n;
|
|
215
|
+
const anyMarkActive = this._marks[0] >= 0 || this._marks[1] >= 0;
|
|
216
|
+
if (this.autoFlush && this.startIndex >= this.flushThreshold && !anyMarkActive) {
|
|
217
|
+
this.flush();
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
canRead(n) {
|
|
222
|
+
n = (n !== undefined) ? n : this.startIndex;
|
|
223
|
+
return this.buffer.length - n > 0;
|
|
224
|
+
}
|
|
225
|
+
}
|
|
@@ -0,0 +1,400 @@
|
|
|
1
|
+
import { CompactBuilderFactory } from '@nodable/compact-builder';
|
|
2
|
+
import { Expression, ExpressionSet } from 'path-expression-matcher';
|
|
3
|
+
import { ParseError, ErrorCode } from './ParseError.js';
|
|
4
|
+
import { DANGEROUS_PROPERTY_NAMES, criticalProperties } from './util.js';
|
|
5
|
+
|
|
6
|
+
const defaultOnDangerousProperty = (name) => {
|
|
7
|
+
if (DANGEROUS_PROPERTY_NAMES.includes(name)) {
|
|
8
|
+
return "__" + name;
|
|
9
|
+
}
|
|
10
|
+
return name;
|
|
11
|
+
};
|
|
12
|
+
|
|
13
|
+
export const defaultOptions = {
|
|
14
|
+
// --- skip group ---
|
|
15
|
+
// Controls which node types are excluded from output
|
|
16
|
+
skip: {
|
|
17
|
+
declaration: false, // Skip <?xml ... ?> declaration
|
|
18
|
+
pi: false, // Skip processing instructions (other than declaration)
|
|
19
|
+
attributes: true, // Skip all attributes
|
|
20
|
+
cdata: false, // Exclude CDATA sections from output entirely
|
|
21
|
+
comment: false, // Exclude comments from output entirely
|
|
22
|
+
nsPrefix: false, // Strip namespace prefixes (e.g. ns:tag → tag)
|
|
23
|
+
tags: [], // Tag paths to skip entirely — content is silently dropped from output
|
|
24
|
+
},
|
|
25
|
+
|
|
26
|
+
// --- nameFor group ---
|
|
27
|
+
// Property names used when including special nodes in output.
|
|
28
|
+
nameFor: {
|
|
29
|
+
text: '#text', // Property for mixed text content
|
|
30
|
+
cdata: '', // '' = merge CDATA into text value
|
|
31
|
+
comment: '', // '' = omit comments from output
|
|
32
|
+
},
|
|
33
|
+
|
|
34
|
+
// --- attributes group ---
|
|
35
|
+
attributes: {
|
|
36
|
+
booleanType: false, // Allow valueless attributes (treated as boolean true)
|
|
37
|
+
groupBy: '', // Group all attributes under this key; '' = inline with tag
|
|
38
|
+
prefix: '@_', // Prepended to attribute names in output
|
|
39
|
+
suffix: '', // Appended to attribute names in output
|
|
40
|
+
},
|
|
41
|
+
|
|
42
|
+
// --- tags group ---
|
|
43
|
+
tags: {
|
|
44
|
+
unpaired: [], // Tags that never have a closing tag (e.g. br, img, hr)
|
|
45
|
+
stopNodes: [], // Tag paths whose content is captured raw without parsing
|
|
46
|
+
},
|
|
47
|
+
|
|
48
|
+
// --- security ---
|
|
49
|
+
strictReservedNames: false,
|
|
50
|
+
onDangerousProperty: defaultOnDangerousProperty,
|
|
51
|
+
|
|
52
|
+
// --- filtering (path-expression-matcher) ---
|
|
53
|
+
only: [], // for future
|
|
54
|
+
|
|
55
|
+
// --- DOCTYPE parsing ---
|
|
56
|
+
// Controls whether DOCTYPE entities are collected and read-time security limits.
|
|
57
|
+
//
|
|
58
|
+
// enabled — false (default) → DOCTYPE is read (to consume it) but entities
|
|
59
|
+
// are discarded and never forwarded to output builders
|
|
60
|
+
// true → collect DOCTYPE entities and forward them to the output builder
|
|
61
|
+
// Note: the output builder must have an EntitiesValueParser registered
|
|
62
|
+
// under 'entity' and 'entity' must be in its
|
|
63
|
+
// valueParsers chain for replacement to actually happen.
|
|
64
|
+
//
|
|
65
|
+
// Read-time security limits (enforced by DocTypeReader at declaration time):
|
|
66
|
+
// maxEntityCount — max entities declared in a DOCTYPE (default: 100)
|
|
67
|
+
// maxEntitySize — max bytes per entity definition value (default: 10000)
|
|
68
|
+
//
|
|
69
|
+
// Replacement-time limits (maxTotalExpansions, maxExpandedLength) are configured
|
|
70
|
+
// on EntitiesValueParser directly — they are not part of doctypeOptions.
|
|
71
|
+
doctypeOptions: {
|
|
72
|
+
enabled: false,
|
|
73
|
+
maxEntityCount: 100,
|
|
74
|
+
maxEntitySize: 10000,
|
|
75
|
+
},
|
|
76
|
+
|
|
77
|
+
// --- autoClose ---
|
|
78
|
+
// Controls parser behaviour when tags are unclosed or mismatched.
|
|
79
|
+
//
|
|
80
|
+
// onEof — what to do when EOF is reached with open tags still on the stack
|
|
81
|
+
// 'throw' (default) → throw an error
|
|
82
|
+
// 'closeAll' → silently close all remaining open tags
|
|
83
|
+
//
|
|
84
|
+
// onMismatch — what to do when a closing tag doesn't match the current open tag
|
|
85
|
+
// 'throw' (default) → throw an error
|
|
86
|
+
// 'recover' → pop the stack toward the nearest matching opener;
|
|
87
|
+
// if no match is found the tag is discarded
|
|
88
|
+
// 'discard' → silently ignore the bad closing tag
|
|
89
|
+
//
|
|
90
|
+
// collectErrors — when true, errors are recorded in result.__parseErrors instead
|
|
91
|
+
// of being silently dropped. Each entry has the shape:
|
|
92
|
+
// { type, tag, expected, line, col, index }
|
|
93
|
+
//
|
|
94
|
+
// Shorthand: autoClose: 'html' sets onEof:'closeAll', onMismatch:'discard',
|
|
95
|
+
// collectErrors:true, and adds the standard HTML void elements to tags.unpaired.
|
|
96
|
+
autoClose: null, // null = feature disabled; throws on any malformed input
|
|
97
|
+
|
|
98
|
+
// --- limits (DoS prevention) ---
|
|
99
|
+
// Group structural limits that guard against resource exhaustion.
|
|
100
|
+
//
|
|
101
|
+
// maxNestedTags — maximum tag nesting depth; throws when exceeded.
|
|
102
|
+
// Prevents stack-overflow attacks via deeply nested XML.
|
|
103
|
+
// Default: null (no limit)
|
|
104
|
+
//
|
|
105
|
+
// maxAttributesPerTag — maximum number of attributes on a single tag.
|
|
106
|
+
// Throws when a tag exceeds this count.
|
|
107
|
+
// Default: null (no limit)
|
|
108
|
+
//
|
|
109
|
+
limits: {
|
|
110
|
+
maxNestedTags: null,
|
|
111
|
+
maxAttributesPerTag: null,
|
|
112
|
+
},
|
|
113
|
+
|
|
114
|
+
// --- feedable (feed/end and parseStream input options) ---
|
|
115
|
+
// Controls buffer behaviour for the FeedableSource and StreamSource.
|
|
116
|
+
//
|
|
117
|
+
// maxBufferSize — maximum number of characters allowed in the buffer at
|
|
118
|
+
// any one time. Prevents memory exhaustion when a caller
|
|
119
|
+
// feeds data faster than it is consumed.
|
|
120
|
+
// Default: 10 MB (10 * 1024 * 1024 characters)
|
|
121
|
+
//
|
|
122
|
+
// autoFlush — when true (default), already-processed characters are
|
|
123
|
+
// automatically discarded from the front of the buffer
|
|
124
|
+
// whenever the processed portion exceeds flushThreshold.
|
|
125
|
+
// Keeps memory usage flat for large documents.
|
|
126
|
+
//
|
|
127
|
+
// flushThreshold — number of processed characters that triggers an auto-
|
|
128
|
+
// flush. Lower values free memory sooner but incur more
|
|
129
|
+
// string-slice operations. Default: 1024 characters (1 KB)
|
|
130
|
+
//
|
|
131
|
+
feedable: {
|
|
132
|
+
maxBufferSize: 10 * 1024 * 1024,
|
|
133
|
+
autoFlush: true,
|
|
134
|
+
flushThreshold: 1024,
|
|
135
|
+
},
|
|
136
|
+
|
|
137
|
+
// --- exitIf ---
|
|
138
|
+
// Stops parsing as soon as the predicate returns true for the current tag.
|
|
139
|
+
//
|
|
140
|
+
// The callback receives a read-only matcher positioned at the just-opened tag:
|
|
141
|
+
// exitIf(matcher) → boolean
|
|
142
|
+
//
|
|
143
|
+
// When exitIf returns true the parser immediately:
|
|
144
|
+
// 1. Closes all currently open tags (innermost first) by calling addTextNode()
|
|
145
|
+
// and popTag() for each, so the output builder can finalise its tree.
|
|
146
|
+
// 2. Calls outputBuilder.onExit({ tagDetail, matcher, tagsStack }) so the
|
|
147
|
+
// builder can record that the parse was intentionally truncated.
|
|
148
|
+
// 3. Breaks the parse loop — no further source characters are read.
|
|
149
|
+
//
|
|
150
|
+
// The parse call returns the partial-but-consistent output as normal.
|
|
151
|
+
// No error is thrown.
|
|
152
|
+
//
|
|
153
|
+
// Default: null (feature disabled)
|
|
154
|
+
exitIf: null,
|
|
155
|
+
|
|
156
|
+
// --- output ---
|
|
157
|
+
OutputBuilder: null, //TODO: accept lower case
|
|
158
|
+
};
|
|
159
|
+
|
|
160
|
+
// All names that should never appear as property keys
|
|
161
|
+
const ALL_RESERVED = new Set([...criticalProperties, ...DANGEROUS_PROPERTY_NAMES]);
|
|
162
|
+
export { ALL_RESERVED as RESERVED_JS_NAMES };
|
|
163
|
+
|
|
164
|
+
function validatePropertyName(value, optionName) {
|
|
165
|
+
if (typeof value !== 'string' || value === '') return;
|
|
166
|
+
if (ALL_RESERVED.has(value)) {
|
|
167
|
+
throw new ParseError(
|
|
168
|
+
`SECURITY: '${value}' is a reserved JavaScript keyword and cannot be used as ${optionName}`,
|
|
169
|
+
ErrorCode.SECURITY_RESERVED_OPTION
|
|
170
|
+
);
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
export const buildOptions = function (options) {
|
|
175
|
+
// Validate security-sensitive option values BEFORE merging
|
|
176
|
+
if (options) {
|
|
177
|
+
if (options.nameFor?.text) validatePropertyName(options.nameFor.text, 'nameFor.text');
|
|
178
|
+
if (options.nameFor?.cdata) validatePropertyName(options.nameFor.cdata, 'nameFor.cdata');
|
|
179
|
+
if (options.nameFor?.comment) validatePropertyName(options.nameFor.comment, 'nameFor.comment');
|
|
180
|
+
if (options.attributes?.prefix) validatePropertyName(options.attributes.prefix, 'attributes.prefix');
|
|
181
|
+
if (options.attributes?.groupBy) validatePropertyName(options.attributes.groupBy, 'attributes.groupBy');
|
|
182
|
+
|
|
183
|
+
// Validate limits option
|
|
184
|
+
if (options.limits !== undefined && options.limits !== null) {
|
|
185
|
+
if (typeof options.limits !== 'object') {
|
|
186
|
+
throw new ParseError(`'limits' must be an object, got ${typeof options.limits}`, ErrorCode.INVALID_INPUT);
|
|
187
|
+
}
|
|
188
|
+
const { maxNestedTags, maxAttributesPerTag } = options.limits;
|
|
189
|
+
if (maxNestedTags !== undefined && maxNestedTags !== null &&
|
|
190
|
+
(typeof maxNestedTags !== 'number' || !Number.isInteger(maxNestedTags) || maxNestedTags < 1)) {
|
|
191
|
+
throw new ParseError(`'limits.maxNestedTags' must be a positive integer, got ${maxNestedTags}`, ErrorCode.INVALID_INPUT);
|
|
192
|
+
}
|
|
193
|
+
if (maxAttributesPerTag !== undefined && maxAttributesPerTag !== null &&
|
|
194
|
+
(typeof maxAttributesPerTag !== 'number' || !Number.isInteger(maxAttributesPerTag) || maxAttributesPerTag < 0)) {
|
|
195
|
+
throw new ParseError(`'limits.maxAttributesPerTag' must be a non-negative integer, got ${maxAttributesPerTag}`, ErrorCode.INVALID_INPUT);
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
const finalOptions = deepClone(defaultOptions);
|
|
201
|
+
|
|
202
|
+
if (options) {
|
|
203
|
+
copyProperties(finalOptions, options);
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
if (!finalOptions.OutputBuilder) {
|
|
207
|
+
finalOptions.OutputBuilder = new CompactBuilderFactory();
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
// Normalize stopNodes and skip.tags entries into Expression objects with config embedded
|
|
211
|
+
// in Expression.data as { nested, skipEnclosures }. Build a sealed ExpressionSet for
|
|
212
|
+
// O(1) hot-path matching in the parser.
|
|
213
|
+
//
|
|
214
|
+
// Accepted entry forms (identical for both stopNodes and skip.tags):
|
|
215
|
+
// "..script"
|
|
216
|
+
// → Expression("..script", {}, { nested: false, skipEnclosures: [] })
|
|
217
|
+
//
|
|
218
|
+
// Expression instance
|
|
219
|
+
// → re-wrapped with { nested: false, skipEnclosures: [] } in data
|
|
220
|
+
//
|
|
221
|
+
// { expression: "..script", nested?: boolean, skipEnclosures?: [] }
|
|
222
|
+
// { expression: Expression, nested?: boolean, skipEnclosures?: [] }
|
|
223
|
+
// → Expression with the given config embedded in .data
|
|
224
|
+
//
|
|
225
|
+
// `nested` defaults to false; `skipEnclosures` defaults to [].
|
|
226
|
+
// The two flags are fully independent — any combination is valid.
|
|
227
|
+
if (Array.isArray(finalOptions.tags?.stopNodes)) {
|
|
228
|
+
const stopSet = new ExpressionSet();
|
|
229
|
+
finalOptions.tags.stopNodes = finalOptions.tags.stopNodes.map(
|
|
230
|
+
(entry) => normalizeTagEntry(entry, 'stopNodes', stopSet)
|
|
231
|
+
);
|
|
232
|
+
stopSet.seal();
|
|
233
|
+
finalOptions.tags.stopNodesSet = stopSet;
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
if (Array.isArray(finalOptions.skip?.tags)) {
|
|
237
|
+
const skipSet = new ExpressionSet();
|
|
238
|
+
finalOptions.skip.tags = finalOptions.skip.tags.map(
|
|
239
|
+
(entry) => normalizeTagEntry(entry, 'skip.tags', skipSet)
|
|
240
|
+
);
|
|
241
|
+
skipSet.seal();
|
|
242
|
+
finalOptions.skip.tagsSet = skipSet;
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
if (finalOptions.onDangerousProperty === null) {
|
|
246
|
+
finalOptions.onDangerousProperty = defaultOnDangerousProperty;
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
// Validate exitIf
|
|
250
|
+
if (finalOptions.exitIf !== null && finalOptions.exitIf !== undefined) {
|
|
251
|
+
if (typeof finalOptions.exitIf !== 'function') {
|
|
252
|
+
throw new ParseError(
|
|
253
|
+
`'exitIf' must be a function, got ${typeof finalOptions.exitIf}`,
|
|
254
|
+
ErrorCode.INVALID_INPUT,
|
|
255
|
+
);
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
// Resolve autoClose: expand the 'html' preset and normalise to an object
|
|
260
|
+
finalOptions.autoClose = resolveAutoClose(finalOptions.autoClose, finalOptions);
|
|
261
|
+
|
|
262
|
+
return finalOptions;
|
|
263
|
+
};
|
|
264
|
+
|
|
265
|
+
/** Standard HTML void elements — never have a closing tag. */
|
|
266
|
+
const HTML_VOID_ELEMENTS = [
|
|
267
|
+
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input',
|
|
268
|
+
'link', 'meta', 'param', 'source', 'track', 'wbr',
|
|
269
|
+
];
|
|
270
|
+
|
|
271
|
+
/**
|
|
272
|
+
* Normalise the raw `autoClose` option value into either null (disabled)
|
|
273
|
+
* or a fully-resolved options object.
|
|
274
|
+
*
|
|
275
|
+
* @param {null|string|object} raw - Value supplied by the user
|
|
276
|
+
* @param {object} opts - The already-merged final options (mutated for html preset)
|
|
277
|
+
* @returns {null|object}
|
|
278
|
+
*/
|
|
279
|
+
function resolveAutoClose(raw, opts) {
|
|
280
|
+
if (!raw) return null;
|
|
281
|
+
|
|
282
|
+
if (raw === 'html') {
|
|
283
|
+
// Apply HTML-specific tag defaults
|
|
284
|
+
const existingUnpaired = opts.tags.unpaired || [];
|
|
285
|
+
const merged = [...new Set([...existingUnpaired, ...HTML_VOID_ELEMENTS])];
|
|
286
|
+
opts.tags = { ...opts.tags, unpaired: merged };
|
|
287
|
+
|
|
288
|
+
return {
|
|
289
|
+
onEof: 'closeAll',
|
|
290
|
+
onMismatch: 'discard',
|
|
291
|
+
collectErrors: true,
|
|
292
|
+
};
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
if (typeof raw === 'string') {
|
|
296
|
+
// e.g. autoClose: 'closeAll' — treat as shorthand for onEof
|
|
297
|
+
return {
|
|
298
|
+
onEof: raw,
|
|
299
|
+
onMismatch: 'throw',
|
|
300
|
+
collectErrors: false,
|
|
301
|
+
};
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
if (typeof raw === 'object') {
|
|
305
|
+
return {
|
|
306
|
+
onEof: raw.onEof || 'throw',
|
|
307
|
+
onMismatch: raw.onMismatch || 'throw',
|
|
308
|
+
collectErrors: raw.collectErrors || false,
|
|
309
|
+
};
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
return null;
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
/**
|
|
316
|
+
* Normalize one entry from `tags.stopNodes` or `skip.tags` into an Expression
|
|
317
|
+
* whose `.data` carries `{ nested, skipEnclosures }`, and register it in `set`.
|
|
318
|
+
*
|
|
319
|
+
* Accepted forms:
|
|
320
|
+
* string → bare pattern, defaults applied
|
|
321
|
+
* Expression instance → re-wrapped with defaults
|
|
322
|
+
* { expression: string|Expression, nested?, skipEnclosures? }
|
|
323
|
+
*
|
|
324
|
+
* @param {string|Expression|object} entry
|
|
325
|
+
* @param {string} optionName - Used in error messages ("stopNodes" or "skip.tags")
|
|
326
|
+
* @param {ExpressionSet} set - The set to register the resulting Expression into
|
|
327
|
+
* @returns {Expression}
|
|
328
|
+
*/
|
|
329
|
+
function normalizeTagEntry(entry, optionName, set) {
|
|
330
|
+
let pattern, nested, skipEnclosures;
|
|
331
|
+
|
|
332
|
+
if (typeof entry === 'string') {
|
|
333
|
+
if (entry.length === 0) throw new ParseError(`${optionName} expression cannot be empty`, ErrorCode.INVALID_INPUT);
|
|
334
|
+
pattern = entry;
|
|
335
|
+
nested = false;
|
|
336
|
+
skipEnclosures = [];
|
|
337
|
+
} else if (entry instanceof Expression) {
|
|
338
|
+
// Bare Expression — keep its pattern, apply defaults for missing data fields
|
|
339
|
+
pattern = entry.toString();
|
|
340
|
+
nested = entry.data?.nested ?? false;
|
|
341
|
+
skipEnclosures = entry.data?.skipEnclosures ?? [];
|
|
342
|
+
} else if (entry && typeof entry === 'object' && entry.expression !== undefined) {
|
|
343
|
+
const raw = entry.expression;
|
|
344
|
+
if (typeof raw === 'string') {
|
|
345
|
+
if (raw.length === 0) throw new ParseError(`${optionName} expression cannot be empty`, ErrorCode.INVALID_INPUT);
|
|
346
|
+
pattern = raw;
|
|
347
|
+
} else if (raw instanceof Expression) {
|
|
348
|
+
pattern = raw.toString();
|
|
349
|
+
} else {
|
|
350
|
+
throw new ParseError(`${optionName} expression must be a string or Expression instance`, ErrorCode.INVALID_INPUT);
|
|
351
|
+
}
|
|
352
|
+
nested = entry.nested === true;
|
|
353
|
+
skipEnclosures = Array.isArray(entry.skipEnclosures) ? entry.skipEnclosures : [];
|
|
354
|
+
} else {
|
|
355
|
+
throw new ParseError(
|
|
356
|
+
`Invalid ${optionName} entry: expected a string, Expression, or { expression, nested?, skipEnclosures? } object.`,
|
|
357
|
+
ErrorCode.INVALID_INPUT,
|
|
358
|
+
);
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
const expr = new Expression(pattern, {}, { nested, skipEnclosures });
|
|
362
|
+
set.add(expr);
|
|
363
|
+
return expr;
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
function deepClone(obj) {
|
|
367
|
+
if (obj === null || typeof obj !== 'object') return obj;
|
|
368
|
+
if (Array.isArray(obj)) return obj.map(deepClone);
|
|
369
|
+
if (obj instanceof RegExp) return obj; // ← guard
|
|
370
|
+
if (obj instanceof Expression) return obj; // ← guard — Expression instances are immutable
|
|
371
|
+
const clone = {};
|
|
372
|
+
for (const key of Object.keys(obj)) {
|
|
373
|
+
clone[key] = typeof obj[key] === 'function' ? obj[key] : deepClone(obj[key]);
|
|
374
|
+
}
|
|
375
|
+
return clone;
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
function copyProperties(target, source) {
|
|
379
|
+
for (const key of Object.keys(source)) {
|
|
380
|
+
// Guard against prototype pollution via option keys
|
|
381
|
+
if (key === '__proto__' || key === 'constructor' || key === 'prototype') continue;
|
|
382
|
+
|
|
383
|
+
if (key === 'OutputBuilder') {
|
|
384
|
+
target[key] = source[key];
|
|
385
|
+
} else if (typeof source[key] === 'function') {
|
|
386
|
+
target[key] = source[key];
|
|
387
|
+
} else if (source[key] instanceof RegExp) { // ← guard, before the generic object check
|
|
388
|
+
target[key] = source[key];
|
|
389
|
+
} else if (Array.isArray(source[key])) {
|
|
390
|
+
target[key] = source[key];
|
|
391
|
+
} else if (typeof source[key] === 'object' && source[key] !== null) {
|
|
392
|
+
if (typeof target[key] !== 'object' || target[key] === null) {
|
|
393
|
+
target[key] = {};
|
|
394
|
+
}
|
|
395
|
+
copyProperties(target[key], source[key]);
|
|
396
|
+
} else {
|
|
397
|
+
target[key] = source[key];
|
|
398
|
+
}
|
|
399
|
+
}
|
|
400
|
+
}
|