@nodable/entities 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,379 @@
1
+ // ---------------------------------------------------------------------------
2
+ // Built-in entity tables
3
+ // ---------------------------------------------------------------------------
4
+
5
+ /**
6
+ * Standard XML entities — always processed after external/system so they
7
+ * cannot be overridden by DOCTYPE, and & is deferred to its own final pass.
8
+ *
9
+ * Each entry: { regex: RegExp, val: string }
10
+ */
11
+ const DEFAULT_XML_ENTITIES = {
12
+ apos: { regex: /&(apos|#0*39|#x0*27);/g, val: "'" },
13
+ gt: { regex: /&(gt|#0*62|#x0*3[Ee]);/g, val: '>' },
14
+ lt: { regex: /&(lt|#0*60|#x0*3[Cc]);/g, val: '<' },
15
+ quot: { regex: /&(quot|#0*34|#x0*22);/g, val: '"' },
16
+ };
17
+
18
+ /** &amp; — always expanded last to avoid double-expansion. */
19
+ const AMP_ENTITY = { regex: /&(amp|#0*38|#x0*26);/g, val: '&' };
20
+
21
+ // ---------------------------------------------------------------------------
22
+ // Helpers
23
+ // ---------------------------------------------------------------------------
24
+
25
+ const SPECIAL_CHARS = new Set('!?\\\\/[]$%{}^&*()<>|+');
26
+
27
+ /**
28
+ * Validate that an entity name contains no regex-special or otherwise
29
+ * dangerous characters.
30
+ * @param {string} name
31
+ * @returns {string} the name, unchanged
32
+ * @throws {Error} on invalid characters
33
+ */
34
+ function validateEntityName(name) {
35
+ for (const ch of name) {
36
+ if (SPECIAL_CHARS.has(ch)) {
37
+ throw new Error(`[EntityReplacer] Invalid character '${ch}' in entity name: "${name}"`);
38
+ }
39
+ }
40
+ return name;
41
+ }
42
+
43
+ /**
44
+ * Escape a string for use inside a RegExp character class / alternation.
45
+ */
46
+ function escapeForRegex(str) {
47
+ return str.replace(/[.\-+*:]/g, '\\$&');
48
+ }
49
+
50
+ /**
51
+ * Resolve a constructor option to an entity table (plain object) or null.
52
+ */
53
+ function resolveTable(option, builtIn, enabledByDefault = false) {
54
+ if (option === false || option === null) return null;
55
+ if (option === true) return builtIn;
56
+ if (option === undefined) return enabledByDefault ? builtIn : null;
57
+ if (typeof option === 'object') return option;
58
+ return null;
59
+ }
60
+
61
+ /**
62
+ * Convert a category name or array of names into a Set<string>.
63
+ */
64
+ function resolveApplyLimitsTo(spec) {
65
+ if (spec === 'all') return 'all';
66
+ if (typeof spec === 'string') return new Set([spec]);
67
+ if (Array.isArray(spec)) return new Set(spec);
68
+ return new Set(['external']);
69
+ }
70
+
71
+ /**
72
+ * Build an entries array from a raw map of name → string|{regex,val}.
73
+ * Skips string values that contain '&' (recursive expansion risk).
74
+ * Normalises DocTypeReader's `regx` spelling to `regex`.
75
+ *
76
+ * @param {object} map
77
+ * @returns {Array<[string, {regex: RegExp, val: string}]>}
78
+ */
79
+ function buildEntries(map) {
80
+ const entries = [];
81
+ for (const key of Object.keys(map)) {
82
+ const raw = map[key];
83
+ if (typeof raw === 'object' && raw !== null && (raw.val !== undefined)) {
84
+ // Accept pre-built { regex, val } or DocTypeReader's { regx, val }
85
+ entries.push([key, { regex: raw.regex ?? raw.regx, val: raw.val }]);
86
+ } else if (typeof raw === 'string') {
87
+ if (raw.indexOf('&') !== -1) continue; // skip — would cause recursive expansion
88
+ validateEntityName(key);
89
+ entries.push([key, {
90
+ regex: new RegExp('&' + escapeForRegex(key) + ';', 'g'),
91
+ val: raw,
92
+ }]);
93
+ }
94
+ }
95
+ return entries;
96
+ }
97
+
98
+ // ---------------------------------------------------------------------------
99
+ // EntityReplacer
100
+ // ---------------------------------------------------------------------------
101
+
102
+ /**
103
+ * Standalone, zero-dependency entity replacer for XML/HTML content.
104
+ *
105
+ * Entity categories:
106
+ * - **persistent external** — configured once, survive across documents.
107
+ * Set via `setExternalEntities()` or built up via `addExternalEntity()`.
108
+ * - **input / runtime** — DOCTYPE entities for the *current* document only.
109
+ * Injected via `addInputEntities()`. Wiped on every `getInstance()` call
110
+ * so they never leak between documents.
111
+ *
112
+ * Replacement order (fixed):
113
+ * 1. persistent external
114
+ * 2. input / runtime (DOCTYPE)
115
+ * 3. system (named entity groups)
116
+ * 4. default (lt / gt / apos / quot)
117
+ * 5. amp (&amp; final pass)
118
+ *
119
+ * @example
120
+ * const replacer = new EntityReplacer({ default: true, system: COMMON_HTML });
121
+ * replacer.setExternalEntities({ brand: 'Acme' });
122
+ *
123
+ * // Builder factory calls getInstance() before each document:
124
+ * const instance = replacer.getInstance();
125
+ * // Builder calls addInputEntities() if DOCTYPE entities are present:
126
+ * instance.addInputEntities({ version: '1.0' });
127
+ * instance.replace('&brand; v&version; &lt;'); // 'Acme v1.0 <'
128
+ */
129
+ export default class EntityReplacer {
130
+ /**
131
+ * @param {object} [options]
132
+ * @param {boolean|object|null} [options.default=true]
133
+ * @param {boolean|object|null} [options.amp=true]
134
+ * @param {boolean|object|null} [options.system=false]
135
+ * @param {number} [options.maxTotalExpansions=0]
136
+ * @param {number} [options.maxExpandedLength=0]
137
+ * @param {'external'|'all'|string[]} [options.applyLimitsTo='external']
138
+ * @param {((resolved: string, original: string) => string)|null} [options.postCheck=null]
139
+ */
140
+ constructor(options = {}) {
141
+ // Immutable config resolved at construction
142
+ this._defaultTable = resolveTable(options.default, DEFAULT_XML_ENTITIES, true);
143
+ this._systemTable = resolveTable(options.system, null, false);
144
+ this._ampEnabled = options.amp !== false && options.amp !== null;
145
+
146
+ this._maxTotalExpansions = options.maxTotalExpansions || 0;
147
+ this._maxExpandedLength = options.maxExpandedLength || 0;
148
+ this._applyLimitsTo = resolveApplyLimitsTo(options.applyLimitsTo ?? 'external');
149
+ this._postCheck = typeof options.postCheck === 'function' ? options.postCheck : null;
150
+
151
+ // Pre-computed category limit flags
152
+ this._limitExternal = this._applyLimitsTo === 'all' || (this._applyLimitsTo instanceof Set && this._applyLimitsTo.has('external'));
153
+ this._limitSystem = this._applyLimitsTo === 'all' || (this._applyLimitsTo instanceof Set && this._applyLimitsTo.has('system'));
154
+ this._limitDefault = this._applyLimitsTo === 'all' || (this._applyLimitsTo instanceof Set && this._applyLimitsTo.has('default'));
155
+
156
+ // Frozen immutable entry arrays
157
+ this._defaultEntries = this._defaultTable ? Object.entries(this._defaultTable) : [];
158
+ this._systemEntries = this._systemTable ? Object.entries(this._systemTable) : [];
159
+
160
+ // Persistent external entities — survive across documents
161
+ /** @type {Array<[string, {regex: RegExp, val: string}]>} */
162
+ this._persistentEntries = [];
163
+
164
+ // Input / runtime entities — current document only, reset per getInstance()
165
+ /** @type {Array<[string, {regex: RegExp, val: string}]>} */
166
+ this._inputEntries = [];
167
+
168
+ // Per-document counters — reset in getInstance()
169
+ this._totalExpansions = 0;
170
+ this._expandedLength = 0;
171
+ }
172
+
173
+ // -------------------------------------------------------------------------
174
+ // Persistent external entity registration (survives across documents)
175
+ // -------------------------------------------------------------------------
176
+
177
+ /**
178
+ * Replace the full set of persistent external entities.
179
+ * These are never wiped between documents.
180
+ *
181
+ * @param {Record<string, string | { regex: RegExp, val: string | Function }>} map
182
+ */
183
+ setExternalEntities(map) {
184
+ this._persistentEntries = buildEntries(map);
185
+ }
186
+
187
+ /**
188
+ * Add a single persistent external entity without disturbing existing ones.
189
+ *
190
+ * @param {string} key — bare entity name, e.g. `'copy'`
191
+ * @param {string} value — replacement string, e.g. `'©'`
192
+ */
193
+ addExternalEntity(key, value) {
194
+ validateEntityName(key);
195
+ if (typeof value === 'string' && value.indexOf('&') === -1) {
196
+ this._persistentEntries.push([key, {
197
+ regex: new RegExp('&' + escapeForRegex(key) + ';', 'g'),
198
+ val: value,
199
+ }]);
200
+ }
201
+ }
202
+
203
+ // -------------------------------------------------------------------------
204
+ // Input / runtime entity registration (per document)
205
+ // -------------------------------------------------------------------------
206
+
207
+ /**
208
+ * Inject DOCTYPE (input/runtime) entities for the current document.
209
+ * These are stored separately from persistent entities and wiped on the
210
+ * next `getInstance()` call so they never leak into subsequent documents.
211
+ *
212
+ * Also resets per-document expansion counters.
213
+ *
214
+ * @param {Record<string, string | { regx?: RegExp, regex?: RegExp, val: string | Function }>} map
215
+ */
216
+ addInputEntities(map) {
217
+ this._totalExpansions = 0;
218
+ this._expandedLength = 0;
219
+ this._inputEntries = buildEntries(map);
220
+ }
221
+
222
+ // -------------------------------------------------------------------------
223
+ // getInstance — builder factory integration point
224
+ // -------------------------------------------------------------------------
225
+
226
+ /**
227
+ * Reset all per-document state (input entities + expansion counters) and
228
+ * return `this`.
229
+ *
230
+ * The builder factory calls this each time it creates a new builder instance
231
+ * so DOCTYPE entities from a previous document are never carried over.
232
+ *
233
+ * @returns {EntityReplacer} `this`, after reset
234
+ */
235
+ getInstance() {
236
+ this._inputEntries = [];
237
+ this._totalExpansions = 0;
238
+ this._expandedLength = 0;
239
+ return this;
240
+ }
241
+
242
+ // -------------------------------------------------------------------------
243
+ // Primary API
244
+ // -------------------------------------------------------------------------
245
+
246
+ /**
247
+ * Replace all entity references in `str`.
248
+ *
249
+ * Processing order:
250
+ * 1. persistent external
251
+ * 2. input / runtime (DOCTYPE)
252
+ * 3. system
253
+ * 4. default (lt/gt/apos/quot)
254
+ * 5. amp
255
+ * 6. postCheck hook
256
+ *
257
+ * @param {string} str
258
+ * @returns {string}
259
+ */
260
+ replace(str) {
261
+ if (typeof str !== 'string' || str.length === 0) return str;
262
+ if (str.indexOf('&') === -1) return str; // fast path
263
+
264
+ const original = str;
265
+
266
+ // 1. Persistent external entities
267
+ if (this._persistentEntries.length > 0) {
268
+ str = this._applyEntries(str, this._persistentEntries, this._limitExternal);
269
+ }
270
+
271
+ // 2. Input / runtime entities (DOCTYPE)
272
+ if (this._inputEntries.length > 0 && str.indexOf('&') !== -1) {
273
+ str = this._applyEntries(str, this._inputEntries, this._limitExternal);
274
+ }
275
+
276
+ // 3. System (named groups)
277
+ if (this._systemEntries.length > 0 && str.indexOf('&') !== -1) {
278
+ str = this._applyEntries(str, this._systemEntries, this._limitSystem);
279
+ }
280
+
281
+ // 4. Default XML entities (lt / gt / apos / quot)
282
+ if (this._defaultEntries.length > 0 && str.indexOf('&') !== -1) {
283
+ str = this._applyEntries(str, this._defaultEntries, this._limitDefault);
284
+ }
285
+
286
+ // 5. &amp; — always last
287
+ if (this._ampEnabled && str.indexOf('&') !== -1) {
288
+ str = str.replace(AMP_ENTITY.regex, AMP_ENTITY.val);
289
+ }
290
+
291
+ // 6. postCheck
292
+ if (this._postCheck !== null && str !== original) {
293
+ str = this._postCheck(str, original);
294
+ }
295
+
296
+ return str;
297
+ }
298
+
299
+ // -------------------------------------------------------------------------
300
+ // Private helpers
301
+ // -------------------------------------------------------------------------
302
+
303
+ _applyEntries(str, entries, track) {
304
+ const limitExpansions = track && this._maxTotalExpansions > 0;
305
+ const limitLength = track && this._maxExpandedLength > 0;
306
+ const trackAny = limitExpansions || limitLength;
307
+
308
+ for (let i = 0; i < entries.length; i++) {
309
+ if (str.indexOf('&') === -1) break;
310
+
311
+ const entity = entries[i][1];
312
+
313
+ if (!trackAny) {
314
+ str = str.replace(entity.regex, entity.val);
315
+ continue;
316
+ }
317
+
318
+ if (limitExpansions && !limitLength) {
319
+ let count = 0;
320
+ str = str.replace(entity.regex, (...args) => {
321
+ count++;
322
+ return typeof entity.val === 'function' ? entity.val(...args) : entity.val;
323
+ });
324
+ if (count > 0) {
325
+ this._totalExpansions += count;
326
+ if (this._totalExpansions > this._maxTotalExpansions) {
327
+ throw new Error(
328
+ `[EntityReplacer] Entity expansion count limit exceeded: ` +
329
+ `${this._totalExpansions} > ${this._maxTotalExpansions}`
330
+ );
331
+ }
332
+ }
333
+ } else if (limitLength && !limitExpansions) {
334
+ const before = str.length;
335
+ str = str.replace(entity.regex, entity.val);
336
+ const delta = str.length - before;
337
+ if (delta > 0) {
338
+ this._expandedLength += delta;
339
+ if (this._expandedLength > this._maxExpandedLength) {
340
+ throw new Error(
341
+ `[EntityReplacer] Expanded content length limit exceeded: ` +
342
+ `${this._expandedLength} > ${this._maxExpandedLength}`
343
+ );
344
+ }
345
+ }
346
+ } else {
347
+ const before = str.length;
348
+ let count = 0;
349
+ str = str.replace(entity.regex, (...args) => {
350
+ count++;
351
+ return typeof entity.val === 'function' ? entity.val(...args) : entity.val;
352
+ });
353
+ if (count > 0) {
354
+ this._totalExpansions += count;
355
+ if (this._totalExpansions > this._maxTotalExpansions) {
356
+ throw new Error(
357
+ `[EntityReplacer] Entity expansion count limit exceeded: ` +
358
+ `${this._totalExpansions} > ${this._maxTotalExpansions}`
359
+ );
360
+ }
361
+ }
362
+ const delta = str.length - before;
363
+ if (delta > 0) {
364
+ this._expandedLength += delta;
365
+ if (this._expandedLength > this._maxExpandedLength) {
366
+ throw new Error(
367
+ `[EntityReplacer] Expanded content length limit exceeded: ` +
368
+ `${this._expandedLength} > ${this._maxExpandedLength}`
369
+ );
370
+ }
371
+ }
372
+ }
373
+ }
374
+ return str;
375
+ }
376
+ }
377
+
378
+ // Re-export the built-in tables for advanced users who want to extend them
379
+ export { DEFAULT_XML_ENTITIES, AMP_ENTITY };
package/src/groups.js ADDED
@@ -0,0 +1,99 @@
1
+ // ---------------------------------------------------------------------------
2
+ // Named entity groups — importable separately and freely composable.
3
+ // All groups are plain objects; no magic, no classes.
4
+ // ---------------------------------------------------------------------------
5
+
6
+ /**
7
+ * ~20 most commonly needed HTML named entities.
8
+ * @type {Record<string, { regex: RegExp, val: string | ((m: string, s: string) => string) }>}
9
+ */
10
+ export const COMMON_HTML = {
11
+ nbsp: { regex: /&(nbsp|#0*160|#x0*[Aa]0);/g, val: '\u00a0' },
12
+ copy: { regex: /&(copy|#0*169|#x0*[Aa]9);/g, val: '\u00a9' },
13
+ reg: { regex: /&(reg|#0*174|#x0*[Aa][Ee]);/g, val: '\u00ae' },
14
+ trade: { regex: /&(trade|#0*8482|#x0*2122);/g, val: '\u2122' },
15
+ mdash: { regex: /&(mdash|#0*8212|#x0*2014);/g, val: '\u2014' },
16
+ ndash: { regex: /&(ndash|#0*8211|#x0*2013);/g, val: '\u2013' },
17
+ hellip: { regex: /&(hellip|#0*8230|#x0*2026);/g, val: '\u2026' },
18
+ laquo: { regex: /&(laquo|#0*171|#x0*[Aa][Bb]);/g, val: '\u00ab' },
19
+ raquo: { regex: /&(raquo|#0*187|#x0*[Bb][Bb]);/g, val: '\u00bb' },
20
+ lsquo: { regex: /&(lsquo|#0*8216|#x0*2018);/g, val: '\u2018' },
21
+ rsquo: { regex: /&(rsquo|#0*8217|#x0*2019);/g, val: '\u2019' },
22
+ ldquo: { regex: /&(ldquo|#0*8220|#x0*201[Cc]);/g, val: '\u201c' },
23
+ rdquo: { regex: /&(rdquo|#0*8221|#x0*201[Dd]);/g, val: '\u201d' },
24
+ bull: { regex: /&(bull|#0*8226|#x0*2022);/g, val: '\u2022' },
25
+ para: { regex: /&(para|#0*182|#x0*[Bb]6);/g, val: '\u00b6' },
26
+ sect: { regex: /&(sect|#0*167|#x0*[Aa]7);/g, val: '\u00a7' },
27
+ deg: { regex: /&(deg|#0*176|#x0*[Bb]0);/g, val: '\u00b0' },
28
+ frac12: { regex: /&(frac12|#0*189|#x0*[Bb][Dd]);/g, val: '\u00bd' },
29
+ frac14: { regex: /&(frac14|#0*188|#x0*[Bb][Cc]);/g, val: '\u00bc' },
30
+ frac34: { regex: /&(frac34|#0*190|#x0*[Bb][Ee]);/g, val: '\u00be' },
31
+ };
32
+
33
+ /**
34
+ * Currency symbol entities.
35
+ */
36
+ export const CURRENCY_ENTITIES = {
37
+ cent: { regex: /&(cent|#0*162|#x0*[Aa]2);/g, val: '\u00a2' },
38
+ pound: { regex: /&(pound|#0*163|#x0*[Aa]3);/g, val: '\u00a3' },
39
+ yen: { regex: /&(yen|#0*165|#x0*[Aa]5);/g, val: '\u00a5' },
40
+ euro: { regex: /&(euro|#0*8364|#x0*20[Aa][Cc]);/g, val: '\u20ac' },
41
+ inr: { regex: /&(inr|#0*8377|#x0*20[Bb]9);/g, val: '\u20b9' },
42
+ curren: { regex: /&(curren|#0*164|#x0*[Aa]4);/g, val: '\u00a4' },
43
+ fnof: { regex: /&(fnof|#0*402|#x0*192);/g, val: '\u0192' },
44
+ };
45
+
46
+ /**
47
+ * Mathematical operator entities.
48
+ */
49
+ export const MATH_ENTITIES = {
50
+ times: { regex: /&(times|#0*215|#x0*[Dd]7);/g, val: '\u00d7' },
51
+ divide: { regex: /&(divide|#0*247|#x0*[Ff]7);/g, val: '\u00f7' },
52
+ plusmn: { regex: /&(plusmn|#0*177|#x0*[Bb]1);/g, val: '\u00b1' },
53
+ minus: { regex: /&(minus|#0*8722|#x0*2212);/g, val: '\u2212' },
54
+ sup2: { regex: /&(sup2|#0*178|#x0*[Bb]2);/g, val: '\u00b2' },
55
+ sup3: { regex: /&(sup3|#0*179|#x0*[Bb]3);/g, val: '\u00b3' },
56
+ sup1: { regex: /&(sup1|#0*185|#x0*[Bb]9);/g, val: '\u00b9' },
57
+ frac12: { regex: /&(frac12|#0*189|#x0*[Bb][Dd]);/g, val: '\u00bd' },
58
+ frac14: { regex: /&(frac14|#0*188|#x0*[Bb][Cc]);/g, val: '\u00bc' },
59
+ frac34: { regex: /&(frac34|#0*190|#x0*[Bb][Ee]);/g, val: '\u00be' },
60
+ permil: { regex: /&(permil|#0*8240|#x0*2030);/g, val: '\u2030' },
61
+ infin: { regex: /&(infin|#0*8734|#x0*221[Ee]);/g, val: '\u221e' },
62
+ sum: { regex: /&(sum|#0*8721|#x0*2211);/g, val: '\u2211' },
63
+ prod: { regex: /&(prod|#0*8719|#x0*220[Ff]);/g, val: '\u220f' },
64
+ radic: { regex: /&(radic|#0*8730|#x0*221[Aa]);/g, val: '\u221a' },
65
+ ne: { regex: /&(ne|#0*8800|#x0*2260);/g, val: '\u2260' },
66
+ le: { regex: /&(le|#0*8804|#x0*2264);/g, val: '\u2264' },
67
+ ge: { regex: /&(ge|#0*8805|#x0*2265);/g, val: '\u2265' },
68
+ };
69
+
70
+ /**
71
+ * Arrow entities.
72
+ */
73
+ export const ARROW_ENTITIES = {
74
+ larr: { regex: /&(larr|#0*8592|#x0*2190);/g, val: '\u2190' },
75
+ uarr: { regex: /&(uarr|#0*8593|#x0*2191);/g, val: '\u2191' },
76
+ rarr: { regex: /&(rarr|#0*8594|#x0*2192);/g, val: '\u2192' },
77
+ darr: { regex: /&(darr|#0*8595|#x0*2193);/g, val: '\u2193' },
78
+ harr: { regex: /&(harr|#0*8596|#x0*2194);/g, val: '\u2194' },
79
+ lArr: { regex: /&(lArr|#0*8656|#x0*21[Dd]0);/g, val: '\u21d0' },
80
+ uArr: { regex: /&(uArr|#0*8657|#x0*21[Dd]1);/g, val: '\u21d1' },
81
+ rArr: { regex: /&(rArr|#0*8658|#x0*21[Dd]2);/g, val: '\u21d2' },
82
+ dArr: { regex: /&(dArr|#0*8659|#x0*21[Dd]3);/g, val: '\u21d3' },
83
+ hArr: { regex: /&(hArr|#0*8660|#x0*21[Dd]4);/g, val: '\u21d4' },
84
+ };
85
+
86
+ /**
87
+ * Numeric character references — decimal &#NNN; and hex &#xHH;
88
+ * These are function-replacers; they expand any valid code point.
89
+ */
90
+ export const NUMERIC_ENTITIES = {
91
+ num_dec: {
92
+ regex: /&#0*([0-9]{1,7});/g,
93
+ val: (_, s) => String.fromCodePoint(Number.parseInt(s, 10)),
94
+ },
95
+ num_hex: {
96
+ regex: /&#x0*([0-9a-fA-F]{1,6});/g,
97
+ val: (_, s) => String.fromCodePoint(Number.parseInt(s, 16)),
98
+ },
99
+ };