entities 6.0.0 → 7.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/decode.d.ts +3 -0
- package/dist/commonjs/decode-codepoint.d.ts.map +1 -1
- package/dist/commonjs/decode-codepoint.js +2 -2
- package/dist/commonjs/decode-codepoint.js.map +1 -1
- package/dist/commonjs/decode.d.ts +1 -7
- package/dist/commonjs/decode.d.ts.map +1 -1
- package/dist/commonjs/decode.js +105 -48
- package/dist/commonjs/decode.js.map +1 -1
- package/dist/commonjs/encode.d.ts.map +1 -1
- package/dist/commonjs/encode.js +49 -30
- package/dist/commonjs/encode.js.map +1 -1
- package/dist/commonjs/escape.d.ts +7 -4
- package/dist/commonjs/escape.d.ts.map +1 -1
- package/dist/commonjs/escape.js +36 -19
- package/dist/commonjs/escape.js.map +1 -1
- package/dist/commonjs/generated/decode-data-html.d.ts.map +1 -1
- package/dist/commonjs/generated/decode-data-html.js +2 -5
- package/dist/commonjs/generated/decode-data-html.js.map +1 -1
- package/dist/commonjs/generated/decode-data-xml.d.ts.map +1 -1
- package/dist/commonjs/generated/decode-data-xml.js +2 -5
- package/dist/commonjs/generated/decode-data-xml.js.map +1 -1
- package/dist/commonjs/generated/encode-html.d.ts +1 -6
- package/dist/commonjs/generated/encode-html.d.ts.map +1 -1
- package/dist/commonjs/generated/encode-html.js +9 -8
- package/dist/commonjs/generated/encode-html.js.map +1 -1
- package/dist/commonjs/index.d.ts +3 -3
- package/dist/commonjs/index.d.ts.map +1 -1
- package/dist/commonjs/index.js +19 -19
- package/dist/commonjs/index.js.map +1 -1
- package/dist/commonjs/internal/bin-trie-flags.d.ts +17 -0
- package/dist/commonjs/internal/bin-trie-flags.d.ts.map +1 -0
- package/dist/commonjs/internal/bin-trie-flags.js +21 -0
- package/dist/commonjs/internal/bin-trie-flags.js.map +1 -0
- package/dist/commonjs/internal/decode-shared.d.ts +2 -0
- package/dist/commonjs/internal/decode-shared.d.ts.map +1 -0
- package/dist/commonjs/internal/decode-shared.js +31 -0
- package/dist/commonjs/internal/decode-shared.js.map +1 -0
- package/dist/commonjs/internal/encode-shared.d.ts +32 -0
- package/dist/commonjs/internal/encode-shared.d.ts.map +1 -0
- package/dist/commonjs/internal/encode-shared.js +94 -0
- package/dist/commonjs/internal/encode-shared.js.map +1 -0
- package/dist/esm/decode-codepoint.d.ts.map +1 -1
- package/dist/esm/decode-codepoint.js +2 -2
- package/dist/esm/decode-codepoint.js.map +1 -1
- package/dist/esm/decode.d.ts +1 -7
- package/dist/esm/decode.d.ts.map +1 -1
- package/dist/esm/decode.js +96 -39
- package/dist/esm/decode.js.map +1 -1
- package/dist/esm/encode.d.ts.map +1 -1
- package/dist/esm/encode.js +49 -30
- package/dist/esm/encode.js.map +1 -1
- package/dist/esm/escape.d.ts +7 -4
- package/dist/esm/escape.d.ts.map +1 -1
- package/dist/esm/escape.js +35 -18
- package/dist/esm/escape.js.map +1 -1
- package/dist/esm/generated/decode-data-html.d.ts.map +1 -1
- package/dist/esm/generated/decode-data-html.js +2 -5
- package/dist/esm/generated/decode-data-html.js.map +1 -1
- package/dist/esm/generated/decode-data-xml.d.ts.map +1 -1
- package/dist/esm/generated/decode-data-xml.js +2 -5
- package/dist/esm/generated/decode-data-xml.js.map +1 -1
- package/dist/esm/generated/encode-html.d.ts +1 -6
- package/dist/esm/generated/encode-html.d.ts.map +1 -1
- package/dist/esm/generated/encode-html.js +9 -8
- package/dist/esm/generated/encode-html.js.map +1 -1
- package/dist/esm/index.d.ts +3 -3
- package/dist/esm/index.d.ts.map +1 -1
- package/dist/esm/index.js +9 -9
- package/dist/esm/index.js.map +1 -1
- package/dist/esm/internal/bin-trie-flags.d.ts +17 -0
- package/dist/esm/internal/bin-trie-flags.d.ts.map +1 -0
- package/dist/esm/internal/bin-trie-flags.js +18 -0
- package/dist/esm/internal/bin-trie-flags.js.map +1 -0
- package/dist/esm/internal/decode-shared.d.ts +2 -0
- package/dist/esm/internal/decode-shared.d.ts.map +1 -0
- package/dist/esm/internal/decode-shared.js +28 -0
- package/dist/esm/internal/decode-shared.js.map +1 -0
- package/dist/esm/internal/encode-shared.d.ts +32 -0
- package/dist/esm/internal/encode-shared.d.ts.map +1 -0
- package/dist/esm/internal/encode-shared.js +91 -0
- package/dist/esm/internal/encode-shared.js.map +1 -0
- package/escape.d.ts +3 -0
- package/package.json +19 -22
- package/src/decode-codepoint.ts +2 -2
- package/src/decode.spec.ts +44 -1
- package/src/decode.ts +111 -55
- package/src/encode.spec.ts +1 -1
- package/src/encode.ts +47 -31
- package/src/escape.spec.ts +1 -1
- package/src/escape.ts +39 -26
- package/src/generated/decode-data-html.ts +3 -5
- package/src/generated/decode-data-xml.ts +3 -5
- package/src/generated/encode-html.ts +14 -14
- package/src/index.spec.ts +2 -2
- package/src/index.ts +23 -24
- package/src/internal/bin-trie-flags.ts +16 -0
- package/src/internal/decode-shared.ts +30 -0
- package/src/internal/encode-shared.ts +121 -0
package/src/decode.ts
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
|
+
import { fromCodePoint, replaceCodePoint } from "./decode-codepoint.js";
|
|
1
2
|
import { htmlDecodeTree } from "./generated/decode-data-html.js";
|
|
2
3
|
import { xmlDecodeTree } from "./generated/decode-data-xml.js";
|
|
3
|
-
import {
|
|
4
|
+
import { BinTrieFlags } from "./internal/bin-trie-flags.js";
|
|
4
5
|
|
|
5
6
|
const enum CharCodes {
|
|
6
7
|
NUM = 35, // "#"
|
|
@@ -20,12 +21,6 @@ const enum CharCodes {
|
|
|
20
21
|
/** Bit that needs to be set to convert an upper case ASCII character to lower case */
|
|
21
22
|
const TO_LOWER_BIT = 0b10_0000;
|
|
22
23
|
|
|
23
|
-
export enum BinTrieFlags {
|
|
24
|
-
VALUE_LENGTH = 0b1100_0000_0000_0000,
|
|
25
|
-
BRANCH_LENGTH = 0b0011_1111_1000_0000,
|
|
26
|
-
JUMP_TABLE = 0b0000_0000_0111_1111,
|
|
27
|
-
}
|
|
28
|
-
|
|
29
24
|
function isNumber(code: number): boolean {
|
|
30
25
|
return code >= CharCodes.ZERO && code <= CharCodes.NINE;
|
|
31
26
|
}
|
|
@@ -89,6 +84,7 @@ export interface EntityErrorProducer {
|
|
|
89
84
|
export class EntityDecoder {
|
|
90
85
|
constructor(
|
|
91
86
|
/** The tree used to decode entities. */
|
|
87
|
+
// biome-ignore lint/correctness/noUnusedPrivateClassMembers: False positive
|
|
92
88
|
private readonly decodeTree: Uint16Array,
|
|
93
89
|
/**
|
|
94
90
|
* The function that is called when a codepoint is decoded.
|
|
@@ -198,21 +194,6 @@ export class EntityDecoder {
|
|
|
198
194
|
return this.stateNumericDecimal(input, offset);
|
|
199
195
|
}
|
|
200
196
|
|
|
201
|
-
private addToNumericResult(
|
|
202
|
-
input: string,
|
|
203
|
-
start: number,
|
|
204
|
-
end: number,
|
|
205
|
-
base: number,
|
|
206
|
-
): void {
|
|
207
|
-
if (start !== end) {
|
|
208
|
-
const digitCount = end - start;
|
|
209
|
-
this.result =
|
|
210
|
-
this.result * Math.pow(base, digitCount) +
|
|
211
|
-
Number.parseInt(input.substr(start, digitCount), base);
|
|
212
|
-
this.consumed += digitCount;
|
|
213
|
-
}
|
|
214
|
-
}
|
|
215
|
-
|
|
216
197
|
/**
|
|
217
198
|
* Parses a hexadecimal numeric entity.
|
|
218
199
|
*
|
|
@@ -223,21 +204,22 @@ export class EntityDecoder {
|
|
|
223
204
|
* @returns The number of characters that were consumed, or -1 if the entity is incomplete.
|
|
224
205
|
*/
|
|
225
206
|
private stateNumericHex(input: string, offset: number): number {
|
|
226
|
-
const startIndex = offset;
|
|
227
|
-
|
|
228
207
|
while (offset < input.length) {
|
|
229
208
|
const char = input.charCodeAt(offset);
|
|
230
209
|
if (isNumber(char) || isHexadecimalCharacter(char)) {
|
|
231
|
-
|
|
210
|
+
// Convert hex digit to value (0-15); 'a'/'A' -> 10.
|
|
211
|
+
const digit =
|
|
212
|
+
char <= CharCodes.NINE
|
|
213
|
+
? char - CharCodes.ZERO
|
|
214
|
+
: (char | TO_LOWER_BIT) - CharCodes.LOWER_A + 10;
|
|
215
|
+
this.result = this.result * 16 + digit;
|
|
216
|
+
this.consumed++;
|
|
217
|
+
offset++;
|
|
232
218
|
} else {
|
|
233
|
-
this.addToNumericResult(input, startIndex, offset, 16);
|
|
234
219
|
return this.emitNumericEntity(char, 3);
|
|
235
220
|
}
|
|
236
221
|
}
|
|
237
|
-
|
|
238
|
-
this.addToNumericResult(input, startIndex, offset, 16);
|
|
239
|
-
|
|
240
|
-
return -1;
|
|
222
|
+
return -1; // Incomplete entity
|
|
241
223
|
}
|
|
242
224
|
|
|
243
225
|
/**
|
|
@@ -250,21 +232,17 @@ export class EntityDecoder {
|
|
|
250
232
|
* @returns The number of characters that were consumed, or -1 if the entity is incomplete.
|
|
251
233
|
*/
|
|
252
234
|
private stateNumericDecimal(input: string, offset: number): number {
|
|
253
|
-
const startIndex = offset;
|
|
254
|
-
|
|
255
235
|
while (offset < input.length) {
|
|
256
236
|
const char = input.charCodeAt(offset);
|
|
257
237
|
if (isNumber(char)) {
|
|
258
|
-
|
|
238
|
+
this.result = this.result * 10 + (char - CharCodes.ZERO);
|
|
239
|
+
this.consumed++;
|
|
240
|
+
offset++;
|
|
259
241
|
} else {
|
|
260
|
-
this.addToNumericResult(input, startIndex, offset, 10);
|
|
261
242
|
return this.emitNumericEntity(char, 2);
|
|
262
243
|
}
|
|
263
244
|
}
|
|
264
|
-
|
|
265
|
-
this.addToNumericResult(input, startIndex, offset, 10);
|
|
266
|
-
|
|
267
|
-
return -1;
|
|
245
|
+
return -1; // Incomplete entity
|
|
268
246
|
}
|
|
269
247
|
|
|
270
248
|
/**
|
|
@@ -321,12 +299,78 @@ export class EntityDecoder {
|
|
|
321
299
|
private stateNamedEntity(input: string, offset: number): number {
|
|
322
300
|
const { decodeTree } = this;
|
|
323
301
|
let current = decodeTree[this.treeIndex];
|
|
324
|
-
// The
|
|
302
|
+
// The length is the number of bytes of the value, including the current byte.
|
|
325
303
|
let valueLength = (current & BinTrieFlags.VALUE_LENGTH) >> 14;
|
|
326
304
|
|
|
327
|
-
|
|
305
|
+
while (offset < input.length) {
|
|
306
|
+
// Handle compact runs (possibly inline): valueLength == 0 and SEMI_REQUIRED bit set.
|
|
307
|
+
if (valueLength === 0 && (current & BinTrieFlags.FLAG13) !== 0) {
|
|
308
|
+
const runLength =
|
|
309
|
+
(current & BinTrieFlags.BRANCH_LENGTH) >> 7; /* 2..63 */
|
|
310
|
+
const firstChar = current & BinTrieFlags.JUMP_TABLE;
|
|
311
|
+
// Fast-fail if we don't have enough remaining input for the full run (incomplete entity)
|
|
312
|
+
if (offset + runLength > input.length) return -1;
|
|
313
|
+
// Verify first char
|
|
314
|
+
if (input.charCodeAt(offset) !== firstChar) {
|
|
315
|
+
return this.result === 0
|
|
316
|
+
? 0
|
|
317
|
+
: this.emitNotTerminatedNamedEntity();
|
|
318
|
+
}
|
|
319
|
+
offset++;
|
|
320
|
+
this.excess++;
|
|
321
|
+
// Remaining characters after the first
|
|
322
|
+
const remaining = runLength - 1;
|
|
323
|
+
// Iterate over packed 2-char words
|
|
324
|
+
for (let runPos = 1; runPos < runLength; runPos += 2) {
|
|
325
|
+
const packedWord =
|
|
326
|
+
decodeTree[this.treeIndex + 1 + ((runPos - 1) >> 1)];
|
|
327
|
+
const low = packedWord & 0xff;
|
|
328
|
+
if (input.charCodeAt(offset) !== low) {
|
|
329
|
+
return this.result === 0
|
|
330
|
+
? 0
|
|
331
|
+
: this.emitNotTerminatedNamedEntity();
|
|
332
|
+
}
|
|
333
|
+
offset++;
|
|
334
|
+
this.excess++;
|
|
335
|
+
const high = (packedWord >> 8) & 0xff;
|
|
336
|
+
if (runPos + 1 < runLength) {
|
|
337
|
+
if (input.charCodeAt(offset) !== high) {
|
|
338
|
+
return this.result === 0
|
|
339
|
+
? 0
|
|
340
|
+
: this.emitNotTerminatedNamedEntity();
|
|
341
|
+
}
|
|
342
|
+
offset++;
|
|
343
|
+
this.excess++;
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
this.treeIndex += 1 + ((remaining + 1) >> 1);
|
|
347
|
+
current = decodeTree[this.treeIndex];
|
|
348
|
+
valueLength = (current & BinTrieFlags.VALUE_LENGTH) >> 14;
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
if (offset >= input.length) break;
|
|
352
|
+
|
|
328
353
|
const char = input.charCodeAt(offset);
|
|
329
354
|
|
|
355
|
+
/*
|
|
356
|
+
* Implicit semicolon handling for nodes that require a semicolon but
|
|
357
|
+
* don't have an explicit ';' branch stored in the trie. If we have
|
|
358
|
+
* a value on the current node, it requires a semicolon, and the
|
|
359
|
+
* current input character is a semicolon, emit the entity using the
|
|
360
|
+
* current node (without descending further).
|
|
361
|
+
*/
|
|
362
|
+
if (
|
|
363
|
+
char === CharCodes.SEMI &&
|
|
364
|
+
valueLength !== 0 &&
|
|
365
|
+
(current & BinTrieFlags.FLAG13) !== 0
|
|
366
|
+
) {
|
|
367
|
+
return this.emitNamedEntityData(
|
|
368
|
+
this.treeIndex,
|
|
369
|
+
valueLength,
|
|
370
|
+
this.consumed + this.excess,
|
|
371
|
+
);
|
|
372
|
+
}
|
|
373
|
+
|
|
330
374
|
this.treeIndex = determineBranch(
|
|
331
375
|
decodeTree,
|
|
332
376
|
current,
|
|
@@ -361,12 +405,18 @@ export class EntityDecoder {
|
|
|
361
405
|
}
|
|
362
406
|
|
|
363
407
|
// If we encounter a non-terminated (legacy) entity while parsing strictly, then ignore it.
|
|
364
|
-
if (
|
|
408
|
+
if (
|
|
409
|
+
this.decodeMode !== DecodingMode.Strict &&
|
|
410
|
+
(current & BinTrieFlags.FLAG13) === 0
|
|
411
|
+
) {
|
|
365
412
|
this.result = this.treeIndex;
|
|
366
413
|
this.consumed += this.excess;
|
|
367
414
|
this.excess = 0;
|
|
368
415
|
}
|
|
369
416
|
}
|
|
417
|
+
// Increment offset & excess for next iteration
|
|
418
|
+
offset++;
|
|
419
|
+
this.excess++;
|
|
370
420
|
}
|
|
371
421
|
|
|
372
422
|
return -1;
|
|
@@ -407,7 +457,8 @@ export class EntityDecoder {
|
|
|
407
457
|
|
|
408
458
|
this.emitCodePoint(
|
|
409
459
|
valueLength === 1
|
|
410
|
-
? decodeTree[result] &
|
|
460
|
+
? decodeTree[result] &
|
|
461
|
+
~(BinTrieFlags.VALUE_LENGTH | BinTrieFlags.FLAG13)
|
|
411
462
|
: decodeTree[result + 1],
|
|
412
463
|
consumed,
|
|
413
464
|
);
|
|
@@ -540,22 +591,28 @@ export function determineBranch(
|
|
|
540
591
|
: decodeTree[nodeIndex + value] - 1;
|
|
541
592
|
}
|
|
542
593
|
|
|
543
|
-
// Case 3: Multiple branches encoded in dictionary
|
|
594
|
+
// Case 3: Multiple branches encoded in packed dictionary (two keys per uint16)
|
|
595
|
+
const packedKeySlots = (branchCount + 1) >> 1;
|
|
544
596
|
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
597
|
+
/*
|
|
598
|
+
* Treat packed keys as a virtual sorted array of length `branchCount`.
|
|
599
|
+
* Key(i) = low byte for even i, high byte for odd i in slot i>>1.
|
|
600
|
+
*/
|
|
601
|
+
let lo = 0;
|
|
602
|
+
let hi = branchCount - 1;
|
|
548
603
|
|
|
549
604
|
while (lo <= hi) {
|
|
550
605
|
const mid = (lo + hi) >>> 1;
|
|
551
|
-
const
|
|
606
|
+
const slot = mid >> 1;
|
|
607
|
+
const packed = decodeTree[nodeIndex + slot];
|
|
608
|
+
const midKey = (packed >> ((mid & 1) * 8)) & 0xff;
|
|
552
609
|
|
|
553
|
-
if (
|
|
610
|
+
if (midKey < char) {
|
|
554
611
|
lo = mid + 1;
|
|
555
|
-
} else if (
|
|
612
|
+
} else if (midKey > char) {
|
|
556
613
|
hi = mid - 1;
|
|
557
614
|
} else {
|
|
558
|
-
return decodeTree[
|
|
615
|
+
return decodeTree[nodeIndex + packedKeySlots + mid];
|
|
559
616
|
}
|
|
560
617
|
}
|
|
561
618
|
|
|
@@ -609,12 +666,11 @@ export function decodeXML(xmlString: string): string {
|
|
|
609
666
|
return xmlDecoder(xmlString, DecodingMode.Strict);
|
|
610
667
|
}
|
|
611
668
|
|
|
612
|
-
// Re-export for use by eg. htmlparser2
|
|
613
|
-
export { htmlDecodeTree } from "./generated/decode-data-html.js";
|
|
614
|
-
export { xmlDecodeTree } from "./generated/decode-data-xml.js";
|
|
615
|
-
|
|
616
669
|
export {
|
|
617
670
|
decodeCodePoint,
|
|
618
|
-
replaceCodePoint,
|
|
619
671
|
fromCodePoint,
|
|
672
|
+
replaceCodePoint,
|
|
620
673
|
} from "./decode-codepoint.js";
|
|
674
|
+
// Re-export for use by eg. htmlparser2
|
|
675
|
+
export { htmlDecodeTree } from "./generated/decode-data-html.js";
|
|
676
|
+
export { xmlDecodeTree } from "./generated/decode-data-xml.js";
|
package/src/encode.spec.ts
CHANGED
package/src/encode.ts
CHANGED
|
@@ -1,7 +1,17 @@
|
|
|
1
|
+
import { getCodePoint, XML_BITSET_VALUE } from "./escape.js";
|
|
1
2
|
import { htmlTrie } from "./generated/encode-html.js";
|
|
2
|
-
import { xmlReplacer, getCodePoint } from "./escape.js";
|
|
3
3
|
|
|
4
|
-
|
|
4
|
+
/**
|
|
5
|
+
* We store the characters to consider as a compact bitset for fast lookups.
|
|
6
|
+
*/
|
|
7
|
+
const HTML_BITSET = /* #__PURE__ */ new Uint32Array([
|
|
8
|
+
0x16_00, // Bits for 09,0A,0C
|
|
9
|
+
0xfc_00_ff_fe, // 32..63 -> 21-2D (minus space), 2E,2F,3A-3F
|
|
10
|
+
0xf8_00_00_01, // 64..95 -> 40, 5B-5F
|
|
11
|
+
0x38_00_00_01, // 96..127-> 60, 7B-7D
|
|
12
|
+
]);
|
|
13
|
+
|
|
14
|
+
const XML_BITSET = /* #__PURE__ */ new Uint32Array([0, XML_BITSET_VALUE, 0, 0]);
|
|
5
15
|
|
|
6
16
|
/**
|
|
7
17
|
* Encodes all characters in the input using HTML entities. This includes
|
|
@@ -15,7 +25,7 @@ const htmlReplacer = /[\t\n\f!-,./:-@[-`{-}\u0080-\uFFFF]/g;
|
|
|
15
25
|
* (eg. `ü`) will be used.
|
|
16
26
|
*/
|
|
17
27
|
export function encodeHTML(input: string): string {
|
|
18
|
-
return encodeHTMLTrieRe(
|
|
28
|
+
return encodeHTMLTrieRe(HTML_BITSET, input);
|
|
19
29
|
}
|
|
20
30
|
/**
|
|
21
31
|
* Encodes all non-ASCII characters, as well as characters not valid in HTML
|
|
@@ -26,52 +36,58 @@ export function encodeHTML(input: string): string {
|
|
|
26
36
|
* (eg. `ü`) will be used.
|
|
27
37
|
*/
|
|
28
38
|
export function encodeNonAsciiHTML(input: string): string {
|
|
29
|
-
return encodeHTMLTrieRe(
|
|
39
|
+
return encodeHTMLTrieRe(XML_BITSET, input);
|
|
30
40
|
}
|
|
31
41
|
|
|
32
|
-
function encodeHTMLTrieRe(
|
|
33
|
-
let
|
|
34
|
-
let
|
|
35
|
-
|
|
42
|
+
function encodeHTMLTrieRe(bitset: Uint32Array, input: string): string {
|
|
43
|
+
let out: string | undefined;
|
|
44
|
+
let last = 0; // Start of the next untouched slice.
|
|
45
|
+
const { length } = input;
|
|
36
46
|
|
|
37
|
-
|
|
38
|
-
const { index } = match;
|
|
39
|
-
returnValue += input.substring(lastIndex, index);
|
|
47
|
+
for (let index = 0; index < length; index++) {
|
|
40
48
|
const char = input.charCodeAt(index);
|
|
41
|
-
|
|
49
|
+
// Skip ASCII characters that don't need encoding
|
|
50
|
+
if (char < 0x80 && !((bitset[char >>> 5] >>> char) & 1)) {
|
|
51
|
+
continue;
|
|
52
|
+
}
|
|
42
53
|
|
|
43
|
-
if (
|
|
44
|
-
|
|
45
|
-
|
|
54
|
+
if (out === undefined) out = input.substring(0, index);
|
|
55
|
+
else if (last !== index) out += input.substring(last, index);
|
|
56
|
+
|
|
57
|
+
let node = htmlTrie.get(char);
|
|
58
|
+
|
|
59
|
+
if (typeof node === "object") {
|
|
60
|
+
if (index + 1 < length) {
|
|
46
61
|
const nextChar = input.charCodeAt(index + 1);
|
|
47
62
|
const value =
|
|
48
|
-
typeof next
|
|
49
|
-
? next
|
|
50
|
-
?
|
|
63
|
+
typeof node.next === "number"
|
|
64
|
+
? node.next === nextChar
|
|
65
|
+
? node.nextValue
|
|
51
66
|
: undefined
|
|
52
|
-
: next.
|
|
67
|
+
: node.next.get(nextChar);
|
|
53
68
|
|
|
54
69
|
if (value !== undefined) {
|
|
55
|
-
|
|
56
|
-
|
|
70
|
+
out += value;
|
|
71
|
+
index++;
|
|
72
|
+
last = index + 1;
|
|
57
73
|
continue;
|
|
58
74
|
}
|
|
59
75
|
}
|
|
60
|
-
|
|
61
|
-
next = next.v;
|
|
76
|
+
node = node.value;
|
|
62
77
|
}
|
|
63
78
|
|
|
64
|
-
|
|
65
|
-
if (next === undefined) {
|
|
79
|
+
if (node === undefined) {
|
|
66
80
|
const cp = getCodePoint(input, index);
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
81
|
+
out += `&#x${cp.toString(16)};`;
|
|
82
|
+
if (cp !== char) index++;
|
|
83
|
+
last = index + 1;
|
|
70
84
|
} else {
|
|
71
|
-
|
|
72
|
-
|
|
85
|
+
out += node;
|
|
86
|
+
last = index + 1;
|
|
73
87
|
}
|
|
74
88
|
}
|
|
75
89
|
|
|
76
|
-
|
|
90
|
+
if (out === undefined) return input;
|
|
91
|
+
if (last < length) out += input.substr(last);
|
|
92
|
+
return out;
|
|
77
93
|
}
|
package/src/escape.spec.ts
CHANGED
package/src/escape.ts
CHANGED
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
export const xmlReplacer: RegExp = /["$&'<>\u0080-\uFFFF]/g;
|
|
2
|
-
|
|
3
1
|
const xmlCodeMap = new Map([
|
|
4
2
|
[34, """],
|
|
5
3
|
[38, "&"],
|
|
@@ -22,39 +20,54 @@ export const getCodePoint: (c: string, index: number) => number =
|
|
|
22
20
|
: // http://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae
|
|
23
21
|
(input: string, index: number): number => input.codePointAt(index)!;
|
|
24
22
|
|
|
23
|
+
/**
|
|
24
|
+
* Bitset for ASCII characters that need to be escaped in XML.
|
|
25
|
+
*/
|
|
26
|
+
export const XML_BITSET_VALUE = 0x50_00_00_c4; // 32..63 -> 34 ("),38 (&),39 ('),60 (<),62 (>)
|
|
27
|
+
|
|
25
28
|
/**
|
|
26
29
|
* Encodes all non-ASCII characters, as well as characters not valid in XML
|
|
27
|
-
* documents using XML entities.
|
|
30
|
+
* documents using XML entities. Uses a fast bitset scan instead of RegExp.
|
|
28
31
|
*
|
|
29
|
-
* If a character has no equivalent entity, a
|
|
30
|
-
*
|
|
32
|
+
* If a character has no equivalent entity, a numeric hexadecimal reference
|
|
33
|
+
* (eg. `ü`) will be used.
|
|
31
34
|
*/
|
|
32
35
|
export function encodeXML(input: string): string {
|
|
33
|
-
let
|
|
34
|
-
let
|
|
35
|
-
|
|
36
|
+
let out: string | undefined;
|
|
37
|
+
let last = 0;
|
|
38
|
+
const { length } = input;
|
|
36
39
|
|
|
37
|
-
|
|
38
|
-
const { index } = match;
|
|
40
|
+
for (let index = 0; index < length; index++) {
|
|
39
41
|
const char = input.charCodeAt(index);
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
if (
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
// Increase by 1 if we have a surrogate pair
|
|
48
|
-
lastIndex = xmlReplacer.lastIndex += Number(
|
|
49
|
-
(char & 0xfc_00) === 0xd8_00,
|
|
50
|
-
);
|
|
51
|
-
} else {
|
|
52
|
-
returnValue += input.substring(lastIndex, index) + next;
|
|
53
|
-
lastIndex = index + 1;
|
|
42
|
+
|
|
43
|
+
// Check for ASCII chars that don't need escaping
|
|
44
|
+
if (
|
|
45
|
+
char < 0x80 &&
|
|
46
|
+
(((XML_BITSET_VALUE >>> char) & 1) === 0 || char >= 64 || char < 32)
|
|
47
|
+
) {
|
|
48
|
+
continue;
|
|
54
49
|
}
|
|
50
|
+
|
|
51
|
+
if (out === undefined) out = input.substring(0, index);
|
|
52
|
+
else if (last !== index) out += input.substring(last, index);
|
|
53
|
+
|
|
54
|
+
if (char < 64) {
|
|
55
|
+
// Known replacement
|
|
56
|
+
out += xmlCodeMap.get(char)!;
|
|
57
|
+
last = index + 1;
|
|
58
|
+
continue;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
// Non-ASCII: encode as numeric entity (handle surrogate pair)
|
|
62
|
+
const cp = getCodePoint(input, index);
|
|
63
|
+
out += `&#x${cp.toString(16)};`;
|
|
64
|
+
if (cp !== char) index++; // Skip trailing surrogate
|
|
65
|
+
last = index + 1;
|
|
55
66
|
}
|
|
56
67
|
|
|
57
|
-
|
|
68
|
+
if (out === undefined) return input;
|
|
69
|
+
if (last < length) out += input.substr(last);
|
|
70
|
+
return out;
|
|
58
71
|
}
|
|
59
72
|
|
|
60
73
|
/**
|
|
@@ -83,7 +96,7 @@ function getEscaper(
|
|
|
83
96
|
map: Map<number, string>,
|
|
84
97
|
): (data: string) => string {
|
|
85
98
|
return function escape(data: string): string {
|
|
86
|
-
let match;
|
|
99
|
+
let match: RegExpExecArray | null;
|
|
87
100
|
let lastIndex = 0;
|
|
88
101
|
let result = "";
|
|
89
102
|
|