entities 6.0.1 → 7.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/decode.d.ts +2 -0
- package/dist/commonjs/decode-codepoint.d.ts.map +1 -1
- package/dist/commonjs/decode-codepoint.js +2 -2
- package/dist/commonjs/decode-codepoint.js.map +1 -1
- package/dist/commonjs/decode.d.ts +3 -7
- package/dist/commonjs/decode.d.ts.map +1 -1
- package/dist/commonjs/decode.js +105 -48
- package/dist/commonjs/decode.js.map +1 -1
- package/dist/commonjs/encode.d.ts.map +1 -1
- package/dist/commonjs/encode.js +49 -30
- package/dist/commonjs/encode.js.map +1 -1
- package/dist/commonjs/escape.d.ts +7 -4
- package/dist/commonjs/escape.d.ts.map +1 -1
- package/dist/commonjs/escape.js +36 -19
- package/dist/commonjs/escape.js.map +1 -1
- package/dist/commonjs/generated/decode-data-html.d.ts.map +1 -1
- package/dist/commonjs/generated/decode-data-html.js +2 -5
- package/dist/commonjs/generated/decode-data-html.js.map +1 -1
- package/dist/commonjs/generated/decode-data-xml.d.ts.map +1 -1
- package/dist/commonjs/generated/decode-data-xml.js +2 -5
- package/dist/commonjs/generated/decode-data-xml.js.map +1 -1
- package/dist/commonjs/generated/encode-html.d.ts +1 -6
- package/dist/commonjs/generated/encode-html.d.ts.map +1 -1
- package/dist/commonjs/generated/encode-html.js +9 -8
- package/dist/commonjs/generated/encode-html.js.map +1 -1
- package/dist/commonjs/index.d.ts +3 -3
- package/dist/commonjs/index.d.ts.map +1 -1
- package/dist/commonjs/index.js +19 -19
- package/dist/commonjs/index.js.map +1 -1
- package/dist/commonjs/internal/bin-trie-flags.d.ts +17 -0
- package/dist/commonjs/internal/bin-trie-flags.d.ts.map +1 -0
- package/dist/commonjs/internal/bin-trie-flags.js +21 -0
- package/dist/commonjs/internal/bin-trie-flags.js.map +1 -0
- package/dist/commonjs/internal/decode-shared.d.ts +2 -0
- package/dist/commonjs/internal/decode-shared.d.ts.map +1 -0
- package/dist/commonjs/internal/decode-shared.js +31 -0
- package/dist/commonjs/internal/decode-shared.js.map +1 -0
- package/dist/commonjs/internal/encode-shared.d.ts +32 -0
- package/dist/commonjs/internal/encode-shared.d.ts.map +1 -0
- package/dist/commonjs/internal/encode-shared.js +94 -0
- package/dist/commonjs/internal/encode-shared.js.map +1 -0
- package/dist/esm/decode-codepoint.d.ts.map +1 -1
- package/dist/esm/decode-codepoint.js +2 -2
- package/dist/esm/decode-codepoint.js.map +1 -1
- package/dist/esm/decode.d.ts +3 -7
- package/dist/esm/decode.d.ts.map +1 -1
- package/dist/esm/decode.js +96 -39
- package/dist/esm/decode.js.map +1 -1
- package/dist/esm/encode.d.ts.map +1 -1
- package/dist/esm/encode.js +49 -30
- package/dist/esm/encode.js.map +1 -1
- package/dist/esm/escape.d.ts +7 -4
- package/dist/esm/escape.d.ts.map +1 -1
- package/dist/esm/escape.js +35 -18
- package/dist/esm/escape.js.map +1 -1
- package/dist/esm/generated/decode-data-html.d.ts.map +1 -1
- package/dist/esm/generated/decode-data-html.js +2 -5
- package/dist/esm/generated/decode-data-html.js.map +1 -1
- package/dist/esm/generated/decode-data-xml.d.ts.map +1 -1
- package/dist/esm/generated/decode-data-xml.js +2 -5
- package/dist/esm/generated/decode-data-xml.js.map +1 -1
- package/dist/esm/generated/encode-html.d.ts +1 -6
- package/dist/esm/generated/encode-html.d.ts.map +1 -1
- package/dist/esm/generated/encode-html.js +9 -8
- package/dist/esm/generated/encode-html.js.map +1 -1
- package/dist/esm/index.d.ts +3 -3
- package/dist/esm/index.d.ts.map +1 -1
- package/dist/esm/index.js +9 -9
- package/dist/esm/index.js.map +1 -1
- package/dist/esm/internal/bin-trie-flags.d.ts +17 -0
- package/dist/esm/internal/bin-trie-flags.d.ts.map +1 -0
- package/dist/esm/internal/bin-trie-flags.js +18 -0
- package/dist/esm/internal/bin-trie-flags.js.map +1 -0
- package/dist/esm/internal/decode-shared.d.ts +2 -0
- package/dist/esm/internal/decode-shared.d.ts.map +1 -0
- package/dist/esm/internal/decode-shared.js +28 -0
- package/dist/esm/internal/decode-shared.js.map +1 -0
- package/dist/esm/internal/encode-shared.d.ts +32 -0
- package/dist/esm/internal/encode-shared.d.ts.map +1 -0
- package/dist/esm/internal/encode-shared.js +91 -0
- package/dist/esm/internal/encode-shared.js.map +1 -0
- package/escape.d.ts +2 -0
- package/package.json +26 -24
- package/readme.md +32 -11
- package/src/decode-codepoint.ts +2 -2
- package/src/decode.ts +120 -55
- package/src/encode.ts +47 -31
- package/src/escape.ts +39 -26
- package/src/generated/decode-data-html.ts +3 -5
- package/src/generated/decode-data-xml.ts +3 -5
- package/src/generated/encode-html.ts +14 -14
- package/src/index.ts +23 -24
- package/src/internal/bin-trie-flags.ts +16 -0
- package/src/internal/decode-shared.ts +30 -0
- package/src/internal/encode-shared.ts +121 -0
- package/src/decode.spec.ts +0 -320
- package/src/encode.spec.ts +0 -78
- package/src/escape.spec.ts +0 -14
- package/src/index.spec.ts +0 -125
package/src/decode.ts
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
|
+
import { fromCodePoint, replaceCodePoint } from "./decode-codepoint.js";
|
|
1
2
|
import { htmlDecodeTree } from "./generated/decode-data-html.js";
|
|
2
3
|
import { xmlDecodeTree } from "./generated/decode-data-xml.js";
|
|
3
|
-
import {
|
|
4
|
+
import { BinTrieFlags } from "./internal/bin-trie-flags.js";
|
|
4
5
|
|
|
5
6
|
const enum CharCodes {
|
|
6
7
|
NUM = 35, // "#"
|
|
@@ -20,12 +21,6 @@ const enum CharCodes {
|
|
|
20
21
|
/** Bit that needs to be set to convert an upper case ASCII character to lower case */
|
|
21
22
|
const TO_LOWER_BIT = 0b10_0000;
|
|
22
23
|
|
|
23
|
-
export enum BinTrieFlags {
|
|
24
|
-
VALUE_LENGTH = 0b1100_0000_0000_0000,
|
|
25
|
-
BRANCH_LENGTH = 0b0011_1111_1000_0000,
|
|
26
|
-
JUMP_TABLE = 0b0000_0000_0111_1111,
|
|
27
|
-
}
|
|
28
|
-
|
|
29
24
|
function isNumber(code: number): boolean {
|
|
30
25
|
return code >= CharCodes.ZERO && code <= CharCodes.NINE;
|
|
31
26
|
}
|
|
@@ -89,6 +84,7 @@ export interface EntityErrorProducer {
|
|
|
89
84
|
export class EntityDecoder {
|
|
90
85
|
constructor(
|
|
91
86
|
/** The tree used to decode entities. */
|
|
87
|
+
// biome-ignore lint/correctness/noUnusedPrivateClassMembers: False positive
|
|
92
88
|
private readonly decodeTree: Uint16Array,
|
|
93
89
|
/**
|
|
94
90
|
* The function that is called when a codepoint is decoded.
|
|
@@ -122,6 +118,8 @@ export class EntityDecoder {
|
|
|
122
118
|
private excess = 1;
|
|
123
119
|
/** The mode in which the decoder is operating. */
|
|
124
120
|
private decodeMode = DecodingMode.Strict;
|
|
121
|
+
/** The number of characters that have been consumed in the current run. */
|
|
122
|
+
private runConsumed = 0;
|
|
125
123
|
|
|
126
124
|
/** Resets the instance to make it reusable. */
|
|
127
125
|
startEntity(decodeMode: DecodingMode): void {
|
|
@@ -131,6 +129,7 @@ export class EntityDecoder {
|
|
|
131
129
|
this.treeIndex = 0;
|
|
132
130
|
this.excess = 1;
|
|
133
131
|
this.consumed = 1;
|
|
132
|
+
this.runConsumed = 0;
|
|
134
133
|
}
|
|
135
134
|
|
|
136
135
|
/**
|
|
@@ -198,21 +197,6 @@ export class EntityDecoder {
|
|
|
198
197
|
return this.stateNumericDecimal(input, offset);
|
|
199
198
|
}
|
|
200
199
|
|
|
201
|
-
private addToNumericResult(
|
|
202
|
-
input: string,
|
|
203
|
-
start: number,
|
|
204
|
-
end: number,
|
|
205
|
-
base: number,
|
|
206
|
-
): void {
|
|
207
|
-
if (start !== end) {
|
|
208
|
-
const digitCount = end - start;
|
|
209
|
-
this.result =
|
|
210
|
-
this.result * Math.pow(base, digitCount) +
|
|
211
|
-
Number.parseInt(input.substr(start, digitCount), base);
|
|
212
|
-
this.consumed += digitCount;
|
|
213
|
-
}
|
|
214
|
-
}
|
|
215
|
-
|
|
216
200
|
/**
|
|
217
201
|
* Parses a hexadecimal numeric entity.
|
|
218
202
|
*
|
|
@@ -223,21 +207,22 @@ export class EntityDecoder {
|
|
|
223
207
|
* @returns The number of characters that were consumed, or -1 if the entity is incomplete.
|
|
224
208
|
*/
|
|
225
209
|
private stateNumericHex(input: string, offset: number): number {
|
|
226
|
-
const startIndex = offset;
|
|
227
|
-
|
|
228
210
|
while (offset < input.length) {
|
|
229
211
|
const char = input.charCodeAt(offset);
|
|
230
212
|
if (isNumber(char) || isHexadecimalCharacter(char)) {
|
|
231
|
-
|
|
213
|
+
// Convert hex digit to value (0-15); 'a'/'A' -> 10.
|
|
214
|
+
const digit =
|
|
215
|
+
char <= CharCodes.NINE
|
|
216
|
+
? char - CharCodes.ZERO
|
|
217
|
+
: (char | TO_LOWER_BIT) - CharCodes.LOWER_A + 10;
|
|
218
|
+
this.result = this.result * 16 + digit;
|
|
219
|
+
this.consumed++;
|
|
220
|
+
offset++;
|
|
232
221
|
} else {
|
|
233
|
-
this.addToNumericResult(input, startIndex, offset, 16);
|
|
234
222
|
return this.emitNumericEntity(char, 3);
|
|
235
223
|
}
|
|
236
224
|
}
|
|
237
|
-
|
|
238
|
-
this.addToNumericResult(input, startIndex, offset, 16);
|
|
239
|
-
|
|
240
|
-
return -1;
|
|
225
|
+
return -1; // Incomplete entity
|
|
241
226
|
}
|
|
242
227
|
|
|
243
228
|
/**
|
|
@@ -250,21 +235,17 @@ export class EntityDecoder {
|
|
|
250
235
|
* @returns The number of characters that were consumed, or -1 if the entity is incomplete.
|
|
251
236
|
*/
|
|
252
237
|
private stateNumericDecimal(input: string, offset: number): number {
|
|
253
|
-
const startIndex = offset;
|
|
254
|
-
|
|
255
238
|
while (offset < input.length) {
|
|
256
239
|
const char = input.charCodeAt(offset);
|
|
257
240
|
if (isNumber(char)) {
|
|
258
|
-
|
|
241
|
+
this.result = this.result * 10 + (char - CharCodes.ZERO);
|
|
242
|
+
this.consumed++;
|
|
243
|
+
offset++;
|
|
259
244
|
} else {
|
|
260
|
-
this.addToNumericResult(input, startIndex, offset, 10);
|
|
261
245
|
return this.emitNumericEntity(char, 2);
|
|
262
246
|
}
|
|
263
247
|
}
|
|
264
|
-
|
|
265
|
-
this.addToNumericResult(input, startIndex, offset, 10);
|
|
266
|
-
|
|
267
|
-
return -1;
|
|
248
|
+
return -1; // Incomplete entity
|
|
268
249
|
}
|
|
269
250
|
|
|
270
251
|
/**
|
|
@@ -321,12 +302,84 @@ export class EntityDecoder {
|
|
|
321
302
|
private stateNamedEntity(input: string, offset: number): number {
|
|
322
303
|
const { decodeTree } = this;
|
|
323
304
|
let current = decodeTree[this.treeIndex];
|
|
324
|
-
// The
|
|
305
|
+
// The length is the number of bytes of the value, including the current byte.
|
|
325
306
|
let valueLength = (current & BinTrieFlags.VALUE_LENGTH) >> 14;
|
|
326
307
|
|
|
327
|
-
|
|
308
|
+
while (offset < input.length) {
|
|
309
|
+
// Handle compact runs (possibly inline): valueLength == 0 and SEMI_REQUIRED bit set.
|
|
310
|
+
if (valueLength === 0 && (current & BinTrieFlags.FLAG13) !== 0) {
|
|
311
|
+
const runLength =
|
|
312
|
+
(current & BinTrieFlags.BRANCH_LENGTH) >> 7; /* 2..63 */
|
|
313
|
+
|
|
314
|
+
// If we are starting a run, check the first char.
|
|
315
|
+
if (this.runConsumed === 0) {
|
|
316
|
+
const firstChar = current & BinTrieFlags.JUMP_TABLE;
|
|
317
|
+
if (input.charCodeAt(offset) !== firstChar) {
|
|
318
|
+
return this.result === 0
|
|
319
|
+
? 0
|
|
320
|
+
: this.emitNotTerminatedNamedEntity();
|
|
321
|
+
}
|
|
322
|
+
offset++;
|
|
323
|
+
this.excess++;
|
|
324
|
+
this.runConsumed++;
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
// Check remaining characters in the run.
|
|
328
|
+
while (this.runConsumed < runLength) {
|
|
329
|
+
if (offset >= input.length) {
|
|
330
|
+
return -1;
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
const charIndexInPacked = this.runConsumed - 1;
|
|
334
|
+
const packedWord =
|
|
335
|
+
decodeTree[
|
|
336
|
+
this.treeIndex + 1 + (charIndexInPacked >> 1)
|
|
337
|
+
];
|
|
338
|
+
const expectedChar =
|
|
339
|
+
charIndexInPacked % 2 === 0
|
|
340
|
+
? packedWord & 0xff
|
|
341
|
+
: (packedWord >> 8) & 0xff;
|
|
342
|
+
|
|
343
|
+
if (input.charCodeAt(offset) !== expectedChar) {
|
|
344
|
+
this.runConsumed = 0;
|
|
345
|
+
return this.result === 0
|
|
346
|
+
? 0
|
|
347
|
+
: this.emitNotTerminatedNamedEntity();
|
|
348
|
+
}
|
|
349
|
+
offset++;
|
|
350
|
+
this.excess++;
|
|
351
|
+
this.runConsumed++;
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
this.runConsumed = 0;
|
|
355
|
+
this.treeIndex += 1 + (runLength >> 1);
|
|
356
|
+
current = decodeTree[this.treeIndex];
|
|
357
|
+
valueLength = (current & BinTrieFlags.VALUE_LENGTH) >> 14;
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
if (offset >= input.length) break;
|
|
361
|
+
|
|
328
362
|
const char = input.charCodeAt(offset);
|
|
329
363
|
|
|
364
|
+
/*
|
|
365
|
+
* Implicit semicolon handling for nodes that require a semicolon but
|
|
366
|
+
* don't have an explicit ';' branch stored in the trie. If we have
|
|
367
|
+
* a value on the current node, it requires a semicolon, and the
|
|
368
|
+
* current input character is a semicolon, emit the entity using the
|
|
369
|
+
* current node (without descending further).
|
|
370
|
+
*/
|
|
371
|
+
if (
|
|
372
|
+
char === CharCodes.SEMI &&
|
|
373
|
+
valueLength !== 0 &&
|
|
374
|
+
(current & BinTrieFlags.FLAG13) !== 0
|
|
375
|
+
) {
|
|
376
|
+
return this.emitNamedEntityData(
|
|
377
|
+
this.treeIndex,
|
|
378
|
+
valueLength,
|
|
379
|
+
this.consumed + this.excess,
|
|
380
|
+
);
|
|
381
|
+
}
|
|
382
|
+
|
|
330
383
|
this.treeIndex = determineBranch(
|
|
331
384
|
decodeTree,
|
|
332
385
|
current,
|
|
@@ -361,12 +414,18 @@ export class EntityDecoder {
|
|
|
361
414
|
}
|
|
362
415
|
|
|
363
416
|
// If we encounter a non-terminated (legacy) entity while parsing strictly, then ignore it.
|
|
364
|
-
if (
|
|
417
|
+
if (
|
|
418
|
+
this.decodeMode !== DecodingMode.Strict &&
|
|
419
|
+
(current & BinTrieFlags.FLAG13) === 0
|
|
420
|
+
) {
|
|
365
421
|
this.result = this.treeIndex;
|
|
366
422
|
this.consumed += this.excess;
|
|
367
423
|
this.excess = 0;
|
|
368
424
|
}
|
|
369
425
|
}
|
|
426
|
+
// Increment offset & excess for next iteration
|
|
427
|
+
offset++;
|
|
428
|
+
this.excess++;
|
|
370
429
|
}
|
|
371
430
|
|
|
372
431
|
return -1;
|
|
@@ -407,7 +466,8 @@ export class EntityDecoder {
|
|
|
407
466
|
|
|
408
467
|
this.emitCodePoint(
|
|
409
468
|
valueLength === 1
|
|
410
|
-
? decodeTree[result] &
|
|
469
|
+
? decodeTree[result] &
|
|
470
|
+
~(BinTrieFlags.VALUE_LENGTH | BinTrieFlags.FLAG13)
|
|
411
471
|
: decodeTree[result + 1],
|
|
412
472
|
consumed,
|
|
413
473
|
);
|
|
@@ -540,22 +600,28 @@ export function determineBranch(
|
|
|
540
600
|
: decodeTree[nodeIndex + value] - 1;
|
|
541
601
|
}
|
|
542
602
|
|
|
543
|
-
// Case 3: Multiple branches encoded in dictionary
|
|
603
|
+
// Case 3: Multiple branches encoded in packed dictionary (two keys per uint16)
|
|
604
|
+
const packedKeySlots = (branchCount + 1) >> 1;
|
|
544
605
|
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
606
|
+
/*
|
|
607
|
+
* Treat packed keys as a virtual sorted array of length `branchCount`.
|
|
608
|
+
* Key(i) = low byte for even i, high byte for odd i in slot i>>1.
|
|
609
|
+
*/
|
|
610
|
+
let lo = 0;
|
|
611
|
+
let hi = branchCount - 1;
|
|
548
612
|
|
|
549
613
|
while (lo <= hi) {
|
|
550
614
|
const mid = (lo + hi) >>> 1;
|
|
551
|
-
const
|
|
615
|
+
const slot = mid >> 1;
|
|
616
|
+
const packed = decodeTree[nodeIndex + slot];
|
|
617
|
+
const midKey = (packed >> ((mid & 1) * 8)) & 0xff;
|
|
552
618
|
|
|
553
|
-
if (
|
|
619
|
+
if (midKey < char) {
|
|
554
620
|
lo = mid + 1;
|
|
555
|
-
} else if (
|
|
621
|
+
} else if (midKey > char) {
|
|
556
622
|
hi = mid - 1;
|
|
557
623
|
} else {
|
|
558
|
-
return decodeTree[
|
|
624
|
+
return decodeTree[nodeIndex + packedKeySlots + mid];
|
|
559
625
|
}
|
|
560
626
|
}
|
|
561
627
|
|
|
@@ -609,12 +675,11 @@ export function decodeXML(xmlString: string): string {
|
|
|
609
675
|
return xmlDecoder(xmlString, DecodingMode.Strict);
|
|
610
676
|
}
|
|
611
677
|
|
|
612
|
-
// Re-export for use by eg. htmlparser2
|
|
613
|
-
export { htmlDecodeTree } from "./generated/decode-data-html.js";
|
|
614
|
-
export { xmlDecodeTree } from "./generated/decode-data-xml.js";
|
|
615
|
-
|
|
616
678
|
export {
|
|
617
679
|
decodeCodePoint,
|
|
618
|
-
replaceCodePoint,
|
|
619
680
|
fromCodePoint,
|
|
681
|
+
replaceCodePoint,
|
|
620
682
|
} from "./decode-codepoint.js";
|
|
683
|
+
// Re-export for use by eg. htmlparser2
|
|
684
|
+
export { htmlDecodeTree } from "./generated/decode-data-html.js";
|
|
685
|
+
export { xmlDecodeTree } from "./generated/decode-data-xml.js";
|
package/src/encode.ts
CHANGED
|
@@ -1,7 +1,17 @@
|
|
|
1
|
+
import { getCodePoint, XML_BITSET_VALUE } from "./escape.js";
|
|
1
2
|
import { htmlTrie } from "./generated/encode-html.js";
|
|
2
|
-
import { xmlReplacer, getCodePoint } from "./escape.js";
|
|
3
3
|
|
|
4
|
-
|
|
4
|
+
/**
|
|
5
|
+
* We store the characters to consider as a compact bitset for fast lookups.
|
|
6
|
+
*/
|
|
7
|
+
const HTML_BITSET = /* #__PURE__ */ new Uint32Array([
|
|
8
|
+
0x16_00, // Bits for 09,0A,0C
|
|
9
|
+
0xfc_00_ff_fe, // 32..63 -> 21-2D (minus space), 2E,2F,3A-3F
|
|
10
|
+
0xf8_00_00_01, // 64..95 -> 40, 5B-5F
|
|
11
|
+
0x38_00_00_01, // 96..127-> 60, 7B-7D
|
|
12
|
+
]);
|
|
13
|
+
|
|
14
|
+
const XML_BITSET = /* #__PURE__ */ new Uint32Array([0, XML_BITSET_VALUE, 0, 0]);
|
|
5
15
|
|
|
6
16
|
/**
|
|
7
17
|
* Encodes all characters in the input using HTML entities. This includes
|
|
@@ -15,7 +25,7 @@ const htmlReplacer = /[\t\n\f!-,./:-@[-`{-}\u0080-\uFFFF]/g;
|
|
|
15
25
|
* (eg. `ü`) will be used.
|
|
16
26
|
*/
|
|
17
27
|
export function encodeHTML(input: string): string {
|
|
18
|
-
return encodeHTMLTrieRe(
|
|
28
|
+
return encodeHTMLTrieRe(HTML_BITSET, input);
|
|
19
29
|
}
|
|
20
30
|
/**
|
|
21
31
|
* Encodes all non-ASCII characters, as well as characters not valid in HTML
|
|
@@ -26,52 +36,58 @@ export function encodeHTML(input: string): string {
|
|
|
26
36
|
* (eg. `ü`) will be used.
|
|
27
37
|
*/
|
|
28
38
|
export function encodeNonAsciiHTML(input: string): string {
|
|
29
|
-
return encodeHTMLTrieRe(
|
|
39
|
+
return encodeHTMLTrieRe(XML_BITSET, input);
|
|
30
40
|
}
|
|
31
41
|
|
|
32
|
-
function encodeHTMLTrieRe(
|
|
33
|
-
let
|
|
34
|
-
let
|
|
35
|
-
|
|
42
|
+
function encodeHTMLTrieRe(bitset: Uint32Array, input: string): string {
|
|
43
|
+
let out: string | undefined;
|
|
44
|
+
let last = 0; // Start of the next untouched slice.
|
|
45
|
+
const { length } = input;
|
|
36
46
|
|
|
37
|
-
|
|
38
|
-
const { index } = match;
|
|
39
|
-
returnValue += input.substring(lastIndex, index);
|
|
47
|
+
for (let index = 0; index < length; index++) {
|
|
40
48
|
const char = input.charCodeAt(index);
|
|
41
|
-
|
|
49
|
+
// Skip ASCII characters that don't need encoding
|
|
50
|
+
if (char < 0x80 && !((bitset[char >>> 5] >>> char) & 1)) {
|
|
51
|
+
continue;
|
|
52
|
+
}
|
|
42
53
|
|
|
43
|
-
if (
|
|
44
|
-
|
|
45
|
-
|
|
54
|
+
if (out === undefined) out = input.substring(0, index);
|
|
55
|
+
else if (last !== index) out += input.substring(last, index);
|
|
56
|
+
|
|
57
|
+
let node = htmlTrie.get(char);
|
|
58
|
+
|
|
59
|
+
if (typeof node === "object") {
|
|
60
|
+
if (index + 1 < length) {
|
|
46
61
|
const nextChar = input.charCodeAt(index + 1);
|
|
47
62
|
const value =
|
|
48
|
-
typeof next
|
|
49
|
-
? next
|
|
50
|
-
?
|
|
63
|
+
typeof node.next === "number"
|
|
64
|
+
? node.next === nextChar
|
|
65
|
+
? node.nextValue
|
|
51
66
|
: undefined
|
|
52
|
-
: next.
|
|
67
|
+
: node.next.get(nextChar);
|
|
53
68
|
|
|
54
69
|
if (value !== undefined) {
|
|
55
|
-
|
|
56
|
-
|
|
70
|
+
out += value;
|
|
71
|
+
index++;
|
|
72
|
+
last = index + 1;
|
|
57
73
|
continue;
|
|
58
74
|
}
|
|
59
75
|
}
|
|
60
|
-
|
|
61
|
-
next = next.v;
|
|
76
|
+
node = node.value;
|
|
62
77
|
}
|
|
63
78
|
|
|
64
|
-
|
|
65
|
-
if (next === undefined) {
|
|
79
|
+
if (node === undefined) {
|
|
66
80
|
const cp = getCodePoint(input, index);
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
81
|
+
out += `&#x${cp.toString(16)};`;
|
|
82
|
+
if (cp !== char) index++;
|
|
83
|
+
last = index + 1;
|
|
70
84
|
} else {
|
|
71
|
-
|
|
72
|
-
|
|
85
|
+
out += node;
|
|
86
|
+
last = index + 1;
|
|
73
87
|
}
|
|
74
88
|
}
|
|
75
89
|
|
|
76
|
-
|
|
90
|
+
if (out === undefined) return input;
|
|
91
|
+
if (last < length) out += input.substr(last);
|
|
92
|
+
return out;
|
|
77
93
|
}
|
package/src/escape.ts
CHANGED
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
export const xmlReplacer: RegExp = /["$&'<>\u0080-\uFFFF]/g;
|
|
2
|
-
|
|
3
1
|
const xmlCodeMap = new Map([
|
|
4
2
|
[34, """],
|
|
5
3
|
[38, "&"],
|
|
@@ -22,39 +20,54 @@ export const getCodePoint: (c: string, index: number) => number =
|
|
|
22
20
|
: // http://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae
|
|
23
21
|
(input: string, index: number): number => input.codePointAt(index)!;
|
|
24
22
|
|
|
23
|
+
/**
|
|
24
|
+
* Bitset for ASCII characters that need to be escaped in XML.
|
|
25
|
+
*/
|
|
26
|
+
export const XML_BITSET_VALUE = 0x50_00_00_c4; // 32..63 -> 34 ("),38 (&),39 ('),60 (<),62 (>)
|
|
27
|
+
|
|
25
28
|
/**
|
|
26
29
|
* Encodes all non-ASCII characters, as well as characters not valid in XML
|
|
27
|
-
* documents using XML entities.
|
|
30
|
+
* documents using XML entities. Uses a fast bitset scan instead of RegExp.
|
|
28
31
|
*
|
|
29
|
-
* If a character has no equivalent entity, a
|
|
30
|
-
*
|
|
32
|
+
* If a character has no equivalent entity, a numeric hexadecimal reference
|
|
33
|
+
* (eg. `ü`) will be used.
|
|
31
34
|
*/
|
|
32
35
|
export function encodeXML(input: string): string {
|
|
33
|
-
let
|
|
34
|
-
let
|
|
35
|
-
|
|
36
|
+
let out: string | undefined;
|
|
37
|
+
let last = 0;
|
|
38
|
+
const { length } = input;
|
|
36
39
|
|
|
37
|
-
|
|
38
|
-
const { index } = match;
|
|
40
|
+
for (let index = 0; index < length; index++) {
|
|
39
41
|
const char = input.charCodeAt(index);
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
if (
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
// Increase by 1 if we have a surrogate pair
|
|
48
|
-
lastIndex = xmlReplacer.lastIndex += Number(
|
|
49
|
-
(char & 0xfc_00) === 0xd8_00,
|
|
50
|
-
);
|
|
51
|
-
} else {
|
|
52
|
-
returnValue += input.substring(lastIndex, index) + next;
|
|
53
|
-
lastIndex = index + 1;
|
|
42
|
+
|
|
43
|
+
// Check for ASCII chars that don't need escaping
|
|
44
|
+
if (
|
|
45
|
+
char < 0x80 &&
|
|
46
|
+
(((XML_BITSET_VALUE >>> char) & 1) === 0 || char >= 64 || char < 32)
|
|
47
|
+
) {
|
|
48
|
+
continue;
|
|
54
49
|
}
|
|
50
|
+
|
|
51
|
+
if (out === undefined) out = input.substring(0, index);
|
|
52
|
+
else if (last !== index) out += input.substring(last, index);
|
|
53
|
+
|
|
54
|
+
if (char < 64) {
|
|
55
|
+
// Known replacement
|
|
56
|
+
out += xmlCodeMap.get(char)!;
|
|
57
|
+
last = index + 1;
|
|
58
|
+
continue;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
// Non-ASCII: encode as numeric entity (handle surrogate pair)
|
|
62
|
+
const cp = getCodePoint(input, index);
|
|
63
|
+
out += `&#x${cp.toString(16)};`;
|
|
64
|
+
if (cp !== char) index++; // Skip trailing surrogate
|
|
65
|
+
last = index + 1;
|
|
55
66
|
}
|
|
56
67
|
|
|
57
|
-
|
|
68
|
+
if (out === undefined) return input;
|
|
69
|
+
if (last < length) out += input.substr(last);
|
|
70
|
+
return out;
|
|
58
71
|
}
|
|
59
72
|
|
|
60
73
|
/**
|
|
@@ -83,7 +96,7 @@ function getEscaper(
|
|
|
83
96
|
map: Map<number, string>,
|
|
84
97
|
): (data: string) => string {
|
|
85
98
|
return function escape(data: string): string {
|
|
86
|
-
let match;
|
|
99
|
+
let match: RegExpExecArray | null;
|
|
87
100
|
let lastIndex = 0;
|
|
88
101
|
let result = "";
|
|
89
102
|
|