entities 6.0.1 → 8.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/decode-codepoint.d.ts +8 -0
- package/dist/decode-codepoint.d.ts.map +1 -0
- package/dist/decode-codepoint.js +46 -0
- package/dist/decode-codepoint.js.map +1 -0
- package/dist/{esm/decode.d.ts → decode.d.ts} +11 -26
- package/dist/decode.d.ts.map +1 -0
- package/dist/{esm/decode.js → decode.js} +130 -83
- package/dist/decode.js.map +1 -0
- package/dist/{commonjs/encode.d.ts → encode.d.ts} +2 -0
- package/dist/encode.d.ts.map +1 -0
- package/dist/encode.js +90 -0
- package/dist/encode.js.map +1 -0
- package/dist/{esm/escape.d.ts → escape.d.ts} +13 -8
- package/dist/escape.d.ts.map +1 -0
- package/dist/{esm/escape.js → escape.js} +49 -34
- package/dist/escape.js.map +1 -0
- package/dist/generated/decode-data-html.d.ts +3 -0
- package/dist/generated/decode-data-html.d.ts.map +1 -0
- package/dist/generated/decode-data-html.js +5 -0
- package/dist/generated/decode-data-html.js.map +1 -0
- package/dist/generated/decode-data-xml.d.ts +3 -0
- package/dist/generated/decode-data-xml.d.ts.map +1 -0
- package/dist/generated/decode-data-xml.js +5 -0
- package/dist/generated/decode-data-xml.js.map +1 -0
- package/dist/generated/encode-html.d.ts +5 -0
- package/dist/generated/encode-html.d.ts.map +1 -0
- package/dist/generated/encode-html.js +12 -0
- package/dist/generated/encode-html.js.map +1 -0
- package/dist/{commonjs/index.d.ts → index.d.ts} +10 -17
- package/dist/index.d.ts.map +1 -0
- package/dist/{esm/index.js → index.js} +9 -25
- package/dist/index.js.map +1 -0
- package/dist/internal/bin-trie-flags.d.ts +17 -0
- package/dist/internal/bin-trie-flags.d.ts.map +1 -0
- package/dist/internal/bin-trie-flags.js +18 -0
- package/dist/internal/bin-trie-flags.js.map +1 -0
- package/dist/internal/decode-shared.d.ts +7 -0
- package/dist/internal/decode-shared.d.ts.map +1 -0
- package/dist/internal/decode-shared.js +17 -0
- package/dist/internal/decode-shared.js.map +1 -0
- package/dist/internal/encode-shared.d.ts +33 -0
- package/dist/internal/encode-shared.d.ts.map +1 -0
- package/dist/internal/encode-shared.js +93 -0
- package/dist/internal/encode-shared.js.map +1 -0
- package/package.json +38 -73
- package/readme.md +36 -27
- package/src/decode-codepoint.ts +1 -32
- package/src/decode.ts +127 -76
- package/src/encode.ts +49 -31
- package/src/escape.ts +50 -38
- package/src/generated/decode-data-html.ts +4 -5
- package/src/generated/decode-data-xml.ts +4 -5
- package/src/generated/encode-html.ts +15 -14
- package/src/index.ts +23 -49
- package/src/internal/bin-trie-flags.ts +16 -0
- package/src/internal/decode-shared.ts +18 -0
- package/src/internal/encode-shared.ts +123 -0
- package/decode.d.ts +0 -1
- package/decode.js +0 -3
- package/dist/commonjs/decode-codepoint.d.ts +0 -19
- package/dist/commonjs/decode-codepoint.d.ts.map +0 -1
- package/dist/commonjs/decode-codepoint.js +0 -77
- package/dist/commonjs/decode-codepoint.js.map +0 -1
- package/dist/commonjs/decode.d.ts +0 -209
- package/dist/commonjs/decode.d.ts.map +0 -1
- package/dist/commonjs/decode.js +0 -511
- package/dist/commonjs/decode.js.map +0 -1
- package/dist/commonjs/encode.d.ts.map +0 -1
- package/dist/commonjs/encode.js +0 -73
- package/dist/commonjs/encode.js.map +0 -1
- package/dist/commonjs/escape.d.ts +0 -43
- package/dist/commonjs/escape.d.ts.map +0 -1
- package/dist/commonjs/escape.js +0 -121
- package/dist/commonjs/escape.js.map +0 -1
- package/dist/commonjs/generated/decode-data-html.d.ts +0 -2
- package/dist/commonjs/generated/decode-data-html.d.ts.map +0 -1
- package/dist/commonjs/generated/decode-data-html.js +0 -10
- package/dist/commonjs/generated/decode-data-html.js.map +0 -1
- package/dist/commonjs/generated/decode-data-xml.d.ts +0 -2
- package/dist/commonjs/generated/decode-data-xml.d.ts.map +0 -1
- package/dist/commonjs/generated/decode-data-xml.js +0 -10
- package/dist/commonjs/generated/decode-data-xml.js.map +0 -1
- package/dist/commonjs/generated/encode-html.d.ts +0 -8
- package/dist/commonjs/generated/encode-html.d.ts.map +0 -1
- package/dist/commonjs/generated/encode-html.js +0 -13
- package/dist/commonjs/generated/encode-html.js.map +0 -1
- package/dist/commonjs/index.d.ts.map +0 -1
- package/dist/commonjs/index.js +0 -131
- package/dist/commonjs/index.js.map +0 -1
- package/dist/commonjs/package.json +0 -3
- package/dist/esm/decode-codepoint.d.ts +0 -19
- package/dist/esm/decode-codepoint.d.ts.map +0 -1
- package/dist/esm/decode-codepoint.js +0 -72
- package/dist/esm/decode-codepoint.js.map +0 -1
- package/dist/esm/decode.d.ts.map +0 -1
- package/dist/esm/decode.js.map +0 -1
- package/dist/esm/encode.d.ts +0 -22
- package/dist/esm/encode.d.ts.map +0 -1
- package/dist/esm/encode.js +0 -69
- package/dist/esm/encode.js.map +0 -1
- package/dist/esm/escape.d.ts.map +0 -1
- package/dist/esm/escape.js.map +0 -1
- package/dist/esm/generated/decode-data-html.d.ts +0 -2
- package/dist/esm/generated/decode-data-html.d.ts.map +0 -1
- package/dist/esm/generated/decode-data-html.js +0 -7
- package/dist/esm/generated/decode-data-html.js.map +0 -1
- package/dist/esm/generated/decode-data-xml.d.ts +0 -2
- package/dist/esm/generated/decode-data-xml.d.ts.map +0 -1
- package/dist/esm/generated/decode-data-xml.js +0 -7
- package/dist/esm/generated/decode-data-xml.js.map +0 -1
- package/dist/esm/generated/encode-html.d.ts +0 -8
- package/dist/esm/generated/encode-html.d.ts.map +0 -1
- package/dist/esm/generated/encode-html.js +0 -10
- package/dist/esm/generated/encode-html.js.map +0 -1
- package/dist/esm/index.d.ts +0 -96
- package/dist/esm/index.d.ts.map +0 -1
- package/dist/esm/index.js.map +0 -1
- package/dist/esm/package.json +0 -3
- package/escape.d.ts +0 -1
- package/escape.js +0 -3
- package/src/decode.spec.ts +0 -320
- package/src/encode.spec.ts +0 -78
- package/src/escape.spec.ts +0 -14
- package/src/generated/.eslintrc.json +0 -10
- package/src/index.spec.ts +0 -125
package/src/decode.ts
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
|
+
import { replaceCodePoint } from "./decode-codepoint.js";
|
|
1
2
|
import { htmlDecodeTree } from "./generated/decode-data-html.js";
|
|
2
3
|
import { xmlDecodeTree } from "./generated/decode-data-xml.js";
|
|
3
|
-
import {
|
|
4
|
+
import { BinTrieFlags } from "./internal/bin-trie-flags.js";
|
|
4
5
|
|
|
5
6
|
const enum CharCodes {
|
|
6
7
|
NUM = 35, // "#"
|
|
@@ -20,12 +21,6 @@ const enum CharCodes {
|
|
|
20
21
|
/** Bit that needs to be set to convert an upper case ASCII character to lower case */
|
|
21
22
|
const TO_LOWER_BIT = 0b10_0000;
|
|
22
23
|
|
|
23
|
-
export enum BinTrieFlags {
|
|
24
|
-
VALUE_LENGTH = 0b1100_0000_0000_0000,
|
|
25
|
-
BRANCH_LENGTH = 0b0011_1111_1000_0000,
|
|
26
|
-
JUMP_TABLE = 0b0000_0000_0111_1111,
|
|
27
|
-
}
|
|
28
|
-
|
|
29
24
|
function isNumber(code: number): boolean {
|
|
30
25
|
return code >= CharCodes.ZERO && code <= CharCodes.NINE;
|
|
31
26
|
}
|
|
@@ -50,6 +45,7 @@ function isAsciiAlphaNumeric(code: number): boolean {
|
|
|
50
45
|
*
|
|
51
46
|
* Attribute values that aren't terminated properly aren't parsed, and shouldn't lead to a parser error.
|
|
52
47
|
* See the example in https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
|
|
48
|
+
* @param code Code point to decode.
|
|
53
49
|
*/
|
|
54
50
|
function isEntityInAttributeInvalidEnd(code: number): boolean {
|
|
55
51
|
return code === CharCodes.EQUALS || isAsciiAlphaNumeric(code);
|
|
@@ -63,6 +59,9 @@ const enum EntityDecoderState {
|
|
|
63
59
|
NamedEntity,
|
|
64
60
|
}
|
|
65
61
|
|
|
62
|
+
/**
|
|
63
|
+
* Decoding mode for named entities.
|
|
64
|
+
*/
|
|
66
65
|
export enum DecodingMode {
|
|
67
66
|
/** Entities in text nodes that can end with any character. */
|
|
68
67
|
Legacy = 0,
|
|
@@ -89,13 +88,13 @@ export interface EntityErrorProducer {
|
|
|
89
88
|
export class EntityDecoder {
|
|
90
89
|
constructor(
|
|
91
90
|
/** The tree used to decode entities. */
|
|
91
|
+
// biome-ignore lint/correctness/noUnusedPrivateClassMembers: False positive
|
|
92
92
|
private readonly decodeTree: Uint16Array,
|
|
93
93
|
/**
|
|
94
94
|
* The function that is called when a codepoint is decoded.
|
|
95
95
|
*
|
|
96
96
|
* For multi-byte named entities, this will be called multiple times,
|
|
97
97
|
* with the second codepoint, and the same `consumed` value.
|
|
98
|
-
*
|
|
99
98
|
* @param codepoint The decoded codepoint.
|
|
100
99
|
* @param consumed The number of bytes consumed by the decoder.
|
|
101
100
|
*/
|
|
@@ -122,8 +121,13 @@ export class EntityDecoder {
|
|
|
122
121
|
private excess = 1;
|
|
123
122
|
/** The mode in which the decoder is operating. */
|
|
124
123
|
private decodeMode = DecodingMode.Strict;
|
|
124
|
+
/** The number of characters that have been consumed in the current run. */
|
|
125
|
+
private runConsumed = 0;
|
|
125
126
|
|
|
126
|
-
/**
|
|
127
|
+
/**
|
|
128
|
+
* Resets the instance to make it reusable.
|
|
129
|
+
* @param decodeMode Entity decoding mode to use.
|
|
130
|
+
*/
|
|
127
131
|
startEntity(decodeMode: DecodingMode): void {
|
|
128
132
|
this.decodeMode = decodeMode;
|
|
129
133
|
this.state = EntityDecoderState.EntityStart;
|
|
@@ -131,6 +135,7 @@ export class EntityDecoder {
|
|
|
131
135
|
this.treeIndex = 0;
|
|
132
136
|
this.excess = 1;
|
|
133
137
|
this.consumed = 1;
|
|
138
|
+
this.runConsumed = 0;
|
|
134
139
|
}
|
|
135
140
|
|
|
136
141
|
/**
|
|
@@ -139,7 +144,6 @@ export class EntityDecoder {
|
|
|
139
144
|
*
|
|
140
145
|
* Mirrors the implementation of `getDecoder`, but with the ability to stop decoding if the
|
|
141
146
|
* entity is incomplete, and resume when the next string is written.
|
|
142
|
-
*
|
|
143
147
|
* @param input The string containing the entity (or a continuation of the entity).
|
|
144
148
|
* @param offset The offset at which the entity begins. Should be 0 if this is not the first call.
|
|
145
149
|
* @returns The number of characters that were consumed, or -1 if the entity is incomplete.
|
|
@@ -178,7 +182,6 @@ export class EntityDecoder {
|
|
|
178
182
|
* Switches between the numeric decimal and hexadecimal states.
|
|
179
183
|
*
|
|
180
184
|
* Equivalent to the `Numeric character reference state` in the HTML spec.
|
|
181
|
-
*
|
|
182
185
|
* @param input The string containing the entity (or a continuation of the entity).
|
|
183
186
|
* @param offset The current offset.
|
|
184
187
|
* @returns The number of characters that were consumed, or -1 if the entity is incomplete.
|
|
@@ -198,73 +201,53 @@ export class EntityDecoder {
|
|
|
198
201
|
return this.stateNumericDecimal(input, offset);
|
|
199
202
|
}
|
|
200
203
|
|
|
201
|
-
private addToNumericResult(
|
|
202
|
-
input: string,
|
|
203
|
-
start: number,
|
|
204
|
-
end: number,
|
|
205
|
-
base: number,
|
|
206
|
-
): void {
|
|
207
|
-
if (start !== end) {
|
|
208
|
-
const digitCount = end - start;
|
|
209
|
-
this.result =
|
|
210
|
-
this.result * Math.pow(base, digitCount) +
|
|
211
|
-
Number.parseInt(input.substr(start, digitCount), base);
|
|
212
|
-
this.consumed += digitCount;
|
|
213
|
-
}
|
|
214
|
-
}
|
|
215
|
-
|
|
216
204
|
/**
|
|
217
205
|
* Parses a hexadecimal numeric entity.
|
|
218
206
|
*
|
|
219
207
|
* Equivalent to the `Hexademical character reference state` in the HTML spec.
|
|
220
|
-
*
|
|
221
208
|
* @param input The string containing the entity (or a continuation of the entity).
|
|
222
209
|
* @param offset The current offset.
|
|
223
210
|
* @returns The number of characters that were consumed, or -1 if the entity is incomplete.
|
|
224
211
|
*/
|
|
225
212
|
private stateNumericHex(input: string, offset: number): number {
|
|
226
|
-
const startIndex = offset;
|
|
227
|
-
|
|
228
213
|
while (offset < input.length) {
|
|
229
214
|
const char = input.charCodeAt(offset);
|
|
230
215
|
if (isNumber(char) || isHexadecimalCharacter(char)) {
|
|
231
|
-
|
|
216
|
+
// Convert hex digit to value (0-15); 'a'/'A' -> 10.
|
|
217
|
+
const digit =
|
|
218
|
+
char <= CharCodes.NINE
|
|
219
|
+
? char - CharCodes.ZERO
|
|
220
|
+
: (char | TO_LOWER_BIT) - CharCodes.LOWER_A + 10;
|
|
221
|
+
this.result = this.result * 16 + digit;
|
|
222
|
+
this.consumed++;
|
|
223
|
+
offset++;
|
|
232
224
|
} else {
|
|
233
|
-
this.addToNumericResult(input, startIndex, offset, 16);
|
|
234
225
|
return this.emitNumericEntity(char, 3);
|
|
235
226
|
}
|
|
236
227
|
}
|
|
237
|
-
|
|
238
|
-
this.addToNumericResult(input, startIndex, offset, 16);
|
|
239
|
-
|
|
240
|
-
return -1;
|
|
228
|
+
return -1; // Incomplete entity
|
|
241
229
|
}
|
|
242
230
|
|
|
243
231
|
/**
|
|
244
232
|
* Parses a decimal numeric entity.
|
|
245
233
|
*
|
|
246
234
|
* Equivalent to the `Decimal character reference state` in the HTML spec.
|
|
247
|
-
*
|
|
248
235
|
* @param input The string containing the entity (or a continuation of the entity).
|
|
249
236
|
* @param offset The current offset.
|
|
250
237
|
* @returns The number of characters that were consumed, or -1 if the entity is incomplete.
|
|
251
238
|
*/
|
|
252
239
|
private stateNumericDecimal(input: string, offset: number): number {
|
|
253
|
-
const startIndex = offset;
|
|
254
|
-
|
|
255
240
|
while (offset < input.length) {
|
|
256
241
|
const char = input.charCodeAt(offset);
|
|
257
242
|
if (isNumber(char)) {
|
|
258
|
-
|
|
243
|
+
this.result = this.result * 10 + (char - CharCodes.ZERO);
|
|
244
|
+
this.consumed++;
|
|
245
|
+
offset++;
|
|
259
246
|
} else {
|
|
260
|
-
this.addToNumericResult(input, startIndex, offset, 10);
|
|
261
247
|
return this.emitNumericEntity(char, 2);
|
|
262
248
|
}
|
|
263
249
|
}
|
|
264
|
-
|
|
265
|
-
this.addToNumericResult(input, startIndex, offset, 10);
|
|
266
|
-
|
|
267
|
-
return -1;
|
|
250
|
+
return -1; // Incomplete entity
|
|
268
251
|
}
|
|
269
252
|
|
|
270
253
|
/**
|
|
@@ -272,7 +255,6 @@ export class EntityDecoder {
|
|
|
272
255
|
*
|
|
273
256
|
* Implements the logic from the `Hexademical character reference start
|
|
274
257
|
* state` and `Numeric character reference end state` in the HTML spec.
|
|
275
|
-
*
|
|
276
258
|
* @param lastCp The last code point of the entity. Used to see if the
|
|
277
259
|
* entity was terminated with a semicolon.
|
|
278
260
|
* @param expectedLength The minimum number of characters that should be
|
|
@@ -313,7 +295,6 @@ export class EntityDecoder {
|
|
|
313
295
|
* Parses a named entity.
|
|
314
296
|
*
|
|
315
297
|
* Equivalent to the `Named character reference state` in the HTML spec.
|
|
316
|
-
*
|
|
317
298
|
* @param input The string containing the entity (or a continuation of the entity).
|
|
318
299
|
* @param offset The current offset.
|
|
319
300
|
* @returns The number of characters that were consumed, or -1 if the entity is incomplete.
|
|
@@ -321,12 +302,84 @@ export class EntityDecoder {
|
|
|
321
302
|
private stateNamedEntity(input: string, offset: number): number {
|
|
322
303
|
const { decodeTree } = this;
|
|
323
304
|
let current = decodeTree[this.treeIndex];
|
|
324
|
-
// The
|
|
305
|
+
// The length is the number of bytes of the value, including the current byte.
|
|
325
306
|
let valueLength = (current & BinTrieFlags.VALUE_LENGTH) >> 14;
|
|
326
307
|
|
|
327
|
-
|
|
308
|
+
while (offset < input.length) {
|
|
309
|
+
// Handle compact runs (possibly inline): valueLength == 0 and SEMI_REQUIRED bit set.
|
|
310
|
+
if (valueLength === 0 && (current & BinTrieFlags.FLAG13) !== 0) {
|
|
311
|
+
const runLength =
|
|
312
|
+
(current & BinTrieFlags.BRANCH_LENGTH) >> 7; /* 2..63 */
|
|
313
|
+
|
|
314
|
+
// If we are starting a run, check the first char.
|
|
315
|
+
if (this.runConsumed === 0) {
|
|
316
|
+
const firstChar = current & BinTrieFlags.JUMP_TABLE;
|
|
317
|
+
if (input.charCodeAt(offset) !== firstChar) {
|
|
318
|
+
return this.result === 0
|
|
319
|
+
? 0
|
|
320
|
+
: this.emitNotTerminatedNamedEntity();
|
|
321
|
+
}
|
|
322
|
+
offset++;
|
|
323
|
+
this.excess++;
|
|
324
|
+
this.runConsumed++;
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
// Check remaining characters in the run.
|
|
328
|
+
while (this.runConsumed < runLength) {
|
|
329
|
+
if (offset >= input.length) {
|
|
330
|
+
return -1;
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
const charIndexInPacked = this.runConsumed - 1;
|
|
334
|
+
const packedWord =
|
|
335
|
+
decodeTree[
|
|
336
|
+
this.treeIndex + 1 + (charIndexInPacked >> 1)
|
|
337
|
+
];
|
|
338
|
+
const expectedChar =
|
|
339
|
+
charIndexInPacked % 2 === 0
|
|
340
|
+
? packedWord & 0xff
|
|
341
|
+
: (packedWord >> 8) & 0xff;
|
|
342
|
+
|
|
343
|
+
if (input.charCodeAt(offset) !== expectedChar) {
|
|
344
|
+
this.runConsumed = 0;
|
|
345
|
+
return this.result === 0
|
|
346
|
+
? 0
|
|
347
|
+
: this.emitNotTerminatedNamedEntity();
|
|
348
|
+
}
|
|
349
|
+
offset++;
|
|
350
|
+
this.excess++;
|
|
351
|
+
this.runConsumed++;
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
this.runConsumed = 0;
|
|
355
|
+
this.treeIndex += 1 + (runLength >> 1);
|
|
356
|
+
current = decodeTree[this.treeIndex];
|
|
357
|
+
valueLength = (current & BinTrieFlags.VALUE_LENGTH) >> 14;
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
if (offset >= input.length) break;
|
|
361
|
+
|
|
328
362
|
const char = input.charCodeAt(offset);
|
|
329
363
|
|
|
364
|
+
/*
|
|
365
|
+
* Implicit semicolon handling for nodes that require a semicolon but
|
|
366
|
+
* don't have an explicit ';' branch stored in the trie. If we have
|
|
367
|
+
* a value on the current node, it requires a semicolon, and the
|
|
368
|
+
* current input character is a semicolon, emit the entity using the
|
|
369
|
+
* current node (without descending further).
|
|
370
|
+
*/
|
|
371
|
+
if (
|
|
372
|
+
char === CharCodes.SEMI &&
|
|
373
|
+
valueLength !== 0 &&
|
|
374
|
+
(current & BinTrieFlags.FLAG13) !== 0
|
|
375
|
+
) {
|
|
376
|
+
return this.emitNamedEntityData(
|
|
377
|
+
this.treeIndex,
|
|
378
|
+
valueLength,
|
|
379
|
+
this.consumed + this.excess,
|
|
380
|
+
);
|
|
381
|
+
}
|
|
382
|
+
|
|
330
383
|
this.treeIndex = determineBranch(
|
|
331
384
|
decodeTree,
|
|
332
385
|
current,
|
|
@@ -361,12 +414,18 @@ export class EntityDecoder {
|
|
|
361
414
|
}
|
|
362
415
|
|
|
363
416
|
// If we encounter a non-terminated (legacy) entity while parsing strictly, then ignore it.
|
|
364
|
-
if (
|
|
417
|
+
if (
|
|
418
|
+
this.decodeMode !== DecodingMode.Strict &&
|
|
419
|
+
(current & BinTrieFlags.FLAG13) === 0
|
|
420
|
+
) {
|
|
365
421
|
this.result = this.treeIndex;
|
|
366
422
|
this.consumed += this.excess;
|
|
367
423
|
this.excess = 0;
|
|
368
424
|
}
|
|
369
425
|
}
|
|
426
|
+
// Increment offset & excess for next iteration
|
|
427
|
+
offset++;
|
|
428
|
+
this.excess++;
|
|
370
429
|
}
|
|
371
430
|
|
|
372
431
|
return -1;
|
|
@@ -374,7 +433,6 @@ export class EntityDecoder {
|
|
|
374
433
|
|
|
375
434
|
/**
|
|
376
435
|
* Emit a named entity that was not terminated with a semicolon.
|
|
377
|
-
*
|
|
378
436
|
* @returns The number of characters consumed.
|
|
379
437
|
*/
|
|
380
438
|
private emitNotTerminatedNamedEntity(): number {
|
|
@@ -391,11 +449,9 @@ export class EntityDecoder {
|
|
|
391
449
|
|
|
392
450
|
/**
|
|
393
451
|
* Emit a named entity.
|
|
394
|
-
*
|
|
395
452
|
* @param result The index of the entity in the decode tree.
|
|
396
453
|
* @param valueLength The number of bytes in the entity.
|
|
397
454
|
* @param consumed The number of characters consumed.
|
|
398
|
-
*
|
|
399
455
|
* @returns The number of characters consumed.
|
|
400
456
|
*/
|
|
401
457
|
private emitNamedEntityData(
|
|
@@ -407,7 +463,8 @@ export class EntityDecoder {
|
|
|
407
463
|
|
|
408
464
|
this.emitCodePoint(
|
|
409
465
|
valueLength === 1
|
|
410
|
-
? decodeTree[result] &
|
|
466
|
+
? decodeTree[result] &
|
|
467
|
+
~(BinTrieFlags.VALUE_LENGTH | BinTrieFlags.FLAG13)
|
|
411
468
|
: decodeTree[result + 1],
|
|
412
469
|
consumed,
|
|
413
470
|
);
|
|
@@ -423,7 +480,6 @@ export class EntityDecoder {
|
|
|
423
480
|
* Signal to the parser that the end of the input was reached.
|
|
424
481
|
*
|
|
425
482
|
* Remaining data will be emitted and relevant errors will be produced.
|
|
426
|
-
*
|
|
427
483
|
* @returns The number of characters consumed.
|
|
428
484
|
*/
|
|
429
485
|
end(): number {
|
|
@@ -459,7 +515,6 @@ export class EntityDecoder {
|
|
|
459
515
|
|
|
460
516
|
/**
|
|
461
517
|
* Creates a function that decodes entities in a string.
|
|
462
|
-
*
|
|
463
518
|
* @param decodeTree The decode tree.
|
|
464
519
|
* @returns A function that decodes entities in a string.
|
|
465
520
|
*/
|
|
@@ -467,7 +522,7 @@ function getDecoder(decodeTree: Uint16Array) {
|
|
|
467
522
|
let returnValue = "";
|
|
468
523
|
const decoder = new EntityDecoder(
|
|
469
524
|
decodeTree,
|
|
470
|
-
(data) => (returnValue += fromCodePoint(data)),
|
|
525
|
+
(data) => (returnValue += String.fromCodePoint(data)),
|
|
471
526
|
);
|
|
472
527
|
|
|
473
528
|
return function decodeWithTrie(
|
|
@@ -510,10 +565,9 @@ function getDecoder(decodeTree: Uint16Array) {
|
|
|
510
565
|
/**
|
|
511
566
|
* Determines the branch of the current node that is taken given the current
|
|
512
567
|
* character. This function is used to traverse the trie.
|
|
513
|
-
*
|
|
514
568
|
* @param decodeTree The trie.
|
|
515
569
|
* @param current The current node.
|
|
516
|
-
* @param
|
|
570
|
+
* @param nodeIndex Index immediately after the current node header.
|
|
517
571
|
* @param char The current character.
|
|
518
572
|
* @returns The index of the next node, or -1 if no branch is taken.
|
|
519
573
|
*/
|
|
@@ -540,22 +594,28 @@ export function determineBranch(
|
|
|
540
594
|
: decodeTree[nodeIndex + value] - 1;
|
|
541
595
|
}
|
|
542
596
|
|
|
543
|
-
// Case 3: Multiple branches encoded in dictionary
|
|
597
|
+
// Case 3: Multiple branches encoded in packed dictionary (two keys per uint16)
|
|
598
|
+
const packedKeySlots = (branchCount + 1) >> 1;
|
|
544
599
|
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
600
|
+
/*
|
|
601
|
+
* Treat packed keys as a virtual sorted array of length `branchCount`.
|
|
602
|
+
* Key(i) = low byte for even i, high byte for odd i in slot i>>1.
|
|
603
|
+
*/
|
|
604
|
+
let lo = 0;
|
|
605
|
+
let hi = branchCount - 1;
|
|
548
606
|
|
|
549
607
|
while (lo <= hi) {
|
|
550
608
|
const mid = (lo + hi) >>> 1;
|
|
551
|
-
const
|
|
609
|
+
const slot = mid >> 1;
|
|
610
|
+
const packed = decodeTree[nodeIndex + slot];
|
|
611
|
+
const midKey = (packed >> ((mid & 1) * 8)) & 0xff;
|
|
552
612
|
|
|
553
|
-
if (
|
|
613
|
+
if (midKey < char) {
|
|
554
614
|
lo = mid + 1;
|
|
555
|
-
} else if (
|
|
615
|
+
} else if (midKey > char) {
|
|
556
616
|
hi = mid - 1;
|
|
557
617
|
} else {
|
|
558
|
-
return decodeTree[
|
|
618
|
+
return decodeTree[nodeIndex + packedKeySlots + mid];
|
|
559
619
|
}
|
|
560
620
|
}
|
|
561
621
|
|
|
@@ -567,7 +627,6 @@ const xmlDecoder = /* #__PURE__ */ getDecoder(xmlDecodeTree);
|
|
|
567
627
|
|
|
568
628
|
/**
|
|
569
629
|
* Decodes an HTML string.
|
|
570
|
-
*
|
|
571
630
|
* @param htmlString The string to decode.
|
|
572
631
|
* @param mode The decoding mode.
|
|
573
632
|
* @returns The decoded string.
|
|
@@ -581,7 +640,6 @@ export function decodeHTML(
|
|
|
581
640
|
|
|
582
641
|
/**
|
|
583
642
|
* Decodes an HTML string in an attribute.
|
|
584
|
-
*
|
|
585
643
|
* @param htmlAttribute The string to decode.
|
|
586
644
|
* @returns The decoded string.
|
|
587
645
|
*/
|
|
@@ -591,7 +649,6 @@ export function decodeHTMLAttribute(htmlAttribute: string): string {
|
|
|
591
649
|
|
|
592
650
|
/**
|
|
593
651
|
* Decodes an HTML string, requiring all entities to be terminated by a semicolon.
|
|
594
|
-
*
|
|
595
652
|
* @param htmlString The string to decode.
|
|
596
653
|
* @returns The decoded string.
|
|
597
654
|
*/
|
|
@@ -601,7 +658,6 @@ export function decodeHTMLStrict(htmlString: string): string {
|
|
|
601
658
|
|
|
602
659
|
/**
|
|
603
660
|
* Decodes an XML string, requiring all entities to be terminated by a semicolon.
|
|
604
|
-
*
|
|
605
661
|
* @param xmlString The string to decode.
|
|
606
662
|
* @returns The decoded string.
|
|
607
663
|
*/
|
|
@@ -609,12 +665,7 @@ export function decodeXML(xmlString: string): string {
|
|
|
609
665
|
return xmlDecoder(xmlString, DecodingMode.Strict);
|
|
610
666
|
}
|
|
611
667
|
|
|
668
|
+
export { replaceCodePoint } from "./decode-codepoint.js";
|
|
612
669
|
// Re-export for use by eg. htmlparser2
|
|
613
670
|
export { htmlDecodeTree } from "./generated/decode-data-html.js";
|
|
614
671
|
export { xmlDecodeTree } from "./generated/decode-data-xml.js";
|
|
615
|
-
|
|
616
|
-
export {
|
|
617
|
-
decodeCodePoint,
|
|
618
|
-
replaceCodePoint,
|
|
619
|
-
fromCodePoint,
|
|
620
|
-
} from "./decode-codepoint.js";
|
package/src/encode.ts
CHANGED
|
@@ -1,7 +1,17 @@
|
|
|
1
|
+
import { getCodePoint, XML_BITSET_VALUE } from "./escape.js";
|
|
1
2
|
import { htmlTrie } from "./generated/encode-html.js";
|
|
2
|
-
import { xmlReplacer, getCodePoint } from "./escape.js";
|
|
3
3
|
|
|
4
|
-
|
|
4
|
+
/**
|
|
5
|
+
* We store the characters to consider as a compact bitset for fast lookups.
|
|
6
|
+
*/
|
|
7
|
+
const HTML_BITSET = /* #__PURE__ */ new Uint32Array([
|
|
8
|
+
0x16_00, // Bits for 09,0A,0C
|
|
9
|
+
0xfc_00_ff_fe, // 32..63 -> 21-2D (minus space), 2E,2F,3A-3F
|
|
10
|
+
0xf8_00_00_01, // 64..95 -> 40, 5B-5F
|
|
11
|
+
0x38_00_00_01, // 96..127-> 60, 7B-7D
|
|
12
|
+
]);
|
|
13
|
+
|
|
14
|
+
const XML_BITSET = /* #__PURE__ */ new Uint32Array([0, XML_BITSET_VALUE, 0, 0]);
|
|
5
15
|
|
|
6
16
|
/**
|
|
7
17
|
* Encodes all characters in the input using HTML entities. This includes
|
|
@@ -13,9 +23,10 @@ const htmlReplacer = /[\t\n\f!-,./:-@[-`{-}\u0080-\uFFFF]/g;
|
|
|
13
23
|
*
|
|
14
24
|
* If a character has no equivalent entity, a numeric hexadecimal reference
|
|
15
25
|
* (eg. `ü`) will be used.
|
|
26
|
+
* @param input Input string to encode or decode.
|
|
16
27
|
*/
|
|
17
28
|
export function encodeHTML(input: string): string {
|
|
18
|
-
return encodeHTMLTrieRe(
|
|
29
|
+
return encodeHTMLTrieRe(HTML_BITSET, input);
|
|
19
30
|
}
|
|
20
31
|
/**
|
|
21
32
|
* Encodes all non-ASCII characters, as well as characters not valid in HTML
|
|
@@ -24,54 +35,61 @@ export function encodeHTML(input: string): string {
|
|
|
24
35
|
*
|
|
25
36
|
* If a character has no equivalent entity, a numeric hexadecimal reference
|
|
26
37
|
* (eg. `ü`) will be used.
|
|
38
|
+
* @param input Input string to encode or decode.
|
|
27
39
|
*/
|
|
28
40
|
export function encodeNonAsciiHTML(input: string): string {
|
|
29
|
-
return encodeHTMLTrieRe(
|
|
41
|
+
return encodeHTMLTrieRe(XML_BITSET, input);
|
|
30
42
|
}
|
|
31
43
|
|
|
32
|
-
function encodeHTMLTrieRe(
|
|
33
|
-
let
|
|
34
|
-
let
|
|
35
|
-
|
|
44
|
+
function encodeHTMLTrieRe(bitset: Uint32Array, input: string): string {
|
|
45
|
+
let out: string | undefined;
|
|
46
|
+
let last = 0; // Start of the next untouched slice.
|
|
47
|
+
const { length } = input;
|
|
36
48
|
|
|
37
|
-
|
|
38
|
-
const { index } = match;
|
|
39
|
-
returnValue += input.substring(lastIndex, index);
|
|
49
|
+
for (let index = 0; index < length; index++) {
|
|
40
50
|
const char = input.charCodeAt(index);
|
|
41
|
-
|
|
51
|
+
// Skip ASCII characters that don't need encoding
|
|
52
|
+
if (char < 0x80 && !((bitset[char >>> 5] >>> char) & 1)) {
|
|
53
|
+
continue;
|
|
54
|
+
}
|
|
42
55
|
|
|
43
|
-
if (
|
|
44
|
-
|
|
45
|
-
|
|
56
|
+
if (out === undefined) out = input.substring(0, index);
|
|
57
|
+
else if (last !== index) out += input.substring(last, index);
|
|
58
|
+
|
|
59
|
+
let node = htmlTrie.get(char);
|
|
60
|
+
|
|
61
|
+
if (typeof node === "object") {
|
|
62
|
+
if (index + 1 < length) {
|
|
46
63
|
const nextChar = input.charCodeAt(index + 1);
|
|
47
64
|
const value =
|
|
48
|
-
typeof next
|
|
49
|
-
? next
|
|
50
|
-
?
|
|
65
|
+
typeof node.next === "number"
|
|
66
|
+
? node.next === nextChar
|
|
67
|
+
? node.nextValue
|
|
51
68
|
: undefined
|
|
52
|
-
: next.
|
|
69
|
+
: node.next.get(nextChar);
|
|
53
70
|
|
|
54
71
|
if (value !== undefined) {
|
|
55
|
-
|
|
56
|
-
|
|
72
|
+
out += value;
|
|
73
|
+
index++;
|
|
74
|
+
last = index + 1;
|
|
57
75
|
continue;
|
|
58
76
|
}
|
|
59
77
|
}
|
|
60
|
-
|
|
61
|
-
next = next.v;
|
|
78
|
+
node = node.value;
|
|
62
79
|
}
|
|
63
80
|
|
|
64
|
-
|
|
65
|
-
if (next === undefined) {
|
|
81
|
+
if (node === undefined) {
|
|
66
82
|
const cp = getCodePoint(input, index);
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
83
|
+
out += `&#x${cp.toString(16)};`;
|
|
84
|
+
if (cp !== char) index++;
|
|
85
|
+
last = index + 1;
|
|
70
86
|
} else {
|
|
71
|
-
|
|
72
|
-
|
|
87
|
+
out += node;
|
|
88
|
+
last = index + 1;
|
|
73
89
|
}
|
|
74
90
|
}
|
|
75
91
|
|
|
76
|
-
|
|
92
|
+
if (out === undefined) return input;
|
|
93
|
+
if (last < length) out += input.substr(last);
|
|
94
|
+
return out;
|
|
77
95
|
}
|