entities 4.4.0 → 5.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. package/dist/commonjs/decode-codepoint.d.ts +19 -0
  2. package/dist/commonjs/decode-codepoint.d.ts.map +1 -0
  3. package/{lib/decode_codepoint.js → dist/commonjs/decode-codepoint.js} +30 -13
  4. package/dist/commonjs/decode-codepoint.js.map +1 -0
  5. package/dist/commonjs/decode.d.ts +209 -0
  6. package/dist/commonjs/decode.d.ts.map +1 -0
  7. package/dist/commonjs/decode.js +514 -0
  8. package/dist/commonjs/decode.js.map +1 -0
  9. package/{lib/esm → dist/commonjs}/encode.d.ts +2 -2
  10. package/dist/commonjs/encode.d.ts.map +1 -0
  11. package/{lib → dist/commonjs}/encode.js +32 -33
  12. package/dist/commonjs/encode.js.map +1 -0
  13. package/{lib/esm → dist/commonjs}/escape.d.ts +2 -2
  14. package/dist/commonjs/escape.d.ts.map +1 -0
  15. package/{lib → dist/commonjs}/escape.js +46 -38
  16. package/dist/commonjs/escape.js.map +1 -0
  17. package/dist/commonjs/generated/decode-data-html.d.ts.map +1 -0
  18. package/{lib → dist/commonjs}/generated/decode-data-html.js +1 -1
  19. package/dist/commonjs/generated/decode-data-html.js.map +1 -0
  20. package/dist/commonjs/generated/decode-data-xml.d.ts.map +1 -0
  21. package/{lib → dist/commonjs}/generated/decode-data-xml.js +1 -1
  22. package/dist/commonjs/generated/decode-data-xml.js.map +1 -0
  23. package/{lib/esm → dist/commonjs}/generated/encode-html.d.ts +1 -1
  24. package/dist/commonjs/generated/encode-html.d.ts.map +1 -0
  25. package/{lib → dist/commonjs}/generated/encode-html.js +4 -4
  26. package/dist/commonjs/generated/encode-html.js.map +1 -0
  27. package/{lib/esm → dist/commonjs}/index.d.ts +9 -15
  28. package/dist/commonjs/index.d.ts.map +1 -0
  29. package/{lib → dist/commonjs}/index.js +46 -55
  30. package/dist/commonjs/index.js.map +1 -0
  31. package/dist/commonjs/package.json +3 -0
  32. package/dist/esm/decode-codepoint.d.ts +19 -0
  33. package/dist/esm/decode-codepoint.d.ts.map +1 -0
  34. package/{lib/esm/decode_codepoint.js → dist/esm/decode-codepoint.js} +25 -8
  35. package/dist/esm/decode-codepoint.js.map +1 -0
  36. package/dist/esm/decode.d.ts +209 -0
  37. package/dist/esm/decode.d.ts.map +1 -0
  38. package/dist/esm/decode.js +497 -0
  39. package/dist/esm/decode.js.map +1 -0
  40. package/{lib → dist/esm}/encode.d.ts +2 -2
  41. package/dist/esm/encode.d.ts.map +1 -0
  42. package/{lib → dist}/esm/encode.js +25 -25
  43. package/dist/esm/encode.js.map +1 -0
  44. package/{lib → dist/esm}/escape.d.ts +2 -2
  45. package/dist/esm/escape.d.ts.map +1 -0
  46. package/{lib → dist}/esm/escape.js +39 -29
  47. package/dist/esm/escape.js.map +1 -0
  48. package/dist/esm/generated/decode-data-html.d.ts.map +1 -0
  49. package/dist/esm/generated/decode-data-html.js.map +1 -0
  50. package/dist/esm/generated/decode-data-xml.d.ts.map +1 -0
  51. package/dist/esm/generated/decode-data-xml.js.map +1 -0
  52. package/{lib → dist/esm}/generated/encode-html.d.ts +1 -1
  53. package/dist/esm/generated/encode-html.d.ts.map +1 -0
  54. package/{lib → dist}/esm/generated/encode-html.js +4 -4
  55. package/dist/esm/generated/encode-html.js.map +1 -0
  56. package/{lib → dist/esm}/index.d.ts +9 -15
  57. package/dist/esm/index.d.ts.map +1 -0
  58. package/{lib → dist}/esm/index.js +36 -45
  59. package/dist/esm/index.js.map +1 -0
  60. package/dist/esm/package.json +3 -0
  61. package/package.json +96 -66
  62. package/readme.md +5 -5
  63. package/src/decode-codepoint.ts +81 -0
  64. package/src/decode.spec.ts +320 -0
  65. package/src/decode.ts +620 -0
  66. package/src/encode.spec.ts +78 -0
  67. package/src/encode.ts +77 -0
  68. package/src/escape.spec.ts +14 -0
  69. package/src/escape.ts +144 -0
  70. package/src/generated/.eslintrc.json +10 -0
  71. package/src/generated/decode-data-html.ts +8 -0
  72. package/src/generated/decode-data-xml.ts +8 -0
  73. package/src/generated/encode-html.ts +17 -0
  74. package/src/index.spec.ts +125 -0
  75. package/src/index.ts +185 -0
  76. package/lib/decode.d.ts +0 -33
  77. package/lib/decode.d.ts.map +0 -1
  78. package/lib/decode.js +0 -179
  79. package/lib/decode.js.map +0 -1
  80. package/lib/decode_codepoint.d.ts +0 -4
  81. package/lib/decode_codepoint.d.ts.map +0 -1
  82. package/lib/decode_codepoint.js.map +0 -1
  83. package/lib/encode.d.ts.map +0 -1
  84. package/lib/encode.js.map +0 -1
  85. package/lib/escape.d.ts.map +0 -1
  86. package/lib/escape.js.map +0 -1
  87. package/lib/esm/decode.d.ts +0 -33
  88. package/lib/esm/decode.d.ts.map +0 -1
  89. package/lib/esm/decode.js +0 -166
  90. package/lib/esm/decode.js.map +0 -1
  91. package/lib/esm/decode_codepoint.d.ts +0 -4
  92. package/lib/esm/decode_codepoint.d.ts.map +0 -1
  93. package/lib/esm/decode_codepoint.js.map +0 -1
  94. package/lib/esm/encode.d.ts.map +0 -1
  95. package/lib/esm/encode.js.map +0 -1
  96. package/lib/esm/escape.d.ts.map +0 -1
  97. package/lib/esm/escape.js.map +0 -1
  98. package/lib/esm/generated/decode-data-html.d.ts.map +0 -1
  99. package/lib/esm/generated/decode-data-html.js.map +0 -1
  100. package/lib/esm/generated/decode-data-xml.d.ts.map +0 -1
  101. package/lib/esm/generated/decode-data-xml.js.map +0 -1
  102. package/lib/esm/generated/encode-html.d.ts.map +0 -1
  103. package/lib/esm/generated/encode-html.js.map +0 -1
  104. package/lib/esm/index.d.ts.map +0 -1
  105. package/lib/esm/index.js.map +0 -1
  106. package/lib/esm/package.json +0 -1
  107. package/lib/generated/decode-data-html.d.ts.map +0 -1
  108. package/lib/generated/decode-data-html.js.map +0 -1
  109. package/lib/generated/decode-data-xml.d.ts.map +0 -1
  110. package/lib/generated/decode-data-xml.js.map +0 -1
  111. package/lib/generated/encode-html.d.ts.map +0 -1
  112. package/lib/generated/encode-html.js.map +0 -1
  113. package/lib/index.d.ts.map +0 -1
  114. package/lib/index.js.map +0 -1
  115. /package/{lib/esm → dist/commonjs}/generated/decode-data-html.d.ts +0 -0
  116. /package/{lib/esm → dist/commonjs}/generated/decode-data-xml.d.ts +0 -0
  117. /package/{lib → dist/esm}/generated/decode-data-html.d.ts +0 -0
  118. /package/{lib → dist}/esm/generated/decode-data-html.js +0 -0
  119. /package/{lib → dist/esm}/generated/decode-data-xml.d.ts +0 -0
  120. /package/{lib → dist}/esm/generated/decode-data-xml.js +0 -0
package/src/decode.ts ADDED
@@ -0,0 +1,620 @@
1
+ import htmlDecodeTree from "./generated/decode-data-html.js";
2
+ import xmlDecodeTree from "./generated/decode-data-xml.js";
3
+ import { replaceCodePoint, fromCodePoint } from "./decode-codepoint.js";
4
+
5
+ const enum CharCodes {
6
+ NUM = 35, // "#"
7
+ SEMI = 59, // ";"
8
+ EQUALS = 61, // "="
9
+ ZERO = 48, // "0"
10
+ NINE = 57, // "9"
11
+ LOWER_A = 97, // "a"
12
+ LOWER_F = 102, // "f"
13
+ LOWER_X = 120, // "x"
14
+ LOWER_Z = 122, // "z"
15
+ UPPER_A = 65, // "A"
16
+ UPPER_F = 70, // "F"
17
+ UPPER_Z = 90, // "Z"
18
+ }
19
+
20
+ /** Bit that needs to be set to convert an upper case ASCII character to lower case */
21
+ const TO_LOWER_BIT = 0b10_0000;
22
+
23
+ export enum BinTrieFlags {
24
+ VALUE_LENGTH = 0b1100_0000_0000_0000,
25
+ BRANCH_LENGTH = 0b0011_1111_1000_0000,
26
+ JUMP_TABLE = 0b0000_0000_0111_1111,
27
+ }
28
+
29
+ function isNumber(code: number): boolean {
30
+ return code >= CharCodes.ZERO && code <= CharCodes.NINE;
31
+ }
32
+
33
+ function isHexadecimalCharacter(code: number): boolean {
34
+ return (
35
+ (code >= CharCodes.UPPER_A && code <= CharCodes.UPPER_F) ||
36
+ (code >= CharCodes.LOWER_A && code <= CharCodes.LOWER_F)
37
+ );
38
+ }
39
+
40
+ function isAsciiAlphaNumeric(code: number): boolean {
41
+ return (
42
+ (code >= CharCodes.UPPER_A && code <= CharCodes.UPPER_Z) ||
43
+ (code >= CharCodes.LOWER_A && code <= CharCodes.LOWER_Z) ||
44
+ isNumber(code)
45
+ );
46
+ }
47
+
48
+ /**
49
+ * Checks if the given character is a valid end character for an entity in an attribute.
50
+ *
51
+ * Attribute values that aren't terminated properly aren't parsed, and shouldn't lead to a parser error.
52
+ * See the example in https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
53
+ */
54
+ function isEntityInAttributeInvalidEnd(code: number): boolean {
55
+ return code === CharCodes.EQUALS || isAsciiAlphaNumeric(code);
56
+ }
57
+
58
+ const enum EntityDecoderState {
59
+ EntityStart,
60
+ NumericStart,
61
+ NumericDecimal,
62
+ NumericHex,
63
+ NamedEntity,
64
+ }
65
+
66
+ export enum DecodingMode {
67
+ /** Entities in text nodes that can end with any character. */
68
+ Legacy = 0,
69
+ /** Only allow entities terminated with a semicolon. */
70
+ Strict = 1,
71
+ /** Entities in attributes have limitations on ending characters. */
72
+ Attribute = 2,
73
+ }
74
+
75
+ /**
76
+ * Producers for character reference errors as defined in the HTML spec.
77
+ */
78
+ export interface EntityErrorProducer {
79
+ missingSemicolonAfterCharacterReference(): void;
80
+ absenceOfDigitsInNumericCharacterReference(
81
+ consumedCharacters: number,
82
+ ): void;
83
+ validateNumericCharacterReference(code: number): void;
84
+ }
85
+
86
+ /**
87
+ * Token decoder with support of writing partial entities.
88
+ */
89
+ export class EntityDecoder {
90
+ constructor(
91
+ /** The tree used to decode entities. */
92
+ private readonly decodeTree: Uint16Array,
93
+ /**
94
+ * The function that is called when a codepoint is decoded.
95
+ *
96
+ * For multi-byte named entities, this will be called multiple times,
97
+ * with the second codepoint, and the same `consumed` value.
98
+ *
99
+ * @param codepoint The decoded codepoint.
100
+ * @param consumed The number of bytes consumed by the decoder.
101
+ */
102
+ private readonly emitCodePoint: (cp: number, consumed: number) => void,
103
+ /** An object that is used to produce errors. */
104
+ private readonly errors?: EntityErrorProducer,
105
+ ) {}
106
+
107
+ /** The current state of the decoder. */
108
+ private state = EntityDecoderState.EntityStart;
109
+ /** Characters that were consumed while parsing an entity. */
110
+ private consumed = 1;
111
+ /**
112
+ * The result of the entity.
113
+ *
114
+ * Either the result index of a numeric entity, or the codepoint of a
115
+ * numeric entity.
116
+ */
117
+ private result = 0;
118
+
119
+ /** The current index in the decode tree. */
120
+ private treeIndex = 0;
121
+ /** The number of characters that were consumed in excess. */
122
+ private excess = 1;
123
+ /** The mode in which the decoder is operating. */
124
+ private decodeMode = DecodingMode.Strict;
125
+
126
+ /** Resets the instance to make it reusable. */
127
+ startEntity(decodeMode: DecodingMode): void {
128
+ this.decodeMode = decodeMode;
129
+ this.state = EntityDecoderState.EntityStart;
130
+ this.result = 0;
131
+ this.treeIndex = 0;
132
+ this.excess = 1;
133
+ this.consumed = 1;
134
+ }
135
+
136
+ /**
137
+ * Write an entity to the decoder. This can be called multiple times with partial entities.
138
+ * If the entity is incomplete, the decoder will return -1.
139
+ *
140
+ * Mirrors the implementation of `getDecoder`, but with the ability to stop decoding if the
141
+ * entity is incomplete, and resume when the next string is written.
142
+ *
143
+ * @param input The string containing the entity (or a continuation of the entity).
144
+ * @param offset The offset at which the entity begins. Should be 0 if this is not the first call.
145
+ * @returns The number of characters that were consumed, or -1 if the entity is incomplete.
146
+ */
147
+ write(input: string, offset: number): number {
148
+ switch (this.state) {
149
+ case EntityDecoderState.EntityStart: {
150
+ if (input.charCodeAt(offset) === CharCodes.NUM) {
151
+ this.state = EntityDecoderState.NumericStart;
152
+ this.consumed += 1;
153
+ return this.stateNumericStart(input, offset + 1);
154
+ }
155
+ this.state = EntityDecoderState.NamedEntity;
156
+ return this.stateNamedEntity(input, offset);
157
+ }
158
+
159
+ case EntityDecoderState.NumericStart: {
160
+ return this.stateNumericStart(input, offset);
161
+ }
162
+
163
+ case EntityDecoderState.NumericDecimal: {
164
+ return this.stateNumericDecimal(input, offset);
165
+ }
166
+
167
+ case EntityDecoderState.NumericHex: {
168
+ return this.stateNumericHex(input, offset);
169
+ }
170
+
171
+ case EntityDecoderState.NamedEntity: {
172
+ return this.stateNamedEntity(input, offset);
173
+ }
174
+ }
175
+ }
176
+
177
+ /**
178
+ * Switches between the numeric decimal and hexadecimal states.
179
+ *
180
+ * Equivalent to the `Numeric character reference state` in the HTML spec.
181
+ *
182
+ * @param input The string containing the entity (or a continuation of the entity).
183
+ * @param offset The current offset.
184
+ * @returns The number of characters that were consumed, or -1 if the entity is incomplete.
185
+ */
186
+ private stateNumericStart(input: string, offset: number): number {
187
+ if (offset >= input.length) {
188
+ return -1;
189
+ }
190
+
191
+ if ((input.charCodeAt(offset) | TO_LOWER_BIT) === CharCodes.LOWER_X) {
192
+ this.state = EntityDecoderState.NumericHex;
193
+ this.consumed += 1;
194
+ return this.stateNumericHex(input, offset + 1);
195
+ }
196
+
197
+ this.state = EntityDecoderState.NumericDecimal;
198
+ return this.stateNumericDecimal(input, offset);
199
+ }
200
+
201
+ private addToNumericResult(
202
+ input: string,
203
+ start: number,
204
+ end: number,
205
+ base: number,
206
+ ): void {
207
+ if (start !== end) {
208
+ const digitCount = end - start;
209
+ this.result =
210
+ this.result * Math.pow(base, digitCount) +
211
+ Number.parseInt(input.substr(start, digitCount), base);
212
+ this.consumed += digitCount;
213
+ }
214
+ }
215
+
216
+ /**
217
+ * Parses a hexadecimal numeric entity.
218
+ *
219
+ * Equivalent to the `Hexademical character reference state` in the HTML spec.
220
+ *
221
+ * @param input The string containing the entity (or a continuation of the entity).
222
+ * @param offset The current offset.
223
+ * @returns The number of characters that were consumed, or -1 if the entity is incomplete.
224
+ */
225
+ private stateNumericHex(input: string, offset: number): number {
226
+ const startIndex = offset;
227
+
228
+ while (offset < input.length) {
229
+ const char = input.charCodeAt(offset);
230
+ if (isNumber(char) || isHexadecimalCharacter(char)) {
231
+ offset += 1;
232
+ } else {
233
+ this.addToNumericResult(input, startIndex, offset, 16);
234
+ return this.emitNumericEntity(char, 3);
235
+ }
236
+ }
237
+
238
+ this.addToNumericResult(input, startIndex, offset, 16);
239
+
240
+ return -1;
241
+ }
242
+
243
+ /**
244
+ * Parses a decimal numeric entity.
245
+ *
246
+ * Equivalent to the `Decimal character reference state` in the HTML spec.
247
+ *
248
+ * @param input The string containing the entity (or a continuation of the entity).
249
+ * @param offset The current offset.
250
+ * @returns The number of characters that were consumed, or -1 if the entity is incomplete.
251
+ */
252
+ private stateNumericDecimal(input: string, offset: number): number {
253
+ const startIndex = offset;
254
+
255
+ while (offset < input.length) {
256
+ const char = input.charCodeAt(offset);
257
+ if (isNumber(char)) {
258
+ offset += 1;
259
+ } else {
260
+ this.addToNumericResult(input, startIndex, offset, 10);
261
+ return this.emitNumericEntity(char, 2);
262
+ }
263
+ }
264
+
265
+ this.addToNumericResult(input, startIndex, offset, 10);
266
+
267
+ return -1;
268
+ }
269
+
270
+ /**
271
+ * Validate and emit a numeric entity.
272
+ *
273
+ * Implements the logic from the `Hexademical character reference start
274
+ * state` and `Numeric character reference end state` in the HTML spec.
275
+ *
276
+ * @param lastCp The last code point of the entity. Used to see if the
277
+ * entity was terminated with a semicolon.
278
+ * @param expectedLength The minimum number of characters that should be
279
+ * consumed. Used to validate that at least one digit
280
+ * was consumed.
281
+ * @returns The number of characters that were consumed.
282
+ */
283
+ private emitNumericEntity(lastCp: number, expectedLength: number): number {
284
+ // Ensure we consumed at least one digit.
285
+ if (this.consumed <= expectedLength) {
286
+ this.errors?.absenceOfDigitsInNumericCharacterReference(
287
+ this.consumed,
288
+ );
289
+ return 0;
290
+ }
291
+
292
+ // Figure out if this is a legit end of the entity
293
+ if (lastCp === CharCodes.SEMI) {
294
+ this.consumed += 1;
295
+ } else if (this.decodeMode === DecodingMode.Strict) {
296
+ return 0;
297
+ }
298
+
299
+ this.emitCodePoint(replaceCodePoint(this.result), this.consumed);
300
+
301
+ if (this.errors) {
302
+ if (lastCp !== CharCodes.SEMI) {
303
+ this.errors.missingSemicolonAfterCharacterReference();
304
+ }
305
+
306
+ this.errors.validateNumericCharacterReference(this.result);
307
+ }
308
+
309
+ return this.consumed;
310
+ }
311
+
312
+ /**
313
+ * Parses a named entity.
314
+ *
315
+ * Equivalent to the `Named character reference state` in the HTML spec.
316
+ *
317
+ * @param input The string containing the entity (or a continuation of the entity).
318
+ * @param offset The current offset.
319
+ * @returns The number of characters that were consumed, or -1 if the entity is incomplete.
320
+ */
321
+ private stateNamedEntity(input: string, offset: number): number {
322
+ const { decodeTree } = this;
323
+ let current = decodeTree[this.treeIndex];
324
+ // The mask is the number of bytes of the value, including the current byte.
325
+ let valueLength = (current & BinTrieFlags.VALUE_LENGTH) >> 14;
326
+
327
+ for (; offset < input.length; offset++, this.excess++) {
328
+ const char = input.charCodeAt(offset);
329
+
330
+ this.treeIndex = determineBranch(
331
+ decodeTree,
332
+ current,
333
+ this.treeIndex + Math.max(1, valueLength),
334
+ char,
335
+ );
336
+
337
+ if (this.treeIndex < 0) {
338
+ return this.result === 0 ||
339
+ // If we are parsing an attribute
340
+ (this.decodeMode === DecodingMode.Attribute &&
341
+ // We shouldn't have consumed any characters after the entity,
342
+ (valueLength === 0 ||
343
+ // And there should be no invalid characters.
344
+ isEntityInAttributeInvalidEnd(char)))
345
+ ? 0
346
+ : this.emitNotTerminatedNamedEntity();
347
+ }
348
+
349
+ current = decodeTree[this.treeIndex];
350
+ valueLength = (current & BinTrieFlags.VALUE_LENGTH) >> 14;
351
+
352
+ // If the branch is a value, store it and continue
353
+ if (valueLength !== 0) {
354
+ // If the entity is terminated by a semicolon, we are done.
355
+ if (char === CharCodes.SEMI) {
356
+ return this.emitNamedEntityData(
357
+ this.treeIndex,
358
+ valueLength,
359
+ this.consumed + this.excess,
360
+ );
361
+ }
362
+
363
+ // If we encounter a non-terminated (legacy) entity while parsing strictly, then ignore it.
364
+ if (this.decodeMode !== DecodingMode.Strict) {
365
+ this.result = this.treeIndex;
366
+ this.consumed += this.excess;
367
+ this.excess = 0;
368
+ }
369
+ }
370
+ }
371
+
372
+ return -1;
373
+ }
374
+
375
+ /**
376
+ * Emit a named entity that was not terminated with a semicolon.
377
+ *
378
+ * @returns The number of characters consumed.
379
+ */
380
+ private emitNotTerminatedNamedEntity(): number {
381
+ const { result, decodeTree } = this;
382
+
383
+ const valueLength =
384
+ (decodeTree[result] & BinTrieFlags.VALUE_LENGTH) >> 14;
385
+
386
+ this.emitNamedEntityData(result, valueLength, this.consumed);
387
+ this.errors?.missingSemicolonAfterCharacterReference();
388
+
389
+ return this.consumed;
390
+ }
391
+
392
+ /**
393
+ * Emit a named entity.
394
+ *
395
+ * @param result The index of the entity in the decode tree.
396
+ * @param valueLength The number of bytes in the entity.
397
+ * @param consumed The number of characters consumed.
398
+ *
399
+ * @returns The number of characters consumed.
400
+ */
401
+ private emitNamedEntityData(
402
+ result: number,
403
+ valueLength: number,
404
+ consumed: number,
405
+ ): number {
406
+ const { decodeTree } = this;
407
+
408
+ this.emitCodePoint(
409
+ valueLength === 1
410
+ ? decodeTree[result] & ~BinTrieFlags.VALUE_LENGTH
411
+ : decodeTree[result + 1],
412
+ consumed,
413
+ );
414
+ if (valueLength === 3) {
415
+ // For multi-byte values, we need to emit the second byte.
416
+ this.emitCodePoint(decodeTree[result + 2], consumed);
417
+ }
418
+
419
+ return consumed;
420
+ }
421
+
422
+ /**
423
+ * Signal to the parser that the end of the input was reached.
424
+ *
425
+ * Remaining data will be emitted and relevant errors will be produced.
426
+ *
427
+ * @returns The number of characters consumed.
428
+ */
429
+ end(): number {
430
+ switch (this.state) {
431
+ case EntityDecoderState.NamedEntity: {
432
+ // Emit a named entity if we have one.
433
+ return this.result !== 0 &&
434
+ (this.decodeMode !== DecodingMode.Attribute ||
435
+ this.result === this.treeIndex)
436
+ ? this.emitNotTerminatedNamedEntity()
437
+ : 0;
438
+ }
439
+ // Otherwise, emit a numeric entity if we have one.
440
+ case EntityDecoderState.NumericDecimal: {
441
+ return this.emitNumericEntity(0, 2);
442
+ }
443
+ case EntityDecoderState.NumericHex: {
444
+ return this.emitNumericEntity(0, 3);
445
+ }
446
+ case EntityDecoderState.NumericStart: {
447
+ this.errors?.absenceOfDigitsInNumericCharacterReference(
448
+ this.consumed,
449
+ );
450
+ return 0;
451
+ }
452
+ case EntityDecoderState.EntityStart: {
453
+ // Return 0 if we have no entity.
454
+ return 0;
455
+ }
456
+ }
457
+ }
458
+ }
459
+
460
+ /**
461
+ * Creates a function that decodes entities in a string.
462
+ *
463
+ * @param decodeTree The decode tree.
464
+ * @returns A function that decodes entities in a string.
465
+ */
466
+ function getDecoder(decodeTree: Uint16Array) {
467
+ let returnValue = "";
468
+ const decoder = new EntityDecoder(
469
+ decodeTree,
470
+ (data) => (returnValue += fromCodePoint(data)),
471
+ );
472
+
473
+ return function decodeWithTrie(
474
+ input: string,
475
+ decodeMode: DecodingMode,
476
+ ): string {
477
+ let lastIndex = 0;
478
+ let offset = 0;
479
+
480
+ while ((offset = input.indexOf("&", offset)) >= 0) {
481
+ returnValue += input.slice(lastIndex, offset);
482
+
483
+ decoder.startEntity(decodeMode);
484
+
485
+ const length = decoder.write(
486
+ input,
487
+ // Skip the "&"
488
+ offset + 1,
489
+ );
490
+
491
+ if (length < 0) {
492
+ lastIndex = offset + decoder.end();
493
+ break;
494
+ }
495
+
496
+ lastIndex = offset + length;
497
+ // If `length` is 0, skip the current `&` and continue.
498
+ offset = length === 0 ? lastIndex + 1 : lastIndex;
499
+ }
500
+
501
+ const result = returnValue + input.slice(lastIndex);
502
+
503
+ // Make sure we don't keep a reference to the final string.
504
+ returnValue = "";
505
+
506
+ return result;
507
+ };
508
+ }
509
+
510
+ /**
511
+ * Determines the branch of the current node that is taken given the current
512
+ * character. This function is used to traverse the trie.
513
+ *
514
+ * @param decodeTree The trie.
515
+ * @param current The current node.
516
+ * @param nodeIdx The index right after the current node and its value.
517
+ * @param char The current character.
518
+ * @returns The index of the next node, or -1 if no branch is taken.
519
+ */
520
+ export function determineBranch(
521
+ decodeTree: Uint16Array,
522
+ current: number,
523
+ nodeIndex: number,
524
+ char: number,
525
+ ): number {
526
+ const branchCount = (current & BinTrieFlags.BRANCH_LENGTH) >> 7;
527
+ const jumpOffset = current & BinTrieFlags.JUMP_TABLE;
528
+
529
+ // Case 1: Single branch encoded in jump offset
530
+ if (branchCount === 0) {
531
+ return jumpOffset !== 0 && char === jumpOffset ? nodeIndex : -1;
532
+ }
533
+
534
+ // Case 2: Multiple branches encoded in jump table
535
+ if (jumpOffset) {
536
+ const value = char - jumpOffset;
537
+
538
+ return value < 0 || value >= branchCount
539
+ ? -1
540
+ : decodeTree[nodeIndex + value] - 1;
541
+ }
542
+
543
+ // Case 3: Multiple branches encoded in dictionary
544
+
545
+ // Binary search for the character.
546
+ let lo = nodeIndex;
547
+ let hi = lo + branchCount - 1;
548
+
549
+ while (lo <= hi) {
550
+ const mid = (lo + hi) >>> 1;
551
+ const midValue = decodeTree[mid];
552
+
553
+ if (midValue < char) {
554
+ lo = mid + 1;
555
+ } else if (midValue > char) {
556
+ hi = mid - 1;
557
+ } else {
558
+ return decodeTree[mid + branchCount];
559
+ }
560
+ }
561
+
562
+ return -1;
563
+ }
564
+
565
+ const htmlDecoder = getDecoder(htmlDecodeTree);
566
+ const xmlDecoder = getDecoder(xmlDecodeTree);
567
+
568
+ /**
569
+ * Decodes an HTML string.
570
+ *
571
+ * @param htmlString The string to decode.
572
+ * @param mode The decoding mode.
573
+ * @returns The decoded string.
574
+ */
575
+ export function decodeHTML(
576
+ htmlString: string,
577
+ mode = DecodingMode.Legacy,
578
+ ): string {
579
+ return htmlDecoder(htmlString, mode);
580
+ }
581
+
582
+ /**
583
+ * Decodes an HTML string in an attribute.
584
+ *
585
+ * @param htmlAttribute The string to decode.
586
+ * @returns The decoded string.
587
+ */
588
+ export function decodeHTMLAttribute(htmlAttribute: string): string {
589
+ return htmlDecoder(htmlAttribute, DecodingMode.Attribute);
590
+ }
591
+
592
+ /**
593
+ * Decodes an HTML string, requiring all entities to be terminated by a semicolon.
594
+ *
595
+ * @param htmlString The string to decode.
596
+ * @returns The decoded string.
597
+ */
598
+ export function decodeHTMLStrict(htmlString: string): string {
599
+ return htmlDecoder(htmlString, DecodingMode.Strict);
600
+ }
601
+
602
+ /**
603
+ * Decodes an XML string, requiring all entities to be terminated by a semicolon.
604
+ *
605
+ * @param xmlString The string to decode.
606
+ * @returns The decoded string.
607
+ */
608
+ export function decodeXML(xmlString: string): string {
609
+ return xmlDecoder(xmlString, DecodingMode.Strict);
610
+ }
611
+
612
+ // Re-export for use by eg. htmlparser2
613
+ export { default as htmlDecodeTree } from "./generated/decode-data-html.js";
614
+ export { default as xmlDecodeTree } from "./generated/decode-data-xml.js";
615
+
616
+ export {
617
+ default as decodeCodePoint,
618
+ replaceCodePoint,
619
+ fromCodePoint,
620
+ } from "./decode-codepoint.js";
@@ -0,0 +1,78 @@
1
+ import { describe, it, expect } from "vitest";
2
+ import * as entities from "./index.js";
3
+
4
+ describe("Encode->decode test", () => {
5
+ const testcases = [
6
+ {
7
+ input: "asdf & ÿ ü '",
8
+ xml: "asdf &amp; &#xff; &#xfc; &apos;",
9
+ html: "asdf &amp; &yuml; &uuml; &apos;",
10
+ },
11
+ {
12
+ input: "&#38;",
13
+ xml: "&amp;#38;",
14
+ html: "&amp;&num;38&semi;",
15
+ },
16
+ ];
17
+
18
+ for (const { input, xml, html } of testcases) {
19
+ const encodedXML = entities.encodeXML(input);
20
+ it(`should XML encode ${input}`, () => expect(encodedXML).toBe(xml));
21
+ it(`should default to XML encode ${input}`, () =>
22
+ expect(entities.encode(input)).toBe(xml));
23
+ it(`should XML decode ${encodedXML}`, () =>
24
+ expect(entities.decodeXML(encodedXML)).toBe(input));
25
+ it(`should default to XML encode ${encodedXML}`, () =>
26
+ expect(entities.decode(encodedXML)).toBe(input));
27
+ it(`should default strict to XML encode ${encodedXML}`, () =>
28
+ expect(entities.decodeStrict(encodedXML)).toBe(input));
29
+
30
+ const encodedHTML5 = entities.encodeHTML5(input);
31
+ it(`should HTML5 encode ${input}`, () =>
32
+ expect(encodedHTML5).toBe(html));
33
+ it(`should HTML5 decode ${encodedHTML5}`, () =>
34
+ expect(entities.decodeHTML(encodedHTML5)).toBe(input));
35
+ it("should encode emojis", () =>
36
+ expect(entities.encodeHTML5("😄🍾🥳💥😇")).toBe(
37
+ "&#x1f604;&#x1f37e;&#x1f973;&#x1f4a5;&#x1f607;",
38
+ ));
39
+ }
40
+
41
+ it("should encode data URIs (issue #16)", () => {
42
+ const data =
43
+ "";
44
+ expect(entities.decode(entities.encode(data))).toBe(data);
45
+ });
46
+
47
+ it("should HTML encode all ASCII characters", () => {
48
+ for (let index = 0; index < 128; index++) {
49
+ const char = String.fromCharCode(index);
50
+ const encoded = entities.encodeHTML(char);
51
+ const decoded = entities.decodeHTML(encoded);
52
+ expect(decoded).toBe(char);
53
+ }
54
+ });
55
+
56
+ it("should encode trailing parts of entities", () =>
57
+ expect(entities.encodeHTML("\uD835")).toBe("&#xd835;"));
58
+
59
+ it("should encode surrogate pair with first surrogate equivalent of entity, without corresponding entity", () =>
60
+ expect(entities.encodeHTML("\u{1D4A4}")).toBe("&#x1d4a4;"));
61
+ });
62
+
63
+ describe("encodeNonAsciiHTML", () => {
64
+ it("should encode all non-ASCII characters", () =>
65
+ expect(entities.encodeNonAsciiHTML("<test> #123! übermaßen")).toBe(
66
+ "&lt;test&gt; #123! &uuml;berma&szlig;en",
67
+ ));
68
+
69
+ it("should encode emojis", () =>
70
+ expect(entities.encodeNonAsciiHTML("😄🍾🥳💥😇")).toBe(
71
+ "&#x1f604;&#x1f37e;&#x1f973;&#x1f4a5;&#x1f607;",
72
+ ));
73
+
74
+ it("should encode chars above surrogates", () =>
75
+ expect(entities.encodeNonAsciiHTML("♒️♓️♈️♉️♊️♋️♌️♍️♎️♏️♐️♑️")).toBe(
76
+ "&#x2652;&#xfe0f;&#x2653;&#xfe0f;&#x2648;&#xfe0f;&#x2649;&#xfe0f;&#x264a;&#xfe0f;&#x264b;&#xfe0f;&#x264c;&#xfe0f;&#x264d;&#xfe0f;&#x264e;&#xfe0f;&#x264f;&#xfe0f;&#x2650;&#xfe0f;&#x2651;&#xfe0f;",
77
+ ));
78
+ });