entities 6.0.1 → 8.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (125) hide show
  1. package/dist/decode-codepoint.d.ts +8 -0
  2. package/dist/decode-codepoint.d.ts.map +1 -0
  3. package/dist/decode-codepoint.js +46 -0
  4. package/dist/decode-codepoint.js.map +1 -0
  5. package/dist/{esm/decode.d.ts → decode.d.ts} +11 -26
  6. package/dist/decode.d.ts.map +1 -0
  7. package/dist/{esm/decode.js → decode.js} +130 -83
  8. package/dist/decode.js.map +1 -0
  9. package/dist/{commonjs/encode.d.ts → encode.d.ts} +2 -0
  10. package/dist/encode.d.ts.map +1 -0
  11. package/dist/encode.js +90 -0
  12. package/dist/encode.js.map +1 -0
  13. package/dist/{esm/escape.d.ts → escape.d.ts} +13 -8
  14. package/dist/escape.d.ts.map +1 -0
  15. package/dist/{esm/escape.js → escape.js} +49 -34
  16. package/dist/escape.js.map +1 -0
  17. package/dist/generated/decode-data-html.d.ts +3 -0
  18. package/dist/generated/decode-data-html.d.ts.map +1 -0
  19. package/dist/generated/decode-data-html.js +5 -0
  20. package/dist/generated/decode-data-html.js.map +1 -0
  21. package/dist/generated/decode-data-xml.d.ts +3 -0
  22. package/dist/generated/decode-data-xml.d.ts.map +1 -0
  23. package/dist/generated/decode-data-xml.js +5 -0
  24. package/dist/generated/decode-data-xml.js.map +1 -0
  25. package/dist/generated/encode-html.d.ts +5 -0
  26. package/dist/generated/encode-html.d.ts.map +1 -0
  27. package/dist/generated/encode-html.js +12 -0
  28. package/dist/generated/encode-html.js.map +1 -0
  29. package/dist/{commonjs/index.d.ts → index.d.ts} +10 -17
  30. package/dist/index.d.ts.map +1 -0
  31. package/dist/{esm/index.js → index.js} +9 -25
  32. package/dist/index.js.map +1 -0
  33. package/dist/internal/bin-trie-flags.d.ts +17 -0
  34. package/dist/internal/bin-trie-flags.d.ts.map +1 -0
  35. package/dist/internal/bin-trie-flags.js +18 -0
  36. package/dist/internal/bin-trie-flags.js.map +1 -0
  37. package/dist/internal/decode-shared.d.ts +7 -0
  38. package/dist/internal/decode-shared.d.ts.map +1 -0
  39. package/dist/internal/decode-shared.js +17 -0
  40. package/dist/internal/decode-shared.js.map +1 -0
  41. package/dist/internal/encode-shared.d.ts +33 -0
  42. package/dist/internal/encode-shared.d.ts.map +1 -0
  43. package/dist/internal/encode-shared.js +93 -0
  44. package/dist/internal/encode-shared.js.map +1 -0
  45. package/package.json +38 -73
  46. package/readme.md +36 -27
  47. package/src/decode-codepoint.ts +1 -32
  48. package/src/decode.ts +127 -76
  49. package/src/encode.ts +49 -31
  50. package/src/escape.ts +50 -38
  51. package/src/generated/decode-data-html.ts +4 -5
  52. package/src/generated/decode-data-xml.ts +4 -5
  53. package/src/generated/encode-html.ts +15 -14
  54. package/src/index.ts +23 -49
  55. package/src/internal/bin-trie-flags.ts +16 -0
  56. package/src/internal/decode-shared.ts +18 -0
  57. package/src/internal/encode-shared.ts +123 -0
  58. package/decode.d.ts +0 -1
  59. package/decode.js +0 -3
  60. package/dist/commonjs/decode-codepoint.d.ts +0 -19
  61. package/dist/commonjs/decode-codepoint.d.ts.map +0 -1
  62. package/dist/commonjs/decode-codepoint.js +0 -77
  63. package/dist/commonjs/decode-codepoint.js.map +0 -1
  64. package/dist/commonjs/decode.d.ts +0 -209
  65. package/dist/commonjs/decode.d.ts.map +0 -1
  66. package/dist/commonjs/decode.js +0 -511
  67. package/dist/commonjs/decode.js.map +0 -1
  68. package/dist/commonjs/encode.d.ts.map +0 -1
  69. package/dist/commonjs/encode.js +0 -73
  70. package/dist/commonjs/encode.js.map +0 -1
  71. package/dist/commonjs/escape.d.ts +0 -43
  72. package/dist/commonjs/escape.d.ts.map +0 -1
  73. package/dist/commonjs/escape.js +0 -121
  74. package/dist/commonjs/escape.js.map +0 -1
  75. package/dist/commonjs/generated/decode-data-html.d.ts +0 -2
  76. package/dist/commonjs/generated/decode-data-html.d.ts.map +0 -1
  77. package/dist/commonjs/generated/decode-data-html.js +0 -10
  78. package/dist/commonjs/generated/decode-data-html.js.map +0 -1
  79. package/dist/commonjs/generated/decode-data-xml.d.ts +0 -2
  80. package/dist/commonjs/generated/decode-data-xml.d.ts.map +0 -1
  81. package/dist/commonjs/generated/decode-data-xml.js +0 -10
  82. package/dist/commonjs/generated/decode-data-xml.js.map +0 -1
  83. package/dist/commonjs/generated/encode-html.d.ts +0 -8
  84. package/dist/commonjs/generated/encode-html.d.ts.map +0 -1
  85. package/dist/commonjs/generated/encode-html.js +0 -13
  86. package/dist/commonjs/generated/encode-html.js.map +0 -1
  87. package/dist/commonjs/index.d.ts.map +0 -1
  88. package/dist/commonjs/index.js +0 -131
  89. package/dist/commonjs/index.js.map +0 -1
  90. package/dist/commonjs/package.json +0 -3
  91. package/dist/esm/decode-codepoint.d.ts +0 -19
  92. package/dist/esm/decode-codepoint.d.ts.map +0 -1
  93. package/dist/esm/decode-codepoint.js +0 -72
  94. package/dist/esm/decode-codepoint.js.map +0 -1
  95. package/dist/esm/decode.d.ts.map +0 -1
  96. package/dist/esm/decode.js.map +0 -1
  97. package/dist/esm/encode.d.ts +0 -22
  98. package/dist/esm/encode.d.ts.map +0 -1
  99. package/dist/esm/encode.js +0 -69
  100. package/dist/esm/encode.js.map +0 -1
  101. package/dist/esm/escape.d.ts.map +0 -1
  102. package/dist/esm/escape.js.map +0 -1
  103. package/dist/esm/generated/decode-data-html.d.ts +0 -2
  104. package/dist/esm/generated/decode-data-html.d.ts.map +0 -1
  105. package/dist/esm/generated/decode-data-html.js +0 -7
  106. package/dist/esm/generated/decode-data-html.js.map +0 -1
  107. package/dist/esm/generated/decode-data-xml.d.ts +0 -2
  108. package/dist/esm/generated/decode-data-xml.d.ts.map +0 -1
  109. package/dist/esm/generated/decode-data-xml.js +0 -7
  110. package/dist/esm/generated/decode-data-xml.js.map +0 -1
  111. package/dist/esm/generated/encode-html.d.ts +0 -8
  112. package/dist/esm/generated/encode-html.d.ts.map +0 -1
  113. package/dist/esm/generated/encode-html.js +0 -10
  114. package/dist/esm/generated/encode-html.js.map +0 -1
  115. package/dist/esm/index.d.ts +0 -96
  116. package/dist/esm/index.d.ts.map +0 -1
  117. package/dist/esm/index.js.map +0 -1
  118. package/dist/esm/package.json +0 -3
  119. package/escape.d.ts +0 -1
  120. package/escape.js +0 -3
  121. package/src/decode.spec.ts +0 -320
  122. package/src/encode.spec.ts +0 -78
  123. package/src/escape.spec.ts +0 -14
  124. package/src/generated/.eslintrc.json +0 -10
  125. package/src/index.spec.ts +0 -125
package/src/decode.ts CHANGED
@@ -1,6 +1,7 @@
1
+ import { replaceCodePoint } from "./decode-codepoint.js";
1
2
  import { htmlDecodeTree } from "./generated/decode-data-html.js";
2
3
  import { xmlDecodeTree } from "./generated/decode-data-xml.js";
3
- import { replaceCodePoint, fromCodePoint } from "./decode-codepoint.js";
4
+ import { BinTrieFlags } from "./internal/bin-trie-flags.js";
4
5
 
5
6
  const enum CharCodes {
6
7
  NUM = 35, // "#"
@@ -20,12 +21,6 @@ const enum CharCodes {
20
21
  /** Bit that needs to be set to convert an upper case ASCII character to lower case */
21
22
  const TO_LOWER_BIT = 0b10_0000;
22
23
 
23
- export enum BinTrieFlags {
24
- VALUE_LENGTH = 0b1100_0000_0000_0000,
25
- BRANCH_LENGTH = 0b0011_1111_1000_0000,
26
- JUMP_TABLE = 0b0000_0000_0111_1111,
27
- }
28
-
29
24
  function isNumber(code: number): boolean {
30
25
  return code >= CharCodes.ZERO && code <= CharCodes.NINE;
31
26
  }
@@ -50,6 +45,7 @@ function isAsciiAlphaNumeric(code: number): boolean {
50
45
  *
51
46
  * Attribute values that aren't terminated properly aren't parsed, and shouldn't lead to a parser error.
52
47
  * See the example in https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
48
+ * @param code Code point to decode.
53
49
  */
54
50
  function isEntityInAttributeInvalidEnd(code: number): boolean {
55
51
  return code === CharCodes.EQUALS || isAsciiAlphaNumeric(code);
@@ -63,6 +59,9 @@ const enum EntityDecoderState {
63
59
  NamedEntity,
64
60
  }
65
61
 
62
+ /**
63
+ * Decoding mode for named entities.
64
+ */
66
65
  export enum DecodingMode {
67
66
  /** Entities in text nodes that can end with any character. */
68
67
  Legacy = 0,
@@ -89,13 +88,13 @@ export interface EntityErrorProducer {
89
88
  export class EntityDecoder {
90
89
  constructor(
91
90
  /** The tree used to decode entities. */
91
+ // biome-ignore lint/correctness/noUnusedPrivateClassMembers: False positive
92
92
  private readonly decodeTree: Uint16Array,
93
93
  /**
94
94
  * The function that is called when a codepoint is decoded.
95
95
  *
96
96
  * For multi-byte named entities, this will be called multiple times,
97
97
  * with the second codepoint, and the same `consumed` value.
98
- *
99
98
  * @param codepoint The decoded codepoint.
100
99
  * @param consumed The number of bytes consumed by the decoder.
101
100
  */
@@ -122,8 +121,13 @@ export class EntityDecoder {
122
121
  private excess = 1;
123
122
  /** The mode in which the decoder is operating. */
124
123
  private decodeMode = DecodingMode.Strict;
124
+ /** The number of characters that have been consumed in the current run. */
125
+ private runConsumed = 0;
125
126
 
126
- /** Resets the instance to make it reusable. */
127
+ /**
128
+ * Resets the instance to make it reusable.
129
+ * @param decodeMode Entity decoding mode to use.
130
+ */
127
131
  startEntity(decodeMode: DecodingMode): void {
128
132
  this.decodeMode = decodeMode;
129
133
  this.state = EntityDecoderState.EntityStart;
@@ -131,6 +135,7 @@ export class EntityDecoder {
131
135
  this.treeIndex = 0;
132
136
  this.excess = 1;
133
137
  this.consumed = 1;
138
+ this.runConsumed = 0;
134
139
  }
135
140
 
136
141
  /**
@@ -139,7 +144,6 @@ export class EntityDecoder {
139
144
  *
140
145
  * Mirrors the implementation of `getDecoder`, but with the ability to stop decoding if the
141
146
  * entity is incomplete, and resume when the next string is written.
142
- *
143
147
  * @param input The string containing the entity (or a continuation of the entity).
144
148
  * @param offset The offset at which the entity begins. Should be 0 if this is not the first call.
145
149
  * @returns The number of characters that were consumed, or -1 if the entity is incomplete.
@@ -178,7 +182,6 @@ export class EntityDecoder {
178
182
  * Switches between the numeric decimal and hexadecimal states.
179
183
  *
180
184
  * Equivalent to the `Numeric character reference state` in the HTML spec.
181
- *
182
185
  * @param input The string containing the entity (or a continuation of the entity).
183
186
  * @param offset The current offset.
184
187
  * @returns The number of characters that were consumed, or -1 if the entity is incomplete.
@@ -198,73 +201,53 @@ export class EntityDecoder {
198
201
  return this.stateNumericDecimal(input, offset);
199
202
  }
200
203
 
201
- private addToNumericResult(
202
- input: string,
203
- start: number,
204
- end: number,
205
- base: number,
206
- ): void {
207
- if (start !== end) {
208
- const digitCount = end - start;
209
- this.result =
210
- this.result * Math.pow(base, digitCount) +
211
- Number.parseInt(input.substr(start, digitCount), base);
212
- this.consumed += digitCount;
213
- }
214
- }
215
-
216
204
  /**
217
205
  * Parses a hexadecimal numeric entity.
218
206
  *
219
207
  * Equivalent to the `Hexademical character reference state` in the HTML spec.
220
- *
221
208
  * @param input The string containing the entity (or a continuation of the entity).
222
209
  * @param offset The current offset.
223
210
  * @returns The number of characters that were consumed, or -1 if the entity is incomplete.
224
211
  */
225
212
  private stateNumericHex(input: string, offset: number): number {
226
- const startIndex = offset;
227
-
228
213
  while (offset < input.length) {
229
214
  const char = input.charCodeAt(offset);
230
215
  if (isNumber(char) || isHexadecimalCharacter(char)) {
231
- offset += 1;
216
+ // Convert hex digit to value (0-15); 'a'/'A' -> 10.
217
+ const digit =
218
+ char <= CharCodes.NINE
219
+ ? char - CharCodes.ZERO
220
+ : (char | TO_LOWER_BIT) - CharCodes.LOWER_A + 10;
221
+ this.result = this.result * 16 + digit;
222
+ this.consumed++;
223
+ offset++;
232
224
  } else {
233
- this.addToNumericResult(input, startIndex, offset, 16);
234
225
  return this.emitNumericEntity(char, 3);
235
226
  }
236
227
  }
237
-
238
- this.addToNumericResult(input, startIndex, offset, 16);
239
-
240
- return -1;
228
+ return -1; // Incomplete entity
241
229
  }
242
230
 
243
231
  /**
244
232
  * Parses a decimal numeric entity.
245
233
  *
246
234
  * Equivalent to the `Decimal character reference state` in the HTML spec.
247
- *
248
235
  * @param input The string containing the entity (or a continuation of the entity).
249
236
  * @param offset The current offset.
250
237
  * @returns The number of characters that were consumed, or -1 if the entity is incomplete.
251
238
  */
252
239
  private stateNumericDecimal(input: string, offset: number): number {
253
- const startIndex = offset;
254
-
255
240
  while (offset < input.length) {
256
241
  const char = input.charCodeAt(offset);
257
242
  if (isNumber(char)) {
258
- offset += 1;
243
+ this.result = this.result * 10 + (char - CharCodes.ZERO);
244
+ this.consumed++;
245
+ offset++;
259
246
  } else {
260
- this.addToNumericResult(input, startIndex, offset, 10);
261
247
  return this.emitNumericEntity(char, 2);
262
248
  }
263
249
  }
264
-
265
- this.addToNumericResult(input, startIndex, offset, 10);
266
-
267
- return -1;
250
+ return -1; // Incomplete entity
268
251
  }
269
252
 
270
253
  /**
@@ -272,7 +255,6 @@ export class EntityDecoder {
272
255
  *
273
256
  * Implements the logic from the `Hexademical character reference start
274
257
  * state` and `Numeric character reference end state` in the HTML spec.
275
- *
276
258
  * @param lastCp The last code point of the entity. Used to see if the
277
259
  * entity was terminated with a semicolon.
278
260
  * @param expectedLength The minimum number of characters that should be
@@ -313,7 +295,6 @@ export class EntityDecoder {
313
295
  * Parses a named entity.
314
296
  *
315
297
  * Equivalent to the `Named character reference state` in the HTML spec.
316
- *
317
298
  * @param input The string containing the entity (or a continuation of the entity).
318
299
  * @param offset The current offset.
319
300
  * @returns The number of characters that were consumed, or -1 if the entity is incomplete.
@@ -321,12 +302,84 @@ export class EntityDecoder {
321
302
  private stateNamedEntity(input: string, offset: number): number {
322
303
  const { decodeTree } = this;
323
304
  let current = decodeTree[this.treeIndex];
324
- // The mask is the number of bytes of the value, including the current byte.
305
+ // The length is the number of bytes of the value, including the current byte.
325
306
  let valueLength = (current & BinTrieFlags.VALUE_LENGTH) >> 14;
326
307
 
327
- for (; offset < input.length; offset++, this.excess++) {
308
+ while (offset < input.length) {
309
+ // Handle compact runs (possibly inline): valueLength == 0 and SEMI_REQUIRED bit set.
310
+ if (valueLength === 0 && (current & BinTrieFlags.FLAG13) !== 0) {
311
+ const runLength =
312
+ (current & BinTrieFlags.BRANCH_LENGTH) >> 7; /* 2..63 */
313
+
314
+ // If we are starting a run, check the first char.
315
+ if (this.runConsumed === 0) {
316
+ const firstChar = current & BinTrieFlags.JUMP_TABLE;
317
+ if (input.charCodeAt(offset) !== firstChar) {
318
+ return this.result === 0
319
+ ? 0
320
+ : this.emitNotTerminatedNamedEntity();
321
+ }
322
+ offset++;
323
+ this.excess++;
324
+ this.runConsumed++;
325
+ }
326
+
327
+ // Check remaining characters in the run.
328
+ while (this.runConsumed < runLength) {
329
+ if (offset >= input.length) {
330
+ return -1;
331
+ }
332
+
333
+ const charIndexInPacked = this.runConsumed - 1;
334
+ const packedWord =
335
+ decodeTree[
336
+ this.treeIndex + 1 + (charIndexInPacked >> 1)
337
+ ];
338
+ const expectedChar =
339
+ charIndexInPacked % 2 === 0
340
+ ? packedWord & 0xff
341
+ : (packedWord >> 8) & 0xff;
342
+
343
+ if (input.charCodeAt(offset) !== expectedChar) {
344
+ this.runConsumed = 0;
345
+ return this.result === 0
346
+ ? 0
347
+ : this.emitNotTerminatedNamedEntity();
348
+ }
349
+ offset++;
350
+ this.excess++;
351
+ this.runConsumed++;
352
+ }
353
+
354
+ this.runConsumed = 0;
355
+ this.treeIndex += 1 + (runLength >> 1);
356
+ current = decodeTree[this.treeIndex];
357
+ valueLength = (current & BinTrieFlags.VALUE_LENGTH) >> 14;
358
+ }
359
+
360
+ if (offset >= input.length) break;
361
+
328
362
  const char = input.charCodeAt(offset);
329
363
 
364
+ /*
365
+ * Implicit semicolon handling for nodes that require a semicolon but
366
+ * don't have an explicit ';' branch stored in the trie. If we have
367
+ * a value on the current node, it requires a semicolon, and the
368
+ * current input character is a semicolon, emit the entity using the
369
+ * current node (without descending further).
370
+ */
371
+ if (
372
+ char === CharCodes.SEMI &&
373
+ valueLength !== 0 &&
374
+ (current & BinTrieFlags.FLAG13) !== 0
375
+ ) {
376
+ return this.emitNamedEntityData(
377
+ this.treeIndex,
378
+ valueLength,
379
+ this.consumed + this.excess,
380
+ );
381
+ }
382
+
330
383
  this.treeIndex = determineBranch(
331
384
  decodeTree,
332
385
  current,
@@ -361,12 +414,18 @@ export class EntityDecoder {
361
414
  }
362
415
 
363
416
  // If we encounter a non-terminated (legacy) entity while parsing strictly, then ignore it.
364
- if (this.decodeMode !== DecodingMode.Strict) {
417
+ if (
418
+ this.decodeMode !== DecodingMode.Strict &&
419
+ (current & BinTrieFlags.FLAG13) === 0
420
+ ) {
365
421
  this.result = this.treeIndex;
366
422
  this.consumed += this.excess;
367
423
  this.excess = 0;
368
424
  }
369
425
  }
426
+ // Increment offset & excess for next iteration
427
+ offset++;
428
+ this.excess++;
370
429
  }
371
430
 
372
431
  return -1;
@@ -374,7 +433,6 @@ export class EntityDecoder {
374
433
 
375
434
  /**
376
435
  * Emit a named entity that was not terminated with a semicolon.
377
- *
378
436
  * @returns The number of characters consumed.
379
437
  */
380
438
  private emitNotTerminatedNamedEntity(): number {
@@ -391,11 +449,9 @@ export class EntityDecoder {
391
449
 
392
450
  /**
393
451
  * Emit a named entity.
394
- *
395
452
  * @param result The index of the entity in the decode tree.
396
453
  * @param valueLength The number of bytes in the entity.
397
454
  * @param consumed The number of characters consumed.
398
- *
399
455
  * @returns The number of characters consumed.
400
456
  */
401
457
  private emitNamedEntityData(
@@ -407,7 +463,8 @@ export class EntityDecoder {
407
463
 
408
464
  this.emitCodePoint(
409
465
  valueLength === 1
410
- ? decodeTree[result] & ~BinTrieFlags.VALUE_LENGTH
466
+ ? decodeTree[result] &
467
+ ~(BinTrieFlags.VALUE_LENGTH | BinTrieFlags.FLAG13)
411
468
  : decodeTree[result + 1],
412
469
  consumed,
413
470
  );
@@ -423,7 +480,6 @@ export class EntityDecoder {
423
480
  * Signal to the parser that the end of the input was reached.
424
481
  *
425
482
  * Remaining data will be emitted and relevant errors will be produced.
426
- *
427
483
  * @returns The number of characters consumed.
428
484
  */
429
485
  end(): number {
@@ -459,7 +515,6 @@ export class EntityDecoder {
459
515
 
460
516
  /**
461
517
  * Creates a function that decodes entities in a string.
462
- *
463
518
  * @param decodeTree The decode tree.
464
519
  * @returns A function that decodes entities in a string.
465
520
  */
@@ -467,7 +522,7 @@ function getDecoder(decodeTree: Uint16Array) {
467
522
  let returnValue = "";
468
523
  const decoder = new EntityDecoder(
469
524
  decodeTree,
470
- (data) => (returnValue += fromCodePoint(data)),
525
+ (data) => (returnValue += String.fromCodePoint(data)),
471
526
  );
472
527
 
473
528
  return function decodeWithTrie(
@@ -510,10 +565,9 @@ function getDecoder(decodeTree: Uint16Array) {
510
565
  /**
511
566
  * Determines the branch of the current node that is taken given the current
512
567
  * character. This function is used to traverse the trie.
513
- *
514
568
  * @param decodeTree The trie.
515
569
  * @param current The current node.
516
- * @param nodeIdx The index right after the current node and its value.
570
+ * @param nodeIndex Index immediately after the current node header.
517
571
  * @param char The current character.
518
572
  * @returns The index of the next node, or -1 if no branch is taken.
519
573
  */
@@ -540,22 +594,28 @@ export function determineBranch(
540
594
  : decodeTree[nodeIndex + value] - 1;
541
595
  }
542
596
 
543
- // Case 3: Multiple branches encoded in dictionary
597
+ // Case 3: Multiple branches encoded in packed dictionary (two keys per uint16)
598
+ const packedKeySlots = (branchCount + 1) >> 1;
544
599
 
545
- // Binary search for the character.
546
- let lo = nodeIndex;
547
- let hi = lo + branchCount - 1;
600
+ /*
601
+ * Treat packed keys as a virtual sorted array of length `branchCount`.
602
+ * Key(i) = low byte for even i, high byte for odd i in slot i>>1.
603
+ */
604
+ let lo = 0;
605
+ let hi = branchCount - 1;
548
606
 
549
607
  while (lo <= hi) {
550
608
  const mid = (lo + hi) >>> 1;
551
- const midValue = decodeTree[mid];
609
+ const slot = mid >> 1;
610
+ const packed = decodeTree[nodeIndex + slot];
611
+ const midKey = (packed >> ((mid & 1) * 8)) & 0xff;
552
612
 
553
- if (midValue < char) {
613
+ if (midKey < char) {
554
614
  lo = mid + 1;
555
- } else if (midValue > char) {
615
+ } else if (midKey > char) {
556
616
  hi = mid - 1;
557
617
  } else {
558
- return decodeTree[mid + branchCount];
618
+ return decodeTree[nodeIndex + packedKeySlots + mid];
559
619
  }
560
620
  }
561
621
 
@@ -567,7 +627,6 @@ const xmlDecoder = /* #__PURE__ */ getDecoder(xmlDecodeTree);
567
627
 
568
628
  /**
569
629
  * Decodes an HTML string.
570
- *
571
630
  * @param htmlString The string to decode.
572
631
  * @param mode The decoding mode.
573
632
  * @returns The decoded string.
@@ -581,7 +640,6 @@ export function decodeHTML(
581
640
 
582
641
  /**
583
642
  * Decodes an HTML string in an attribute.
584
- *
585
643
  * @param htmlAttribute The string to decode.
586
644
  * @returns The decoded string.
587
645
  */
@@ -591,7 +649,6 @@ export function decodeHTMLAttribute(htmlAttribute: string): string {
591
649
 
592
650
  /**
593
651
  * Decodes an HTML string, requiring all entities to be terminated by a semicolon.
594
- *
595
652
  * @param htmlString The string to decode.
596
653
  * @returns The decoded string.
597
654
  */
@@ -601,7 +658,6 @@ export function decodeHTMLStrict(htmlString: string): string {
601
658
 
602
659
  /**
603
660
  * Decodes an XML string, requiring all entities to be terminated by a semicolon.
604
- *
605
661
  * @param xmlString The string to decode.
606
662
  * @returns The decoded string.
607
663
  */
@@ -609,12 +665,7 @@ export function decodeXML(xmlString: string): string {
609
665
  return xmlDecoder(xmlString, DecodingMode.Strict);
610
666
  }
611
667
 
668
+ export { replaceCodePoint } from "./decode-codepoint.js";
612
669
  // Re-export for use by eg. htmlparser2
613
670
  export { htmlDecodeTree } from "./generated/decode-data-html.js";
614
671
  export { xmlDecodeTree } from "./generated/decode-data-xml.js";
615
-
616
- export {
617
- decodeCodePoint,
618
- replaceCodePoint,
619
- fromCodePoint,
620
- } from "./decode-codepoint.js";
package/src/encode.ts CHANGED
@@ -1,7 +1,17 @@
1
+ import { getCodePoint, XML_BITSET_VALUE } from "./escape.js";
1
2
  import { htmlTrie } from "./generated/encode-html.js";
2
- import { xmlReplacer, getCodePoint } from "./escape.js";
3
3
 
4
- const htmlReplacer = /[\t\n\f!-,./:-@[-`{-}\u0080-\uFFFF]/g;
4
+ /**
5
+ * We store the characters to consider as a compact bitset for fast lookups.
6
+ */
7
+ const HTML_BITSET = /* #__PURE__ */ new Uint32Array([
8
+ 0x16_00, // Bits for 09,0A,0C
9
+ 0xfc_00_ff_fe, // 32..63 -> 21-2D (minus space), 2E,2F,3A-3F
10
+ 0xf8_00_00_01, // 64..95 -> 40, 5B-5F
11
+ 0x38_00_00_01, // 96..127-> 60, 7B-7D
12
+ ]);
13
+
14
+ const XML_BITSET = /* #__PURE__ */ new Uint32Array([0, XML_BITSET_VALUE, 0, 0]);
5
15
 
6
16
  /**
7
17
  * Encodes all characters in the input using HTML entities. This includes
@@ -13,9 +23,10 @@ const htmlReplacer = /[\t\n\f!-,./:-@[-`{-}\u0080-\uFFFF]/g;
13
23
  *
14
24
  * If a character has no equivalent entity, a numeric hexadecimal reference
15
25
  * (eg. `&#xfc;`) will be used.
26
+ * @param input Input string to encode or decode.
16
27
  */
17
28
  export function encodeHTML(input: string): string {
18
- return encodeHTMLTrieRe(htmlReplacer, input);
29
+ return encodeHTMLTrieRe(HTML_BITSET, input);
19
30
  }
20
31
  /**
21
32
  * Encodes all non-ASCII characters, as well as characters not valid in HTML
@@ -24,54 +35,61 @@ export function encodeHTML(input: string): string {
24
35
  *
25
36
  * If a character has no equivalent entity, a numeric hexadecimal reference
26
37
  * (eg. `&#xfc;`) will be used.
38
+ * @param input Input string to encode or decode.
27
39
  */
28
40
  export function encodeNonAsciiHTML(input: string): string {
29
- return encodeHTMLTrieRe(xmlReplacer, input);
41
+ return encodeHTMLTrieRe(XML_BITSET, input);
30
42
  }
31
43
 
32
- function encodeHTMLTrieRe(regExp: RegExp, input: string): string {
33
- let returnValue = "";
34
- let lastIndex = 0;
35
- let match;
44
+ function encodeHTMLTrieRe(bitset: Uint32Array, input: string): string {
45
+ let out: string | undefined;
46
+ let last = 0; // Start of the next untouched slice.
47
+ const { length } = input;
36
48
 
37
- while ((match = regExp.exec(input)) !== null) {
38
- const { index } = match;
39
- returnValue += input.substring(lastIndex, index);
49
+ for (let index = 0; index < length; index++) {
40
50
  const char = input.charCodeAt(index);
41
- let next = htmlTrie.get(char);
51
+ // Skip ASCII characters that don't need encoding
52
+ if (char < 0x80 && !((bitset[char >>> 5] >>> char) & 1)) {
53
+ continue;
54
+ }
42
55
 
43
- if (typeof next === "object") {
44
- // We are in a branch. Try to match the next char.
45
- if (index + 1 < input.length) {
56
+ if (out === undefined) out = input.substring(0, index);
57
+ else if (last !== index) out += input.substring(last, index);
58
+
59
+ let node = htmlTrie.get(char);
60
+
61
+ if (typeof node === "object") {
62
+ if (index + 1 < length) {
46
63
  const nextChar = input.charCodeAt(index + 1);
47
64
  const value =
48
- typeof next.n === "number"
49
- ? next.n === nextChar
50
- ? next.o
65
+ typeof node.next === "number"
66
+ ? node.next === nextChar
67
+ ? node.nextValue
51
68
  : undefined
52
- : next.n.get(nextChar);
69
+ : node.next.get(nextChar);
53
70
 
54
71
  if (value !== undefined) {
55
- returnValue += value;
56
- lastIndex = regExp.lastIndex += 1;
72
+ out += value;
73
+ index++;
74
+ last = index + 1;
57
75
  continue;
58
76
  }
59
77
  }
60
-
61
- next = next.v;
78
+ node = node.value;
62
79
  }
63
80
 
64
- // We might have a tree node without a value; skip and use a numeric entity.
65
- if (next === undefined) {
81
+ if (node === undefined) {
66
82
  const cp = getCodePoint(input, index);
67
- returnValue += `&#x${cp.toString(16)};`;
68
- // Increase by 1 if we have a surrogate pair
69
- lastIndex = regExp.lastIndex += Number(cp !== char);
83
+ out += `&#x${cp.toString(16)};`;
84
+ if (cp !== char) index++;
85
+ last = index + 1;
70
86
  } else {
71
- returnValue += next;
72
- lastIndex = index + 1;
87
+ out += node;
88
+ last = index + 1;
73
89
  }
74
90
  }
75
91
 
76
- return returnValue + input.substr(lastIndex);
92
+ if (out === undefined) return input;
93
+ if (last < length) out += input.substr(last);
94
+ return out;
77
95
  }