re2js 1.4.0 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,7 +2,7 @@
2
2
  * re2js
3
3
  * RE2JS is the JavaScript port of RE2, a regular expression engine that provides linear time matching
4
4
  *
5
- * @version v1.4.0
5
+ * @version v2.0.1
6
6
  * @author Alexey Vasiliev
7
7
  * @homepage https://github.com/le0pard/re2js#readme
8
8
  * @repository github:le0pard/re2js
@@ -365,6 +365,11 @@
365
365
  // Checked during test.
366
366
  static MIN_FOLD = 0x0041;
367
367
  static MAX_FOLD = 0x1e943;
368
+ static MIN_HIGH_SURROGATE = 0xd800;
369
+ static MAX_HIGH_SURROGATE = 0xdbff;
370
+ static MIN_LOW_SURROGATE = 0xdc00;
371
+ static MAX_LOW_SURROGATE = 0xdfff;
372
+ static MIN_SUPPLEMENTARY_CODE_POINT = 0x10000;
368
373
 
369
374
  // is32 uses binary search to test whether rune is in the specified
370
375
  // slice of 32-bit ranges.
@@ -671,9 +676,9 @@
671
676
  } else if (c < 2048) {
672
677
  out[p++] = c >> 6 | 192;
673
678
  out[p++] = c & 63 | 128;
674
- } else if ((c & 0xfc00) === 0xd800 && i + 1 < str.length && (str.charCodeAt(i + 1) & 0xfc00) === 0xdc00) {
679
+ } else if ((c & 0xfc00) === Unicode.MIN_HIGH_SURROGATE && i + 1 < str.length && (str.charCodeAt(i + 1) & 0xfc00) === Unicode.MIN_LOW_SURROGATE) {
675
680
  // Surrogate Pair
676
- c = 0x10000 + ((c & 0x03ff) << 10) + (str.charCodeAt(++i) & 0x03ff);
681
+ c = Unicode.MIN_SUPPLEMENTARY_CODE_POINT + ((c & 0x03ff) << 10) + (str.charCodeAt(++i) & 0x03ff);
677
682
  out[p++] = c >> 18 | 240;
678
683
  out[p++] = c >> 12 & 63 | 128;
679
684
  out[p++] = c >> 6 & 63 | 128;
@@ -707,9 +712,9 @@
707
712
  let c2 = bytes[pos++];
708
713
  let c3 = bytes[pos++];
709
714
  let c4 = bytes[pos++];
710
- let u = ((c1 & 7) << 18 | (c2 & 63) << 12 | (c3 & 63) << 6 | c4 & 63) - 0x10000;
711
- out[c++] = String.fromCharCode(0xd800 + (u >> 10));
712
- out[c++] = String.fromCharCode(0xdc00 + (u & 1023));
715
+ let u = ((c1 & 7) << 18 | (c2 & 63) << 12 | (c3 & 63) << 6 | c4 & 63) - Unicode.MIN_SUPPLEMENTARY_CODE_POINT;
716
+ out[c++] = String.fromCharCode(Unicode.MIN_HIGH_SURROGATE + (u >> 10));
717
+ out[c++] = String.fromCharCode(Unicode.MIN_LOW_SURROGATE + (u & 1023));
713
718
  } else {
714
719
  let c2 = bytes[pos++];
715
720
  let c3 = bytes[pos++];
@@ -883,38 +888,34 @@
883
888
  // the lower 3 bits, and the rune (Unicode code point) in the high
884
889
  // bits. Never negative, except for EOF which is represented as -1
885
890
  // << 3 | 0.
886
- step(i) {
887
- i += this.start;
888
- if (i >= this.end) {
891
+ step(pos) {
892
+ pos += this.start;
893
+ if (pos >= this.end) {
889
894
  return MachineInputBase.EOF();
890
895
  }
891
- let x = this.bytes[i++] & 255;
892
- if ((x & 128) === 0) {
893
- return x << 3 | 1;
894
- } else if ((x & 224) === 192) {
895
- x = x & 31;
896
- if (i >= this.end) {
897
- return MachineInputBase.EOF();
898
- }
899
- x = x << 6 | this.bytes[i++] & 63;
900
- return x << 3 | 2;
901
- } else if ((x & 240) === 224) {
902
- x = x & 15;
903
- if (i + 1 >= this.end) {
904
- return MachineInputBase.EOF();
905
- }
906
- x = x << 6 | this.bytes[i++] & 63;
907
- x = x << 6 | this.bytes[i++] & 63;
908
- return x << 3 | 3;
896
+
897
+ // Read UTF-8 bytes to extract the Rune and its width
898
+ const c = this.bytes[pos] & 0xff;
899
+ if (c < 0x80) {
900
+ return c << 3 | 1;
901
+ } else if (c >= 0xc2 && c <= 0xdf && pos + 1 < this.end) {
902
+ const c1 = this.bytes[pos + 1] & 0xff;
903
+ const rune = (c & 0x1f) << 6 | c1 & 0x3f;
904
+ return rune << 3 | 2;
905
+ } else if (c >= 0xe0 && c <= 0xef && pos + 2 < this.end) {
906
+ const c1 = this.bytes[pos + 1] & 0xff;
907
+ const c2 = this.bytes[pos + 2] & 0xff;
908
+ const rune = (c & 0x0f) << 12 | (c1 & 0x3f) << 6 | c2 & 0x3f;
909
+ return rune << 3 | 3;
910
+ } else if (c >= 0xf0 && c <= 0xf4 && pos + 3 < this.end) {
911
+ const c1 = this.bytes[pos + 1] & 0xff;
912
+ const c2 = this.bytes[pos + 2] & 0xff;
913
+ const c3 = this.bytes[pos + 3] & 0xff;
914
+ const rune = (c & 0x07) << 18 | (c1 & 0x3f) << 12 | (c2 & 0x3f) << 6 | c3 & 0x3f;
915
+ return rune << 3 | 4;
909
916
  } else {
910
- x = x & 7;
911
- if (i + 2 >= this.end) {
912
- return MachineInputBase.EOF();
913
- }
914
- x = x << 6 | this.bytes[i++] & 63;
915
- x = x << 6 | this.bytes[i++] & 63;
916
- x = x << 6 | this.bytes[i++] & 63;
917
- return x << 3 | 4;
917
+ // Invalid sequence fallback
918
+ return c << 3 | 1;
918
919
  }
919
920
  }
920
921
 
@@ -989,12 +990,25 @@
989
990
  // << 3 | 0.
990
991
  step(pos) {
991
992
  pos += this.start;
992
- if (pos < this.end) {
993
- const rune = this.charSequence.codePointAt(pos);
994
- return rune << 3 | Utils.charCount(rune);
995
- } else {
993
+ if (pos >= this.end) {
996
994
  return MachineInputBase.EOF();
997
995
  }
996
+ const c1 = this.charSequence.charCodeAt(pos);
997
+
998
+ // Fast path: standard BMP character (not a high surrogate)
999
+ if (c1 < Unicode.MIN_HIGH_SURROGATE || c1 > Unicode.MAX_HIGH_SURROGATE || pos + 1 >= this.end) {
1000
+ return c1 << 3 | 1;
1001
+ }
1002
+
1003
+ // Slow path: Calculate surrogate pair manually
1004
+ const c2 = this.charSequence.charCodeAt(pos + 1);
1005
+ if (c2 >= Unicode.MIN_LOW_SURROGATE && c2 <= Unicode.MAX_LOW_SURROGATE) {
1006
+ const rune = (c1 - Unicode.MIN_HIGH_SURROGATE) * 0x400 + (c2 - Unicode.MIN_LOW_SURROGATE) + Unicode.MIN_SUPPLEMENTARY_CODE_POINT;
1007
+ return rune << 3 | 2;
1008
+ }
1009
+
1010
+ // Invalid surrogate pair fallback
1011
+ return c1 << 3 | 1;
998
1012
  }
999
1013
 
1000
1014
  // Returns the index relative to |pos| at which |re2.prefix| is found
@@ -1742,7 +1756,7 @@
1742
1756
  let lo = 0;
1743
1757
  let hi = this.runes.length / 2 | 0;
1744
1758
  while (lo < hi) {
1745
- const m = lo + ((hi - lo) / 2 | 0);
1759
+ const m = lo + hi >> 1; // native cpu instruction for "lo + (((hi - lo) / 2) | 0)"
1746
1760
  const c = this.runes[2 * m];
1747
1761
  if (c <= r) {
1748
1762
  if (r <= this.runes[2 * m + 1]) {
@@ -1803,10 +1817,10 @@
1803
1817
  // A queue is a 'sparse array' holding pending threads of execution. See:
1804
1818
  // research.swtch.com/2008/03/using-uninitialized-memory-for-fun-and.html
1805
1819
  class Queue {
1806
- constructor() {
1807
- this.sparse = []; // may contain stale but in-bounds values.
1808
- this.densePcs = []; // may contain stale pc in slots >= size
1809
- this.denseThreads = []; // may contain stale Thread in slots >= size
1820
+ constructor(numInst) {
1821
+ this.sparse = new Int32Array(numInst); // may contain stale but in-bounds values.
1822
+ this.densePcs = new Int32Array(numInst); // may contain stale pc in slots >= size
1823
+ this.denseThreads = new Array(numInst); // may contain stale Thread in slots >= size
1810
1824
  this.size = 0;
1811
1825
  }
1812
1826
  contains(pc) {
@@ -2307,7 +2321,7 @@
2307
2321
  if (width === 0) {
2308
2322
  break;
2309
2323
  }
2310
- currentState = this.step(currentState, rune, anchor);
2324
+ currentState = anchor === RE2Flags.UNANCHORED && rune <= Unicode.MAX_ASCII && currentState.nextAscii[rune] || this.step(currentState, rune, anchor);
2311
2325
 
2312
2326
  // If we hit an unrecoverable DFA error or bailout, signal fallback
2313
2327
  if (currentState === null) return null;