re2js 2.3.1 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,7 +2,7 @@
2
2
  * re2js
3
3
  * RE2JS is the JavaScript port of RE2, a regular expression engine that provides linear time matching
4
4
  *
5
- * @version v2.3.1
5
+ * @version v2.3.2
6
6
  * @author Oleksii Vasyliev
7
7
  * @homepage https://github.com/le0pard/re2js#readme
8
8
  * @repository github:le0pard/re2js
@@ -633,6 +633,31 @@
633
633
  }
634
634
  }
635
635
 
636
+ /**
637
+ * Size of the precomputed single-byte lookup table.
638
+ * Covers standard ASCII and Latin-1 characters for fast-path execution.
639
+ */
640
+ const FAST_PATH_TABLE_SIZE = 256;
641
+ /**
642
+ * Precomputed lookup table for Word Boundary (\b, \B) assertions.
643
+ * * By precomputing the boolean results for standard ASCII word ranges
644
+ * ('a'-'z', 'A'-'Z', '0'-'9', '_'), we completely eliminate 4 logical
645
+ * branches from the NFA's hot execution loop. This prevents costly
646
+ * CPU branch mispredictions when scanning large strings.
647
+ */
648
+ const WORD_RUNE_TABLE = new Uint8Array(FAST_PATH_TABLE_SIZE);
649
+ for (let i = 0; i < FAST_PATH_TABLE_SIZE; i++) {
650
+ WORD_RUNE_TABLE[i] = 97 <= i && i <= 122 ||
651
+ // 'a' - 'z'
652
+ 65 <= i && i <= 90 ||
653
+ // 'A' - 'Z'
654
+ 48 <= i && i <= 57 ||
655
+ // '0' - '9'
656
+ i === 95 // '_'
657
+ ? 1 : 0;
658
+ }
659
+ let cachedNativeEncoder = null;
660
+ let cachedNativeDecoder = null;
636
661
  /**
637
662
  * Various constants and helper utilities.
638
663
  */
@@ -731,12 +756,21 @@
731
756
  return out;
732
757
  }
733
758
 
734
- // Returns the array of runes in the specified Java UTF-16 string.
759
+ // Returns the array of runes in the specified JS UTF-16 string.
735
760
  static stringToRunes(str) {
736
- return Array.from(String(str)).map(s => s.codePointAt(0));
761
+ const string = String(str);
762
+ const runes = [];
763
+ let i = 0;
764
+ while (i < string.length) {
765
+ const cp = string.codePointAt(i);
766
+ runes.push(cp);
767
+ // Surrogate pairs (Emojis, etc.) are > 0xFFFF
768
+ i += cp > Unicode.MAX_BMP ? 2 : 1;
769
+ }
770
+ return runes;
737
771
  }
738
772
 
739
- // Returns the Java UTF-16 string containing the single rune |r|.
773
+ // Returns the JS UTF-16 string containing the single rune |r|.
740
774
  static runeToString(r) {
741
775
  return String.fromCodePoint(r);
742
776
  }
@@ -745,7 +779,7 @@
745
779
  // during the evaluation of the \b and \B zero-width assertions.
746
780
  // These assertions are ASCII-only: the word characters are [A-Za-z0-9_].
747
781
  static isWordRune(r) {
748
- return Codepoint.CODES.get('a') <= r && r <= Codepoint.CODES.get('z') || Codepoint.CODES.get('A') <= r && r <= Codepoint.CODES.get('Z') || Codepoint.CODES.get('0') <= r && r <= Codepoint.CODES.get('9') || r === Codepoint.CODES.get('_');
782
+ return r < FAST_PATH_TABLE_SIZE ? WORD_RUNE_TABLE[r] === 1 : false;
749
783
  }
750
784
 
751
785
  // emptyOpContext returns the zero-width assertions satisfied at the position
@@ -758,21 +792,24 @@
758
792
  static emptyOpContext(r1, r2) {
759
793
  let op = 0;
760
794
  if (r1 < 0) {
761
- op |= this.EMPTY_BEGIN_TEXT | this.EMPTY_BEGIN_LINE;
795
+ op |= Utils.EMPTY_BEGIN_TEXT | Utils.EMPTY_BEGIN_LINE;
762
796
  }
763
- if (r1 === Codepoint.CODES.get('\n')) {
764
- op |= this.EMPTY_BEGIN_LINE;
797
+ // Hardcode 10 for '\n'
798
+ if (r1 === 10) {
799
+ op |= Utils.EMPTY_BEGIN_LINE;
765
800
  }
766
801
  if (r2 < 0) {
767
- op |= this.EMPTY_END_TEXT | this.EMPTY_END_LINE;
802
+ op |= Utils.EMPTY_END_TEXT | Utils.EMPTY_END_LINE;
768
803
  }
769
- if (r2 === Codepoint.CODES.get('\n')) {
770
- op |= this.EMPTY_END_LINE;
804
+
805
+ // Hardcode 10 for '\n'
806
+ if (r2 === 10) {
807
+ op |= Utils.EMPTY_END_LINE;
771
808
  }
772
- if (this.isWordRune(r1) !== this.isWordRune(r2)) {
773
- op |= this.EMPTY_WORD_BOUNDARY;
809
+ if (Utils.isWordRune(r1) !== Utils.isWordRune(r2)) {
810
+ op |= Utils.EMPTY_WORD_BOUNDARY;
774
811
  } else {
775
- op |= this.EMPTY_NO_WORD_BOUNDARY;
812
+ op |= Utils.EMPTY_NO_WORD_BOUNDARY;
776
813
  }
777
814
  return op;
778
815
  }
@@ -796,9 +833,23 @@
796
833
  static charCount(codePoint) {
797
834
  return codePoint > Unicode.MAX_BMP ? 2 : 1;
798
835
  }
836
+
837
+ /**
838
+ * High-speed conversion from TypedArrays to standard JS Arrays.
839
+ * Bypasses the expensive Symbol.iterator overhead of Array.from()
840
+ */
841
+ static toArray(typedArray) {
842
+ const len = typedArray.length;
843
+ const res = new Array(len);
844
+ for (let i = 0; i < len; i++) {
845
+ res[i] = typedArray[i];
846
+ }
847
+ return res;
848
+ }
799
849
  static stringToUtf8ByteArray(str) {
800
850
  if (globalThis.TextEncoder) {
801
- return Array.from(new TextEncoder().encode(str));
851
+ if (!cachedNativeEncoder) cachedNativeEncoder = new TextEncoder();
852
+ return Utils.toArray(cachedNativeEncoder.encode(str));
802
853
  } else {
803
854
  // fallback, if no TextEncoder
804
855
  let out = [],
@@ -828,7 +879,9 @@
828
879
  }
829
880
  static utf8ByteArrayToString(bytes) {
830
881
  if (globalThis.TextDecoder) {
831
- return new TextDecoder('utf-8').decode(new Uint8Array(bytes));
882
+ if (!cachedNativeDecoder) cachedNativeDecoder = new TextDecoder('utf-8');
883
+ const view = bytes instanceof Uint8Array ? bytes : new Uint8Array(bytes);
884
+ return cachedNativeDecoder.decode(view);
832
885
  } else {
833
886
  // fallback, if no TextDecoder
834
887
  let out = [],
@@ -1125,15 +1178,34 @@
1125
1178
  if (targetLength === 0) {
1126
1179
  return fromIndex <= this.end ? fromIndex : -1;
1127
1180
  }
1181
+ const firstByte = target[0];
1128
1182
  let limit = this.end - targetLength;
1129
- for (let i = fromIndex; i <= limit; i++) {
1130
- for (let j = 0; j < targetLength; j++) {
1183
+ // Feature detection: Native TypedArray indexOf (ES2015)
1184
+ const hasNativeIndexOf = typeof source.indexOf === 'function';
1185
+ let i = fromIndex;
1186
+ while (i <= limit) {
1187
+ // Fast-forward to the first matching byte using C++ bindings if available
1188
+ if (hasNativeIndexOf) {
1189
+ i = source.indexOf(firstByte, i);
1190
+ if (i === -1 || i > limit) return -1;
1191
+ } else {
1192
+ // Fallback: Manual loop
1193
+ while (i <= limit && source[i] !== firstByte) i++;
1194
+ if (i > limit) return -1;
1195
+ }
1196
+
1197
+ // First byte matches, verify the rest of the target sequence
1198
+ let match = true;
1199
+ for (let j = 1; j < targetLength; j++) {
1131
1200
  if (source[i + j] !== target[j]) {
1201
+ match = false;
1132
1202
  break;
1133
- } else if (j === targetLength - 1) {
1134
- return i;
1135
1203
  }
1136
1204
  }
1205
+ if (match) {
1206
+ return i;
1207
+ }
1208
+ i++;
1137
1209
  }
1138
1210
  return -1;
1139
1211
  }
@@ -1205,8 +1277,10 @@
1205
1277
  // Returns a bitmask of EMPTY_* flags.
1206
1278
  context(pos) {
1207
1279
  pos += this.start;
1208
- const r1 = pos > this.start && pos <= this.end ? this.charSequence.codePointAt(pos - 1) : -1;
1209
- const r2 = pos < this.end ? this.charSequence.codePointAt(pos) : -1;
1280
+
1281
+ // We only care about ASCII word runes and newlines for context boundaries
1282
+ const r1 = pos > this.start && pos <= this.end ? this.charSequence.charCodeAt(pos - 1) : -1;
1283
+ const r2 = pos < this.end ? this.charSequence.charCodeAt(pos) : -1;
1210
1284
  return Utils.emptyOpContext(r1, r2);
1211
1285
  }
1212
1286
  prefixLength(re2) {
@@ -2213,7 +2287,7 @@
2213
2287
  return Utils.emptyInts();
2214
2288
  }
2215
2289
  // Use subarray() to create a zero-allocation view before converting
2216
- return Array.from(this.matchcap.subarray(0, this.ncap));
2290
+ return Utils.toArray(this.matchcap.subarray(0, this.ncap));
2217
2291
  }
2218
2292
 
2219
2293
  // alloc() allocates a new thread with the given instruction.
@@ -3175,7 +3249,7 @@
3175
3249
  }
3176
3250
 
3177
3251
  // Must slice so we don't accidentally leak trailing arrays from previously recycled typed arrays
3178
- const result = ncap === 0 ? [] : Array.from(b.matchcap.subarray(0, ncap));
3252
+ const result = ncap === 0 ? [] : Utils.toArray(b.matchcap.subarray(0, ncap));
3179
3253
  bitStatePool.push(b);
3180
3254
  return result;
3181
3255
  }
@@ -3551,7 +3625,7 @@
3551
3625
  matchcap[0] = 0;
3552
3626
  matchcap[1] = pos;
3553
3627
  }
3554
- return ncap === 0 ? [] : Array.from(matchcap);
3628
+ return ncap === 0 ? [] : Utils.toArray(matchcap);
3555
3629
  }
3556
3630
  case Inst.RUNE:
3557
3631
  if (!inst.matchRune(rune)) return null;
@@ -3600,7 +3674,7 @@
3600
3674
  }
3601
3675
  }
3602
3676
  if (!matched) return null;
3603
- return ncap === 0 ? [] : Array.from(matchcap);
3677
+ return ncap === 0 ? [] : Utils.toArray(matchcap);
3604
3678
  }
3605
3679
  }
3606
3680
 
@@ -5354,7 +5428,7 @@
5354
5428
  return r;
5355
5429
  }
5356
5430
  lookingAt(s) {
5357
- return this.rest().startsWith(s);
5431
+ return this.str.startsWith(s, this.position);
5358
5432
  }
5359
5433
 
5360
5434
  // Returns the rest of the pattern as a Java UTF-16 string.