re2js 2.3.1 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -615,23 +615,23 @@ Because RE2JS's Lazy DFA, Prefilter, and OnePass engines operate efficiently wit
615
615
 
616
616
  Here is a benchmark running 30,000 items through both engines using their respective `.test()` fast-paths (averages of multiple runs):
617
617
 
618
- | Benchmark Scenario | Pattern Example | RE2JS (Pure JS) | RE2-Node (C++) | Result |
619
- |:--------------------------|:---------------------------|:----------------|:---------------|:-----------------------------|
620
- | **Simple Literal** | `/damage/` | **~5.82 ms** | ~14.08 ms | `re2js` is **~2.42x faster** |
621
- | **Greedy Wildcard** | `/enters.*battlefield/` | **~8.44 ms** | ~13.32 ms | `re2js` is **~1.58x faster** |
622
- | **Lazy Wildcard** | `/enters.*?battlefield/` | **~8.43 ms** | ~13.33 ms | `re2js` is **~1.58x faster** |
623
- | **Deep State Machine** | `/([0-9]+(/[0-9]+)+)/` | **~7.71 ms** | ~16.08 ms | `re2js` is **~2.09x faster** |
624
- | **Massive Alternation** | `/White\|Blue\|Black.../` | **~11.62 ms** | ~14.99 ms | `re2js` is **~1.29x faster** |
625
- | **Bounded Repetition** | `/[A-Z][a-z]{5,15}/` | **~12.20 ms** | ~13.77 ms | `re2js` is **~1.13x faster** |
626
- | **ReDoS Attempt** | `/(a+)+!/` | **~5.68 ms** | ~16.25 ms | `re2js` is **~2.86x faster** |
627
- | **Case Insensitive** | `/(?i)swamp/` | ~18.71 ms | **~16.22 ms** | `re2-node` is ~1.15x faster |
628
- | **Word Boundaries (NFA)** | `/\b(Flying\|First...)\b/` | ~57.24 ms | **~15.66 ms** | `re2-node` is ~3.66x faster |
618
+ | Benchmark Scenario | Pattern Example | RE2JS (Pure JS) | RE2-Node (C++) | Result |
619
+ |:--------------------------|:---------------------------|:----------------|:---------------|:----------------------------|
620
+ | **ReDoS Attempt** | `/(a+)+!/` | **7.28 ms** | 12.74 ms | `re2js` is **1.75x** faster |
621
+ | **Deep State Machine** | `/([0-9]+(/[0-9]+)+)/` | **8.78 ms** | 12.56 ms | `re2js` is **1.43x** faster |
622
+ | **Simple Literal** | `/damage/` | **7.04 ms** | 9.59 ms | `re2js` is **1.36x** faster |
623
+ | **Lazy Wildcard** | `/enters.*?battlefield/` | **9.36 ms** | 10.27 ms | `re2js` is **1.10x** faster |
624
+ | **Greedy Wildcard** | `/enters.*battlefield/` | **9.47 ms** | 10.03 ms | `re2js` is **1.06x** faster |
625
+ | **Massive Alternation** | `/White\|Blue\|Black.../` | 11.69 ms | **11.28 ms** | `re2-node` is 1.04x faster |
626
+ | **Bounded Repetition** | `/[A-Z][a-z]{5,15}/` | 12.68 ms | **10.64 ms** | `re2-node` is 1.19x faster |
627
+ | **Case Insensitive** | `/(?i)swamp/` | 18.58 ms | **12.64 ms** | `re2-node` is 1.47x faster |
628
+ | **Word Boundaries (NFA)** | `/\b(Flying\|First...)\b/` | 30.45 ms | **12.22 ms** | `re2-node` is 2.49x faster |
629
629
 
630
630
  **Takeaways:**
631
- * **The Literal & Prefilter Advantage (JS wins):** For simple text searches like literals and wildcards, RE2JS's Literal Fast-Path and Prefilter Engine leverage highly optimized native JavaScript `indexOf` string scanning. By bypassing the regex state machines completely, pure JavaScript now outperforms native C++ bindings by **~1.5x to 2.4x**.
632
- * **State-Heavy Tasks (JS wins):** For complex state machines, massive alternations, and catastrophic backtracking (ReDoS) attempts, RE2JS operates entirely within V8's highly optimized JIT. Avoiding the JS-to-C++ N-API bridge overhead allows pure JavaScript to beat native bindings by **~1.1x to 2.8x**.
633
- * **Case Insensitivity (C++ wins):** Case-folded literal matching currently skips the prefilter and requires full DFA state-machine evaluation, giving C++ a slight ~1.15x edge due to raw memory scanning speeds.
634
- * **The Fallback Engines (C++ wins):** Pure DFA engines mathematically cannot track look-behind context like Word Boundaries (`\b`). When RE2JS encounters these, it safely bails out to its NFA engine. As shown in the benchmarks, the pure JS NFA fallback is slower than the C++ NFA. **For maximum performance in RE2JS, avoid `\b` when doing bulk boolean `.test()` matching.**
631
+ * **Pure JS Strengths:** For complex state tracking (nested groups, wildcards) and literal string scanning, `re2js` actually beats the native C++ bindings. V8's Turbofan JIT compiler is able to heavily optimize the Pure JS DFA loop, bypassing the C++ boundary entirely.
632
+ * **C++ Strengths:** For character class evaluations (Case Insensitivity, Bounded Repetitions, Alternations), `re2-node` has a slight edge thanks to highly optimized, hardware-level memory tables.
633
+ * **The NFA Fallback:** Pure DFA engines mathematically cannot track look-behind context like Word Boundaries (`\b`). When RE2JS encounters these, it safely bails out to its NFA engine. As shown in the benchmarks, the pure JS NFA is significantly slower than the C++ NFA.
634
+ * **Optimization Tip:** For maximum absolute performance in RE2JS, avoid `\b` or capture groups when doing bulk boolean `.test()` matching to ensure execution stays on the DFA fast-path.
635
635
 
636
636
  ### RE2JS vs JavaScript's native RegExp
637
637
 
@@ -2,7 +2,7 @@
2
2
  * re2js
3
3
  * RE2JS is the JavaScript port of RE2, a regular expression engine that provides linear time matching
4
4
  *
5
- * @version v2.3.1
5
+ * @version v2.3.2
6
6
  * @author Oleksii Vasyliev
7
7
  * @homepage https://github.com/le0pard/re2js#readme
8
8
  * @repository github:le0pard/re2js
@@ -629,6 +629,31 @@ class Unicode {
629
629
  }
630
630
  }
631
631
 
632
+ /**
633
+ * Size of the precomputed single-byte lookup table.
634
+ * Covers standard ASCII and Latin-1 characters for fast-path execution.
635
+ */
636
+ const FAST_PATH_TABLE_SIZE = 256;
637
+ /**
638
+ * Precomputed lookup table for Word Boundary (\b, \B) assertions.
639
+ * * By precomputing the boolean results for standard ASCII word ranges
640
+ * ('a'-'z', 'A'-'Z', '0'-'9', '_'), we completely eliminate 4 logical
641
+ * branches from the NFA's hot execution loop. This prevents costly
642
+ * CPU branch mispredictions when scanning large strings.
643
+ */
644
+ const WORD_RUNE_TABLE = new Uint8Array(FAST_PATH_TABLE_SIZE);
645
+ for (let i = 0; i < FAST_PATH_TABLE_SIZE; i++) {
646
+ WORD_RUNE_TABLE[i] = 97 <= i && i <= 122 ||
647
+ // 'a' - 'z'
648
+ 65 <= i && i <= 90 ||
649
+ // 'A' - 'Z'
650
+ 48 <= i && i <= 57 ||
651
+ // '0' - '9'
652
+ i === 95 // '_'
653
+ ? 1 : 0;
654
+ }
655
+ let cachedNativeEncoder = null;
656
+ let cachedNativeDecoder = null;
632
657
  /**
633
658
  * Various constants and helper utilities.
634
659
  */
@@ -727,12 +752,21 @@ class Utils {
727
752
  return out;
728
753
  }
729
754
 
730
- // Returns the array of runes in the specified Java UTF-16 string.
755
+ // Returns the array of runes in the specified JS UTF-16 string.
731
756
  static stringToRunes(str) {
732
- return Array.from(String(str)).map(s => s.codePointAt(0));
757
+ const string = String(str);
758
+ const runes = [];
759
+ let i = 0;
760
+ while (i < string.length) {
761
+ const cp = string.codePointAt(i);
762
+ runes.push(cp);
763
+ // Surrogate pairs (Emojis, etc.) are > 0xFFFF
764
+ i += cp > Unicode.MAX_BMP ? 2 : 1;
765
+ }
766
+ return runes;
733
767
  }
734
768
 
735
- // Returns the Java UTF-16 string containing the single rune |r|.
769
+ // Returns the JS UTF-16 string containing the single rune |r|.
736
770
  static runeToString(r) {
737
771
  return String.fromCodePoint(r);
738
772
  }
@@ -741,7 +775,7 @@ class Utils {
741
775
  // during the evaluation of the \b and \B zero-width assertions.
742
776
  // These assertions are ASCII-only: the word characters are [A-Za-z0-9_].
743
777
  static isWordRune(r) {
744
- return Codepoint.CODES.get('a') <= r && r <= Codepoint.CODES.get('z') || Codepoint.CODES.get('A') <= r && r <= Codepoint.CODES.get('Z') || Codepoint.CODES.get('0') <= r && r <= Codepoint.CODES.get('9') || r === Codepoint.CODES.get('_');
778
+ return r < FAST_PATH_TABLE_SIZE ? WORD_RUNE_TABLE[r] === 1 : false;
745
779
  }
746
780
 
747
781
  // emptyOpContext returns the zero-width assertions satisfied at the position
@@ -754,21 +788,24 @@ class Utils {
754
788
  static emptyOpContext(r1, r2) {
755
789
  let op = 0;
756
790
  if (r1 < 0) {
757
- op |= this.EMPTY_BEGIN_TEXT | this.EMPTY_BEGIN_LINE;
791
+ op |= Utils.EMPTY_BEGIN_TEXT | Utils.EMPTY_BEGIN_LINE;
758
792
  }
759
- if (r1 === Codepoint.CODES.get('\n')) {
760
- op |= this.EMPTY_BEGIN_LINE;
793
+ // Hardcode 10 for '\n'
794
+ if (r1 === 10) {
795
+ op |= Utils.EMPTY_BEGIN_LINE;
761
796
  }
762
797
  if (r2 < 0) {
763
- op |= this.EMPTY_END_TEXT | this.EMPTY_END_LINE;
798
+ op |= Utils.EMPTY_END_TEXT | Utils.EMPTY_END_LINE;
764
799
  }
765
- if (r2 === Codepoint.CODES.get('\n')) {
766
- op |= this.EMPTY_END_LINE;
800
+
801
+ // Hardcode 10 for '\n'
802
+ if (r2 === 10) {
803
+ op |= Utils.EMPTY_END_LINE;
767
804
  }
768
- if (this.isWordRune(r1) !== this.isWordRune(r2)) {
769
- op |= this.EMPTY_WORD_BOUNDARY;
805
+ if (Utils.isWordRune(r1) !== Utils.isWordRune(r2)) {
806
+ op |= Utils.EMPTY_WORD_BOUNDARY;
770
807
  } else {
771
- op |= this.EMPTY_NO_WORD_BOUNDARY;
808
+ op |= Utils.EMPTY_NO_WORD_BOUNDARY;
772
809
  }
773
810
  return op;
774
811
  }
@@ -792,9 +829,23 @@ class Utils {
792
829
  static charCount(codePoint) {
793
830
  return codePoint > Unicode.MAX_BMP ? 2 : 1;
794
831
  }
832
+
833
+ /**
834
+ * High-speed conversion from TypedArrays to standard JS Arrays.
835
+ * Bypasses the expensive Symbol.iterator overhead of Array.from()
836
+ */
837
+ static toArray(typedArray) {
838
+ const len = typedArray.length;
839
+ const res = new Array(len);
840
+ for (let i = 0; i < len; i++) {
841
+ res[i] = typedArray[i];
842
+ }
843
+ return res;
844
+ }
795
845
  static stringToUtf8ByteArray(str) {
796
846
  if (globalThis.TextEncoder) {
797
- return Array.from(new TextEncoder().encode(str));
847
+ if (!cachedNativeEncoder) cachedNativeEncoder = new TextEncoder();
848
+ return Utils.toArray(cachedNativeEncoder.encode(str));
798
849
  } else {
799
850
  // fallback, if no TextEncoder
800
851
  let out = [],
@@ -824,7 +875,9 @@ class Utils {
824
875
  }
825
876
  static utf8ByteArrayToString(bytes) {
826
877
  if (globalThis.TextDecoder) {
827
- return new TextDecoder('utf-8').decode(new Uint8Array(bytes));
878
+ if (!cachedNativeDecoder) cachedNativeDecoder = new TextDecoder('utf-8');
879
+ const view = bytes instanceof Uint8Array ? bytes : new Uint8Array(bytes);
880
+ return cachedNativeDecoder.decode(view);
828
881
  } else {
829
882
  // fallback, if no TextDecoder
830
883
  let out = [],
@@ -1121,15 +1174,34 @@ class MachineUTF8Input extends MachineInputBase {
1121
1174
  if (targetLength === 0) {
1122
1175
  return fromIndex <= this.end ? fromIndex : -1;
1123
1176
  }
1177
+ const firstByte = target[0];
1124
1178
  let limit = this.end - targetLength;
1125
- for (let i = fromIndex; i <= limit; i++) {
1126
- for (let j = 0; j < targetLength; j++) {
1179
+ // Feature detection: Native TypedArray indexOf (ES2015)
1180
+ const hasNativeIndexOf = typeof source.indexOf === 'function';
1181
+ let i = fromIndex;
1182
+ while (i <= limit) {
1183
+ // Fast-forward to the first matching byte using C++ bindings if available
1184
+ if (hasNativeIndexOf) {
1185
+ i = source.indexOf(firstByte, i);
1186
+ if (i === -1 || i > limit) return -1;
1187
+ } else {
1188
+ // Fallback: Manual loop
1189
+ while (i <= limit && source[i] !== firstByte) i++;
1190
+ if (i > limit) return -1;
1191
+ }
1192
+
1193
+ // First byte matches, verify the rest of the target sequence
1194
+ let match = true;
1195
+ for (let j = 1; j < targetLength; j++) {
1127
1196
  if (source[i + j] !== target[j]) {
1197
+ match = false;
1128
1198
  break;
1129
- } else if (j === targetLength - 1) {
1130
- return i;
1131
1199
  }
1132
1200
  }
1201
+ if (match) {
1202
+ return i;
1203
+ }
1204
+ i++;
1133
1205
  }
1134
1206
  return -1;
1135
1207
  }
@@ -1201,8 +1273,10 @@ class MachineUTF16Input extends MachineInputBase {
1201
1273
  // Returns a bitmask of EMPTY_* flags.
1202
1274
  context(pos) {
1203
1275
  pos += this.start;
1204
- const r1 = pos > this.start && pos <= this.end ? this.charSequence.codePointAt(pos - 1) : -1;
1205
- const r2 = pos < this.end ? this.charSequence.codePointAt(pos) : -1;
1276
+
1277
+ // We only care about ASCII word runes and newlines for context boundaries
1278
+ const r1 = pos > this.start && pos <= this.end ? this.charSequence.charCodeAt(pos - 1) : -1;
1279
+ const r2 = pos < this.end ? this.charSequence.charCodeAt(pos) : -1;
1206
1280
  return Utils.emptyOpContext(r1, r2);
1207
1281
  }
1208
1282
  prefixLength(re2) {
@@ -2209,7 +2283,7 @@ class Machine {
2209
2283
  return Utils.emptyInts();
2210
2284
  }
2211
2285
  // Use subarray() to create a zero-allocation view before converting
2212
- return Array.from(this.matchcap.subarray(0, this.ncap));
2286
+ return Utils.toArray(this.matchcap.subarray(0, this.ncap));
2213
2287
  }
2214
2288
 
2215
2289
  // alloc() allocates a new thread with the given instruction.
@@ -3171,7 +3245,7 @@ class Backtracker {
3171
3245
  }
3172
3246
 
3173
3247
  // Must slice so we don't accidentally leak trailing arrays from previously recycled typed arrays
3174
- const result = ncap === 0 ? [] : Array.from(b.matchcap.subarray(0, ncap));
3248
+ const result = ncap === 0 ? [] : Utils.toArray(b.matchcap.subarray(0, ncap));
3175
3249
  bitStatePool.push(b);
3176
3250
  return result;
3177
3251
  }
@@ -3547,7 +3621,7 @@ class OnePass {
3547
3621
  matchcap[0] = 0;
3548
3622
  matchcap[1] = pos;
3549
3623
  }
3550
- return ncap === 0 ? [] : Array.from(matchcap);
3624
+ return ncap === 0 ? [] : Utils.toArray(matchcap);
3551
3625
  }
3552
3626
  case Inst.RUNE:
3553
3627
  if (!inst.matchRune(rune)) return null;
@@ -3596,7 +3670,7 @@ class OnePass {
3596
3670
  }
3597
3671
  }
3598
3672
  if (!matched) return null;
3599
- return ncap === 0 ? [] : Array.from(matchcap);
3673
+ return ncap === 0 ? [] : Utils.toArray(matchcap);
3600
3674
  }
3601
3675
  }
3602
3676
 
@@ -5350,7 +5424,7 @@ class StringIterator {
5350
5424
  return r;
5351
5425
  }
5352
5426
  lookingAt(s) {
5353
- return this.rest().startsWith(s);
5427
+ return this.str.startsWith(s, this.position);
5354
5428
  }
5355
5429
 
5356
5430
  // Returns the rest of the pattern as a Java UTF-16 string.