re2js 2.3.0 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -562,8 +562,7 @@ console.log(RE2JS.compile('(a+b?)').programSize()); // Outputs: 8
562
562
 
563
563
  ### Translating Regular Expressions
564
564
 
565
- The `translateRegExp()` method preprocesses a given regular expression string to ensure compatibility with RE2JS.
566
- It applies necessary transformations, such as escaping special characters, adjusting Unicode sequences, and converting named capture groups
565
+ The `translateRegExp()` method preprocesses a given regular expression string or native RegExp object to ensure compatibility with RE2JS. It applies necessary transformations, such as escaping special characters, adjusting Unicode sequences, converting named capture groups, and mapping native execution flags
567
566
 
568
567
  ```js
569
568
  import { RE2JS } from 're2js'
@@ -579,7 +578,11 @@ RE2JS.matches(unicodeRegexp, '😀') // true
579
578
  RE2JS.matches(unicodeRegexp, '😃') // false
580
579
 
581
580
  // also support native Regex
582
- RE2JS.translateRegExp(/foo/ims) // '(?ims)foo'
581
+ const translatedNative = RE2JS.translateRegExp(/foo/ims) // '(?ims)foo'
582
+
583
+ const re = RE2JS.compile(translatedNative)
584
+ re.test('FOO') // true
585
+
583
586
  RE2JS.translateRegExp(/bar/giy) // '(?i)bar'
584
587
  ```
585
588
 
@@ -612,23 +615,23 @@ Because RE2JS's Lazy DFA, Prefilter, and OnePass engines operate efficiently wit
612
615
 
613
616
  Here is a benchmark running 30,000 items through both engines using their respective `.test()` fast-paths (averages of multiple runs):
614
617
 
615
- | Benchmark Scenario | Pattern Example | RE2JS (Pure JS) | RE2-Node (C++) | Result |
616
- |:--------------------------|:---------------------------|:----------------|:---------------|:-----------------------------|
617
- | **Simple Literal** | `/damage/` | **~5.82 ms** | ~14.08 ms | `re2js` is **~2.42x faster** |
618
- | **Greedy Wildcard** | `/enters.*battlefield/` | **~8.44 ms** | ~13.32 ms | `re2js` is **~1.58x faster** |
619
- | **Lazy Wildcard** | `/enters.*?battlefield/` | **~8.43 ms** | ~13.33 ms | `re2js` is **~1.58x faster** |
620
- | **Deep State Machine** | `/([0-9]+(/[0-9]+)+)/` | **~7.71 ms** | ~16.08 ms | `re2js` is **~2.09x faster** |
621
- | **Massive Alternation** | `/White\|Blue\|Black.../` | **~11.62 ms** | ~14.99 ms | `re2js` is **~1.29x faster** |
622
- | **Bounded Repetition** | `/[A-Z][a-z]{5,15}/` | **~12.20 ms** | ~13.77 ms | `re2js` is **~1.13x faster** |
623
- | **ReDoS Attempt** | `/(a+)+!/` | **~5.68 ms** | ~16.25 ms | `re2js` is **~2.86x faster** |
624
- | **Case Insensitive** | `/(?i)swamp/` | ~18.71 ms | **~16.22 ms** | `re2-node` is ~1.15x faster |
625
- | **Word Boundaries (NFA)** | `/\b(Flying\|First...)\b/` | ~57.24 ms | **~15.66 ms** | `re2-node` is ~3.66x faster |
618
+ | Benchmark Scenario | Pattern Example | RE2JS (Pure JS) | RE2-Node (C++) | Result |
619
+ |:--------------------------|:---------------------------|:----------------|:---------------|:----------------------------|
620
+ | **ReDoS Attempt** | `/(a+)+!/` | **7.28 ms** | 12.74 ms | `re2js` is **1.75x** faster |
621
+ | **Deep State Machine** | `/([0-9]+(/[0-9]+)+)/` | **8.78 ms** | 12.56 ms | `re2js` is **1.43x** faster |
622
+ | **Simple Literal** | `/damage/` | **7.04 ms** | 9.59 ms | `re2js` is **1.36x** faster |
623
+ | **Lazy Wildcard** | `/enters.*?battlefield/` | **9.36 ms** | 10.27 ms | `re2js` is **1.10x** faster |
624
+ | **Greedy Wildcard** | `/enters.*battlefield/` | **9.47 ms** | 10.03 ms | `re2js` is **1.06x** faster |
625
+ | **Massive Alternation** | `/White\|Blue\|Black.../` | 11.69 ms | **11.28 ms** | `re2-node` is 1.04x faster |
626
+ | **Bounded Repetition** | `/[A-Z][a-z]{5,15}/` | 12.68 ms | **10.64 ms** | `re2-node` is 1.19x faster |
627
+ | **Case Insensitive** | `/(?i)swamp/` | 18.58 ms | **12.64 ms** | `re2-node` is 1.47x faster |
628
+ | **Word Boundaries (NFA)** | `/\b(Flying\|First...)\b/` | 30.45 ms | **12.22 ms** | `re2-node` is 2.49x faster |
626
629
 
627
630
  **Takeaways:**
628
- * **The Literal & Prefilter Advantage (JS wins):** For simple text searches like literals and wildcards, RE2JS's Literal Fast-Path and Prefilter Engine leverage highly optimized native JavaScript `indexOf` string scanning. By bypassing the regex state machines completely, pure JavaScript now outperforms native C++ bindings by **~1.5x to 2.4x**.
629
- * **State-Heavy Tasks (JS wins):** For complex state machines, massive alternations, and catastrophic backtracking (ReDoS) attempts, RE2JS operates entirely within V8's highly optimized JIT. Avoiding the JS-to-C++ N-API bridge overhead allows pure JavaScript to beat native bindings by **~1.1x to 2.8x**.
630
- * **Case Insensitivity (C++ wins):** Case-folded literal matching currently skips the prefilter and requires full DFA state-machine evaluation, giving C++ a slight ~1.15x edge due to raw memory scanning speeds.
631
- * **The Fallback Engines (C++ wins):** Pure DFA engines mathematically cannot track look-behind context like Word Boundaries (`\b`). When RE2JS encounters these, it safely bails out to its NFA engine. As shown in the benchmarks, the pure JS NFA fallback is slower than the C++ NFA. **For maximum performance in RE2JS, avoid `\b` when doing bulk boolean `.test()` matching.**
631
+ * **Pure JS Strengths:** For complex state tracking (nested groups, wildcards) and literal string scanning, `re2js` actually beats the native C++ bindings. V8's Turbofan JIT compiler is able to heavily optimize the Pure JS DFA loop, bypassing the C++ boundary entirely.
632
+ * **C++ Strengths:** For character class evaluations (Case Insensitivity, Bounded Repetitions, Alternations), `re2-node` has a slight edge thanks to highly optimized, hardware-level memory tables.
633
+ * **The NFA Fallback:** Pure DFA engines mathematically cannot track look-behind context like Word Boundaries (`\b`). When RE2JS encounters these, it safely bails out to its NFA engine. As shown in the benchmarks, the pure JS NFA is significantly slower than the C++ NFA.
634
+ * **Optimization Tip:** For maximum absolute performance in RE2JS, avoid `\b` or capture groups when doing bulk boolean `.test()` matching to ensure execution stays on the DFA fast-path.
632
635
 
633
636
  ### RE2JS vs JavaScript's native RegExp
634
637
 
@@ -2,7 +2,7 @@
2
2
  * re2js
3
3
  * RE2JS is the JavaScript port of RE2, a regular expression engine that provides linear time matching
4
4
  *
5
- * @version v2.3.0
5
+ * @version v2.3.2
6
6
  * @author Oleksii Vasyliev
7
7
  * @homepage https://github.com/le0pard/re2js#readme
8
8
  * @repository github:le0pard/re2js
@@ -629,6 +629,31 @@ class Unicode {
629
629
  }
630
630
  }
631
631
 
632
+ /**
633
+ * Size of the precomputed single-byte lookup table.
634
+ * Covers standard ASCII and Latin-1 characters for fast-path execution.
635
+ */
636
+ const FAST_PATH_TABLE_SIZE = 256;
637
+ /**
638
+ * Precomputed lookup table for Word Boundary (\b, \B) assertions.
639
+ * * By precomputing the boolean results for standard ASCII word ranges
640
+ * ('a'-'z', 'A'-'Z', '0'-'9', '_'), we completely eliminate 4 logical
641
+ * branches from the NFA's hot execution loop. This prevents costly
642
+ * CPU branch mispredictions when scanning large strings.
643
+ */
644
+ const WORD_RUNE_TABLE = new Uint8Array(FAST_PATH_TABLE_SIZE);
645
+ for (let i = 0; i < FAST_PATH_TABLE_SIZE; i++) {
646
+ WORD_RUNE_TABLE[i] = 97 <= i && i <= 122 ||
647
+ // 'a' - 'z'
648
+ 65 <= i && i <= 90 ||
649
+ // 'A' - 'Z'
650
+ 48 <= i && i <= 57 ||
651
+ // '0' - '9'
652
+ i === 95 // '_'
653
+ ? 1 : 0;
654
+ }
655
+ let cachedNativeEncoder = null;
656
+ let cachedNativeDecoder = null;
632
657
  /**
633
658
  * Various constants and helper utilities.
634
659
  */
@@ -727,12 +752,21 @@ class Utils {
727
752
  return out;
728
753
  }
729
754
 
730
- // Returns the array of runes in the specified Java UTF-16 string.
755
+ // Returns the array of runes in the specified JS UTF-16 string.
731
756
  static stringToRunes(str) {
732
- return Array.from(String(str)).map(s => s.codePointAt(0));
757
+ const string = String(str);
758
+ const runes = [];
759
+ let i = 0;
760
+ while (i < string.length) {
761
+ const cp = string.codePointAt(i);
762
+ runes.push(cp);
763
+ // Surrogate pairs (Emojis, etc.) are > 0xFFFF
764
+ i += cp > Unicode.MAX_BMP ? 2 : 1;
765
+ }
766
+ return runes;
733
767
  }
734
768
 
735
- // Returns the Java UTF-16 string containing the single rune |r|.
769
+ // Returns the JS UTF-16 string containing the single rune |r|.
736
770
  static runeToString(r) {
737
771
  return String.fromCodePoint(r);
738
772
  }
@@ -741,7 +775,7 @@ class Utils {
741
775
  // during the evaluation of the \b and \B zero-width assertions.
742
776
  // These assertions are ASCII-only: the word characters are [A-Za-z0-9_].
743
777
  static isWordRune(r) {
744
- return Codepoint.CODES.get('a') <= r && r <= Codepoint.CODES.get('z') || Codepoint.CODES.get('A') <= r && r <= Codepoint.CODES.get('Z') || Codepoint.CODES.get('0') <= r && r <= Codepoint.CODES.get('9') || r === Codepoint.CODES.get('_');
778
+ return r < FAST_PATH_TABLE_SIZE ? WORD_RUNE_TABLE[r] === 1 : false;
745
779
  }
746
780
 
747
781
  // emptyOpContext returns the zero-width assertions satisfied at the position
@@ -754,21 +788,24 @@ class Utils {
754
788
  static emptyOpContext(r1, r2) {
755
789
  let op = 0;
756
790
  if (r1 < 0) {
757
- op |= this.EMPTY_BEGIN_TEXT | this.EMPTY_BEGIN_LINE;
791
+ op |= Utils.EMPTY_BEGIN_TEXT | Utils.EMPTY_BEGIN_LINE;
758
792
  }
759
- if (r1 === Codepoint.CODES.get('\n')) {
760
- op |= this.EMPTY_BEGIN_LINE;
793
+ // Hardcode 10 for '\n'
794
+ if (r1 === 10) {
795
+ op |= Utils.EMPTY_BEGIN_LINE;
761
796
  }
762
797
  if (r2 < 0) {
763
- op |= this.EMPTY_END_TEXT | this.EMPTY_END_LINE;
798
+ op |= Utils.EMPTY_END_TEXT | Utils.EMPTY_END_LINE;
764
799
  }
765
- if (r2 === Codepoint.CODES.get('\n')) {
766
- op |= this.EMPTY_END_LINE;
800
+
801
+ // Hardcode 10 for '\n'
802
+ if (r2 === 10) {
803
+ op |= Utils.EMPTY_END_LINE;
767
804
  }
768
- if (this.isWordRune(r1) !== this.isWordRune(r2)) {
769
- op |= this.EMPTY_WORD_BOUNDARY;
805
+ if (Utils.isWordRune(r1) !== Utils.isWordRune(r2)) {
806
+ op |= Utils.EMPTY_WORD_BOUNDARY;
770
807
  } else {
771
- op |= this.EMPTY_NO_WORD_BOUNDARY;
808
+ op |= Utils.EMPTY_NO_WORD_BOUNDARY;
772
809
  }
773
810
  return op;
774
811
  }
@@ -792,9 +829,23 @@ class Utils {
792
829
  static charCount(codePoint) {
793
830
  return codePoint > Unicode.MAX_BMP ? 2 : 1;
794
831
  }
832
+
833
+ /**
834
+ * High-speed conversion from TypedArrays to standard JS Arrays.
835
+ * Bypasses the expensive Symbol.iterator overhead of Array.from()
836
+ */
837
+ static toArray(typedArray) {
838
+ const len = typedArray.length;
839
+ const res = new Array(len);
840
+ for (let i = 0; i < len; i++) {
841
+ res[i] = typedArray[i];
842
+ }
843
+ return res;
844
+ }
795
845
  static stringToUtf8ByteArray(str) {
796
846
  if (globalThis.TextEncoder) {
797
- return Array.from(new TextEncoder().encode(str));
847
+ if (!cachedNativeEncoder) cachedNativeEncoder = new TextEncoder();
848
+ return Utils.toArray(cachedNativeEncoder.encode(str));
798
849
  } else {
799
850
  // fallback, if no TextEncoder
800
851
  let out = [],
@@ -824,7 +875,9 @@ class Utils {
824
875
  }
825
876
  static utf8ByteArrayToString(bytes) {
826
877
  if (globalThis.TextDecoder) {
827
- return new TextDecoder('utf-8').decode(new Uint8Array(bytes));
878
+ if (!cachedNativeDecoder) cachedNativeDecoder = new TextDecoder('utf-8');
879
+ const view = bytes instanceof Uint8Array ? bytes : new Uint8Array(bytes);
880
+ return cachedNativeDecoder.decode(view);
828
881
  } else {
829
882
  // fallback, if no TextDecoder
830
883
  let out = [],
@@ -1121,15 +1174,34 @@ class MachineUTF8Input extends MachineInputBase {
1121
1174
  if (targetLength === 0) {
1122
1175
  return fromIndex <= this.end ? fromIndex : -1;
1123
1176
  }
1177
+ const firstByte = target[0];
1124
1178
  let limit = this.end - targetLength;
1125
- for (let i = fromIndex; i <= limit; i++) {
1126
- for (let j = 0; j < targetLength; j++) {
1179
+ // Feature detection: Native TypedArray indexOf (ES2015)
1180
+ const hasNativeIndexOf = typeof source.indexOf === 'function';
1181
+ let i = fromIndex;
1182
+ while (i <= limit) {
1183
+ // Fast-forward to the first matching byte using C++ bindings if available
1184
+ if (hasNativeIndexOf) {
1185
+ i = source.indexOf(firstByte, i);
1186
+ if (i === -1 || i > limit) return -1;
1187
+ } else {
1188
+ // Fallback: Manual loop
1189
+ while (i <= limit && source[i] !== firstByte) i++;
1190
+ if (i > limit) return -1;
1191
+ }
1192
+
1193
+ // First byte matches, verify the rest of the target sequence
1194
+ let match = true;
1195
+ for (let j = 1; j < targetLength; j++) {
1127
1196
  if (source[i + j] !== target[j]) {
1197
+ match = false;
1128
1198
  break;
1129
- } else if (j === targetLength - 1) {
1130
- return i;
1131
1199
  }
1132
1200
  }
1201
+ if (match) {
1202
+ return i;
1203
+ }
1204
+ i++;
1133
1205
  }
1134
1206
  return -1;
1135
1207
  }
@@ -1201,8 +1273,10 @@ class MachineUTF16Input extends MachineInputBase {
1201
1273
  // Returns a bitmask of EMPTY_* flags.
1202
1274
  context(pos) {
1203
1275
  pos += this.start;
1204
- const r1 = pos > this.start && pos <= this.end ? this.charSequence.codePointAt(pos - 1) : -1;
1205
- const r2 = pos < this.end ? this.charSequence.codePointAt(pos) : -1;
1276
+
1277
+ // We only care about ASCII word runes and newlines for context boundaries
1278
+ const r1 = pos > this.start && pos <= this.end ? this.charSequence.charCodeAt(pos - 1) : -1;
1279
+ const r2 = pos < this.end ? this.charSequence.charCodeAt(pos) : -1;
1206
1280
  return Utils.emptyOpContext(r1, r2);
1207
1281
  }
1208
1282
  prefixLength(re2) {
@@ -1332,10 +1406,6 @@ class RE2JSInternalException extends RE2JSException {
1332
1406
  * @author rsc@google.com (Russ Cox)
1333
1407
  */
1334
1408
 
1335
- /**
1336
- * @typedef {import('./index').RE2JS} RE2JS_Pattern
1337
- */
1338
-
1339
1409
  class Matcher {
1340
1410
  /**
1341
1411
  * Quotes '\' and '$' in {@code s}, so that the returned string could be used in
@@ -1373,8 +1443,8 @@ class Matcher {
1373
1443
  }
1374
1444
  /**
1375
1445
  *
1376
- * @param {RE2JS_Pattern} pattern
1377
- * @param {Uint8Array|number[]|string} input
1446
+ * @param {RE2JS} pattern
1447
+ * @param {string|number[]|Uint8Array} input
1378
1448
  */
1379
1449
  constructor(pattern, input) {
1380
1450
  if (pattern === null) {
@@ -1382,7 +1452,7 @@ class Matcher {
1382
1452
  }
1383
1453
  /**
1384
1454
  * The pattern being matched.
1385
- * @type {RE2JS_Pattern}
1455
+ * @type {RE2JS}
1386
1456
  */
1387
1457
  this.patternInput = pattern;
1388
1458
  const re2 = this.patternInput.re2();
@@ -1407,7 +1477,7 @@ class Matcher {
1407
1477
 
1408
1478
  /**
1409
1479
  * Returns the {@code RE2JS} associated with this {@code Matcher}.
1410
- * @returns {RE2JS_Pattern}
1480
+ * @returns {RE2JS}
1411
1481
  */
1412
1482
  pattern() {
1413
1483
  return this.patternInput;
@@ -1437,7 +1507,7 @@ class Matcher {
1437
1507
 
1438
1508
  /**
1439
1509
  * Resets the {@code Matcher} and changes the input.
1440
- * @param {import('./MatcherInput').MatcherInputBase} input
1510
+ * @param {MatcherInputBase} input
1441
1511
  * @returns {Matcher} the {@code Matcher} itself, for chained method calls
1442
1512
  */
1443
1513
  resetMatcherInput(input) {
@@ -2213,7 +2283,7 @@ class Machine {
2213
2283
  return Utils.emptyInts();
2214
2284
  }
2215
2285
  // Use subarray() to create a zero-allocation view before converting
2216
- return Array.from(this.matchcap.subarray(0, this.ncap));
2286
+ return Utils.toArray(this.matchcap.subarray(0, this.ncap));
2217
2287
  }
2218
2288
 
2219
2289
  // alloc() allocates a new thread with the given instruction.
@@ -3175,7 +3245,7 @@ class Backtracker {
3175
3245
  }
3176
3246
 
3177
3247
  // Must slice so we don't accidentally leak trailing arrays from previously recycled typed arrays
3178
- const result = ncap === 0 ? [] : Array.from(b.matchcap.subarray(0, ncap));
3248
+ const result = ncap === 0 ? [] : Utils.toArray(b.matchcap.subarray(0, ncap));
3179
3249
  bitStatePool.push(b);
3180
3250
  return result;
3181
3251
  }
@@ -3551,7 +3621,7 @@ class OnePass {
3551
3621
  matchcap[0] = 0;
3552
3622
  matchcap[1] = pos;
3553
3623
  }
3554
- return ncap === 0 ? [] : Array.from(matchcap);
3624
+ return ncap === 0 ? [] : Utils.toArray(matchcap);
3555
3625
  }
3556
3626
  case Inst.RUNE:
3557
3627
  if (!inst.matchRune(rune)) return null;
@@ -3600,7 +3670,7 @@ class OnePass {
3600
3670
  }
3601
3671
  }
3602
3672
  if (!matched) return null;
3603
- return ncap === 0 ? [] : Array.from(matchcap);
3673
+ return ncap === 0 ? [] : Utils.toArray(matchcap);
3604
3674
  }
3605
3675
  }
3606
3676
 
@@ -5354,7 +5424,7 @@ class StringIterator {
5354
5424
  return r;
5355
5425
  }
5356
5426
  lookingAt(s) {
5357
- return this.rest().startsWith(s);
5427
+ return this.str.startsWith(s, this.position);
5358
5428
  }
5359
5429
 
5360
5430
  // Returns the rest of the pattern as a Java UTF-16 string.