re2js 2.3.1 → 2.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +15 -15
- package/build/index.cjs.cjs +101 -27
- package/build/index.cjs.cjs.map +1 -1
- package/build/index.esm.d.ts.map +1 -1
- package/build/index.esm.js +101 -27
- package/build/index.esm.js.map +1 -1
- package/build/index.umd.js +101 -27
- package/build/index.umd.js.map +1 -1
- package/package.json +5 -4
package/README.md
CHANGED
|
@@ -615,23 +615,23 @@ Because RE2JS's Lazy DFA, Prefilter, and OnePass engines operate efficiently wit
|
|
|
615
615
|
|
|
616
616
|
Here is a benchmark running 30,000 items through both engines using their respective `.test()` fast-paths (averages of multiple runs):
|
|
617
617
|
|
|
618
|
-
| Benchmark Scenario | Pattern Example | RE2JS (Pure JS) | RE2-Node (C++) | Result
|
|
619
|
-
|
|
620
|
-
| **
|
|
621
|
-
| **
|
|
622
|
-
| **
|
|
623
|
-
| **
|
|
624
|
-
| **
|
|
625
|
-
| **
|
|
626
|
-
| **
|
|
627
|
-
| **Case Insensitive** | `/(?i)swamp/` |
|
|
628
|
-
| **Word Boundaries (NFA)** | `/\b(Flying\|First...)\b/` |
|
|
618
|
+
| Benchmark Scenario | Pattern Example | RE2JS (Pure JS) | RE2-Node (C++) | Result |
|
|
619
|
+
|:--------------------------|:---------------------------|:----------------|:---------------|:----------------------------|
|
|
620
|
+
| **ReDoS Attempt** | `/(a+)+!/` | **7.28 ms** | 12.74 ms | `re2js` is **1.75x** faster |
|
|
621
|
+
| **Deep State Machine** | `/([0-9]+(/[0-9]+)+)/` | **8.78 ms** | 12.56 ms | `re2js` is **1.43x** faster |
|
|
622
|
+
| **Simple Literal** | `/damage/` | **7.04 ms** | 9.59 ms | `re2js` is **1.36x** faster |
|
|
623
|
+
| **Lazy Wildcard** | `/enters.*?battlefield/` | **9.36 ms** | 10.27 ms | `re2js` is **1.10x** faster |
|
|
624
|
+
| **Greedy Wildcard** | `/enters.*battlefield/` | **9.47 ms** | 10.03 ms | `re2js` is **1.06x** faster |
|
|
625
|
+
| **Massive Alternation** | `/White\|Blue\|Black.../` | 11.69 ms | **11.28 ms** | `re2-node` is 1.04x faster |
|
|
626
|
+
| **Bounded Repetition** | `/[A-Z][a-z]{5,15}/` | 12.68 ms | **10.64 ms** | `re2-node` is 1.19x faster |
|
|
627
|
+
| **Case Insensitive** | `/(?i)swamp/` | 18.58 ms | **12.64 ms** | `re2-node` is 1.47x faster |
|
|
628
|
+
| **Word Boundaries (NFA)** | `/\b(Flying\|First...)\b/` | 30.45 ms | **12.22 ms** | `re2-node` is 2.49x faster |
|
|
629
629
|
|
|
630
630
|
**Takeaways:**
|
|
631
|
-
* **
|
|
632
|
-
* **
|
|
633
|
-
* **
|
|
634
|
-
* **
|
|
631
|
+
* **Pure JS Strengths:** For complex state tracking (nested groups, wildcards) and literal string scanning, `re2js` actually beats the native C++ bindings. V8's Turbofan JIT compiler is able to heavily optimize the Pure JS DFA loop, bypassing the C++ boundary entirely.
|
|
632
|
+
* **C++ Strengths:** For character class evaluations (Case Insensitivity, Bounded Repetitions, Alternations), `re2-node` has a slight edge thanks to highly optimized, hardware-level memory tables.
|
|
633
|
+
* **The NFA Fallback:** Pure DFA engines mathematically cannot track look-behind context like Word Boundaries (`\b`). When RE2JS encounters these, it safely bails out to its NFA engine. As shown in the benchmarks, the pure JS NFA is significantly slower than the C++ NFA.
|
|
634
|
+
* **Optimization Tip:** For maximum absolute performance in RE2JS, avoid `\b` or capture groups when doing bulk boolean `.test()` matching to ensure execution stays on the DFA fast-path.
|
|
635
635
|
|
|
636
636
|
### RE2JS vs JavaScript's native RegExp
|
|
637
637
|
|
package/build/index.cjs.cjs
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
* re2js
|
|
3
3
|
* RE2JS is the JavaScript port of RE2, a regular expression engine that provides linear time matching
|
|
4
4
|
*
|
|
5
|
-
* @version v2.3.
|
|
5
|
+
* @version v2.3.2
|
|
6
6
|
* @author Oleksii Vasyliev
|
|
7
7
|
* @homepage https://github.com/le0pard/re2js#readme
|
|
8
8
|
* @repository github:le0pard/re2js
|
|
@@ -629,6 +629,31 @@ class Unicode {
|
|
|
629
629
|
}
|
|
630
630
|
}
|
|
631
631
|
|
|
632
|
+
/**
|
|
633
|
+
* Size of the precomputed single-byte lookup table.
|
|
634
|
+
* Covers standard ASCII and Latin-1 characters for fast-path execution.
|
|
635
|
+
*/
|
|
636
|
+
const FAST_PATH_TABLE_SIZE = 256;
|
|
637
|
+
/**
|
|
638
|
+
* Precomputed lookup table for Word Boundary (\b, \B) assertions.
|
|
639
|
+
* * By precomputing the boolean results for standard ASCII word ranges
|
|
640
|
+
* ('a'-'z', 'A'-'Z', '0'-'9', '_'), we completely eliminate 4 logical
|
|
641
|
+
* branches from the NFA's hot execution loop. This prevents costly
|
|
642
|
+
* CPU branch mispredictions when scanning large strings.
|
|
643
|
+
*/
|
|
644
|
+
const WORD_RUNE_TABLE = new Uint8Array(FAST_PATH_TABLE_SIZE);
|
|
645
|
+
for (let i = 0; i < FAST_PATH_TABLE_SIZE; i++) {
|
|
646
|
+
WORD_RUNE_TABLE[i] = 97 <= i && i <= 122 ||
|
|
647
|
+
// 'a' - 'z'
|
|
648
|
+
65 <= i && i <= 90 ||
|
|
649
|
+
// 'A' - 'Z'
|
|
650
|
+
48 <= i && i <= 57 ||
|
|
651
|
+
// '0' - '9'
|
|
652
|
+
i === 95 // '_'
|
|
653
|
+
? 1 : 0;
|
|
654
|
+
}
|
|
655
|
+
let cachedNativeEncoder = null;
|
|
656
|
+
let cachedNativeDecoder = null;
|
|
632
657
|
/**
|
|
633
658
|
* Various constants and helper utilities.
|
|
634
659
|
*/
|
|
@@ -727,12 +752,21 @@ class Utils {
|
|
|
727
752
|
return out;
|
|
728
753
|
}
|
|
729
754
|
|
|
730
|
-
// Returns the array of runes in the specified
|
|
755
|
+
// Returns the array of runes in the specified JS UTF-16 string.
|
|
731
756
|
static stringToRunes(str) {
|
|
732
|
-
|
|
757
|
+
const string = String(str);
|
|
758
|
+
const runes = [];
|
|
759
|
+
let i = 0;
|
|
760
|
+
while (i < string.length) {
|
|
761
|
+
const cp = string.codePointAt(i);
|
|
762
|
+
runes.push(cp);
|
|
763
|
+
// Surrogate pairs (Emojis, etc.) are > 0xFFFF
|
|
764
|
+
i += cp > Unicode.MAX_BMP ? 2 : 1;
|
|
765
|
+
}
|
|
766
|
+
return runes;
|
|
733
767
|
}
|
|
734
768
|
|
|
735
|
-
// Returns the
|
|
769
|
+
// Returns the JS UTF-16 string containing the single rune |r|.
|
|
736
770
|
static runeToString(r) {
|
|
737
771
|
return String.fromCodePoint(r);
|
|
738
772
|
}
|
|
@@ -741,7 +775,7 @@ class Utils {
|
|
|
741
775
|
// during the evaluation of the \b and \B zero-width assertions.
|
|
742
776
|
// These assertions are ASCII-only: the word characters are [A-Za-z0-9_].
|
|
743
777
|
static isWordRune(r) {
|
|
744
|
-
return
|
|
778
|
+
return r < FAST_PATH_TABLE_SIZE ? WORD_RUNE_TABLE[r] === 1 : false;
|
|
745
779
|
}
|
|
746
780
|
|
|
747
781
|
// emptyOpContext returns the zero-width assertions satisfied at the position
|
|
@@ -754,21 +788,24 @@ class Utils {
|
|
|
754
788
|
static emptyOpContext(r1, r2) {
|
|
755
789
|
let op = 0;
|
|
756
790
|
if (r1 < 0) {
|
|
757
|
-
op |=
|
|
791
|
+
op |= Utils.EMPTY_BEGIN_TEXT | Utils.EMPTY_BEGIN_LINE;
|
|
758
792
|
}
|
|
759
|
-
|
|
760
|
-
|
|
793
|
+
// Hardcode 10 for '\n'
|
|
794
|
+
if (r1 === 10) {
|
|
795
|
+
op |= Utils.EMPTY_BEGIN_LINE;
|
|
761
796
|
}
|
|
762
797
|
if (r2 < 0) {
|
|
763
|
-
op |=
|
|
798
|
+
op |= Utils.EMPTY_END_TEXT | Utils.EMPTY_END_LINE;
|
|
764
799
|
}
|
|
765
|
-
|
|
766
|
-
|
|
800
|
+
|
|
801
|
+
// Hardcode 10 for '\n'
|
|
802
|
+
if (r2 === 10) {
|
|
803
|
+
op |= Utils.EMPTY_END_LINE;
|
|
767
804
|
}
|
|
768
|
-
if (
|
|
769
|
-
op |=
|
|
805
|
+
if (Utils.isWordRune(r1) !== Utils.isWordRune(r2)) {
|
|
806
|
+
op |= Utils.EMPTY_WORD_BOUNDARY;
|
|
770
807
|
} else {
|
|
771
|
-
op |=
|
|
808
|
+
op |= Utils.EMPTY_NO_WORD_BOUNDARY;
|
|
772
809
|
}
|
|
773
810
|
return op;
|
|
774
811
|
}
|
|
@@ -792,9 +829,23 @@ class Utils {
|
|
|
792
829
|
static charCount(codePoint) {
|
|
793
830
|
return codePoint > Unicode.MAX_BMP ? 2 : 1;
|
|
794
831
|
}
|
|
832
|
+
|
|
833
|
+
/**
|
|
834
|
+
* High-speed conversion from TypedArrays to standard JS Arrays.
|
|
835
|
+
* Bypasses the expensive Symbol.iterator overhead of Array.from()
|
|
836
|
+
*/
|
|
837
|
+
static toArray(typedArray) {
|
|
838
|
+
const len = typedArray.length;
|
|
839
|
+
const res = new Array(len);
|
|
840
|
+
for (let i = 0; i < len; i++) {
|
|
841
|
+
res[i] = typedArray[i];
|
|
842
|
+
}
|
|
843
|
+
return res;
|
|
844
|
+
}
|
|
795
845
|
static stringToUtf8ByteArray(str) {
|
|
796
846
|
if (globalThis.TextEncoder) {
|
|
797
|
-
|
|
847
|
+
if (!cachedNativeEncoder) cachedNativeEncoder = new TextEncoder();
|
|
848
|
+
return Utils.toArray(cachedNativeEncoder.encode(str));
|
|
798
849
|
} else {
|
|
799
850
|
// fallback, if no TextEncoder
|
|
800
851
|
let out = [],
|
|
@@ -824,7 +875,9 @@ class Utils {
|
|
|
824
875
|
}
|
|
825
876
|
static utf8ByteArrayToString(bytes) {
|
|
826
877
|
if (globalThis.TextDecoder) {
|
|
827
|
-
|
|
878
|
+
if (!cachedNativeDecoder) cachedNativeDecoder = new TextDecoder('utf-8');
|
|
879
|
+
const view = bytes instanceof Uint8Array ? bytes : new Uint8Array(bytes);
|
|
880
|
+
return cachedNativeDecoder.decode(view);
|
|
828
881
|
} else {
|
|
829
882
|
// fallback, if no TextDecoder
|
|
830
883
|
let out = [],
|
|
@@ -1121,15 +1174,34 @@ class MachineUTF8Input extends MachineInputBase {
|
|
|
1121
1174
|
if (targetLength === 0) {
|
|
1122
1175
|
return fromIndex <= this.end ? fromIndex : -1;
|
|
1123
1176
|
}
|
|
1177
|
+
const firstByte = target[0];
|
|
1124
1178
|
let limit = this.end - targetLength;
|
|
1125
|
-
|
|
1126
|
-
|
|
1179
|
+
// Feature detection: Native TypedArray indexOf (ES2015)
|
|
1180
|
+
const hasNativeIndexOf = typeof source.indexOf === 'function';
|
|
1181
|
+
let i = fromIndex;
|
|
1182
|
+
while (i <= limit) {
|
|
1183
|
+
// Fast-forward to the first matching byte using C++ bindings if available
|
|
1184
|
+
if (hasNativeIndexOf) {
|
|
1185
|
+
i = source.indexOf(firstByte, i);
|
|
1186
|
+
if (i === -1 || i > limit) return -1;
|
|
1187
|
+
} else {
|
|
1188
|
+
// Fallback: Manual loop
|
|
1189
|
+
while (i <= limit && source[i] !== firstByte) i++;
|
|
1190
|
+
if (i > limit) return -1;
|
|
1191
|
+
}
|
|
1192
|
+
|
|
1193
|
+
// First byte matches, verify the rest of the target sequence
|
|
1194
|
+
let match = true;
|
|
1195
|
+
for (let j = 1; j < targetLength; j++) {
|
|
1127
1196
|
if (source[i + j] !== target[j]) {
|
|
1197
|
+
match = false;
|
|
1128
1198
|
break;
|
|
1129
|
-
} else if (j === targetLength - 1) {
|
|
1130
|
-
return i;
|
|
1131
1199
|
}
|
|
1132
1200
|
}
|
|
1201
|
+
if (match) {
|
|
1202
|
+
return i;
|
|
1203
|
+
}
|
|
1204
|
+
i++;
|
|
1133
1205
|
}
|
|
1134
1206
|
return -1;
|
|
1135
1207
|
}
|
|
@@ -1201,8 +1273,10 @@ class MachineUTF16Input extends MachineInputBase {
|
|
|
1201
1273
|
// Returns a bitmask of EMPTY_* flags.
|
|
1202
1274
|
context(pos) {
|
|
1203
1275
|
pos += this.start;
|
|
1204
|
-
|
|
1205
|
-
|
|
1276
|
+
|
|
1277
|
+
// We only care about ASCII word runes and newlines for context boundaries
|
|
1278
|
+
const r1 = pos > this.start && pos <= this.end ? this.charSequence.charCodeAt(pos - 1) : -1;
|
|
1279
|
+
const r2 = pos < this.end ? this.charSequence.charCodeAt(pos) : -1;
|
|
1206
1280
|
return Utils.emptyOpContext(r1, r2);
|
|
1207
1281
|
}
|
|
1208
1282
|
prefixLength(re2) {
|
|
@@ -2209,7 +2283,7 @@ class Machine {
|
|
|
2209
2283
|
return Utils.emptyInts();
|
|
2210
2284
|
}
|
|
2211
2285
|
// Use subarray() to create a zero-allocation view before converting
|
|
2212
|
-
return
|
|
2286
|
+
return Utils.toArray(this.matchcap.subarray(0, this.ncap));
|
|
2213
2287
|
}
|
|
2214
2288
|
|
|
2215
2289
|
// alloc() allocates a new thread with the given instruction.
|
|
@@ -3171,7 +3245,7 @@ class Backtracker {
|
|
|
3171
3245
|
}
|
|
3172
3246
|
|
|
3173
3247
|
// Must slice so we don't accidentally leak trailing arrays from previously recycled typed arrays
|
|
3174
|
-
const result = ncap === 0 ? [] :
|
|
3248
|
+
const result = ncap === 0 ? [] : Utils.toArray(b.matchcap.subarray(0, ncap));
|
|
3175
3249
|
bitStatePool.push(b);
|
|
3176
3250
|
return result;
|
|
3177
3251
|
}
|
|
@@ -3547,7 +3621,7 @@ class OnePass {
|
|
|
3547
3621
|
matchcap[0] = 0;
|
|
3548
3622
|
matchcap[1] = pos;
|
|
3549
3623
|
}
|
|
3550
|
-
return ncap === 0 ? [] :
|
|
3624
|
+
return ncap === 0 ? [] : Utils.toArray(matchcap);
|
|
3551
3625
|
}
|
|
3552
3626
|
case Inst.RUNE:
|
|
3553
3627
|
if (!inst.matchRune(rune)) return null;
|
|
@@ -3596,7 +3670,7 @@ class OnePass {
|
|
|
3596
3670
|
}
|
|
3597
3671
|
}
|
|
3598
3672
|
if (!matched) return null;
|
|
3599
|
-
return ncap === 0 ? [] :
|
|
3673
|
+
return ncap === 0 ? [] : Utils.toArray(matchcap);
|
|
3600
3674
|
}
|
|
3601
3675
|
}
|
|
3602
3676
|
|
|
@@ -5350,7 +5424,7 @@ class StringIterator {
|
|
|
5350
5424
|
return r;
|
|
5351
5425
|
}
|
|
5352
5426
|
lookingAt(s) {
|
|
5353
|
-
return this.
|
|
5427
|
+
return this.str.startsWith(s, this.position);
|
|
5354
5428
|
}
|
|
5355
5429
|
|
|
5356
5430
|
// Returns the rest of the pattern as a Java UTF-16 string.
|