re2js 2.3.0 → 2.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +21 -18
- package/build/index.cjs.cjs +106 -36
- package/build/index.cjs.cjs.map +1 -1
- package/build/index.esm.d.ts +27 -14
- package/build/index.esm.d.ts.map +1 -1
- package/build/index.esm.js +106 -36
- package/build/index.esm.js.map +1 -1
- package/build/index.umd.js +106 -36
- package/build/index.umd.js.map +1 -1
- package/package.json +5 -4
package/README.md
CHANGED
|
@@ -562,8 +562,7 @@ console.log(RE2JS.compile('(a+b?)').programSize()); // Outputs: 8
|
|
|
562
562
|
|
|
563
563
|
### Translating Regular Expressions
|
|
564
564
|
|
|
565
|
-
The `translateRegExp()` method preprocesses a given regular expression string to ensure compatibility with RE2JS.
|
|
566
|
-
It applies necessary transformations, such as escaping special characters, adjusting Unicode sequences, and converting named capture groups
|
|
565
|
+
The `translateRegExp()` method preprocesses a given regular expression string or native RegExp object to ensure compatibility with RE2JS. It applies necessary transformations, such as escaping special characters, adjusting Unicode sequences, converting named capture groups, and mapping native execution flags
|
|
567
566
|
|
|
568
567
|
```js
|
|
569
568
|
import { RE2JS } from 're2js'
|
|
@@ -579,7 +578,11 @@ RE2JS.matches(unicodeRegexp, '😀') // true
|
|
|
579
578
|
RE2JS.matches(unicodeRegexp, '😃') // false
|
|
580
579
|
|
|
581
580
|
// also support native Regex
|
|
582
|
-
RE2JS.translateRegExp(/foo/ims) // '(?ims)foo'
|
|
581
|
+
const translatedNative = RE2JS.translateRegExp(/foo/ims) // '(?ims)foo'
|
|
582
|
+
|
|
583
|
+
const re = RE2JS.compile(translatedNative)
|
|
584
|
+
re.test('FOO') // true
|
|
585
|
+
|
|
583
586
|
RE2JS.translateRegExp(/bar/giy) // '(?i)bar'
|
|
584
587
|
```
|
|
585
588
|
|
|
@@ -612,23 +615,23 @@ Because RE2JS's Lazy DFA, Prefilter, and OnePass engines operate efficiently wit
|
|
|
612
615
|
|
|
613
616
|
Here is a benchmark running 30,000 items through both engines using their respective `.test()` fast-paths (averages of multiple runs):
|
|
614
617
|
|
|
615
|
-
| Benchmark Scenario | Pattern Example | RE2JS (Pure JS) | RE2-Node (C++) | Result
|
|
616
|
-
|
|
617
|
-
| **
|
|
618
|
-
| **
|
|
619
|
-
| **
|
|
620
|
-
| **
|
|
621
|
-
| **
|
|
622
|
-
| **
|
|
623
|
-
| **
|
|
624
|
-
| **Case Insensitive** | `/(?i)swamp/` |
|
|
625
|
-
| **Word Boundaries (NFA)** | `/\b(Flying\|First...)\b/` |
|
|
618
|
+
| Benchmark Scenario | Pattern Example | RE2JS (Pure JS) | RE2-Node (C++) | Result |
|
|
619
|
+
|:--------------------------|:---------------------------|:----------------|:---------------|:----------------------------|
|
|
620
|
+
| **ReDoS Attempt** | `/(a+)+!/` | **7.28 ms** | 12.74 ms | `re2js` is **1.75x** faster |
|
|
621
|
+
| **Deep State Machine** | `/([0-9]+(/[0-9]+)+)/` | **8.78 ms** | 12.56 ms | `re2js` is **1.43x** faster |
|
|
622
|
+
| **Simple Literal** | `/damage/` | **7.04 ms** | 9.59 ms | `re2js` is **1.36x** faster |
|
|
623
|
+
| **Lazy Wildcard** | `/enters.*?battlefield/` | **9.36 ms** | 10.27 ms | `re2js` is **1.10x** faster |
|
|
624
|
+
| **Greedy Wildcard** | `/enters.*battlefield/` | **9.47 ms** | 10.03 ms | `re2js` is **1.06x** faster |
|
|
625
|
+
| **Massive Alternation** | `/White\|Blue\|Black.../` | 11.69 ms | **11.28 ms** | `re2-node` is 1.04x faster |
|
|
626
|
+
| **Bounded Repetition** | `/[A-Z][a-z]{5,15}/` | 12.68 ms | **10.64 ms** | `re2-node` is 1.19x faster |
|
|
627
|
+
| **Case Insensitive** | `/(?i)swamp/` | 18.58 ms | **12.64 ms** | `re2-node` is 1.47x faster |
|
|
628
|
+
| **Word Boundaries (NFA)** | `/\b(Flying\|First...)\b/` | 30.45 ms | **12.22 ms** | `re2-node` is 2.49x faster |
|
|
626
629
|
|
|
627
630
|
**Takeaways:**
|
|
628
|
-
* **
|
|
629
|
-
* **
|
|
630
|
-
* **
|
|
631
|
-
* **
|
|
631
|
+
* **Pure JS Strengths:** For complex state tracking (nested groups, wildcards) and literal string scanning, `re2js` actually beats the native C++ bindings. V8's Turbofan JIT compiler is able to heavily optimize the Pure JS DFA loop, bypassing the C++ boundary entirely.
|
|
632
|
+
* **C++ Strengths:** For character class evaluations (Case Insensitivity, Bounded Repetitions, Alternations), `re2-node` has a slight edge thanks to highly optimized, hardware-level memory tables.
|
|
633
|
+
* **The NFA Fallback:** Pure DFA engines mathematically cannot track look-behind context like Word Boundaries (`\b`). When RE2JS encounters these, it safely bails out to its NFA engine. As shown in the benchmarks, the pure JS NFA is significantly slower than the C++ NFA.
|
|
634
|
+
* **Optimization Tip:** For maximum absolute performance in RE2JS, avoid `\b` or capture groups when doing bulk boolean `.test()` matching to ensure execution stays on the DFA fast-path.
|
|
632
635
|
|
|
633
636
|
### RE2JS vs JavaScript's native RegExp
|
|
634
637
|
|
package/build/index.cjs.cjs
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
* re2js
|
|
3
3
|
* RE2JS is the JavaScript port of RE2, a regular expression engine that provides linear time matching
|
|
4
4
|
*
|
|
5
|
-
* @version v2.3.
|
|
5
|
+
* @version v2.3.2
|
|
6
6
|
* @author Oleksii Vasyliev
|
|
7
7
|
* @homepage https://github.com/le0pard/re2js#readme
|
|
8
8
|
* @repository github:le0pard/re2js
|
|
@@ -629,6 +629,31 @@ class Unicode {
|
|
|
629
629
|
}
|
|
630
630
|
}
|
|
631
631
|
|
|
632
|
+
/**
|
|
633
|
+
* Size of the precomputed single-byte lookup table.
|
|
634
|
+
* Covers standard ASCII and Latin-1 characters for fast-path execution.
|
|
635
|
+
*/
|
|
636
|
+
const FAST_PATH_TABLE_SIZE = 256;
|
|
637
|
+
/**
|
|
638
|
+
* Precomputed lookup table for Word Boundary (\b, \B) assertions.
|
|
639
|
+
* * By precomputing the boolean results for standard ASCII word ranges
|
|
640
|
+
* ('a'-'z', 'A'-'Z', '0'-'9', '_'), we completely eliminate 4 logical
|
|
641
|
+
* branches from the NFA's hot execution loop. This prevents costly
|
|
642
|
+
* CPU branch mispredictions when scanning large strings.
|
|
643
|
+
*/
|
|
644
|
+
const WORD_RUNE_TABLE = new Uint8Array(FAST_PATH_TABLE_SIZE);
|
|
645
|
+
for (let i = 0; i < FAST_PATH_TABLE_SIZE; i++) {
|
|
646
|
+
WORD_RUNE_TABLE[i] = 97 <= i && i <= 122 ||
|
|
647
|
+
// 'a' - 'z'
|
|
648
|
+
65 <= i && i <= 90 ||
|
|
649
|
+
// 'A' - 'Z'
|
|
650
|
+
48 <= i && i <= 57 ||
|
|
651
|
+
// '0' - '9'
|
|
652
|
+
i === 95 // '_'
|
|
653
|
+
? 1 : 0;
|
|
654
|
+
}
|
|
655
|
+
let cachedNativeEncoder = null;
|
|
656
|
+
let cachedNativeDecoder = null;
|
|
632
657
|
/**
|
|
633
658
|
* Various constants and helper utilities.
|
|
634
659
|
*/
|
|
@@ -727,12 +752,21 @@ class Utils {
|
|
|
727
752
|
return out;
|
|
728
753
|
}
|
|
729
754
|
|
|
730
|
-
// Returns the array of runes in the specified
|
|
755
|
+
// Returns the array of runes in the specified JS UTF-16 string.
|
|
731
756
|
static stringToRunes(str) {
|
|
732
|
-
|
|
757
|
+
const string = String(str);
|
|
758
|
+
const runes = [];
|
|
759
|
+
let i = 0;
|
|
760
|
+
while (i < string.length) {
|
|
761
|
+
const cp = string.codePointAt(i);
|
|
762
|
+
runes.push(cp);
|
|
763
|
+
// Surrogate pairs (Emojis, etc.) are > 0xFFFF
|
|
764
|
+
i += cp > Unicode.MAX_BMP ? 2 : 1;
|
|
765
|
+
}
|
|
766
|
+
return runes;
|
|
733
767
|
}
|
|
734
768
|
|
|
735
|
-
// Returns the
|
|
769
|
+
// Returns the JS UTF-16 string containing the single rune |r|.
|
|
736
770
|
static runeToString(r) {
|
|
737
771
|
return String.fromCodePoint(r);
|
|
738
772
|
}
|
|
@@ -741,7 +775,7 @@ class Utils {
|
|
|
741
775
|
// during the evaluation of the \b and \B zero-width assertions.
|
|
742
776
|
// These assertions are ASCII-only: the word characters are [A-Za-z0-9_].
|
|
743
777
|
static isWordRune(r) {
|
|
744
|
-
return
|
|
778
|
+
return r < FAST_PATH_TABLE_SIZE ? WORD_RUNE_TABLE[r] === 1 : false;
|
|
745
779
|
}
|
|
746
780
|
|
|
747
781
|
// emptyOpContext returns the zero-width assertions satisfied at the position
|
|
@@ -754,21 +788,24 @@ class Utils {
|
|
|
754
788
|
static emptyOpContext(r1, r2) {
|
|
755
789
|
let op = 0;
|
|
756
790
|
if (r1 < 0) {
|
|
757
|
-
op |=
|
|
791
|
+
op |= Utils.EMPTY_BEGIN_TEXT | Utils.EMPTY_BEGIN_LINE;
|
|
758
792
|
}
|
|
759
|
-
|
|
760
|
-
|
|
793
|
+
// Hardcode 10 for '\n'
|
|
794
|
+
if (r1 === 10) {
|
|
795
|
+
op |= Utils.EMPTY_BEGIN_LINE;
|
|
761
796
|
}
|
|
762
797
|
if (r2 < 0) {
|
|
763
|
-
op |=
|
|
798
|
+
op |= Utils.EMPTY_END_TEXT | Utils.EMPTY_END_LINE;
|
|
764
799
|
}
|
|
765
|
-
|
|
766
|
-
|
|
800
|
+
|
|
801
|
+
// Hardcode 10 for '\n'
|
|
802
|
+
if (r2 === 10) {
|
|
803
|
+
op |= Utils.EMPTY_END_LINE;
|
|
767
804
|
}
|
|
768
|
-
if (
|
|
769
|
-
op |=
|
|
805
|
+
if (Utils.isWordRune(r1) !== Utils.isWordRune(r2)) {
|
|
806
|
+
op |= Utils.EMPTY_WORD_BOUNDARY;
|
|
770
807
|
} else {
|
|
771
|
-
op |=
|
|
808
|
+
op |= Utils.EMPTY_NO_WORD_BOUNDARY;
|
|
772
809
|
}
|
|
773
810
|
return op;
|
|
774
811
|
}
|
|
@@ -792,9 +829,23 @@ class Utils {
|
|
|
792
829
|
static charCount(codePoint) {
|
|
793
830
|
return codePoint > Unicode.MAX_BMP ? 2 : 1;
|
|
794
831
|
}
|
|
832
|
+
|
|
833
|
+
/**
|
|
834
|
+
* High-speed conversion from TypedArrays to standard JS Arrays.
|
|
835
|
+
* Bypasses the expensive Symbol.iterator overhead of Array.from()
|
|
836
|
+
*/
|
|
837
|
+
static toArray(typedArray) {
|
|
838
|
+
const len = typedArray.length;
|
|
839
|
+
const res = new Array(len);
|
|
840
|
+
for (let i = 0; i < len; i++) {
|
|
841
|
+
res[i] = typedArray[i];
|
|
842
|
+
}
|
|
843
|
+
return res;
|
|
844
|
+
}
|
|
795
845
|
static stringToUtf8ByteArray(str) {
|
|
796
846
|
if (globalThis.TextEncoder) {
|
|
797
|
-
|
|
847
|
+
if (!cachedNativeEncoder) cachedNativeEncoder = new TextEncoder();
|
|
848
|
+
return Utils.toArray(cachedNativeEncoder.encode(str));
|
|
798
849
|
} else {
|
|
799
850
|
// fallback, if no TextEncoder
|
|
800
851
|
let out = [],
|
|
@@ -824,7 +875,9 @@ class Utils {
|
|
|
824
875
|
}
|
|
825
876
|
static utf8ByteArrayToString(bytes) {
|
|
826
877
|
if (globalThis.TextDecoder) {
|
|
827
|
-
|
|
878
|
+
if (!cachedNativeDecoder) cachedNativeDecoder = new TextDecoder('utf-8');
|
|
879
|
+
const view = bytes instanceof Uint8Array ? bytes : new Uint8Array(bytes);
|
|
880
|
+
return cachedNativeDecoder.decode(view);
|
|
828
881
|
} else {
|
|
829
882
|
// fallback, if no TextDecoder
|
|
830
883
|
let out = [],
|
|
@@ -1121,15 +1174,34 @@ class MachineUTF8Input extends MachineInputBase {
|
|
|
1121
1174
|
if (targetLength === 0) {
|
|
1122
1175
|
return fromIndex <= this.end ? fromIndex : -1;
|
|
1123
1176
|
}
|
|
1177
|
+
const firstByte = target[0];
|
|
1124
1178
|
let limit = this.end - targetLength;
|
|
1125
|
-
|
|
1126
|
-
|
|
1179
|
+
// Feature detection: Native TypedArray indexOf (ES2015)
|
|
1180
|
+
const hasNativeIndexOf = typeof source.indexOf === 'function';
|
|
1181
|
+
let i = fromIndex;
|
|
1182
|
+
while (i <= limit) {
|
|
1183
|
+
// Fast-forward to the first matching byte using C++ bindings if available
|
|
1184
|
+
if (hasNativeIndexOf) {
|
|
1185
|
+
i = source.indexOf(firstByte, i);
|
|
1186
|
+
if (i === -1 || i > limit) return -1;
|
|
1187
|
+
} else {
|
|
1188
|
+
// Fallback: Manual loop
|
|
1189
|
+
while (i <= limit && source[i] !== firstByte) i++;
|
|
1190
|
+
if (i > limit) return -1;
|
|
1191
|
+
}
|
|
1192
|
+
|
|
1193
|
+
// First byte matches, verify the rest of the target sequence
|
|
1194
|
+
let match = true;
|
|
1195
|
+
for (let j = 1; j < targetLength; j++) {
|
|
1127
1196
|
if (source[i + j] !== target[j]) {
|
|
1197
|
+
match = false;
|
|
1128
1198
|
break;
|
|
1129
|
-
} else if (j === targetLength - 1) {
|
|
1130
|
-
return i;
|
|
1131
1199
|
}
|
|
1132
1200
|
}
|
|
1201
|
+
if (match) {
|
|
1202
|
+
return i;
|
|
1203
|
+
}
|
|
1204
|
+
i++;
|
|
1133
1205
|
}
|
|
1134
1206
|
return -1;
|
|
1135
1207
|
}
|
|
@@ -1201,8 +1273,10 @@ class MachineUTF16Input extends MachineInputBase {
|
|
|
1201
1273
|
// Returns a bitmask of EMPTY_* flags.
|
|
1202
1274
|
context(pos) {
|
|
1203
1275
|
pos += this.start;
|
|
1204
|
-
|
|
1205
|
-
|
|
1276
|
+
|
|
1277
|
+
// We only care about ASCII word runes and newlines for context boundaries
|
|
1278
|
+
const r1 = pos > this.start && pos <= this.end ? this.charSequence.charCodeAt(pos - 1) : -1;
|
|
1279
|
+
const r2 = pos < this.end ? this.charSequence.charCodeAt(pos) : -1;
|
|
1206
1280
|
return Utils.emptyOpContext(r1, r2);
|
|
1207
1281
|
}
|
|
1208
1282
|
prefixLength(re2) {
|
|
@@ -1332,10 +1406,6 @@ class RE2JSInternalException extends RE2JSException {
|
|
|
1332
1406
|
* @author rsc@google.com (Russ Cox)
|
|
1333
1407
|
*/
|
|
1334
1408
|
|
|
1335
|
-
/**
|
|
1336
|
-
* @typedef {import('./index').RE2JS} RE2JS_Pattern
|
|
1337
|
-
*/
|
|
1338
|
-
|
|
1339
1409
|
class Matcher {
|
|
1340
1410
|
/**
|
|
1341
1411
|
* Quotes '\' and '$' in {@code s}, so that the returned string could be used in
|
|
@@ -1373,8 +1443,8 @@ class Matcher {
|
|
|
1373
1443
|
}
|
|
1374
1444
|
/**
|
|
1375
1445
|
*
|
|
1376
|
-
* @param {
|
|
1377
|
-
* @param {
|
|
1446
|
+
* @param {RE2JS} pattern
|
|
1447
|
+
* @param {string|number[]|Uint8Array} input
|
|
1378
1448
|
*/
|
|
1379
1449
|
constructor(pattern, input) {
|
|
1380
1450
|
if (pattern === null) {
|
|
@@ -1382,7 +1452,7 @@ class Matcher {
|
|
|
1382
1452
|
}
|
|
1383
1453
|
/**
|
|
1384
1454
|
* The pattern being matched.
|
|
1385
|
-
* @type {
|
|
1455
|
+
* @type {RE2JS}
|
|
1386
1456
|
*/
|
|
1387
1457
|
this.patternInput = pattern;
|
|
1388
1458
|
const re2 = this.patternInput.re2();
|
|
@@ -1407,7 +1477,7 @@ class Matcher {
|
|
|
1407
1477
|
|
|
1408
1478
|
/**
|
|
1409
1479
|
* Returns the {@code RE2JS} associated with this {@code Matcher}.
|
|
1410
|
-
* @returns {
|
|
1480
|
+
* @returns {RE2JS}
|
|
1411
1481
|
*/
|
|
1412
1482
|
pattern() {
|
|
1413
1483
|
return this.patternInput;
|
|
@@ -1437,7 +1507,7 @@ class Matcher {
|
|
|
1437
1507
|
|
|
1438
1508
|
/**
|
|
1439
1509
|
* Resets the {@code Matcher} and changes the input.
|
|
1440
|
-
* @param {
|
|
1510
|
+
* @param {MatcherInputBase} input
|
|
1441
1511
|
* @returns {Matcher} the {@code Matcher} itself, for chained method calls
|
|
1442
1512
|
*/
|
|
1443
1513
|
resetMatcherInput(input) {
|
|
@@ -2213,7 +2283,7 @@ class Machine {
|
|
|
2213
2283
|
return Utils.emptyInts();
|
|
2214
2284
|
}
|
|
2215
2285
|
// Use subarray() to create a zero-allocation view before converting
|
|
2216
|
-
return
|
|
2286
|
+
return Utils.toArray(this.matchcap.subarray(0, this.ncap));
|
|
2217
2287
|
}
|
|
2218
2288
|
|
|
2219
2289
|
// alloc() allocates a new thread with the given instruction.
|
|
@@ -3175,7 +3245,7 @@ class Backtracker {
|
|
|
3175
3245
|
}
|
|
3176
3246
|
|
|
3177
3247
|
// Must slice so we don't accidentally leak trailing arrays from previously recycled typed arrays
|
|
3178
|
-
const result = ncap === 0 ? [] :
|
|
3248
|
+
const result = ncap === 0 ? [] : Utils.toArray(b.matchcap.subarray(0, ncap));
|
|
3179
3249
|
bitStatePool.push(b);
|
|
3180
3250
|
return result;
|
|
3181
3251
|
}
|
|
@@ -3551,7 +3621,7 @@ class OnePass {
|
|
|
3551
3621
|
matchcap[0] = 0;
|
|
3552
3622
|
matchcap[1] = pos;
|
|
3553
3623
|
}
|
|
3554
|
-
return ncap === 0 ? [] :
|
|
3624
|
+
return ncap === 0 ? [] : Utils.toArray(matchcap);
|
|
3555
3625
|
}
|
|
3556
3626
|
case Inst.RUNE:
|
|
3557
3627
|
if (!inst.matchRune(rune)) return null;
|
|
@@ -3600,7 +3670,7 @@ class OnePass {
|
|
|
3600
3670
|
}
|
|
3601
3671
|
}
|
|
3602
3672
|
if (!matched) return null;
|
|
3603
|
-
return ncap === 0 ? [] :
|
|
3673
|
+
return ncap === 0 ? [] : Utils.toArray(matchcap);
|
|
3604
3674
|
}
|
|
3605
3675
|
}
|
|
3606
3676
|
|
|
@@ -5354,7 +5424,7 @@ class StringIterator {
|
|
|
5354
5424
|
return r;
|
|
5355
5425
|
}
|
|
5356
5426
|
lookingAt(s) {
|
|
5357
|
-
return this.
|
|
5427
|
+
return this.str.startsWith(s, this.position);
|
|
5358
5428
|
}
|
|
5359
5429
|
|
|
5360
5430
|
// Returns the rest of the pattern as a Java UTF-16 string.
|