re2js 2.3.1 → 2.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +15 -15
- package/build/index.cjs.cjs +101 -27
- package/build/index.cjs.cjs.map +1 -1
- package/build/index.esm.d.ts.map +1 -1
- package/build/index.esm.js +101 -27
- package/build/index.esm.js.map +1 -1
- package/build/index.umd.js +101 -27
- package/build/index.umd.js.map +1 -1
- package/package.json +5 -4
package/build/index.umd.js
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
* re2js
|
|
3
3
|
* RE2JS is the JavaScript port of RE2, a regular expression engine that provides linear time matching
|
|
4
4
|
*
|
|
5
|
-
* @version v2.3.
|
|
5
|
+
* @version v2.3.2
|
|
6
6
|
* @author Oleksii Vasyliev
|
|
7
7
|
* @homepage https://github.com/le0pard/re2js#readme
|
|
8
8
|
* @repository github:le0pard/re2js
|
|
@@ -633,6 +633,31 @@
|
|
|
633
633
|
}
|
|
634
634
|
}
|
|
635
635
|
|
|
636
|
+
/**
|
|
637
|
+
* Size of the precomputed single-byte lookup table.
|
|
638
|
+
* Covers standard ASCII and Latin-1 characters for fast-path execution.
|
|
639
|
+
*/
|
|
640
|
+
const FAST_PATH_TABLE_SIZE = 256;
|
|
641
|
+
/**
|
|
642
|
+
* Precomputed lookup table for Word Boundary (\b, \B) assertions.
|
|
643
|
+
* * By precomputing the boolean results for standard ASCII word ranges
|
|
644
|
+
* ('a'-'z', 'A'-'Z', '0'-'9', '_'), we completely eliminate 4 logical
|
|
645
|
+
* branches from the NFA's hot execution loop. This prevents costly
|
|
646
|
+
* CPU branch mispredictions when scanning large strings.
|
|
647
|
+
*/
|
|
648
|
+
const WORD_RUNE_TABLE = new Uint8Array(FAST_PATH_TABLE_SIZE);
|
|
649
|
+
for (let i = 0; i < FAST_PATH_TABLE_SIZE; i++) {
|
|
650
|
+
WORD_RUNE_TABLE[i] = 97 <= i && i <= 122 ||
|
|
651
|
+
// 'a' - 'z'
|
|
652
|
+
65 <= i && i <= 90 ||
|
|
653
|
+
// 'A' - 'Z'
|
|
654
|
+
48 <= i && i <= 57 ||
|
|
655
|
+
// '0' - '9'
|
|
656
|
+
i === 95 // '_'
|
|
657
|
+
? 1 : 0;
|
|
658
|
+
}
|
|
659
|
+
let cachedNativeEncoder = null;
|
|
660
|
+
let cachedNativeDecoder = null;
|
|
636
661
|
/**
|
|
637
662
|
* Various constants and helper utilities.
|
|
638
663
|
*/
|
|
@@ -731,12 +756,21 @@
|
|
|
731
756
|
return out;
|
|
732
757
|
}
|
|
733
758
|
|
|
734
|
-
// Returns the array of runes in the specified
|
|
759
|
+
// Returns the array of runes in the specified JS UTF-16 string.
|
|
735
760
|
static stringToRunes(str) {
|
|
736
|
-
|
|
761
|
+
const string = String(str);
|
|
762
|
+
const runes = [];
|
|
763
|
+
let i = 0;
|
|
764
|
+
while (i < string.length) {
|
|
765
|
+
const cp = string.codePointAt(i);
|
|
766
|
+
runes.push(cp);
|
|
767
|
+
// Surrogate pairs (Emojis, etc.) are > 0xFFFF
|
|
768
|
+
i += cp > Unicode.MAX_BMP ? 2 : 1;
|
|
769
|
+
}
|
|
770
|
+
return runes;
|
|
737
771
|
}
|
|
738
772
|
|
|
739
|
-
// Returns the
|
|
773
|
+
// Returns the JS UTF-16 string containing the single rune |r|.
|
|
740
774
|
static runeToString(r) {
|
|
741
775
|
return String.fromCodePoint(r);
|
|
742
776
|
}
|
|
@@ -745,7 +779,7 @@
|
|
|
745
779
|
// during the evaluation of the \b and \B zero-width assertions.
|
|
746
780
|
// These assertions are ASCII-only: the word characters are [A-Za-z0-9_].
|
|
747
781
|
static isWordRune(r) {
|
|
748
|
-
return
|
|
782
|
+
return r < FAST_PATH_TABLE_SIZE ? WORD_RUNE_TABLE[r] === 1 : false;
|
|
749
783
|
}
|
|
750
784
|
|
|
751
785
|
// emptyOpContext returns the zero-width assertions satisfied at the position
|
|
@@ -758,21 +792,24 @@
|
|
|
758
792
|
static emptyOpContext(r1, r2) {
|
|
759
793
|
let op = 0;
|
|
760
794
|
if (r1 < 0) {
|
|
761
|
-
op |=
|
|
795
|
+
op |= Utils.EMPTY_BEGIN_TEXT | Utils.EMPTY_BEGIN_LINE;
|
|
762
796
|
}
|
|
763
|
-
|
|
764
|
-
|
|
797
|
+
// Hardcode 10 for '\n'
|
|
798
|
+
if (r1 === 10) {
|
|
799
|
+
op |= Utils.EMPTY_BEGIN_LINE;
|
|
765
800
|
}
|
|
766
801
|
if (r2 < 0) {
|
|
767
|
-
op |=
|
|
802
|
+
op |= Utils.EMPTY_END_TEXT | Utils.EMPTY_END_LINE;
|
|
768
803
|
}
|
|
769
|
-
|
|
770
|
-
|
|
804
|
+
|
|
805
|
+
// Hardcode 10 for '\n'
|
|
806
|
+
if (r2 === 10) {
|
|
807
|
+
op |= Utils.EMPTY_END_LINE;
|
|
771
808
|
}
|
|
772
|
-
if (
|
|
773
|
-
op |=
|
|
809
|
+
if (Utils.isWordRune(r1) !== Utils.isWordRune(r2)) {
|
|
810
|
+
op |= Utils.EMPTY_WORD_BOUNDARY;
|
|
774
811
|
} else {
|
|
775
|
-
op |=
|
|
812
|
+
op |= Utils.EMPTY_NO_WORD_BOUNDARY;
|
|
776
813
|
}
|
|
777
814
|
return op;
|
|
778
815
|
}
|
|
@@ -796,9 +833,23 @@
|
|
|
796
833
|
static charCount(codePoint) {
|
|
797
834
|
return codePoint > Unicode.MAX_BMP ? 2 : 1;
|
|
798
835
|
}
|
|
836
|
+
|
|
837
|
+
/**
|
|
838
|
+
* High-speed conversion from TypedArrays to standard JS Arrays.
|
|
839
|
+
* Bypasses the expensive Symbol.iterator overhead of Array.from()
|
|
840
|
+
*/
|
|
841
|
+
static toArray(typedArray) {
|
|
842
|
+
const len = typedArray.length;
|
|
843
|
+
const res = new Array(len);
|
|
844
|
+
for (let i = 0; i < len; i++) {
|
|
845
|
+
res[i] = typedArray[i];
|
|
846
|
+
}
|
|
847
|
+
return res;
|
|
848
|
+
}
|
|
799
849
|
static stringToUtf8ByteArray(str) {
|
|
800
850
|
if (globalThis.TextEncoder) {
|
|
801
|
-
|
|
851
|
+
if (!cachedNativeEncoder) cachedNativeEncoder = new TextEncoder();
|
|
852
|
+
return Utils.toArray(cachedNativeEncoder.encode(str));
|
|
802
853
|
} else {
|
|
803
854
|
// fallback, if no TextEncoder
|
|
804
855
|
let out = [],
|
|
@@ -828,7 +879,9 @@
|
|
|
828
879
|
}
|
|
829
880
|
static utf8ByteArrayToString(bytes) {
|
|
830
881
|
if (globalThis.TextDecoder) {
|
|
831
|
-
|
|
882
|
+
if (!cachedNativeDecoder) cachedNativeDecoder = new TextDecoder('utf-8');
|
|
883
|
+
const view = bytes instanceof Uint8Array ? bytes : new Uint8Array(bytes);
|
|
884
|
+
return cachedNativeDecoder.decode(view);
|
|
832
885
|
} else {
|
|
833
886
|
// fallback, if no TextDecoder
|
|
834
887
|
let out = [],
|
|
@@ -1125,15 +1178,34 @@
|
|
|
1125
1178
|
if (targetLength === 0) {
|
|
1126
1179
|
return fromIndex <= this.end ? fromIndex : -1;
|
|
1127
1180
|
}
|
|
1181
|
+
const firstByte = target[0];
|
|
1128
1182
|
let limit = this.end - targetLength;
|
|
1129
|
-
|
|
1130
|
-
|
|
1183
|
+
// Feature detection: Native TypedArray indexOf (ES2015)
|
|
1184
|
+
const hasNativeIndexOf = typeof source.indexOf === 'function';
|
|
1185
|
+
let i = fromIndex;
|
|
1186
|
+
while (i <= limit) {
|
|
1187
|
+
// Fast-forward to the first matching byte using C++ bindings if available
|
|
1188
|
+
if (hasNativeIndexOf) {
|
|
1189
|
+
i = source.indexOf(firstByte, i);
|
|
1190
|
+
if (i === -1 || i > limit) return -1;
|
|
1191
|
+
} else {
|
|
1192
|
+
// Fallback: Manual loop
|
|
1193
|
+
while (i <= limit && source[i] !== firstByte) i++;
|
|
1194
|
+
if (i > limit) return -1;
|
|
1195
|
+
}
|
|
1196
|
+
|
|
1197
|
+
// First byte matches, verify the rest of the target sequence
|
|
1198
|
+
let match = true;
|
|
1199
|
+
for (let j = 1; j < targetLength; j++) {
|
|
1131
1200
|
if (source[i + j] !== target[j]) {
|
|
1201
|
+
match = false;
|
|
1132
1202
|
break;
|
|
1133
|
-
} else if (j === targetLength - 1) {
|
|
1134
|
-
return i;
|
|
1135
1203
|
}
|
|
1136
1204
|
}
|
|
1205
|
+
if (match) {
|
|
1206
|
+
return i;
|
|
1207
|
+
}
|
|
1208
|
+
i++;
|
|
1137
1209
|
}
|
|
1138
1210
|
return -1;
|
|
1139
1211
|
}
|
|
@@ -1205,8 +1277,10 @@
|
|
|
1205
1277
|
// Returns a bitmask of EMPTY_* flags.
|
|
1206
1278
|
context(pos) {
|
|
1207
1279
|
pos += this.start;
|
|
1208
|
-
|
|
1209
|
-
|
|
1280
|
+
|
|
1281
|
+
// We only care about ASCII word runes and newlines for context boundaries
|
|
1282
|
+
const r1 = pos > this.start && pos <= this.end ? this.charSequence.charCodeAt(pos - 1) : -1;
|
|
1283
|
+
const r2 = pos < this.end ? this.charSequence.charCodeAt(pos) : -1;
|
|
1210
1284
|
return Utils.emptyOpContext(r1, r2);
|
|
1211
1285
|
}
|
|
1212
1286
|
prefixLength(re2) {
|
|
@@ -2213,7 +2287,7 @@
|
|
|
2213
2287
|
return Utils.emptyInts();
|
|
2214
2288
|
}
|
|
2215
2289
|
// Use subarray() to create a zero-allocation view before converting
|
|
2216
|
-
return
|
|
2290
|
+
return Utils.toArray(this.matchcap.subarray(0, this.ncap));
|
|
2217
2291
|
}
|
|
2218
2292
|
|
|
2219
2293
|
// alloc() allocates a new thread with the given instruction.
|
|
@@ -3175,7 +3249,7 @@
|
|
|
3175
3249
|
}
|
|
3176
3250
|
|
|
3177
3251
|
// Must slice so we don't accidentally leak trailing arrays from previously recycled typed arrays
|
|
3178
|
-
const result = ncap === 0 ? [] :
|
|
3252
|
+
const result = ncap === 0 ? [] : Utils.toArray(b.matchcap.subarray(0, ncap));
|
|
3179
3253
|
bitStatePool.push(b);
|
|
3180
3254
|
return result;
|
|
3181
3255
|
}
|
|
@@ -3551,7 +3625,7 @@
|
|
|
3551
3625
|
matchcap[0] = 0;
|
|
3552
3626
|
matchcap[1] = pos;
|
|
3553
3627
|
}
|
|
3554
|
-
return ncap === 0 ? [] :
|
|
3628
|
+
return ncap === 0 ? [] : Utils.toArray(matchcap);
|
|
3555
3629
|
}
|
|
3556
3630
|
case Inst.RUNE:
|
|
3557
3631
|
if (!inst.matchRune(rune)) return null;
|
|
@@ -3600,7 +3674,7 @@
|
|
|
3600
3674
|
}
|
|
3601
3675
|
}
|
|
3602
3676
|
if (!matched) return null;
|
|
3603
|
-
return ncap === 0 ? [] :
|
|
3677
|
+
return ncap === 0 ? [] : Utils.toArray(matchcap);
|
|
3604
3678
|
}
|
|
3605
3679
|
}
|
|
3606
3680
|
|
|
@@ -5354,7 +5428,7 @@
|
|
|
5354
5428
|
return r;
|
|
5355
5429
|
}
|
|
5356
5430
|
lookingAt(s) {
|
|
5357
|
-
return this.
|
|
5431
|
+
return this.str.startsWith(s, this.position);
|
|
5358
5432
|
}
|
|
5359
5433
|
|
|
5360
5434
|
// Returns the rest of the pattern as a Java UTF-16 string.
|