re2js 2.2.0 → 2.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +54 -54
- package/build/index.cjs.cjs +425 -156
- package/build/index.cjs.cjs.map +1 -1
- package/build/index.esm.d.ts +3 -1
- package/build/index.esm.d.ts.map +1 -1
- package/build/index.esm.js +425 -156
- package/build/index.esm.js.map +1 -1
- package/build/index.umd.js +425 -156
- package/build/index.umd.js.map +1 -1
- package/package.json +6 -6
package/build/index.cjs.cjs
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
* re2js
|
|
3
3
|
* RE2JS is the JavaScript port of RE2, a regular expression engine that provides linear time matching
|
|
4
4
|
*
|
|
5
|
-
* @version v2.2.
|
|
5
|
+
* @version v2.2.1
|
|
6
6
|
* @author Alexey Vasiliev
|
|
7
7
|
* @homepage https://github.com/le0pard/re2js#readme
|
|
8
8
|
* @repository github:le0pard/re2js
|
|
@@ -78,6 +78,8 @@ const PublicFlags = {
|
|
|
78
78
|
const ASCII_SIZE = 128;
|
|
79
79
|
const ASCII_TO_UPPER = new Int32Array(ASCII_SIZE);
|
|
80
80
|
const ASCII_TO_LOWER = new Int32Array(ASCII_SIZE);
|
|
81
|
+
// The highest legal Basic Multilingual Plane (BMP) value.
|
|
82
|
+
const MAX_BMP = 0xffff;
|
|
81
83
|
for (let i = 0; i < ASCII_SIZE; i++) {
|
|
82
84
|
if (i >= 97 && i <= 122) {
|
|
83
85
|
// a-z
|
|
@@ -101,11 +103,13 @@ class Codepoint {
|
|
|
101
103
|
static toUpperCase(codepoint) {
|
|
102
104
|
if (codepoint < ASCII_SIZE) return ASCII_TO_UPPER[codepoint];
|
|
103
105
|
const s = String.fromCodePoint(codepoint).toUpperCase();
|
|
104
|
-
|
|
106
|
+
const expectedLen = s.codePointAt(0) > MAX_BMP ? 2 : 1;
|
|
107
|
+
if (s.length > expectedLen) {
|
|
105
108
|
return codepoint;
|
|
106
109
|
}
|
|
107
110
|
const sOrigin = String.fromCodePoint(s.codePointAt(0)).toLowerCase();
|
|
108
|
-
|
|
111
|
+
const originExpectedLen = sOrigin.codePointAt(0) > MAX_BMP ? 2 : 1;
|
|
112
|
+
if (sOrigin.length > originExpectedLen || sOrigin.codePointAt(0) !== codepoint) {
|
|
109
113
|
return codepoint;
|
|
110
114
|
}
|
|
111
115
|
return s.codePointAt(0);
|
|
@@ -116,11 +120,13 @@ class Codepoint {
|
|
|
116
120
|
static toLowerCase(codepoint) {
|
|
117
121
|
if (codepoint < ASCII_SIZE) return ASCII_TO_LOWER[codepoint];
|
|
118
122
|
const s = String.fromCodePoint(codepoint).toLowerCase();
|
|
119
|
-
|
|
123
|
+
const expectedLen = s.codePointAt(0) > MAX_BMP ? 2 : 1;
|
|
124
|
+
if (s.length > expectedLen) {
|
|
120
125
|
return codepoint;
|
|
121
126
|
}
|
|
122
127
|
const sOrigin = String.fromCodePoint(s.codePointAt(0)).toUpperCase();
|
|
123
|
-
|
|
128
|
+
const originExpectedLen = sOrigin.codePointAt(0) > MAX_BMP ? 2 : 1;
|
|
129
|
+
if (sOrigin.length > originExpectedLen || sOrigin.codePointAt(0) !== codepoint) {
|
|
124
130
|
return codepoint;
|
|
125
131
|
}
|
|
126
132
|
return s.codePointAt(0);
|
|
@@ -222,7 +228,7 @@ class UnicodeTables {
|
|
|
222
228
|
static _CASE_ORBIT = null;
|
|
223
229
|
static get CASE_ORBIT() {
|
|
224
230
|
if (!this._CASE_ORBIT) {
|
|
225
|
-
this._CASE_ORBIT = decodeOrbit('rCrDIzDYqpII-LiC8cQlHa+0HGrpI6EzClClOBmOBkOBoOBpOBnOBrOBsOBqOlByPBzPBxPyK5crCz+HCydD1dD4dB5dB6dC8dEgeBheCieDmeDpeHj-HCweD1fDxeB+9HBwfC1FE2eBxfBjeBjdD1eDmpIHycB0fEmdBgda6cBhdD4cB1cdyhBC0hBK+hBDhiBBiiBIqiBIgkHChkHKikHDjkHBkkHImkHYjjBBnkH9gGygBB0gBB+gBBhhBBlkHBihBBqhBBijBBqypB4OhzHB70H6BgzHD-GiHo8HBp8HBq8HBr8HBs8HBt8HBu8HBv8HBg8HBh8HBi8HBj8HBk8HBl8HBm8HBn8HB48HB58HB68HB78HB88HB98HB+8HB-8HBw8HBx8HBy8HBz8HB08HB18HB28HB38HBo9HBp9HBq9HBr9HBs9HBt9HBu9HBv9HBg9HBh9HBi9HBj9HBk9HBl9HBm9HBn9HE89HJz9HClaFs+HJj+HHwcQwdQ8-HJz-HqJpdErCBlG-ohBrypBBokH6lVm4+BBl4+
|
|
231
|
+
this._CASE_ORBIT = decodeOrbit('rCrDIzDYqpII-LiC8cQlHa+0HGrpI6EzClClOBmOBkOBoOBpOBnOBrOBsOBqOlByPBzPBxPyK5crCz+HCydD1dD4dB5dB6dC8dEgeBheCieDmeDpeHj-HCweD1fDxeB+9HBwfC1FE2eBxfBjeBjdD1eDmpIHycB0fEmdBgda6cBhdD4cB1cdyhBC0hBK+hBDhiBBiiBIqiBIgkHChkHKikHDjkHBkkHImkHYjjBBnkH9gGygBB0gBB+gBBhhBBlkHBihBBqhBBijBBqypB4OhzHB70H6BgzHD-GiHo8HBp8HBq8HBr8HBs8HBt8HBu8HBv8HBg8HBh8HBi8HBj8HBk8HBl8HBm8HBn8HB48HB58HB68HB78HB88HB98HB+8HB-8HBw8HBx8HBy8HBz8HB08HB18HB28HB38HBo9HBp9HBq9HBr9HBs9HBt9HBu9HBv9HBg9HBh9HBi9HBj9HBk9HBl9HBm9HBn9HE89HJz9HClaFs+HJj+HHwcQwdQ8-HJz-HqJpdErCBlG-ohBrypBBokH6lVm4+BBl4+B');
|
|
226
232
|
}
|
|
227
233
|
return this._CASE_ORBIT;
|
|
228
234
|
}
|
|
@@ -594,11 +600,16 @@ class Unicode {
|
|
|
594
600
|
// to compare it to |r2|.
|
|
595
601
|
// -1 is interpreted as the end-of-file mark.
|
|
596
602
|
static equalsIgnoreCase(r1, r2) {
|
|
597
|
-
// Runes already match
|
|
598
|
-
if (r1
|
|
603
|
+
// Runes already match
|
|
604
|
+
if (r1 === r2) {
|
|
599
605
|
return true;
|
|
600
606
|
}
|
|
601
607
|
|
|
608
|
+
// Safely fail if either is EOF (and they didn't explicitly match above)
|
|
609
|
+
if (r1 < 0 || r2 < 0) {
|
|
610
|
+
return false;
|
|
611
|
+
}
|
|
612
|
+
|
|
602
613
|
// Fast path for the common case where both runes are ASCII characters.
|
|
603
614
|
// Coerces both runes to lowercase if applicable.
|
|
604
615
|
if (r1 <= this.MAX_ASCII && r2 <= this.MAX_ASCII) {
|
|
@@ -851,7 +862,7 @@ class Utils {
|
|
|
851
862
|
// Encoding[(Encoding['UTF_16'] = 0)] = 'UTF_16'
|
|
852
863
|
// Encoding[(Encoding['UTF_8'] = 1)] = 'UTF_8'
|
|
853
864
|
const createEnum = (values = [], initNum = 0) => {
|
|
854
|
-
const enumObject =
|
|
865
|
+
const enumObject = Object.create(null);
|
|
855
866
|
for (let i = 0; i < values.length; i++) {
|
|
856
867
|
const val = values[i];
|
|
857
868
|
const keyVal = initNum + i;
|
|
@@ -993,6 +1004,9 @@ class MachineInputBase {
|
|
|
993
1004
|
hasString() {
|
|
994
1005
|
return false;
|
|
995
1006
|
}
|
|
1007
|
+
hasAnyString() {
|
|
1008
|
+
return false;
|
|
1009
|
+
}
|
|
996
1010
|
|
|
997
1011
|
// Helper for the exact-literal fast-path execution router
|
|
998
1012
|
prefixLength() {
|
|
@@ -1018,6 +1032,13 @@ class MachineUTF8Input extends MachineInputBase {
|
|
|
1018
1032
|
return idx !== -1 && idx <= this.end - target.length;
|
|
1019
1033
|
}
|
|
1020
1034
|
|
|
1035
|
+
// Executes a high-speed, single - pass search for multiple literal strings
|
|
1036
|
+
// simultaneously using an Aho-Corasick automaton.
|
|
1037
|
+
hasAnyString(prefilter, pos) {
|
|
1038
|
+
if (!prefilter.ac8) return false;
|
|
1039
|
+
return prefilter.ac8.searchUTF8(this.bytes, this.start + pos, this.end);
|
|
1040
|
+
}
|
|
1041
|
+
|
|
1021
1042
|
// Returns the rune at the specified index; the units are
|
|
1022
1043
|
// unspecified, but could be UTF-8 byte, UTF-16 char, or rune
|
|
1023
1044
|
// indices. Returns the width (in the same units) of the rune in
|
|
@@ -1036,17 +1057,23 @@ class MachineUTF8Input extends MachineInputBase {
|
|
|
1036
1057
|
return c << 3 | 1;
|
|
1037
1058
|
} else if (c >= 0xc2 && c <= 0xdf && pos + 1 < this.end) {
|
|
1038
1059
|
const c1 = this.bytes[pos + 1] & 0xff;
|
|
1060
|
+
if ((c1 & 0xc0) !== 0x80) return c << 3 | 1;
|
|
1039
1061
|
const rune = (c & 0x1f) << 6 | c1 & 0x3f;
|
|
1040
1062
|
return rune << 3 | 2;
|
|
1041
1063
|
} else if (c >= 0xe0 && c <= 0xef && pos + 2 < this.end) {
|
|
1042
1064
|
const c1 = this.bytes[pos + 1] & 0xff;
|
|
1065
|
+
if ((c1 & 0xc0) !== 0x80) return c << 3 | 1;
|
|
1043
1066
|
const c2 = this.bytes[pos + 2] & 0xff;
|
|
1067
|
+
if ((c2 & 0xc0) !== 0x80) return c << 3 | 1;
|
|
1044
1068
|
const rune = (c & 0x0f) << 12 | (c1 & 0x3f) << 6 | c2 & 0x3f;
|
|
1045
1069
|
return rune << 3 | 3;
|
|
1046
1070
|
} else if (c >= 0xf0 && c <= 0xf4 && pos + 3 < this.end) {
|
|
1047
1071
|
const c1 = this.bytes[pos + 1] & 0xff;
|
|
1072
|
+
if ((c1 & 0xc0) !== 0x80) return c << 3 | 1;
|
|
1048
1073
|
const c2 = this.bytes[pos + 2] & 0xff;
|
|
1074
|
+
if ((c2 & 0xc0) !== 0x80) return c << 3 | 1;
|
|
1049
1075
|
const c3 = this.bytes[pos + 3] & 0xff;
|
|
1076
|
+
if ((c3 & 0xc0) !== 0x80) return c << 3 | 1;
|
|
1050
1077
|
const rune = (c & 0x07) << 18 | (c1 & 0x3f) << 12 | (c2 & 0x3f) << 6 | c3 & 0x3f;
|
|
1051
1078
|
return rune << 3 | 4;
|
|
1052
1079
|
} else {
|
|
@@ -1125,6 +1152,13 @@ class MachineUTF16Input extends MachineInputBase {
|
|
|
1125
1152
|
return idx !== -1 && idx <= this.end - prefilter.str.length;
|
|
1126
1153
|
}
|
|
1127
1154
|
|
|
1155
|
+
// Executes a high-speed, single - pass search for multiple literal strings
|
|
1156
|
+
// simultaneously using an Aho-Corasick automaton.
|
|
1157
|
+
hasAnyString(prefilter, pos) {
|
|
1158
|
+
if (!prefilter.ac16) return false;
|
|
1159
|
+
return prefilter.ac16.searchUTF16(this.charSequence, this.start + pos, this.end);
|
|
1160
|
+
}
|
|
1161
|
+
|
|
1128
1162
|
// Returns the rune at the specified index; the units are
|
|
1129
1163
|
// unspecified, but could be UTF-8 byte, UTF-16 char, or rune
|
|
1130
1164
|
// indices. Returns the width (in the same units) of the rune in
|
|
@@ -1570,7 +1604,15 @@ class Matcher {
|
|
|
1570
1604
|
if (this.hasMatch) {
|
|
1571
1605
|
start = this.groups[1];
|
|
1572
1606
|
if (this.groups[0] === this.groups[1]) {
|
|
1573
|
-
|
|
1607
|
+
// Safely calculate structural encoding width to avoid sequence corruption
|
|
1608
|
+
const machineInput = this.matcherInput.isUTF16Encoding() ? MachineInput.fromUTF16(this.matcherInput.asCharSequence(), 0, this.matcherInputLength) : MachineInput.fromUTF8(this.matcherInput.asBytes(), 0, this.matcherInputLength);
|
|
1609
|
+
const r = machineInput.step(start);
|
|
1610
|
+
if (r < 0) {
|
|
1611
|
+
// EOF
|
|
1612
|
+
start++; // Advance past length to force loop exit
|
|
1613
|
+
} else {
|
|
1614
|
+
start += r & 7; // Advance by safely decoded width
|
|
1615
|
+
}
|
|
1574
1616
|
}
|
|
1575
1617
|
}
|
|
1576
1618
|
return this.genMatch(start, RE2Flags.UNANCHORED);
|
|
@@ -1709,6 +1751,8 @@ class Matcher {
|
|
|
1709
1751
|
const groupName = replacement.substring(i + 1, j);
|
|
1710
1752
|
res += this.group(groupName);
|
|
1711
1753
|
last = j + 1;
|
|
1754
|
+
i = j;
|
|
1755
|
+
continue;
|
|
1712
1756
|
}
|
|
1713
1757
|
}
|
|
1714
1758
|
}
|
|
@@ -1788,6 +1832,7 @@ class Matcher {
|
|
|
1788
1832
|
if (j === replacement.length || replacement.codePointAt(j) !== Codepoint.CODES.get('>')) {
|
|
1789
1833
|
res += replacement.substring(i - 1, j + 1);
|
|
1790
1834
|
last = j + 1;
|
|
1835
|
+
i = j;
|
|
1791
1836
|
continue;
|
|
1792
1837
|
}
|
|
1793
1838
|
const groupName = replacement.substring(i + 1, j);
|
|
@@ -1797,6 +1842,8 @@ class Matcher {
|
|
|
1797
1842
|
res += `$<${groupName}>`;
|
|
1798
1843
|
}
|
|
1799
1844
|
last = j + 1;
|
|
1845
|
+
i = j;
|
|
1846
|
+
continue;
|
|
1800
1847
|
}
|
|
1801
1848
|
}
|
|
1802
1849
|
}
|
|
@@ -1921,6 +1968,8 @@ class Inst {
|
|
|
1921
1968
|
return r === r0;
|
|
1922
1969
|
}
|
|
1923
1970
|
const len = this.runes.length;
|
|
1971
|
+
if (len === 0) return false;
|
|
1972
|
+
|
|
1924
1973
|
// If the array is exactly 2, 4, 6, or 8 items, DO NOT fall through to binary search
|
|
1925
1974
|
if (len === 2 || len === 4 || len === 6 || len === 8) {
|
|
1926
1975
|
for (let j = 0; j < len; j += 2) {
|
|
@@ -1934,22 +1983,19 @@ class Inst {
|
|
|
1934
1983
|
return false; // Stop here
|
|
1935
1984
|
}
|
|
1936
1985
|
|
|
1937
|
-
//
|
|
1938
|
-
|
|
1939
|
-
|
|
1940
|
-
|
|
1941
|
-
|
|
1942
|
-
|
|
1943
|
-
|
|
1944
|
-
|
|
1945
|
-
|
|
1946
|
-
}
|
|
1947
|
-
lo = m + 1;
|
|
1948
|
-
} else {
|
|
1949
|
-
hi = m;
|
|
1950
|
-
}
|
|
1986
|
+
// Branchless Binary Search (Lower Bound)
|
|
1987
|
+
// Compiles to optimal conditional move (cmov) machine code, preventing
|
|
1988
|
+
// branch mispredictions on large, chaotic Unicode arrays
|
|
1989
|
+
let base = 0;
|
|
1990
|
+
let n = len >> 1;
|
|
1991
|
+
while (n > 1) {
|
|
1992
|
+
const half = n >> 1;
|
|
1993
|
+
base += this.runes[base + half << 1] <= r ? half : 0;
|
|
1994
|
+
n -= half;
|
|
1951
1995
|
}
|
|
1952
|
-
|
|
1996
|
+
base += this.runes[base << 1] <= r ? 1 : 0;
|
|
1997
|
+
const m = base - 1;
|
|
1998
|
+
return m >= 0 && r <= this.runes[m << 1 | 1];
|
|
1953
1999
|
}
|
|
1954
2000
|
|
|
1955
2001
|
// matchRunePos checks whether the instruction matches (and consumes) r.
|
|
@@ -1964,6 +2010,7 @@ class Inst {
|
|
|
1964
2010
|
return r === r0 ? 0 : -1;
|
|
1965
2011
|
}
|
|
1966
2012
|
const len = this.runes.length;
|
|
2013
|
+
if (len === 0) return -1;
|
|
1967
2014
|
if (len === 2 || len === 4 || len === 6 || len === 8) {
|
|
1968
2015
|
for (let j = 0; j < len; j += 2) {
|
|
1969
2016
|
if (r < this.runes[j]) return -1;
|
|
@@ -1971,19 +2018,18 @@ class Inst {
|
|
|
1971
2018
|
}
|
|
1972
2019
|
return -1;
|
|
1973
2020
|
}
|
|
1974
|
-
|
|
1975
|
-
|
|
1976
|
-
|
|
1977
|
-
|
|
1978
|
-
|
|
1979
|
-
|
|
1980
|
-
|
|
1981
|
-
|
|
1982
|
-
} else {
|
|
1983
|
-
hi = m;
|
|
1984
|
-
}
|
|
2021
|
+
|
|
2022
|
+
// Branchless Binary Search (Lower Bound)
|
|
2023
|
+
let base = 0;
|
|
2024
|
+
let n = len >> 1;
|
|
2025
|
+
while (n > 1) {
|
|
2026
|
+
const half = n >> 1;
|
|
2027
|
+
base += this.runes[base + half << 1] <= r ? half : 0;
|
|
2028
|
+
n -= half;
|
|
1985
2029
|
}
|
|
1986
|
-
|
|
2030
|
+
base += this.runes[base << 1] <= r ? 1 : 0;
|
|
2031
|
+
const m = base - 1;
|
|
2032
|
+
return m >= 0 && r <= this.runes[m << 1 | 1] ? m : -1;
|
|
1987
2033
|
}
|
|
1988
2034
|
/**
|
|
1989
2035
|
*
|
|
@@ -2082,6 +2128,7 @@ class Queue {
|
|
|
2082
2128
|
//
|
|
2083
2129
|
// Called by RE2.doExecute.
|
|
2084
2130
|
class Machine {
|
|
2131
|
+
static THREADS_CHUNK_SIZE = 128;
|
|
2085
2132
|
static fromRE2(re2) {
|
|
2086
2133
|
const m = new Machine();
|
|
2087
2134
|
m.prog = re2.prog;
|
|
@@ -2122,15 +2169,15 @@ class Machine {
|
|
|
2122
2169
|
resetCap() {
|
|
2123
2170
|
for (let i = 0; i < this.poolSize; i++) {
|
|
2124
2171
|
const t = this.pool[i];
|
|
2125
|
-
t.cap.fill(
|
|
2172
|
+
t.cap.fill(-1);
|
|
2126
2173
|
}
|
|
2127
2174
|
}
|
|
2128
2175
|
initNewCap(ncap) {
|
|
2129
2176
|
for (let i = 0; i < this.poolSize; i++) {
|
|
2130
2177
|
const t = this.pool[i];
|
|
2131
|
-
t.cap = new Int32Array(ncap);
|
|
2178
|
+
t.cap = new Int32Array(ncap).fill(-1);
|
|
2132
2179
|
}
|
|
2133
|
-
this.matchcap = new Int32Array(ncap);
|
|
2180
|
+
this.matchcap = new Int32Array(ncap).fill(-1);
|
|
2134
2181
|
}
|
|
2135
2182
|
submatches() {
|
|
2136
2183
|
if (this.ncap === 0) {
|
|
@@ -2143,14 +2190,21 @@ class Machine {
|
|
|
2143
2190
|
// alloc() allocates a new thread with the given instruction.
|
|
2144
2191
|
// It uses the free pool if possible.
|
|
2145
2192
|
alloc(inst) {
|
|
2146
|
-
|
|
2147
|
-
|
|
2148
|
-
|
|
2149
|
-
|
|
2150
|
-
|
|
2151
|
-
|
|
2152
|
-
|
|
2193
|
+
if (this.poolSize === 0) {
|
|
2194
|
+
const capLen = this.matchcap.length;
|
|
2195
|
+
|
|
2196
|
+
// Bulk allocate threads in a tight loop so the V8 engine
|
|
2197
|
+
// places them adjacently in the young generation heap
|
|
2198
|
+
for (let i = 0; i < Machine.THREADS_CHUNK_SIZE; i++) {
|
|
2199
|
+
const t = new Thread();
|
|
2200
|
+
t.cap = new Int32Array(capLen);
|
|
2201
|
+
this.pool[this.poolSize++] = t;
|
|
2202
|
+
}
|
|
2153
2203
|
}
|
|
2204
|
+
|
|
2205
|
+
// Pop a thread from the top of the pool stack
|
|
2206
|
+
this.poolSize--;
|
|
2207
|
+
const t = this.pool[this.poolSize];
|
|
2154
2208
|
t.inst = inst;
|
|
2155
2209
|
return t;
|
|
2156
2210
|
}
|
|
@@ -2203,6 +2257,9 @@ class Machine {
|
|
|
2203
2257
|
if ((startCond & Utils.EMPTY_BEGIN_TEXT) !== 0 && pos !== 0) {
|
|
2204
2258
|
break;
|
|
2205
2259
|
}
|
|
2260
|
+
if ((anchor === RE2Flags.ANCHOR_START || anchor === RE2Flags.ANCHOR_BOTH) && pos !== 0) {
|
|
2261
|
+
break;
|
|
2262
|
+
}
|
|
2206
2263
|
if (this.matched) {
|
|
2207
2264
|
break;
|
|
2208
2265
|
}
|
|
@@ -2280,6 +2337,9 @@ class Machine {
|
|
|
2280
2337
|
while (true) {
|
|
2281
2338
|
if (runq.isEmpty()) {
|
|
2282
2339
|
if ((startCond & Utils.EMPTY_BEGIN_TEXT) !== 0 && pos !== 0) break;
|
|
2340
|
+
if ((anchor === RE2Flags.ANCHOR_START || anchor === RE2Flags.ANCHOR_BOTH) && pos !== 0) {
|
|
2341
|
+
break;
|
|
2342
|
+
}
|
|
2283
2343
|
}
|
|
2284
2344
|
if (pos === 0 || anchor === RE2Flags.UNANCHORED) {
|
|
2285
2345
|
// Spawn Lookbehind threads BEFORE the main pattern
|
|
@@ -2395,78 +2455,83 @@ class Machine {
|
|
|
2395
2455
|
runq.clear();
|
|
2396
2456
|
}
|
|
2397
2457
|
add(q, pc, pos, cap, cond, t) {
|
|
2398
|
-
|
|
2399
|
-
|
|
2400
|
-
|
|
2401
|
-
|
|
2402
|
-
|
|
2403
|
-
|
|
2404
|
-
|
|
2405
|
-
|
|
2406
|
-
|
|
2407
|
-
|
|
2408
|
-
|
|
2409
|
-
|
|
2410
|
-
|
|
2411
|
-
|
|
2412
|
-
t = this.add(q, inst.arg, pos, cap, cond, t);
|
|
2413
|
-
break;
|
|
2414
|
-
case Inst.EMPTY_WIDTH:
|
|
2415
|
-
if ((inst.arg & ~cond) === 0) {
|
|
2416
|
-
t = this.add(q, inst.out, pos, cap, cond, t);
|
|
2417
|
-
}
|
|
2418
|
-
break;
|
|
2419
|
-
case Inst.NOP:
|
|
2420
|
-
t = this.add(q, inst.out, pos, cap, cond, t);
|
|
2421
|
-
break;
|
|
2422
|
-
case Inst.CAPTURE:
|
|
2423
|
-
if (inst.arg < this.ncap) {
|
|
2424
|
-
const opos = cap[inst.arg];
|
|
2425
|
-
cap[inst.arg] = pos;
|
|
2426
|
-
this.add(q, inst.out, pos, cap, cond, null);
|
|
2427
|
-
cap[inst.arg] = opos;
|
|
2428
|
-
} else {
|
|
2458
|
+
while (true) {
|
|
2459
|
+
if (pc === 0) {
|
|
2460
|
+
return t;
|
|
2461
|
+
}
|
|
2462
|
+
if (q.contains(pc)) {
|
|
2463
|
+
return t;
|
|
2464
|
+
}
|
|
2465
|
+
const d = q.add(pc);
|
|
2466
|
+
const inst = this.prog.inst[pc];
|
|
2467
|
+
switch (inst.op) {
|
|
2468
|
+
case Inst.FAIL:
|
|
2469
|
+
return t;
|
|
2470
|
+
case Inst.ALT:
|
|
2471
|
+
case Inst.ALT_MATCH:
|
|
2429
2472
|
t = this.add(q, inst.out, pos, cap, cond, t);
|
|
2430
|
-
|
|
2431
|
-
|
|
2432
|
-
|
|
2433
|
-
|
|
2434
|
-
|
|
2435
|
-
|
|
2436
|
-
case Inst.LB_CHECK:
|
|
2437
|
-
if (inst.lb > 0) {
|
|
2438
|
-
// Positive Lookbehind
|
|
2439
|
-
if (this.lbTable[inst.lb] === pos) {
|
|
2440
|
-
t = this.add(q, inst.out, pos, cap, cond, t);
|
|
2473
|
+
pc = inst.arg; // Flattened tail recursion
|
|
2474
|
+
continue;
|
|
2475
|
+
case Inst.EMPTY_WIDTH:
|
|
2476
|
+
if ((inst.arg & ~cond) === 0) {
|
|
2477
|
+
pc = inst.out; // Flattened tail recursion
|
|
2478
|
+
continue;
|
|
2441
2479
|
}
|
|
2442
|
-
|
|
2443
|
-
|
|
2444
|
-
|
|
2445
|
-
|
|
2446
|
-
|
|
2447
|
-
|
|
2448
|
-
|
|
2449
|
-
|
|
2450
|
-
|
|
2451
|
-
|
|
2452
|
-
|
|
2453
|
-
|
|
2454
|
-
|
|
2455
|
-
|
|
2456
|
-
}
|
|
2457
|
-
if (this.ncap > 0 && t.cap !== cap) {
|
|
2458
|
-
// Direct assignment utilizing Typed Array performance
|
|
2459
|
-
for (let c = 0; c < this.ncap; c++) {
|
|
2460
|
-
t.cap[c] = cap[c];
|
|
2480
|
+
return t;
|
|
2481
|
+
case Inst.NOP:
|
|
2482
|
+
pc = inst.out; // Flattened tail recursion
|
|
2483
|
+
continue;
|
|
2484
|
+
case Inst.CAPTURE:
|
|
2485
|
+
if (inst.arg < this.ncap) {
|
|
2486
|
+
const opos = cap[inst.arg];
|
|
2487
|
+
cap[inst.arg] = pos;
|
|
2488
|
+
this.add(q, inst.out, pos, cap, cond, null);
|
|
2489
|
+
cap[inst.arg] = opos;
|
|
2490
|
+
return t;
|
|
2491
|
+
} else {
|
|
2492
|
+
pc = inst.out; // Flattened tail recursion
|
|
2493
|
+
continue;
|
|
2461
2494
|
}
|
|
2462
|
-
|
|
2463
|
-
|
|
2464
|
-
|
|
2465
|
-
|
|
2466
|
-
|
|
2467
|
-
|
|
2495
|
+
case Inst.LB_WRITE:
|
|
2496
|
+
this.lbTable[Math.abs(inst.lb)] = pos;
|
|
2497
|
+
pc = inst.out;
|
|
2498
|
+
continue;
|
|
2499
|
+
case Inst.LB_CHECK:
|
|
2500
|
+
if (inst.lb > 0) {
|
|
2501
|
+
// Positive Lookbehind
|
|
2502
|
+
if (this.lbTable[inst.lb] === pos) {
|
|
2503
|
+
pc = inst.out; // Flattened tail recursion
|
|
2504
|
+
continue;
|
|
2505
|
+
}
|
|
2506
|
+
} else if (this.lbTable[-inst.lb] !== pos) {
|
|
2507
|
+
// Negative Lookbehind
|
|
2508
|
+
pc = inst.out; // Flattened tail recursion
|
|
2509
|
+
continue;
|
|
2510
|
+
}
|
|
2511
|
+
return t;
|
|
2512
|
+
case Inst.MATCH:
|
|
2513
|
+
case Inst.RUNE:
|
|
2514
|
+
case Inst.RUNE1:
|
|
2515
|
+
case Inst.RUNE_ANY:
|
|
2516
|
+
case Inst.RUNE_ANY_NOT_NL:
|
|
2517
|
+
if (t === null) {
|
|
2518
|
+
t = this.alloc(inst);
|
|
2519
|
+
} else {
|
|
2520
|
+
t.inst = inst;
|
|
2521
|
+
}
|
|
2522
|
+
if (this.ncap > 0 && t.cap !== cap) {
|
|
2523
|
+
// Direct assignment utilizing Typed Array performance
|
|
2524
|
+
for (let c = 0; c < this.ncap; c++) {
|
|
2525
|
+
t.cap[c] = cap[c];
|
|
2526
|
+
}
|
|
2527
|
+
}
|
|
2528
|
+
q.denseThreads[d] = t;
|
|
2529
|
+
t = null;
|
|
2530
|
+
return t;
|
|
2531
|
+
default:
|
|
2532
|
+
throw new Error('unhandled');
|
|
2533
|
+
}
|
|
2468
2534
|
}
|
|
2469
|
-
return t;
|
|
2470
2535
|
}
|
|
2471
2536
|
}
|
|
2472
2537
|
|
|
@@ -2494,8 +2559,15 @@ class DFAState {
|
|
|
2494
2559
|
this.nfaStates = nfaStates; // Int32Array of Instruction PCs
|
|
2495
2560
|
this.isMatch = isMatch; // Boolean
|
|
2496
2561
|
this.matchIDs = matchIDs; // Array of integers indicating which Set patterns matched
|
|
2497
|
-
|
|
2498
|
-
|
|
2562
|
+
|
|
2563
|
+
// Latin-1 (Unicode.MAX_LATIN1 + 1) flat arrays for blisteringly fast O(1) lookups
|
|
2564
|
+
// completely covering standard English, European languages, and 1-byte encodings.
|
|
2565
|
+
this.nextLatin1 = new Array(Unicode.MAX_LATIN1 + 1).fill(null); // Flat array for blisteringly fast ASCII lookups
|
|
2566
|
+
this.nextLatin1Anchored = new Array(Unicode.MAX_LATIN1 + 1).fill(null); // Flat array for blisteringly fast ASCII lookups
|
|
2567
|
+
// 2 arrays used as hash map for V8 optimization (N is small number, so O(n) faster than Map O(1))
|
|
2568
|
+
this.transKeys = [];
|
|
2569
|
+
this.transVals = [];
|
|
2570
|
+
this.lastSeen = 0; // Track when this state was last used for LRU eviction
|
|
2499
2571
|
}
|
|
2500
2572
|
}
|
|
2501
2573
|
class DFA {
|
|
@@ -2508,6 +2580,7 @@ class DFA {
|
|
|
2508
2580
|
this.stateLimit = 10000; // Prevent memory explosion (ReDoS protection)
|
|
2509
2581
|
this.cacheClears = 0; // Track thrashing
|
|
2510
2582
|
this.failed = false; // mark if DFA cannot work with provided prog
|
|
2583
|
+
this.clock = 0; // Global clock for LRU eviction
|
|
2511
2584
|
}
|
|
2512
2585
|
|
|
2513
2586
|
// Follows epsilon (empty) transitions to find all reachable states without consuming a char
|
|
@@ -2567,6 +2640,7 @@ class DFA {
|
|
|
2567
2640
|
for (let i = 0; i < bucket.length; i++) {
|
|
2568
2641
|
const state = bucket[i];
|
|
2569
2642
|
if (arraysEqual(state.nfaStates, sortedPCs)) {
|
|
2643
|
+
state.lastSeen = ++this.clock;
|
|
2570
2644
|
return state;
|
|
2571
2645
|
}
|
|
2572
2646
|
}
|
|
@@ -2579,40 +2653,99 @@ class DFA {
|
|
|
2579
2653
|
if (this.failed) return null;
|
|
2580
2654
|
|
|
2581
2655
|
// Safety: prevent memory exhaustion from state explosion
|
|
2582
|
-
// We
|
|
2656
|
+
// We prune the cache to keep the newest 50%
|
|
2583
2657
|
if (this.stateCount >= this.stateLimit) {
|
|
2584
|
-
this.stateCache.clear();
|
|
2585
|
-
this.stateCount = 0;
|
|
2586
|
-
this.startState = null;
|
|
2587
2658
|
this.cacheClears++;
|
|
2588
2659
|
|
|
2589
2660
|
// If this regex causes continuous cache thrashing, permanently fall back to NFA
|
|
2590
2661
|
// to avoid spending CPU cycles constantly rebuilding the DFA tree.
|
|
2591
2662
|
if (this.cacheClears >= DFA.MAX_CACHE_CLEARS) {
|
|
2592
2663
|
this.failed = true;
|
|
2664
|
+
this.stateCache.clear();
|
|
2665
|
+
this.stateCount = 0;
|
|
2666
|
+
this.startState = null;
|
|
2667
|
+
return null;
|
|
2668
|
+
}
|
|
2669
|
+
this.evictCache();
|
|
2670
|
+
|
|
2671
|
+
// After eviction, the bucket reference might be stale or empty.
|
|
2672
|
+
// We must re-fetch or re-create the bucket.
|
|
2673
|
+
bucket = this.stateCache.get(hash);
|
|
2674
|
+
if (!bucket) {
|
|
2675
|
+
bucket = [];
|
|
2676
|
+
this.stateCache.set(hash, bucket);
|
|
2593
2677
|
}
|
|
2594
|
-
return null;
|
|
2595
2678
|
}
|
|
2596
2679
|
|
|
2597
2680
|
// State not found, create it and add to bucket
|
|
2598
2681
|
const state = new DFAState(sortedPCs, closureResult.isMatch, closureResult.matchIDs);
|
|
2682
|
+
state.lastSeen = ++this.clock;
|
|
2599
2683
|
bucket.push(state);
|
|
2600
2684
|
this.stateCount++;
|
|
2601
2685
|
return state;
|
|
2602
2686
|
}
|
|
2687
|
+
evictCache() {
|
|
2688
|
+
const allStates = [];
|
|
2689
|
+
for (const bucket of this.stateCache.values()) {
|
|
2690
|
+
for (let i = 0; i < bucket.length; i++) {
|
|
2691
|
+
allStates.push(bucket[i]);
|
|
2692
|
+
}
|
|
2693
|
+
}
|
|
2694
|
+
|
|
2695
|
+
// Sort ascending by lastSeen (oldest first)
|
|
2696
|
+
allStates.sort((a, b) => a.lastSeen - b.lastSeen);
|
|
2697
|
+
|
|
2698
|
+
// Keep the newest 50%
|
|
2699
|
+
const keepCount = Math.max(1, Math.floor(this.stateLimit / 2));
|
|
2700
|
+
const startIndex = allStates.length - keepCount;
|
|
2701
|
+
const survivorsArray = allStates.slice(startIndex);
|
|
2702
|
+
const survivors = new Set(survivorsArray);
|
|
2703
|
+
this.stateCache.clear();
|
|
2704
|
+
this.stateCount = 0;
|
|
2705
|
+
for (let i = 0; i < survivorsArray.length; i++) {
|
|
2706
|
+
const state = survivorsArray[i];
|
|
2707
|
+
|
|
2708
|
+
// Sever ties to all states to prevent memory leaks and dangling pointers
|
|
2709
|
+
state.nextLatin1.fill(null);
|
|
2710
|
+
state.nextLatin1Anchored.fill(null);
|
|
2711
|
+
// zero-allocation cleanup
|
|
2712
|
+
state.transKeys.length = 0;
|
|
2713
|
+
state.transVals.length = 0;
|
|
2714
|
+
const hash = hashPCs(state.nfaStates);
|
|
2715
|
+
let bucket = this.stateCache.get(hash);
|
|
2716
|
+
if (!bucket) {
|
|
2717
|
+
bucket = [];
|
|
2718
|
+
this.stateCache.set(hash, bucket);
|
|
2719
|
+
}
|
|
2720
|
+
bucket.push(state);
|
|
2721
|
+
this.stateCount++;
|
|
2722
|
+
}
|
|
2723
|
+
|
|
2724
|
+
// Start state must either be preserved or nullified so it gets re-created
|
|
2725
|
+
if (this.startState && !survivors.has(this.startState)) {
|
|
2726
|
+
this.startState = null;
|
|
2727
|
+
}
|
|
2728
|
+
}
|
|
2603
2729
|
|
|
2604
2730
|
// Compute the next DFA state given a current state and a character
|
|
2605
2731
|
step(state, charCode, anchor) {
|
|
2606
|
-
// OPTIMIZATION:
|
|
2607
|
-
if (
|
|
2608
|
-
|
|
2609
|
-
|
|
2610
|
-
return next;
|
|
2732
|
+
// OPTIMIZATION: Latin-1 Array Fast-Path
|
|
2733
|
+
if (charCode <= Unicode.MAX_LATIN1) {
|
|
2734
|
+
if (anchor === RE2Flags.UNANCHORED) {
|
|
2735
|
+
const next = state.nextLatin1[charCode];
|
|
2736
|
+
if (next !== null) return next;
|
|
2737
|
+
} else {
|
|
2738
|
+
const next = state.nextLatin1Anchored[charCode];
|
|
2739
|
+
if (next !== null) return next;
|
|
2611
2740
|
}
|
|
2612
2741
|
} else {
|
|
2742
|
+
// Dense Array Linear Search fallback for Runes > 255
|
|
2613
2743
|
const key = charCode + (anchor === RE2Flags.UNANCHORED ? 0 : Unicode.MAX_RUNE + 1);
|
|
2614
|
-
|
|
2615
|
-
|
|
2744
|
+
// get [key] -> nextState
|
|
2745
|
+
const keys = state.transKeys;
|
|
2746
|
+
const len = keys.length;
|
|
2747
|
+
for (let i = 0; i < len; i++) {
|
|
2748
|
+
if (keys[i] === key) return state.transVals[i];
|
|
2616
2749
|
}
|
|
2617
2750
|
}
|
|
2618
2751
|
const nextPCs = [];
|
|
@@ -2629,11 +2762,17 @@ class DFA {
|
|
|
2629
2762
|
const nextState = this.getState(nextPCs);
|
|
2630
2763
|
|
|
2631
2764
|
// Cache the result
|
|
2632
|
-
if (
|
|
2633
|
-
|
|
2765
|
+
if (charCode <= Unicode.MAX_LATIN1) {
|
|
2766
|
+
if (anchor === RE2Flags.UNANCHORED) {
|
|
2767
|
+
state.nextLatin1[charCode] = nextState;
|
|
2768
|
+
} else {
|
|
2769
|
+
state.nextLatin1Anchored[charCode] = nextState;
|
|
2770
|
+
}
|
|
2634
2771
|
} else {
|
|
2635
2772
|
const key = charCode + (anchor === RE2Flags.UNANCHORED ? 0 : Unicode.MAX_RUNE + 1);
|
|
2636
|
-
|
|
2773
|
+
// store key -> nextState
|
|
2774
|
+
state.transKeys.push(key);
|
|
2775
|
+
state.transVals.push(nextState);
|
|
2637
2776
|
}
|
|
2638
2777
|
return nextState;
|
|
2639
2778
|
}
|
|
@@ -2666,10 +2805,11 @@ class DFA {
|
|
|
2666
2805
|
if (width === 0) {
|
|
2667
2806
|
break;
|
|
2668
2807
|
}
|
|
2669
|
-
currentState = anchor === RE2Flags.UNANCHORED && rune <= Unicode.
|
|
2808
|
+
currentState = anchor === RE2Flags.UNANCHORED && rune <= Unicode.MAX_LATIN1 && currentState.nextLatin1[rune] || this.step(currentState, rune, anchor);
|
|
2670
2809
|
|
|
2671
2810
|
// If we hit an unrecoverable DFA error or bailout, signal fallback
|
|
2672
2811
|
if (currentState === null) return null;
|
|
2812
|
+
currentState.lastSeen = ++this.clock;
|
|
2673
2813
|
if (currentState.isMatch) {
|
|
2674
2814
|
if (anchor === RE2Flags.ANCHOR_BOTH) {
|
|
2675
2815
|
if (i + width === endPos) return true;
|
|
@@ -2717,9 +2857,10 @@ class DFA {
|
|
|
2717
2857
|
const rune = r >> 3;
|
|
2718
2858
|
const width = r & 7;
|
|
2719
2859
|
if (width === 0) break;
|
|
2720
|
-
currentState = anchor === RE2Flags.UNANCHORED && rune <= Unicode.
|
|
2860
|
+
currentState = anchor === RE2Flags.UNANCHORED && rune <= Unicode.MAX_LATIN1 && currentState.nextLatin1[rune] || this.step(currentState, rune, anchor);
|
|
2721
2861
|
if (currentState === null) return null; // Bailout to NFA
|
|
2722
2862
|
|
|
2863
|
+
currentState.lastSeen = ++this.clock;
|
|
2723
2864
|
i += width;
|
|
2724
2865
|
checkMatch(currentState, i);
|
|
2725
2866
|
if (currentState.nfaStates.length === 0) {
|
|
@@ -2757,7 +2898,7 @@ class BitState {
|
|
|
2757
2898
|
// Bitwise shift (>>> 5) instead of Math.floor( / 32)
|
|
2758
2899
|
const visitedSize = prog.numInst() * (end + 1) + VISITED_BITS - 1 >>> 5;
|
|
2759
2900
|
if (this.visited.length < visitedSize) {
|
|
2760
|
-
this.visited = new Uint32Array(
|
|
2901
|
+
this.visited = new Uint32Array(visitedSize);
|
|
2761
2902
|
} else {
|
|
2762
2903
|
this.visited.fill(0, 0, visitedSize);
|
|
2763
2904
|
}
|
|
@@ -2843,11 +2984,12 @@ class BitState {
|
|
|
2843
2984
|
const outInst = re2.prog.getInst(inst.out);
|
|
2844
2985
|
if (Inst.isRuneOp(outInst.op)) {
|
|
2845
2986
|
this.push(re2, inst.arg, currentPos, false);
|
|
2846
|
-
currentPc = inst.
|
|
2987
|
+
currentPc = inst.arg;
|
|
2988
|
+
currentPos = this.end;
|
|
2847
2989
|
continue;
|
|
2848
2990
|
}
|
|
2849
2991
|
this.push(re2, inst.out, this.end, false);
|
|
2850
|
-
currentPc = inst.
|
|
2992
|
+
currentPc = inst.out;
|
|
2851
2993
|
continue;
|
|
2852
2994
|
}
|
|
2853
2995
|
case Inst.RUNE:
|
|
@@ -2928,6 +3070,11 @@ class BitState {
|
|
|
2928
3070
|
if (currentPos === this.end) return true;
|
|
2929
3071
|
break;
|
|
2930
3072
|
}
|
|
3073
|
+
case Inst.LB_WRITE:
|
|
3074
|
+
case Inst.LB_CHECK:
|
|
3075
|
+
{
|
|
3076
|
+
throw new RE2JSInternalException('Backtracker cannot evaluate Lookbehind instructions');
|
|
3077
|
+
}
|
|
2931
3078
|
default:
|
|
2932
3079
|
{
|
|
2933
3080
|
throw new RE2JSInternalException('bad inst');
|
|
@@ -3364,6 +3511,10 @@ class OnePass {
|
|
|
3364
3511
|
switch (inst.op) {
|
|
3365
3512
|
case Inst.MATCH:
|
|
3366
3513
|
{
|
|
3514
|
+
// Verify ANCHOR_BOTH constraint before accepting the match
|
|
3515
|
+
if (anchor === RE2Flags.ANCHOR_BOTH && pos !== input.endPos()) {
|
|
3516
|
+
return null;
|
|
3517
|
+
}
|
|
3367
3518
|
matched = true;
|
|
3368
3519
|
if (matchcap.length > 0) {
|
|
3369
3520
|
matchcap[0] = 0;
|
|
@@ -3794,6 +3945,88 @@ class Regexp {
|
|
|
3794
3945
|
}
|
|
3795
3946
|
}
|
|
3796
3947
|
|
|
3948
|
+
// High-speed, single-pass Aho-Corasick string matcher optimized for V8.
|
|
3949
|
+
// Builds a trie with failure links to search for multiple prefixes simultaneously.
|
|
3950
|
+
class AhoCorasick {
|
|
3951
|
+
constructor(wordArrays) {
|
|
3952
|
+
this.next = [Object.create(null)];
|
|
3953
|
+
this.fail = [0];
|
|
3954
|
+
this.match = [false];
|
|
3955
|
+
|
|
3956
|
+
// Build Trie
|
|
3957
|
+
for (const word of wordArrays) {
|
|
3958
|
+
let node = 0;
|
|
3959
|
+
for (let i = 0; i < word.length; i++) {
|
|
3960
|
+
const val = word[i];
|
|
3961
|
+
if (!(val in this.next[node])) {
|
|
3962
|
+
this.next.push(Object.create(null));
|
|
3963
|
+
this.fail.push(0);
|
|
3964
|
+
this.match.push(false);
|
|
3965
|
+
this.next[node][val] = this.next.length - 1;
|
|
3966
|
+
}
|
|
3967
|
+
node = this.next[node][val];
|
|
3968
|
+
}
|
|
3969
|
+
this.match[node] = true;
|
|
3970
|
+
}
|
|
3971
|
+
|
|
3972
|
+
// Build Failure Links (BFS)
|
|
3973
|
+
const queue = [];
|
|
3974
|
+
for (const val in this.next[0]) {
|
|
3975
|
+
if (Object.prototype.hasOwnProperty.call(this.next[0], val)) {
|
|
3976
|
+
const child = this.next[0][val];
|
|
3977
|
+
this.fail[child] = 0;
|
|
3978
|
+
queue.push(child);
|
|
3979
|
+
}
|
|
3980
|
+
}
|
|
3981
|
+
while (queue.length > 0) {
|
|
3982
|
+
const curr = queue.shift();
|
|
3983
|
+
for (const val in this.next[curr]) {
|
|
3984
|
+
if (Object.prototype.hasOwnProperty.call(this.next[curr], val)) {
|
|
3985
|
+
const child = this.next[curr][val];
|
|
3986
|
+
let failNode = this.fail[curr];
|
|
3987
|
+
while (failNode !== 0 && !(val in this.next[failNode])) {
|
|
3988
|
+
failNode = this.fail[failNode];
|
|
3989
|
+
}
|
|
3990
|
+
if (val in this.next[failNode]) {
|
|
3991
|
+
this.fail[child] = this.next[failNode][val];
|
|
3992
|
+
} else {
|
|
3993
|
+
this.fail[child] = 0;
|
|
3994
|
+
}
|
|
3995
|
+
this.match[child] = this.match[child] || this.match[this.fail[child]];
|
|
3996
|
+
queue.push(child);
|
|
3997
|
+
}
|
|
3998
|
+
}
|
|
3999
|
+
}
|
|
4000
|
+
}
|
|
4001
|
+
searchUTF16(charSeq, start, end) {
|
|
4002
|
+
let node = 0;
|
|
4003
|
+
for (let i = start; i < end; i++) {
|
|
4004
|
+
const val = charSeq.charCodeAt(i);
|
|
4005
|
+
while (node !== 0 && !(val in this.next[node])) {
|
|
4006
|
+
node = this.fail[node];
|
|
4007
|
+
}
|
|
4008
|
+
if (val in this.next[node]) {
|
|
4009
|
+
node = this.next[node][val];
|
|
4010
|
+
}
|
|
4011
|
+
if (this.match[node]) return true;
|
|
4012
|
+
}
|
|
4013
|
+
return false;
|
|
4014
|
+
}
|
|
4015
|
+
searchUTF8(bytes, start, end) {
|
|
4016
|
+
let node = 0;
|
|
4017
|
+
for (let i = start; i < end; i++) {
|
|
4018
|
+
const val = bytes[i];
|
|
4019
|
+
while (node !== 0 && !(val in this.next[node])) {
|
|
4020
|
+
node = this.fail[node];
|
|
4021
|
+
}
|
|
4022
|
+
if (val in this.next[node]) {
|
|
4023
|
+
node = this.next[node][val];
|
|
4024
|
+
}
|
|
4025
|
+
if (this.match[node]) return true;
|
|
4026
|
+
}
|
|
4027
|
+
return false;
|
|
4028
|
+
}
|
|
4029
|
+
}
|
|
3797
4030
|
class Prefilter {
|
|
3798
4031
|
static Type = {
|
|
3799
4032
|
NONE: 0,
|
|
@@ -3806,6 +4039,8 @@ class Prefilter {
|
|
|
3806
4039
|
this.subs = [];
|
|
3807
4040
|
this.str = '';
|
|
3808
4041
|
this.bytes = null;
|
|
4042
|
+
this.ac16 = null;
|
|
4043
|
+
this.ac8 = null;
|
|
3809
4044
|
}
|
|
3810
4045
|
eval(input, pos) {
|
|
3811
4046
|
switch (this.type) {
|
|
@@ -3819,6 +4054,10 @@ class Prefilter {
|
|
|
3819
4054
|
}
|
|
3820
4055
|
return true;
|
|
3821
4056
|
case Prefilter.Type.OR:
|
|
4057
|
+
// Exploit Aho-Corasick if it was successfully built
|
|
4058
|
+
if (this.ac16 && this.ac8) {
|
|
4059
|
+
return input.hasAnyString(this, pos);
|
|
4060
|
+
}
|
|
3822
4061
|
for (let i = 0; i < this.subs.length; i++) {
|
|
3823
4062
|
if (this.subs[i].eval(input, pos)) return true;
|
|
3824
4063
|
}
|
|
@@ -3909,7 +4148,9 @@ class PrefilterTree {
|
|
|
3909
4148
|
const s = PrefilterTree.simplify(sub);
|
|
3910
4149
|
if (s.type !== Prefilter.Type.NONE) {
|
|
3911
4150
|
if (s.type === Prefilter.Type.AND) {
|
|
3912
|
-
|
|
4151
|
+
for (let j = 0; j < s.subs.length; j++) {
|
|
4152
|
+
newSubs.push(s.subs[j]);
|
|
4153
|
+
}
|
|
3913
4154
|
} else {
|
|
3914
4155
|
newSubs.push(s);
|
|
3915
4156
|
}
|
|
@@ -3951,6 +4192,27 @@ class PrefilterTree {
|
|
|
3951
4192
|
}
|
|
3952
4193
|
}
|
|
3953
4194
|
pf.subs = uniqueSubs;
|
|
4195
|
+
|
|
4196
|
+
// Build an Aho-Corasick automaton if all children are exact matches
|
|
4197
|
+
let allExact = true;
|
|
4198
|
+
for (const sub of uniqueSubs) {
|
|
4199
|
+
if (sub.type !== Prefilter.Type.EXACT) {
|
|
4200
|
+
allExact = false;
|
|
4201
|
+
break;
|
|
4202
|
+
}
|
|
4203
|
+
}
|
|
4204
|
+
if (allExact && uniqueSubs.length > 1) {
|
|
4205
|
+
const words16 = uniqueSubs.map(s => {
|
|
4206
|
+
const arr = [];
|
|
4207
|
+
for (let i = 0; i < s.str.length; i++) {
|
|
4208
|
+
arr.push(s.str.charCodeAt(i));
|
|
4209
|
+
}
|
|
4210
|
+
return arr;
|
|
4211
|
+
});
|
|
4212
|
+
pf.ac16 = new AhoCorasick(words16);
|
|
4213
|
+
const words8 = uniqueSubs.map(s => s.bytes);
|
|
4214
|
+
pf.ac8 = new AhoCorasick(words8);
|
|
4215
|
+
}
|
|
3954
4216
|
return pf;
|
|
3955
4217
|
}
|
|
3956
4218
|
return pf;
|
|
@@ -4477,7 +4739,9 @@ class Simplify {
|
|
|
4477
4739
|
// Flatten nested concatenations
|
|
4478
4740
|
if (nsub.op === Regexp.Op.CONCAT) {
|
|
4479
4741
|
changed = true;
|
|
4480
|
-
|
|
4742
|
+
for (let j = 0; j < nsub.subs.length; j++) {
|
|
4743
|
+
newSubs.push(nsub.subs[j]);
|
|
4744
|
+
}
|
|
4481
4745
|
continue;
|
|
4482
4746
|
}
|
|
4483
4747
|
} else if (re.op === Regexp.Op.ALTERNATE) {
|
|
@@ -4489,7 +4753,9 @@ class Simplify {
|
|
|
4489
4753
|
// Flatten nested alternations
|
|
4490
4754
|
if (nsub.op === Regexp.Op.ALTERNATE) {
|
|
4491
4755
|
changed = true;
|
|
4492
|
-
|
|
4756
|
+
for (let j = 0; j < nsub.subs.length; j++) {
|
|
4757
|
+
newSubs.push(nsub.subs[j]);
|
|
4758
|
+
}
|
|
4493
4759
|
continue;
|
|
4494
4760
|
}
|
|
4495
4761
|
}
|
|
@@ -5499,7 +5765,10 @@ class Parser {
|
|
|
5499
5765
|
return t.pop();
|
|
5500
5766
|
}
|
|
5501
5767
|
static concatRunes(x, y) {
|
|
5502
|
-
|
|
5768
|
+
for (let i = 0; i < y.length; i++) {
|
|
5769
|
+
x.push(y[i]);
|
|
5770
|
+
}
|
|
5771
|
+
return x;
|
|
5503
5772
|
}
|
|
5504
5773
|
constructor(wholeRegexp, flags = 0) {
|
|
5505
5774
|
this.wholeRegexp = wholeRegexp;
|
|
@@ -5535,8 +5804,8 @@ class Parser {
|
|
|
5535
5804
|
return re;
|
|
5536
5805
|
}
|
|
5537
5806
|
reuse(re) {
|
|
5538
|
-
if (this.height !== null &&
|
|
5539
|
-
|
|
5807
|
+
if (this.height !== null && this.height.has(re)) {
|
|
5808
|
+
this.height.delete(re);
|
|
5540
5809
|
}
|
|
5541
5810
|
if (re.subs !== null && re.subs.length > 0) {
|
|
5542
5811
|
re.subs[0] = this.free;
|
|
@@ -5568,20 +5837,20 @@ class Parser {
|
|
|
5568
5837
|
if (n <= 0) {
|
|
5569
5838
|
n = 1;
|
|
5570
5839
|
}
|
|
5571
|
-
if (n > Parser.MAX_SIZE / this.repeats) {
|
|
5840
|
+
if (n > Math.floor(Parser.MAX_SIZE / this.repeats)) {
|
|
5572
5841
|
this.repeats = Parser.MAX_SIZE;
|
|
5573
5842
|
} else {
|
|
5574
5843
|
this.repeats *= n;
|
|
5575
5844
|
}
|
|
5576
5845
|
}
|
|
5577
|
-
if (this.numRegexp < Parser.MAX_SIZE / this.repeats) {
|
|
5846
|
+
if (this.numRegexp < Math.floor(Parser.MAX_SIZE / this.repeats)) {
|
|
5578
5847
|
return;
|
|
5579
5848
|
}
|
|
5580
5849
|
|
|
5581
5850
|
// We need to start tracking size.
|
|
5582
5851
|
// Make the map and belatedly populate it
|
|
5583
5852
|
// with info about everything we've constructed so far.
|
|
5584
|
-
this.size =
|
|
5853
|
+
this.size = new Map();
|
|
5585
5854
|
for (let reEx of this.stack) {
|
|
5586
5855
|
this.checkSize(reEx);
|
|
5587
5856
|
}
|
|
@@ -5592,8 +5861,8 @@ class Parser {
|
|
|
5592
5861
|
}
|
|
5593
5862
|
calcSize(re, force = false) {
|
|
5594
5863
|
if (!force && this.size !== null) {
|
|
5595
|
-
if (
|
|
5596
|
-
return this.size
|
|
5864
|
+
if (this.size.has(re)) {
|
|
5865
|
+
return this.size.get(re);
|
|
5597
5866
|
}
|
|
5598
5867
|
}
|
|
5599
5868
|
let size = 0;
|
|
@@ -5653,9 +5922,9 @@ class Parser {
|
|
|
5653
5922
|
}
|
|
5654
5923
|
size = Math.max(1, size);
|
|
5655
5924
|
if (this.size === null) {
|
|
5656
|
-
this.size =
|
|
5925
|
+
this.size = new Map();
|
|
5657
5926
|
}
|
|
5658
|
-
this.size
|
|
5927
|
+
this.size.set(re, size);
|
|
5659
5928
|
return size;
|
|
5660
5929
|
}
|
|
5661
5930
|
checkHeight(re) {
|
|
@@ -5663,7 +5932,7 @@ class Parser {
|
|
|
5663
5932
|
return;
|
|
5664
5933
|
}
|
|
5665
5934
|
if (this.height === null) {
|
|
5666
|
-
this.height =
|
|
5935
|
+
this.height = new Map();
|
|
5667
5936
|
for (let reEx of this.stack) {
|
|
5668
5937
|
this.checkHeight(reEx);
|
|
5669
5938
|
}
|
|
@@ -5674,8 +5943,8 @@ class Parser {
|
|
|
5674
5943
|
}
|
|
5675
5944
|
calcHeight(re, force = false) {
|
|
5676
5945
|
if (!force && this.height !== null) {
|
|
5677
|
-
if (
|
|
5678
|
-
return this.height
|
|
5946
|
+
if (this.height.has(re)) {
|
|
5947
|
+
return this.height.get(re);
|
|
5679
5948
|
}
|
|
5680
5949
|
}
|
|
5681
5950
|
let h = 1;
|
|
@@ -5686,9 +5955,9 @@ class Parser {
|
|
|
5686
5955
|
}
|
|
5687
5956
|
}
|
|
5688
5957
|
if (this.height === null) {
|
|
5689
|
-
this.height =
|
|
5958
|
+
this.height = new Map();
|
|
5690
5959
|
}
|
|
5691
|
-
this.height
|
|
5960
|
+
this.height.set(re, h);
|
|
5692
5961
|
return h;
|
|
5693
5962
|
}
|
|
5694
5963
|
|
|
@@ -7810,7 +8079,7 @@ class TranslateRegExpString {
|
|
|
7810
8079
|
changed = true;
|
|
7811
8080
|
continue;
|
|
7812
8081
|
} else if (ch === '(' && i + 2 < size && data[i + 1] === '?' && data[i + 2] === '<') {
|
|
7813
|
-
if (i + 3
|
|
8082
|
+
if (i + 3 < size && !'=!>)'.includes(data[i + 3])) {
|
|
7814
8083
|
result += '(?P<';
|
|
7815
8084
|
i += 3;
|
|
7816
8085
|
changed = true;
|