re2js 2.2.0 → 2.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +54 -54
- package/build/index.cjs.cjs +425 -156
- package/build/index.cjs.cjs.map +1 -1
- package/build/index.esm.d.ts +3 -1
- package/build/index.esm.d.ts.map +1 -1
- package/build/index.esm.js +425 -156
- package/build/index.esm.js.map +1 -1
- package/build/index.umd.js +425 -156
- package/build/index.umd.js.map +1 -1
- package/package.json +6 -6
package/build/index.esm.js
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
* re2js
|
|
3
3
|
* RE2JS is the JavaScript port of RE2, a regular expression engine that provides linear time matching
|
|
4
4
|
*
|
|
5
|
-
* @version v2.2.
|
|
5
|
+
* @version v2.2.1
|
|
6
6
|
* @author Alexey Vasiliev
|
|
7
7
|
* @homepage https://github.com/le0pard/re2js#readme
|
|
8
8
|
* @repository github:le0pard/re2js
|
|
@@ -76,6 +76,8 @@ const PublicFlags = {
|
|
|
76
76
|
const ASCII_SIZE = 128;
|
|
77
77
|
const ASCII_TO_UPPER = new Int32Array(ASCII_SIZE);
|
|
78
78
|
const ASCII_TO_LOWER = new Int32Array(ASCII_SIZE);
|
|
79
|
+
// The highest legal Basic Multilingual Plane (BMP) value.
|
|
80
|
+
const MAX_BMP = 0xffff;
|
|
79
81
|
for (let i = 0; i < ASCII_SIZE; i++) {
|
|
80
82
|
if (i >= 97 && i <= 122) {
|
|
81
83
|
// a-z
|
|
@@ -99,11 +101,13 @@ class Codepoint {
|
|
|
99
101
|
static toUpperCase(codepoint) {
|
|
100
102
|
if (codepoint < ASCII_SIZE) return ASCII_TO_UPPER[codepoint];
|
|
101
103
|
const s = String.fromCodePoint(codepoint).toUpperCase();
|
|
102
|
-
|
|
104
|
+
const expectedLen = s.codePointAt(0) > MAX_BMP ? 2 : 1;
|
|
105
|
+
if (s.length > expectedLen) {
|
|
103
106
|
return codepoint;
|
|
104
107
|
}
|
|
105
108
|
const sOrigin = String.fromCodePoint(s.codePointAt(0)).toLowerCase();
|
|
106
|
-
|
|
109
|
+
const originExpectedLen = sOrigin.codePointAt(0) > MAX_BMP ? 2 : 1;
|
|
110
|
+
if (sOrigin.length > originExpectedLen || sOrigin.codePointAt(0) !== codepoint) {
|
|
107
111
|
return codepoint;
|
|
108
112
|
}
|
|
109
113
|
return s.codePointAt(0);
|
|
@@ -114,11 +118,13 @@ class Codepoint {
|
|
|
114
118
|
static toLowerCase(codepoint) {
|
|
115
119
|
if (codepoint < ASCII_SIZE) return ASCII_TO_LOWER[codepoint];
|
|
116
120
|
const s = String.fromCodePoint(codepoint).toLowerCase();
|
|
117
|
-
|
|
121
|
+
const expectedLen = s.codePointAt(0) > MAX_BMP ? 2 : 1;
|
|
122
|
+
if (s.length > expectedLen) {
|
|
118
123
|
return codepoint;
|
|
119
124
|
}
|
|
120
125
|
const sOrigin = String.fromCodePoint(s.codePointAt(0)).toUpperCase();
|
|
121
|
-
|
|
126
|
+
const originExpectedLen = sOrigin.codePointAt(0) > MAX_BMP ? 2 : 1;
|
|
127
|
+
if (sOrigin.length > originExpectedLen || sOrigin.codePointAt(0) !== codepoint) {
|
|
122
128
|
return codepoint;
|
|
123
129
|
}
|
|
124
130
|
return s.codePointAt(0);
|
|
@@ -220,7 +226,7 @@ class UnicodeTables {
|
|
|
220
226
|
static _CASE_ORBIT = null;
|
|
221
227
|
static get CASE_ORBIT() {
|
|
222
228
|
if (!this._CASE_ORBIT) {
|
|
223
|
-
this._CASE_ORBIT = decodeOrbit('rCrDIzDYqpII-LiC8cQlHa+0HGrpI6EzClClOBmOBkOBoOBpOBnOBrOBsOBqOlByPBzPBxPyK5crCz+HCydD1dD4dB5dB6dC8dEgeBheCieDmeDpeHj-HCweD1fDxeB+9HBwfC1FE2eBxfBjeBjdD1eDmpIHycB0fEmdBgda6cBhdD4cB1cdyhBC0hBK+hBDhiBBiiBIqiBIgkHChkHKikHDjkHBkkHImkHYjjBBnkH9gGygBB0gBB+gBBhhBBlkHBihBBqhBBijBBqypB4OhzHB70H6BgzHD-GiHo8HBp8HBq8HBr8HBs8HBt8HBu8HBv8HBg8HBh8HBi8HBj8HBk8HBl8HBm8HBn8HB48HB58HB68HB78HB88HB98HB+8HB-8HBw8HBx8HBy8HBz8HB08HB18HB28HB38HBo9HBp9HBq9HBr9HBs9HBt9HBu9HBv9HBg9HBh9HBi9HBj9HBk9HBl9HBm9HBn9HE89HJz9HClaFs+HJj+HHwcQwdQ8-HJz-HqJpdErCBlG-ohBrypBBokH6lVm4+BBl4+
|
|
229
|
+
this._CASE_ORBIT = decodeOrbit('rCrDIzDYqpII-LiC8cQlHa+0HGrpI6EzClClOBmOBkOBoOBpOBnOBrOBsOBqOlByPBzPBxPyK5crCz+HCydD1dD4dB5dB6dC8dEgeBheCieDmeDpeHj-HCweD1fDxeB+9HBwfC1FE2eBxfBjeBjdD1eDmpIHycB0fEmdBgda6cBhdD4cB1cdyhBC0hBK+hBDhiBBiiBIqiBIgkHChkHKikHDjkHBkkHImkHYjjBBnkH9gGygBB0gBB+gBBhhBBlkHBihBBqhBBijBBqypB4OhzHB70H6BgzHD-GiHo8HBp8HBq8HBr8HBs8HBt8HBu8HBv8HBg8HBh8HBi8HBj8HBk8HBl8HBm8HBn8HB48HB58HB68HB78HB88HB98HB+8HB-8HBw8HBx8HBy8HBz8HB08HB18HB28HB38HBo9HBp9HBq9HBr9HBs9HBt9HBu9HBv9HBg9HBh9HBi9HBj9HBk9HBl9HBm9HBn9HE89HJz9HClaFs+HJj+HHwcQwdQ8-HJz-HqJpdErCBlG-ohBrypBBokH6lVm4+BBl4+B');
|
|
224
230
|
}
|
|
225
231
|
return this._CASE_ORBIT;
|
|
226
232
|
}
|
|
@@ -592,11 +598,16 @@ class Unicode {
|
|
|
592
598
|
// to compare it to |r2|.
|
|
593
599
|
// -1 is interpreted as the end-of-file mark.
|
|
594
600
|
static equalsIgnoreCase(r1, r2) {
|
|
595
|
-
// Runes already match
|
|
596
|
-
if (r1
|
|
601
|
+
// Runes already match
|
|
602
|
+
if (r1 === r2) {
|
|
597
603
|
return true;
|
|
598
604
|
}
|
|
599
605
|
|
|
606
|
+
// Safely fail if either is EOF (and they didn't explicitly match above)
|
|
607
|
+
if (r1 < 0 || r2 < 0) {
|
|
608
|
+
return false;
|
|
609
|
+
}
|
|
610
|
+
|
|
600
611
|
// Fast path for the common case where both runes are ASCII characters.
|
|
601
612
|
// Coerces both runes to lowercase if applicable.
|
|
602
613
|
if (r1 <= this.MAX_ASCII && r2 <= this.MAX_ASCII) {
|
|
@@ -849,7 +860,7 @@ class Utils {
|
|
|
849
860
|
// Encoding[(Encoding['UTF_16'] = 0)] = 'UTF_16'
|
|
850
861
|
// Encoding[(Encoding['UTF_8'] = 1)] = 'UTF_8'
|
|
851
862
|
const createEnum = (values = [], initNum = 0) => {
|
|
852
|
-
const enumObject =
|
|
863
|
+
const enumObject = Object.create(null);
|
|
853
864
|
for (let i = 0; i < values.length; i++) {
|
|
854
865
|
const val = values[i];
|
|
855
866
|
const keyVal = initNum + i;
|
|
@@ -991,6 +1002,9 @@ class MachineInputBase {
|
|
|
991
1002
|
hasString() {
|
|
992
1003
|
return false;
|
|
993
1004
|
}
|
|
1005
|
+
hasAnyString() {
|
|
1006
|
+
return false;
|
|
1007
|
+
}
|
|
994
1008
|
|
|
995
1009
|
// Helper for the exact-literal fast-path execution router
|
|
996
1010
|
prefixLength() {
|
|
@@ -1016,6 +1030,13 @@ class MachineUTF8Input extends MachineInputBase {
|
|
|
1016
1030
|
return idx !== -1 && idx <= this.end - target.length;
|
|
1017
1031
|
}
|
|
1018
1032
|
|
|
1033
|
+
// Executes a high-speed, single - pass search for multiple literal strings
|
|
1034
|
+
// simultaneously using an Aho-Corasick automaton.
|
|
1035
|
+
hasAnyString(prefilter, pos) {
|
|
1036
|
+
if (!prefilter.ac8) return false;
|
|
1037
|
+
return prefilter.ac8.searchUTF8(this.bytes, this.start + pos, this.end);
|
|
1038
|
+
}
|
|
1039
|
+
|
|
1019
1040
|
// Returns the rune at the specified index; the units are
|
|
1020
1041
|
// unspecified, but could be UTF-8 byte, UTF-16 char, or rune
|
|
1021
1042
|
// indices. Returns the width (in the same units) of the rune in
|
|
@@ -1034,17 +1055,23 @@ class MachineUTF8Input extends MachineInputBase {
|
|
|
1034
1055
|
return c << 3 | 1;
|
|
1035
1056
|
} else if (c >= 0xc2 && c <= 0xdf && pos + 1 < this.end) {
|
|
1036
1057
|
const c1 = this.bytes[pos + 1] & 0xff;
|
|
1058
|
+
if ((c1 & 0xc0) !== 0x80) return c << 3 | 1;
|
|
1037
1059
|
const rune = (c & 0x1f) << 6 | c1 & 0x3f;
|
|
1038
1060
|
return rune << 3 | 2;
|
|
1039
1061
|
} else if (c >= 0xe0 && c <= 0xef && pos + 2 < this.end) {
|
|
1040
1062
|
const c1 = this.bytes[pos + 1] & 0xff;
|
|
1063
|
+
if ((c1 & 0xc0) !== 0x80) return c << 3 | 1;
|
|
1041
1064
|
const c2 = this.bytes[pos + 2] & 0xff;
|
|
1065
|
+
if ((c2 & 0xc0) !== 0x80) return c << 3 | 1;
|
|
1042
1066
|
const rune = (c & 0x0f) << 12 | (c1 & 0x3f) << 6 | c2 & 0x3f;
|
|
1043
1067
|
return rune << 3 | 3;
|
|
1044
1068
|
} else if (c >= 0xf0 && c <= 0xf4 && pos + 3 < this.end) {
|
|
1045
1069
|
const c1 = this.bytes[pos + 1] & 0xff;
|
|
1070
|
+
if ((c1 & 0xc0) !== 0x80) return c << 3 | 1;
|
|
1046
1071
|
const c2 = this.bytes[pos + 2] & 0xff;
|
|
1072
|
+
if ((c2 & 0xc0) !== 0x80) return c << 3 | 1;
|
|
1047
1073
|
const c3 = this.bytes[pos + 3] & 0xff;
|
|
1074
|
+
if ((c3 & 0xc0) !== 0x80) return c << 3 | 1;
|
|
1048
1075
|
const rune = (c & 0x07) << 18 | (c1 & 0x3f) << 12 | (c2 & 0x3f) << 6 | c3 & 0x3f;
|
|
1049
1076
|
return rune << 3 | 4;
|
|
1050
1077
|
} else {
|
|
@@ -1123,6 +1150,13 @@ class MachineUTF16Input extends MachineInputBase {
|
|
|
1123
1150
|
return idx !== -1 && idx <= this.end - prefilter.str.length;
|
|
1124
1151
|
}
|
|
1125
1152
|
|
|
1153
|
+
// Executes a high-speed, single - pass search for multiple literal strings
|
|
1154
|
+
// simultaneously using an Aho-Corasick automaton.
|
|
1155
|
+
hasAnyString(prefilter, pos) {
|
|
1156
|
+
if (!prefilter.ac16) return false;
|
|
1157
|
+
return prefilter.ac16.searchUTF16(this.charSequence, this.start + pos, this.end);
|
|
1158
|
+
}
|
|
1159
|
+
|
|
1126
1160
|
// Returns the rune at the specified index; the units are
|
|
1127
1161
|
// unspecified, but could be UTF-8 byte, UTF-16 char, or rune
|
|
1128
1162
|
// indices. Returns the width (in the same units) of the rune in
|
|
@@ -1568,7 +1602,15 @@ class Matcher {
|
|
|
1568
1602
|
if (this.hasMatch) {
|
|
1569
1603
|
start = this.groups[1];
|
|
1570
1604
|
if (this.groups[0] === this.groups[1]) {
|
|
1571
|
-
|
|
1605
|
+
// Safely calculate structural encoding width to avoid sequence corruption
|
|
1606
|
+
const machineInput = this.matcherInput.isUTF16Encoding() ? MachineInput.fromUTF16(this.matcherInput.asCharSequence(), 0, this.matcherInputLength) : MachineInput.fromUTF8(this.matcherInput.asBytes(), 0, this.matcherInputLength);
|
|
1607
|
+
const r = machineInput.step(start);
|
|
1608
|
+
if (r < 0) {
|
|
1609
|
+
// EOF
|
|
1610
|
+
start++; // Advance past length to force loop exit
|
|
1611
|
+
} else {
|
|
1612
|
+
start += r & 7; // Advance by safely decoded width
|
|
1613
|
+
}
|
|
1572
1614
|
}
|
|
1573
1615
|
}
|
|
1574
1616
|
return this.genMatch(start, RE2Flags.UNANCHORED);
|
|
@@ -1707,6 +1749,8 @@ class Matcher {
|
|
|
1707
1749
|
const groupName = replacement.substring(i + 1, j);
|
|
1708
1750
|
res += this.group(groupName);
|
|
1709
1751
|
last = j + 1;
|
|
1752
|
+
i = j;
|
|
1753
|
+
continue;
|
|
1710
1754
|
}
|
|
1711
1755
|
}
|
|
1712
1756
|
}
|
|
@@ -1786,6 +1830,7 @@ class Matcher {
|
|
|
1786
1830
|
if (j === replacement.length || replacement.codePointAt(j) !== Codepoint.CODES.get('>')) {
|
|
1787
1831
|
res += replacement.substring(i - 1, j + 1);
|
|
1788
1832
|
last = j + 1;
|
|
1833
|
+
i = j;
|
|
1789
1834
|
continue;
|
|
1790
1835
|
}
|
|
1791
1836
|
const groupName = replacement.substring(i + 1, j);
|
|
@@ -1795,6 +1840,8 @@ class Matcher {
|
|
|
1795
1840
|
res += `$<${groupName}>`;
|
|
1796
1841
|
}
|
|
1797
1842
|
last = j + 1;
|
|
1843
|
+
i = j;
|
|
1844
|
+
continue;
|
|
1798
1845
|
}
|
|
1799
1846
|
}
|
|
1800
1847
|
}
|
|
@@ -1919,6 +1966,8 @@ class Inst {
|
|
|
1919
1966
|
return r === r0;
|
|
1920
1967
|
}
|
|
1921
1968
|
const len = this.runes.length;
|
|
1969
|
+
if (len === 0) return false;
|
|
1970
|
+
|
|
1922
1971
|
// If the array is exactly 2, 4, 6, or 8 items, DO NOT fall through to binary search
|
|
1923
1972
|
if (len === 2 || len === 4 || len === 6 || len === 8) {
|
|
1924
1973
|
for (let j = 0; j < len; j += 2) {
|
|
@@ -1932,22 +1981,19 @@ class Inst {
|
|
|
1932
1981
|
return false; // Stop here
|
|
1933
1982
|
}
|
|
1934
1983
|
|
|
1935
|
-
//
|
|
1936
|
-
|
|
1937
|
-
|
|
1938
|
-
|
|
1939
|
-
|
|
1940
|
-
|
|
1941
|
-
|
|
1942
|
-
|
|
1943
|
-
|
|
1944
|
-
}
|
|
1945
|
-
lo = m + 1;
|
|
1946
|
-
} else {
|
|
1947
|
-
hi = m;
|
|
1948
|
-
}
|
|
1984
|
+
// Branchless Binary Search (Lower Bound)
|
|
1985
|
+
// Compiles to optimal conditional move (cmov) machine code, preventing
|
|
1986
|
+
// branch mispredictions on large, chaotic Unicode arrays
|
|
1987
|
+
let base = 0;
|
|
1988
|
+
let n = len >> 1;
|
|
1989
|
+
while (n > 1) {
|
|
1990
|
+
const half = n >> 1;
|
|
1991
|
+
base += this.runes[base + half << 1] <= r ? half : 0;
|
|
1992
|
+
n -= half;
|
|
1949
1993
|
}
|
|
1950
|
-
|
|
1994
|
+
base += this.runes[base << 1] <= r ? 1 : 0;
|
|
1995
|
+
const m = base - 1;
|
|
1996
|
+
return m >= 0 && r <= this.runes[m << 1 | 1];
|
|
1951
1997
|
}
|
|
1952
1998
|
|
|
1953
1999
|
// matchRunePos checks whether the instruction matches (and consumes) r.
|
|
@@ -1962,6 +2008,7 @@ class Inst {
|
|
|
1962
2008
|
return r === r0 ? 0 : -1;
|
|
1963
2009
|
}
|
|
1964
2010
|
const len = this.runes.length;
|
|
2011
|
+
if (len === 0) return -1;
|
|
1965
2012
|
if (len === 2 || len === 4 || len === 6 || len === 8) {
|
|
1966
2013
|
for (let j = 0; j < len; j += 2) {
|
|
1967
2014
|
if (r < this.runes[j]) return -1;
|
|
@@ -1969,19 +2016,18 @@ class Inst {
|
|
|
1969
2016
|
}
|
|
1970
2017
|
return -1;
|
|
1971
2018
|
}
|
|
1972
|
-
|
|
1973
|
-
|
|
1974
|
-
|
|
1975
|
-
|
|
1976
|
-
|
|
1977
|
-
|
|
1978
|
-
|
|
1979
|
-
|
|
1980
|
-
} else {
|
|
1981
|
-
hi = m;
|
|
1982
|
-
}
|
|
2019
|
+
|
|
2020
|
+
// Branchless Binary Search (Lower Bound)
|
|
2021
|
+
let base = 0;
|
|
2022
|
+
let n = len >> 1;
|
|
2023
|
+
while (n > 1) {
|
|
2024
|
+
const half = n >> 1;
|
|
2025
|
+
base += this.runes[base + half << 1] <= r ? half : 0;
|
|
2026
|
+
n -= half;
|
|
1983
2027
|
}
|
|
1984
|
-
|
|
2028
|
+
base += this.runes[base << 1] <= r ? 1 : 0;
|
|
2029
|
+
const m = base - 1;
|
|
2030
|
+
return m >= 0 && r <= this.runes[m << 1 | 1] ? m : -1;
|
|
1985
2031
|
}
|
|
1986
2032
|
/**
|
|
1987
2033
|
*
|
|
@@ -2080,6 +2126,7 @@ class Queue {
|
|
|
2080
2126
|
//
|
|
2081
2127
|
// Called by RE2.doExecute.
|
|
2082
2128
|
class Machine {
|
|
2129
|
+
static THREADS_CHUNK_SIZE = 128;
|
|
2083
2130
|
static fromRE2(re2) {
|
|
2084
2131
|
const m = new Machine();
|
|
2085
2132
|
m.prog = re2.prog;
|
|
@@ -2120,15 +2167,15 @@ class Machine {
|
|
|
2120
2167
|
resetCap() {
|
|
2121
2168
|
for (let i = 0; i < this.poolSize; i++) {
|
|
2122
2169
|
const t = this.pool[i];
|
|
2123
|
-
t.cap.fill(
|
|
2170
|
+
t.cap.fill(-1);
|
|
2124
2171
|
}
|
|
2125
2172
|
}
|
|
2126
2173
|
initNewCap(ncap) {
|
|
2127
2174
|
for (let i = 0; i < this.poolSize; i++) {
|
|
2128
2175
|
const t = this.pool[i];
|
|
2129
|
-
t.cap = new Int32Array(ncap);
|
|
2176
|
+
t.cap = new Int32Array(ncap).fill(-1);
|
|
2130
2177
|
}
|
|
2131
|
-
this.matchcap = new Int32Array(ncap);
|
|
2178
|
+
this.matchcap = new Int32Array(ncap).fill(-1);
|
|
2132
2179
|
}
|
|
2133
2180
|
submatches() {
|
|
2134
2181
|
if (this.ncap === 0) {
|
|
@@ -2141,14 +2188,21 @@ class Machine {
|
|
|
2141
2188
|
// alloc() allocates a new thread with the given instruction.
|
|
2142
2189
|
// It uses the free pool if possible.
|
|
2143
2190
|
alloc(inst) {
|
|
2144
|
-
|
|
2145
|
-
|
|
2146
|
-
|
|
2147
|
-
|
|
2148
|
-
|
|
2149
|
-
|
|
2150
|
-
|
|
2191
|
+
if (this.poolSize === 0) {
|
|
2192
|
+
const capLen = this.matchcap.length;
|
|
2193
|
+
|
|
2194
|
+
// Bulk allocate threads in a tight loop so the V8 engine
|
|
2195
|
+
// places them adjacently in the young generation heap
|
|
2196
|
+
for (let i = 0; i < Machine.THREADS_CHUNK_SIZE; i++) {
|
|
2197
|
+
const t = new Thread();
|
|
2198
|
+
t.cap = new Int32Array(capLen);
|
|
2199
|
+
this.pool[this.poolSize++] = t;
|
|
2200
|
+
}
|
|
2151
2201
|
}
|
|
2202
|
+
|
|
2203
|
+
// Pop a thread from the top of the pool stack
|
|
2204
|
+
this.poolSize--;
|
|
2205
|
+
const t = this.pool[this.poolSize];
|
|
2152
2206
|
t.inst = inst;
|
|
2153
2207
|
return t;
|
|
2154
2208
|
}
|
|
@@ -2201,6 +2255,9 @@ class Machine {
|
|
|
2201
2255
|
if ((startCond & Utils.EMPTY_BEGIN_TEXT) !== 0 && pos !== 0) {
|
|
2202
2256
|
break;
|
|
2203
2257
|
}
|
|
2258
|
+
if ((anchor === RE2Flags.ANCHOR_START || anchor === RE2Flags.ANCHOR_BOTH) && pos !== 0) {
|
|
2259
|
+
break;
|
|
2260
|
+
}
|
|
2204
2261
|
if (this.matched) {
|
|
2205
2262
|
break;
|
|
2206
2263
|
}
|
|
@@ -2278,6 +2335,9 @@ class Machine {
|
|
|
2278
2335
|
while (true) {
|
|
2279
2336
|
if (runq.isEmpty()) {
|
|
2280
2337
|
if ((startCond & Utils.EMPTY_BEGIN_TEXT) !== 0 && pos !== 0) break;
|
|
2338
|
+
if ((anchor === RE2Flags.ANCHOR_START || anchor === RE2Flags.ANCHOR_BOTH) && pos !== 0) {
|
|
2339
|
+
break;
|
|
2340
|
+
}
|
|
2281
2341
|
}
|
|
2282
2342
|
if (pos === 0 || anchor === RE2Flags.UNANCHORED) {
|
|
2283
2343
|
// Spawn Lookbehind threads BEFORE the main pattern
|
|
@@ -2393,78 +2453,83 @@ class Machine {
|
|
|
2393
2453
|
runq.clear();
|
|
2394
2454
|
}
|
|
2395
2455
|
add(q, pc, pos, cap, cond, t) {
|
|
2396
|
-
|
|
2397
|
-
|
|
2398
|
-
|
|
2399
|
-
|
|
2400
|
-
|
|
2401
|
-
|
|
2402
|
-
|
|
2403
|
-
|
|
2404
|
-
|
|
2405
|
-
|
|
2406
|
-
|
|
2407
|
-
|
|
2408
|
-
|
|
2409
|
-
|
|
2410
|
-
t = this.add(q, inst.arg, pos, cap, cond, t);
|
|
2411
|
-
break;
|
|
2412
|
-
case Inst.EMPTY_WIDTH:
|
|
2413
|
-
if ((inst.arg & ~cond) === 0) {
|
|
2414
|
-
t = this.add(q, inst.out, pos, cap, cond, t);
|
|
2415
|
-
}
|
|
2416
|
-
break;
|
|
2417
|
-
case Inst.NOP:
|
|
2418
|
-
t = this.add(q, inst.out, pos, cap, cond, t);
|
|
2419
|
-
break;
|
|
2420
|
-
case Inst.CAPTURE:
|
|
2421
|
-
if (inst.arg < this.ncap) {
|
|
2422
|
-
const opos = cap[inst.arg];
|
|
2423
|
-
cap[inst.arg] = pos;
|
|
2424
|
-
this.add(q, inst.out, pos, cap, cond, null);
|
|
2425
|
-
cap[inst.arg] = opos;
|
|
2426
|
-
} else {
|
|
2456
|
+
while (true) {
|
|
2457
|
+
if (pc === 0) {
|
|
2458
|
+
return t;
|
|
2459
|
+
}
|
|
2460
|
+
if (q.contains(pc)) {
|
|
2461
|
+
return t;
|
|
2462
|
+
}
|
|
2463
|
+
const d = q.add(pc);
|
|
2464
|
+
const inst = this.prog.inst[pc];
|
|
2465
|
+
switch (inst.op) {
|
|
2466
|
+
case Inst.FAIL:
|
|
2467
|
+
return t;
|
|
2468
|
+
case Inst.ALT:
|
|
2469
|
+
case Inst.ALT_MATCH:
|
|
2427
2470
|
t = this.add(q, inst.out, pos, cap, cond, t);
|
|
2428
|
-
|
|
2429
|
-
|
|
2430
|
-
|
|
2431
|
-
|
|
2432
|
-
|
|
2433
|
-
|
|
2434
|
-
case Inst.LB_CHECK:
|
|
2435
|
-
if (inst.lb > 0) {
|
|
2436
|
-
// Positive Lookbehind
|
|
2437
|
-
if (this.lbTable[inst.lb] === pos) {
|
|
2438
|
-
t = this.add(q, inst.out, pos, cap, cond, t);
|
|
2471
|
+
pc = inst.arg; // Flattened tail recursion
|
|
2472
|
+
continue;
|
|
2473
|
+
case Inst.EMPTY_WIDTH:
|
|
2474
|
+
if ((inst.arg & ~cond) === 0) {
|
|
2475
|
+
pc = inst.out; // Flattened tail recursion
|
|
2476
|
+
continue;
|
|
2439
2477
|
}
|
|
2440
|
-
|
|
2441
|
-
|
|
2442
|
-
|
|
2443
|
-
|
|
2444
|
-
|
|
2445
|
-
|
|
2446
|
-
|
|
2447
|
-
|
|
2448
|
-
|
|
2449
|
-
|
|
2450
|
-
|
|
2451
|
-
|
|
2452
|
-
|
|
2453
|
-
|
|
2454
|
-
}
|
|
2455
|
-
if (this.ncap > 0 && t.cap !== cap) {
|
|
2456
|
-
// Direct assignment utilizing Typed Array performance
|
|
2457
|
-
for (let c = 0; c < this.ncap; c++) {
|
|
2458
|
-
t.cap[c] = cap[c];
|
|
2478
|
+
return t;
|
|
2479
|
+
case Inst.NOP:
|
|
2480
|
+
pc = inst.out; // Flattened tail recursion
|
|
2481
|
+
continue;
|
|
2482
|
+
case Inst.CAPTURE:
|
|
2483
|
+
if (inst.arg < this.ncap) {
|
|
2484
|
+
const opos = cap[inst.arg];
|
|
2485
|
+
cap[inst.arg] = pos;
|
|
2486
|
+
this.add(q, inst.out, pos, cap, cond, null);
|
|
2487
|
+
cap[inst.arg] = opos;
|
|
2488
|
+
return t;
|
|
2489
|
+
} else {
|
|
2490
|
+
pc = inst.out; // Flattened tail recursion
|
|
2491
|
+
continue;
|
|
2459
2492
|
}
|
|
2460
|
-
|
|
2461
|
-
|
|
2462
|
-
|
|
2463
|
-
|
|
2464
|
-
|
|
2465
|
-
|
|
2493
|
+
case Inst.LB_WRITE:
|
|
2494
|
+
this.lbTable[Math.abs(inst.lb)] = pos;
|
|
2495
|
+
pc = inst.out;
|
|
2496
|
+
continue;
|
|
2497
|
+
case Inst.LB_CHECK:
|
|
2498
|
+
if (inst.lb > 0) {
|
|
2499
|
+
// Positive Lookbehind
|
|
2500
|
+
if (this.lbTable[inst.lb] === pos) {
|
|
2501
|
+
pc = inst.out; // Flattened tail recursion
|
|
2502
|
+
continue;
|
|
2503
|
+
}
|
|
2504
|
+
} else if (this.lbTable[-inst.lb] !== pos) {
|
|
2505
|
+
// Negative Lookbehind
|
|
2506
|
+
pc = inst.out; // Flattened tail recursion
|
|
2507
|
+
continue;
|
|
2508
|
+
}
|
|
2509
|
+
return t;
|
|
2510
|
+
case Inst.MATCH:
|
|
2511
|
+
case Inst.RUNE:
|
|
2512
|
+
case Inst.RUNE1:
|
|
2513
|
+
case Inst.RUNE_ANY:
|
|
2514
|
+
case Inst.RUNE_ANY_NOT_NL:
|
|
2515
|
+
if (t === null) {
|
|
2516
|
+
t = this.alloc(inst);
|
|
2517
|
+
} else {
|
|
2518
|
+
t.inst = inst;
|
|
2519
|
+
}
|
|
2520
|
+
if (this.ncap > 0 && t.cap !== cap) {
|
|
2521
|
+
// Direct assignment utilizing Typed Array performance
|
|
2522
|
+
for (let c = 0; c < this.ncap; c++) {
|
|
2523
|
+
t.cap[c] = cap[c];
|
|
2524
|
+
}
|
|
2525
|
+
}
|
|
2526
|
+
q.denseThreads[d] = t;
|
|
2527
|
+
t = null;
|
|
2528
|
+
return t;
|
|
2529
|
+
default:
|
|
2530
|
+
throw new Error('unhandled');
|
|
2531
|
+
}
|
|
2466
2532
|
}
|
|
2467
|
-
return t;
|
|
2468
2533
|
}
|
|
2469
2534
|
}
|
|
2470
2535
|
|
|
@@ -2492,8 +2557,15 @@ class DFAState {
|
|
|
2492
2557
|
this.nfaStates = nfaStates; // Int32Array of Instruction PCs
|
|
2493
2558
|
this.isMatch = isMatch; // Boolean
|
|
2494
2559
|
this.matchIDs = matchIDs; // Array of integers indicating which Set patterns matched
|
|
2495
|
-
|
|
2496
|
-
|
|
2560
|
+
|
|
2561
|
+
// Latin-1 (Unicode.MAX_LATIN1 + 1) flat arrays for blisteringly fast O(1) lookups
|
|
2562
|
+
// completely covering standard English, European languages, and 1-byte encodings.
|
|
2563
|
+
this.nextLatin1 = new Array(Unicode.MAX_LATIN1 + 1).fill(null); // Flat array for blisteringly fast ASCII lookups
|
|
2564
|
+
this.nextLatin1Anchored = new Array(Unicode.MAX_LATIN1 + 1).fill(null); // Flat array for blisteringly fast ASCII lookups
|
|
2565
|
+
// 2 arrays used as hash map for V8 optimization (N is small number, so O(n) faster than Map O(1))
|
|
2566
|
+
this.transKeys = [];
|
|
2567
|
+
this.transVals = [];
|
|
2568
|
+
this.lastSeen = 0; // Track when this state was last used for LRU eviction
|
|
2497
2569
|
}
|
|
2498
2570
|
}
|
|
2499
2571
|
class DFA {
|
|
@@ -2506,6 +2578,7 @@ class DFA {
|
|
|
2506
2578
|
this.stateLimit = 10000; // Prevent memory explosion (ReDoS protection)
|
|
2507
2579
|
this.cacheClears = 0; // Track thrashing
|
|
2508
2580
|
this.failed = false; // mark if DFA cannot work with provided prog
|
|
2581
|
+
this.clock = 0; // Global clock for LRU eviction
|
|
2509
2582
|
}
|
|
2510
2583
|
|
|
2511
2584
|
// Follows epsilon (empty) transitions to find all reachable states without consuming a char
|
|
@@ -2565,6 +2638,7 @@ class DFA {
|
|
|
2565
2638
|
for (let i = 0; i < bucket.length; i++) {
|
|
2566
2639
|
const state = bucket[i];
|
|
2567
2640
|
if (arraysEqual(state.nfaStates, sortedPCs)) {
|
|
2641
|
+
state.lastSeen = ++this.clock;
|
|
2568
2642
|
return state;
|
|
2569
2643
|
}
|
|
2570
2644
|
}
|
|
@@ -2577,40 +2651,99 @@ class DFA {
|
|
|
2577
2651
|
if (this.failed) return null;
|
|
2578
2652
|
|
|
2579
2653
|
// Safety: prevent memory exhaustion from state explosion
|
|
2580
|
-
// We
|
|
2654
|
+
// We prune the cache to keep the newest 50%
|
|
2581
2655
|
if (this.stateCount >= this.stateLimit) {
|
|
2582
|
-
this.stateCache.clear();
|
|
2583
|
-
this.stateCount = 0;
|
|
2584
|
-
this.startState = null;
|
|
2585
2656
|
this.cacheClears++;
|
|
2586
2657
|
|
|
2587
2658
|
// If this regex causes continuous cache thrashing, permanently fall back to NFA
|
|
2588
2659
|
// to avoid spending CPU cycles constantly rebuilding the DFA tree.
|
|
2589
2660
|
if (this.cacheClears >= DFA.MAX_CACHE_CLEARS) {
|
|
2590
2661
|
this.failed = true;
|
|
2662
|
+
this.stateCache.clear();
|
|
2663
|
+
this.stateCount = 0;
|
|
2664
|
+
this.startState = null;
|
|
2665
|
+
return null;
|
|
2666
|
+
}
|
|
2667
|
+
this.evictCache();
|
|
2668
|
+
|
|
2669
|
+
// After eviction, the bucket reference might be stale or empty.
|
|
2670
|
+
// We must re-fetch or re-create the bucket.
|
|
2671
|
+
bucket = this.stateCache.get(hash);
|
|
2672
|
+
if (!bucket) {
|
|
2673
|
+
bucket = [];
|
|
2674
|
+
this.stateCache.set(hash, bucket);
|
|
2591
2675
|
}
|
|
2592
|
-
return null;
|
|
2593
2676
|
}
|
|
2594
2677
|
|
|
2595
2678
|
// State not found, create it and add to bucket
|
|
2596
2679
|
const state = new DFAState(sortedPCs, closureResult.isMatch, closureResult.matchIDs);
|
|
2680
|
+
state.lastSeen = ++this.clock;
|
|
2597
2681
|
bucket.push(state);
|
|
2598
2682
|
this.stateCount++;
|
|
2599
2683
|
return state;
|
|
2600
2684
|
}
|
|
2685
|
+
evictCache() {
|
|
2686
|
+
const allStates = [];
|
|
2687
|
+
for (const bucket of this.stateCache.values()) {
|
|
2688
|
+
for (let i = 0; i < bucket.length; i++) {
|
|
2689
|
+
allStates.push(bucket[i]);
|
|
2690
|
+
}
|
|
2691
|
+
}
|
|
2692
|
+
|
|
2693
|
+
// Sort ascending by lastSeen (oldest first)
|
|
2694
|
+
allStates.sort((a, b) => a.lastSeen - b.lastSeen);
|
|
2695
|
+
|
|
2696
|
+
// Keep the newest 50%
|
|
2697
|
+
const keepCount = Math.max(1, Math.floor(this.stateLimit / 2));
|
|
2698
|
+
const startIndex = allStates.length - keepCount;
|
|
2699
|
+
const survivorsArray = allStates.slice(startIndex);
|
|
2700
|
+
const survivors = new Set(survivorsArray);
|
|
2701
|
+
this.stateCache.clear();
|
|
2702
|
+
this.stateCount = 0;
|
|
2703
|
+
for (let i = 0; i < survivorsArray.length; i++) {
|
|
2704
|
+
const state = survivorsArray[i];
|
|
2705
|
+
|
|
2706
|
+
// Sever ties to all states to prevent memory leaks and dangling pointers
|
|
2707
|
+
state.nextLatin1.fill(null);
|
|
2708
|
+
state.nextLatin1Anchored.fill(null);
|
|
2709
|
+
// zero-allocation cleanup
|
|
2710
|
+
state.transKeys.length = 0;
|
|
2711
|
+
state.transVals.length = 0;
|
|
2712
|
+
const hash = hashPCs(state.nfaStates);
|
|
2713
|
+
let bucket = this.stateCache.get(hash);
|
|
2714
|
+
if (!bucket) {
|
|
2715
|
+
bucket = [];
|
|
2716
|
+
this.stateCache.set(hash, bucket);
|
|
2717
|
+
}
|
|
2718
|
+
bucket.push(state);
|
|
2719
|
+
this.stateCount++;
|
|
2720
|
+
}
|
|
2721
|
+
|
|
2722
|
+
// Start state must either be preserved or nullified so it gets re-created
|
|
2723
|
+
if (this.startState && !survivors.has(this.startState)) {
|
|
2724
|
+
this.startState = null;
|
|
2725
|
+
}
|
|
2726
|
+
}
|
|
2601
2727
|
|
|
2602
2728
|
// Compute the next DFA state given a current state and a character
|
|
2603
2729
|
step(state, charCode, anchor) {
|
|
2604
|
-
// OPTIMIZATION:
|
|
2605
|
-
if (
|
|
2606
|
-
|
|
2607
|
-
|
|
2608
|
-
return next;
|
|
2730
|
+
// OPTIMIZATION: Latin-1 Array Fast-Path
|
|
2731
|
+
if (charCode <= Unicode.MAX_LATIN1) {
|
|
2732
|
+
if (anchor === RE2Flags.UNANCHORED) {
|
|
2733
|
+
const next = state.nextLatin1[charCode];
|
|
2734
|
+
if (next !== null) return next;
|
|
2735
|
+
} else {
|
|
2736
|
+
const next = state.nextLatin1Anchored[charCode];
|
|
2737
|
+
if (next !== null) return next;
|
|
2609
2738
|
}
|
|
2610
2739
|
} else {
|
|
2740
|
+
// Dense Array Linear Search fallback for Runes > 255
|
|
2611
2741
|
const key = charCode + (anchor === RE2Flags.UNANCHORED ? 0 : Unicode.MAX_RUNE + 1);
|
|
2612
|
-
|
|
2613
|
-
|
|
2742
|
+
// get [key] -> nextState
|
|
2743
|
+
const keys = state.transKeys;
|
|
2744
|
+
const len = keys.length;
|
|
2745
|
+
for (let i = 0; i < len; i++) {
|
|
2746
|
+
if (keys[i] === key) return state.transVals[i];
|
|
2614
2747
|
}
|
|
2615
2748
|
}
|
|
2616
2749
|
const nextPCs = [];
|
|
@@ -2627,11 +2760,17 @@ class DFA {
|
|
|
2627
2760
|
const nextState = this.getState(nextPCs);
|
|
2628
2761
|
|
|
2629
2762
|
// Cache the result
|
|
2630
|
-
if (
|
|
2631
|
-
|
|
2763
|
+
if (charCode <= Unicode.MAX_LATIN1) {
|
|
2764
|
+
if (anchor === RE2Flags.UNANCHORED) {
|
|
2765
|
+
state.nextLatin1[charCode] = nextState;
|
|
2766
|
+
} else {
|
|
2767
|
+
state.nextLatin1Anchored[charCode] = nextState;
|
|
2768
|
+
}
|
|
2632
2769
|
} else {
|
|
2633
2770
|
const key = charCode + (anchor === RE2Flags.UNANCHORED ? 0 : Unicode.MAX_RUNE + 1);
|
|
2634
|
-
|
|
2771
|
+
// store key -> nextState
|
|
2772
|
+
state.transKeys.push(key);
|
|
2773
|
+
state.transVals.push(nextState);
|
|
2635
2774
|
}
|
|
2636
2775
|
return nextState;
|
|
2637
2776
|
}
|
|
@@ -2664,10 +2803,11 @@ class DFA {
|
|
|
2664
2803
|
if (width === 0) {
|
|
2665
2804
|
break;
|
|
2666
2805
|
}
|
|
2667
|
-
currentState = anchor === RE2Flags.UNANCHORED && rune <= Unicode.
|
|
2806
|
+
currentState = anchor === RE2Flags.UNANCHORED && rune <= Unicode.MAX_LATIN1 && currentState.nextLatin1[rune] || this.step(currentState, rune, anchor);
|
|
2668
2807
|
|
|
2669
2808
|
// If we hit an unrecoverable DFA error or bailout, signal fallback
|
|
2670
2809
|
if (currentState === null) return null;
|
|
2810
|
+
currentState.lastSeen = ++this.clock;
|
|
2671
2811
|
if (currentState.isMatch) {
|
|
2672
2812
|
if (anchor === RE2Flags.ANCHOR_BOTH) {
|
|
2673
2813
|
if (i + width === endPos) return true;
|
|
@@ -2715,9 +2855,10 @@ class DFA {
|
|
|
2715
2855
|
const rune = r >> 3;
|
|
2716
2856
|
const width = r & 7;
|
|
2717
2857
|
if (width === 0) break;
|
|
2718
|
-
currentState = anchor === RE2Flags.UNANCHORED && rune <= Unicode.
|
|
2858
|
+
currentState = anchor === RE2Flags.UNANCHORED && rune <= Unicode.MAX_LATIN1 && currentState.nextLatin1[rune] || this.step(currentState, rune, anchor);
|
|
2719
2859
|
if (currentState === null) return null; // Bailout to NFA
|
|
2720
2860
|
|
|
2861
|
+
currentState.lastSeen = ++this.clock;
|
|
2721
2862
|
i += width;
|
|
2722
2863
|
checkMatch(currentState, i);
|
|
2723
2864
|
if (currentState.nfaStates.length === 0) {
|
|
@@ -2755,7 +2896,7 @@ class BitState {
|
|
|
2755
2896
|
// Bitwise shift (>>> 5) instead of Math.floor( / 32)
|
|
2756
2897
|
const visitedSize = prog.numInst() * (end + 1) + VISITED_BITS - 1 >>> 5;
|
|
2757
2898
|
if (this.visited.length < visitedSize) {
|
|
2758
|
-
this.visited = new Uint32Array(
|
|
2899
|
+
this.visited = new Uint32Array(visitedSize);
|
|
2759
2900
|
} else {
|
|
2760
2901
|
this.visited.fill(0, 0, visitedSize);
|
|
2761
2902
|
}
|
|
@@ -2841,11 +2982,12 @@ class BitState {
|
|
|
2841
2982
|
const outInst = re2.prog.getInst(inst.out);
|
|
2842
2983
|
if (Inst.isRuneOp(outInst.op)) {
|
|
2843
2984
|
this.push(re2, inst.arg, currentPos, false);
|
|
2844
|
-
currentPc = inst.
|
|
2985
|
+
currentPc = inst.arg;
|
|
2986
|
+
currentPos = this.end;
|
|
2845
2987
|
continue;
|
|
2846
2988
|
}
|
|
2847
2989
|
this.push(re2, inst.out, this.end, false);
|
|
2848
|
-
currentPc = inst.
|
|
2990
|
+
currentPc = inst.out;
|
|
2849
2991
|
continue;
|
|
2850
2992
|
}
|
|
2851
2993
|
case Inst.RUNE:
|
|
@@ -2926,6 +3068,11 @@ class BitState {
|
|
|
2926
3068
|
if (currentPos === this.end) return true;
|
|
2927
3069
|
break;
|
|
2928
3070
|
}
|
|
3071
|
+
case Inst.LB_WRITE:
|
|
3072
|
+
case Inst.LB_CHECK:
|
|
3073
|
+
{
|
|
3074
|
+
throw new RE2JSInternalException('Backtracker cannot evaluate Lookbehind instructions');
|
|
3075
|
+
}
|
|
2929
3076
|
default:
|
|
2930
3077
|
{
|
|
2931
3078
|
throw new RE2JSInternalException('bad inst');
|
|
@@ -3362,6 +3509,10 @@ class OnePass {
|
|
|
3362
3509
|
switch (inst.op) {
|
|
3363
3510
|
case Inst.MATCH:
|
|
3364
3511
|
{
|
|
3512
|
+
// Verify ANCHOR_BOTH constraint before accepting the match
|
|
3513
|
+
if (anchor === RE2Flags.ANCHOR_BOTH && pos !== input.endPos()) {
|
|
3514
|
+
return null;
|
|
3515
|
+
}
|
|
3365
3516
|
matched = true;
|
|
3366
3517
|
if (matchcap.length > 0) {
|
|
3367
3518
|
matchcap[0] = 0;
|
|
@@ -3792,6 +3943,88 @@ class Regexp {
|
|
|
3792
3943
|
}
|
|
3793
3944
|
}
|
|
3794
3945
|
|
|
3946
|
+
// High-speed, single-pass Aho-Corasick string matcher optimized for V8.
|
|
3947
|
+
// Builds a trie with failure links to search for multiple prefixes simultaneously.
|
|
3948
|
+
class AhoCorasick {
|
|
3949
|
+
constructor(wordArrays) {
|
|
3950
|
+
this.next = [Object.create(null)];
|
|
3951
|
+
this.fail = [0];
|
|
3952
|
+
this.match = [false];
|
|
3953
|
+
|
|
3954
|
+
// Build Trie
|
|
3955
|
+
for (const word of wordArrays) {
|
|
3956
|
+
let node = 0;
|
|
3957
|
+
for (let i = 0; i < word.length; i++) {
|
|
3958
|
+
const val = word[i];
|
|
3959
|
+
if (!(val in this.next[node])) {
|
|
3960
|
+
this.next.push(Object.create(null));
|
|
3961
|
+
this.fail.push(0);
|
|
3962
|
+
this.match.push(false);
|
|
3963
|
+
this.next[node][val] = this.next.length - 1;
|
|
3964
|
+
}
|
|
3965
|
+
node = this.next[node][val];
|
|
3966
|
+
}
|
|
3967
|
+
this.match[node] = true;
|
|
3968
|
+
}
|
|
3969
|
+
|
|
3970
|
+
// Build Failure Links (BFS)
|
|
3971
|
+
const queue = [];
|
|
3972
|
+
for (const val in this.next[0]) {
|
|
3973
|
+
if (Object.prototype.hasOwnProperty.call(this.next[0], val)) {
|
|
3974
|
+
const child = this.next[0][val];
|
|
3975
|
+
this.fail[child] = 0;
|
|
3976
|
+
queue.push(child);
|
|
3977
|
+
}
|
|
3978
|
+
}
|
|
3979
|
+
while (queue.length > 0) {
|
|
3980
|
+
const curr = queue.shift();
|
|
3981
|
+
for (const val in this.next[curr]) {
|
|
3982
|
+
if (Object.prototype.hasOwnProperty.call(this.next[curr], val)) {
|
|
3983
|
+
const child = this.next[curr][val];
|
|
3984
|
+
let failNode = this.fail[curr];
|
|
3985
|
+
while (failNode !== 0 && !(val in this.next[failNode])) {
|
|
3986
|
+
failNode = this.fail[failNode];
|
|
3987
|
+
}
|
|
3988
|
+
if (val in this.next[failNode]) {
|
|
3989
|
+
this.fail[child] = this.next[failNode][val];
|
|
3990
|
+
} else {
|
|
3991
|
+
this.fail[child] = 0;
|
|
3992
|
+
}
|
|
3993
|
+
this.match[child] = this.match[child] || this.match[this.fail[child]];
|
|
3994
|
+
queue.push(child);
|
|
3995
|
+
}
|
|
3996
|
+
}
|
|
3997
|
+
}
|
|
3998
|
+
}
|
|
3999
|
+
searchUTF16(charSeq, start, end) {
|
|
4000
|
+
let node = 0;
|
|
4001
|
+
for (let i = start; i < end; i++) {
|
|
4002
|
+
const val = charSeq.charCodeAt(i);
|
|
4003
|
+
while (node !== 0 && !(val in this.next[node])) {
|
|
4004
|
+
node = this.fail[node];
|
|
4005
|
+
}
|
|
4006
|
+
if (val in this.next[node]) {
|
|
4007
|
+
node = this.next[node][val];
|
|
4008
|
+
}
|
|
4009
|
+
if (this.match[node]) return true;
|
|
4010
|
+
}
|
|
4011
|
+
return false;
|
|
4012
|
+
}
|
|
4013
|
+
searchUTF8(bytes, start, end) {
|
|
4014
|
+
let node = 0;
|
|
4015
|
+
for (let i = start; i < end; i++) {
|
|
4016
|
+
const val = bytes[i];
|
|
4017
|
+
while (node !== 0 && !(val in this.next[node])) {
|
|
4018
|
+
node = this.fail[node];
|
|
4019
|
+
}
|
|
4020
|
+
if (val in this.next[node]) {
|
|
4021
|
+
node = this.next[node][val];
|
|
4022
|
+
}
|
|
4023
|
+
if (this.match[node]) return true;
|
|
4024
|
+
}
|
|
4025
|
+
return false;
|
|
4026
|
+
}
|
|
4027
|
+
}
|
|
3795
4028
|
class Prefilter {
|
|
3796
4029
|
static Type = {
|
|
3797
4030
|
NONE: 0,
|
|
@@ -3804,6 +4037,8 @@ class Prefilter {
|
|
|
3804
4037
|
this.subs = [];
|
|
3805
4038
|
this.str = '';
|
|
3806
4039
|
this.bytes = null;
|
|
4040
|
+
this.ac16 = null;
|
|
4041
|
+
this.ac8 = null;
|
|
3807
4042
|
}
|
|
3808
4043
|
eval(input, pos) {
|
|
3809
4044
|
switch (this.type) {
|
|
@@ -3817,6 +4052,10 @@ class Prefilter {
|
|
|
3817
4052
|
}
|
|
3818
4053
|
return true;
|
|
3819
4054
|
case Prefilter.Type.OR:
|
|
4055
|
+
// Exploit Aho-Corasick if it was successfully built
|
|
4056
|
+
if (this.ac16 && this.ac8) {
|
|
4057
|
+
return input.hasAnyString(this, pos);
|
|
4058
|
+
}
|
|
3820
4059
|
for (let i = 0; i < this.subs.length; i++) {
|
|
3821
4060
|
if (this.subs[i].eval(input, pos)) return true;
|
|
3822
4061
|
}
|
|
@@ -3907,7 +4146,9 @@ class PrefilterTree {
|
|
|
3907
4146
|
const s = PrefilterTree.simplify(sub);
|
|
3908
4147
|
if (s.type !== Prefilter.Type.NONE) {
|
|
3909
4148
|
if (s.type === Prefilter.Type.AND) {
|
|
3910
|
-
|
|
4149
|
+
for (let j = 0; j < s.subs.length; j++) {
|
|
4150
|
+
newSubs.push(s.subs[j]);
|
|
4151
|
+
}
|
|
3911
4152
|
} else {
|
|
3912
4153
|
newSubs.push(s);
|
|
3913
4154
|
}
|
|
@@ -3949,6 +4190,27 @@ class PrefilterTree {
|
|
|
3949
4190
|
}
|
|
3950
4191
|
}
|
|
3951
4192
|
pf.subs = uniqueSubs;
|
|
4193
|
+
|
|
4194
|
+
// Build an Aho-Corasick automaton if all children are exact matches
|
|
4195
|
+
let allExact = true;
|
|
4196
|
+
for (const sub of uniqueSubs) {
|
|
4197
|
+
if (sub.type !== Prefilter.Type.EXACT) {
|
|
4198
|
+
allExact = false;
|
|
4199
|
+
break;
|
|
4200
|
+
}
|
|
4201
|
+
}
|
|
4202
|
+
if (allExact && uniqueSubs.length > 1) {
|
|
4203
|
+
const words16 = uniqueSubs.map(s => {
|
|
4204
|
+
const arr = [];
|
|
4205
|
+
for (let i = 0; i < s.str.length; i++) {
|
|
4206
|
+
arr.push(s.str.charCodeAt(i));
|
|
4207
|
+
}
|
|
4208
|
+
return arr;
|
|
4209
|
+
});
|
|
4210
|
+
pf.ac16 = new AhoCorasick(words16);
|
|
4211
|
+
const words8 = uniqueSubs.map(s => s.bytes);
|
|
4212
|
+
pf.ac8 = new AhoCorasick(words8);
|
|
4213
|
+
}
|
|
3952
4214
|
return pf;
|
|
3953
4215
|
}
|
|
3954
4216
|
return pf;
|
|
@@ -4475,7 +4737,9 @@ class Simplify {
|
|
|
4475
4737
|
// Flatten nested concatenations
|
|
4476
4738
|
if (nsub.op === Regexp.Op.CONCAT) {
|
|
4477
4739
|
changed = true;
|
|
4478
|
-
|
|
4740
|
+
for (let j = 0; j < nsub.subs.length; j++) {
|
|
4741
|
+
newSubs.push(nsub.subs[j]);
|
|
4742
|
+
}
|
|
4479
4743
|
continue;
|
|
4480
4744
|
}
|
|
4481
4745
|
} else if (re.op === Regexp.Op.ALTERNATE) {
|
|
@@ -4487,7 +4751,9 @@ class Simplify {
|
|
|
4487
4751
|
// Flatten nested alternations
|
|
4488
4752
|
if (nsub.op === Regexp.Op.ALTERNATE) {
|
|
4489
4753
|
changed = true;
|
|
4490
|
-
|
|
4754
|
+
for (let j = 0; j < nsub.subs.length; j++) {
|
|
4755
|
+
newSubs.push(nsub.subs[j]);
|
|
4756
|
+
}
|
|
4491
4757
|
continue;
|
|
4492
4758
|
}
|
|
4493
4759
|
}
|
|
@@ -5497,7 +5763,10 @@ class Parser {
|
|
|
5497
5763
|
return t.pop();
|
|
5498
5764
|
}
|
|
5499
5765
|
static concatRunes(x, y) {
|
|
5500
|
-
|
|
5766
|
+
for (let i = 0; i < y.length; i++) {
|
|
5767
|
+
x.push(y[i]);
|
|
5768
|
+
}
|
|
5769
|
+
return x;
|
|
5501
5770
|
}
|
|
5502
5771
|
constructor(wholeRegexp, flags = 0) {
|
|
5503
5772
|
this.wholeRegexp = wholeRegexp;
|
|
@@ -5533,8 +5802,8 @@ class Parser {
|
|
|
5533
5802
|
return re;
|
|
5534
5803
|
}
|
|
5535
5804
|
reuse(re) {
|
|
5536
|
-
if (this.height !== null &&
|
|
5537
|
-
|
|
5805
|
+
if (this.height !== null && this.height.has(re)) {
|
|
5806
|
+
this.height.delete(re);
|
|
5538
5807
|
}
|
|
5539
5808
|
if (re.subs !== null && re.subs.length > 0) {
|
|
5540
5809
|
re.subs[0] = this.free;
|
|
@@ -5566,20 +5835,20 @@ class Parser {
|
|
|
5566
5835
|
if (n <= 0) {
|
|
5567
5836
|
n = 1;
|
|
5568
5837
|
}
|
|
5569
|
-
if (n > Parser.MAX_SIZE / this.repeats) {
|
|
5838
|
+
if (n > Math.floor(Parser.MAX_SIZE / this.repeats)) {
|
|
5570
5839
|
this.repeats = Parser.MAX_SIZE;
|
|
5571
5840
|
} else {
|
|
5572
5841
|
this.repeats *= n;
|
|
5573
5842
|
}
|
|
5574
5843
|
}
|
|
5575
|
-
if (this.numRegexp < Parser.MAX_SIZE / this.repeats) {
|
|
5844
|
+
if (this.numRegexp < Math.floor(Parser.MAX_SIZE / this.repeats)) {
|
|
5576
5845
|
return;
|
|
5577
5846
|
}
|
|
5578
5847
|
|
|
5579
5848
|
// We need to start tracking size.
|
|
5580
5849
|
// Make the map and belatedly populate it
|
|
5581
5850
|
// with info about everything we've constructed so far.
|
|
5582
|
-
this.size =
|
|
5851
|
+
this.size = new Map();
|
|
5583
5852
|
for (let reEx of this.stack) {
|
|
5584
5853
|
this.checkSize(reEx);
|
|
5585
5854
|
}
|
|
@@ -5590,8 +5859,8 @@ class Parser {
|
|
|
5590
5859
|
}
|
|
5591
5860
|
calcSize(re, force = false) {
|
|
5592
5861
|
if (!force && this.size !== null) {
|
|
5593
|
-
if (
|
|
5594
|
-
return this.size
|
|
5862
|
+
if (this.size.has(re)) {
|
|
5863
|
+
return this.size.get(re);
|
|
5595
5864
|
}
|
|
5596
5865
|
}
|
|
5597
5866
|
let size = 0;
|
|
@@ -5651,9 +5920,9 @@ class Parser {
|
|
|
5651
5920
|
}
|
|
5652
5921
|
size = Math.max(1, size);
|
|
5653
5922
|
if (this.size === null) {
|
|
5654
|
-
this.size =
|
|
5923
|
+
this.size = new Map();
|
|
5655
5924
|
}
|
|
5656
|
-
this.size
|
|
5925
|
+
this.size.set(re, size);
|
|
5657
5926
|
return size;
|
|
5658
5927
|
}
|
|
5659
5928
|
checkHeight(re) {
|
|
@@ -5661,7 +5930,7 @@ class Parser {
|
|
|
5661
5930
|
return;
|
|
5662
5931
|
}
|
|
5663
5932
|
if (this.height === null) {
|
|
5664
|
-
this.height =
|
|
5933
|
+
this.height = new Map();
|
|
5665
5934
|
for (let reEx of this.stack) {
|
|
5666
5935
|
this.checkHeight(reEx);
|
|
5667
5936
|
}
|
|
@@ -5672,8 +5941,8 @@ class Parser {
|
|
|
5672
5941
|
}
|
|
5673
5942
|
calcHeight(re, force = false) {
|
|
5674
5943
|
if (!force && this.height !== null) {
|
|
5675
|
-
if (
|
|
5676
|
-
return this.height
|
|
5944
|
+
if (this.height.has(re)) {
|
|
5945
|
+
return this.height.get(re);
|
|
5677
5946
|
}
|
|
5678
5947
|
}
|
|
5679
5948
|
let h = 1;
|
|
@@ -5684,9 +5953,9 @@ class Parser {
|
|
|
5684
5953
|
}
|
|
5685
5954
|
}
|
|
5686
5955
|
if (this.height === null) {
|
|
5687
|
-
this.height =
|
|
5956
|
+
this.height = new Map();
|
|
5688
5957
|
}
|
|
5689
|
-
this.height
|
|
5958
|
+
this.height.set(re, h);
|
|
5690
5959
|
return h;
|
|
5691
5960
|
}
|
|
5692
5961
|
|
|
@@ -7808,7 +8077,7 @@ class TranslateRegExpString {
|
|
|
7808
8077
|
changed = true;
|
|
7809
8078
|
continue;
|
|
7810
8079
|
} else if (ch === '(' && i + 2 < size && data[i + 1] === '?' && data[i + 2] === '<') {
|
|
7811
|
-
if (i + 3
|
|
8080
|
+
if (i + 3 < size && !'=!>)'.includes(data[i + 3])) {
|
|
7812
8081
|
result += '(?P<';
|
|
7813
8082
|
i += 3;
|
|
7814
8083
|
changed = true;
|