re2js 2.2.0 → 2.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/README.md +54 -54
- package/build/index.cjs.cjs +432 -159
- package/build/index.cjs.cjs.map +1 -1
- package/build/index.esm.d.ts +3 -1
- package/build/index.esm.d.ts.map +1 -1
- package/build/index.esm.js +432 -159
- package/build/index.esm.js.map +1 -1
- package/build/index.umd.js +432 -159
- package/build/index.umd.js.map +1 -1
- package/package.json +7 -7
package/build/index.umd.js
CHANGED
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
* re2js
|
|
3
3
|
* RE2JS is the JavaScript port of RE2, a regular expression engine that provides linear time matching
|
|
4
4
|
*
|
|
5
|
-
* @version v2.2.
|
|
6
|
-
* @author
|
|
5
|
+
* @version v2.2.2
|
|
6
|
+
* @author Oleksii Vasyliev
|
|
7
7
|
* @homepage https://github.com/le0pard/re2js#readme
|
|
8
8
|
* @repository github:le0pard/re2js
|
|
9
9
|
* @license MIT
|
|
@@ -82,6 +82,8 @@
|
|
|
82
82
|
const ASCII_SIZE = 128;
|
|
83
83
|
const ASCII_TO_UPPER = new Int32Array(ASCII_SIZE);
|
|
84
84
|
const ASCII_TO_LOWER = new Int32Array(ASCII_SIZE);
|
|
85
|
+
// The highest legal Basic Multilingual Plane (BMP) value.
|
|
86
|
+
const MAX_BMP = 0xffff;
|
|
85
87
|
for (let i = 0; i < ASCII_SIZE; i++) {
|
|
86
88
|
if (i >= 97 && i <= 122) {
|
|
87
89
|
// a-z
|
|
@@ -105,11 +107,13 @@
|
|
|
105
107
|
static toUpperCase(codepoint) {
|
|
106
108
|
if (codepoint < ASCII_SIZE) return ASCII_TO_UPPER[codepoint];
|
|
107
109
|
const s = String.fromCodePoint(codepoint).toUpperCase();
|
|
108
|
-
|
|
110
|
+
const expectedLen = s.codePointAt(0) > MAX_BMP ? 2 : 1;
|
|
111
|
+
if (s.length > expectedLen) {
|
|
109
112
|
return codepoint;
|
|
110
113
|
}
|
|
111
114
|
const sOrigin = String.fromCodePoint(s.codePointAt(0)).toLowerCase();
|
|
112
|
-
|
|
115
|
+
const originExpectedLen = sOrigin.codePointAt(0) > MAX_BMP ? 2 : 1;
|
|
116
|
+
if (sOrigin.length > originExpectedLen || sOrigin.codePointAt(0) !== codepoint) {
|
|
113
117
|
return codepoint;
|
|
114
118
|
}
|
|
115
119
|
return s.codePointAt(0);
|
|
@@ -120,11 +124,13 @@
|
|
|
120
124
|
static toLowerCase(codepoint) {
|
|
121
125
|
if (codepoint < ASCII_SIZE) return ASCII_TO_LOWER[codepoint];
|
|
122
126
|
const s = String.fromCodePoint(codepoint).toLowerCase();
|
|
123
|
-
|
|
127
|
+
const expectedLen = s.codePointAt(0) > MAX_BMP ? 2 : 1;
|
|
128
|
+
if (s.length > expectedLen) {
|
|
124
129
|
return codepoint;
|
|
125
130
|
}
|
|
126
131
|
const sOrigin = String.fromCodePoint(s.codePointAt(0)).toUpperCase();
|
|
127
|
-
|
|
132
|
+
const originExpectedLen = sOrigin.codePointAt(0) > MAX_BMP ? 2 : 1;
|
|
133
|
+
if (sOrigin.length > originExpectedLen || sOrigin.codePointAt(0) !== codepoint) {
|
|
128
134
|
return codepoint;
|
|
129
135
|
}
|
|
130
136
|
return s.codePointAt(0);
|
|
@@ -226,7 +232,7 @@
|
|
|
226
232
|
static _CASE_ORBIT = null;
|
|
227
233
|
static get CASE_ORBIT() {
|
|
228
234
|
if (!this._CASE_ORBIT) {
|
|
229
|
-
this._CASE_ORBIT = decodeOrbit('rCrDIzDYqpII-LiC8cQlHa+0HGrpI6EzClClOBmOBkOBoOBpOBnOBrOBsOBqOlByPBzPBxPyK5crCz+HCydD1dD4dB5dB6dC8dEgeBheCieDmeDpeHj-HCweD1fDxeB+9HBwfC1FE2eBxfBjeBjdD1eDmpIHycB0fEmdBgda6cBhdD4cB1cdyhBC0hBK+hBDhiBBiiBIqiBIgkHChkHKikHDjkHBkkHImkHYjjBBnkH9gGygBB0gBB+gBBhhBBlkHBihBBqhBBijBBqypB4OhzHB70H6BgzHD-GiHo8HBp8HBq8HBr8HBs8HBt8HBu8HBv8HBg8HBh8HBi8HBj8HBk8HBl8HBm8HBn8HB48HB58HB68HB78HB88HB98HB+8HB-8HBw8HBx8HBy8HBz8HB08HB18HB28HB38HBo9HBp9HBq9HBr9HBs9HBt9HBu9HBv9HBg9HBh9HBi9HBj9HBk9HBl9HBm9HBn9HE89HJz9HClaFs+HJj+HHwcQwdQ8-HJz-HqJpdErCBlG-ohBrypBBokH6lVm4+BBl4+
|
|
235
|
+
this._CASE_ORBIT = decodeOrbit('rCrDIzDYqpII-LiC8cQlHa+0HGrpI6EzClClOBmOBkOBoOBpOBnOBrOBsOBqOlByPBzPBxPyK5crCz+HCydD1dD4dB5dB6dC8dEgeBheCieDmeDpeHj-HCweD1fDxeB+9HBwfC1FE2eBxfBjeBjdD1eDmpIHycB0fEmdBgda6cBhdD4cB1cdyhBC0hBK+hBDhiBBiiBIqiBIgkHChkHKikHDjkHBkkHImkHYjjBBnkH9gGygBB0gBB+gBBhhBBlkHBihBBqhBBijBBqypB4OhzHB70H6BgzHD-GiHo8HBp8HBq8HBr8HBs8HBt8HBu8HBv8HBg8HBh8HBi8HBj8HBk8HBl8HBm8HBn8HB48HB58HB68HB78HB88HB98HB+8HB-8HBw8HBx8HBy8HBz8HB08HB18HB28HB38HBo9HBp9HBq9HBr9HBs9HBt9HBu9HBv9HBg9HBh9HBi9HBj9HBk9HBl9HBm9HBn9HE89HJz9HClaFs+HJj+HHwcQwdQ8-HJz-HqJpdErCBlG-ohBrypBBokH6lVm4+BBl4+B');
|
|
230
236
|
}
|
|
231
237
|
return this._CASE_ORBIT;
|
|
232
238
|
}
|
|
@@ -598,11 +604,16 @@
|
|
|
598
604
|
// to compare it to |r2|.
|
|
599
605
|
// -1 is interpreted as the end-of-file mark.
|
|
600
606
|
static equalsIgnoreCase(r1, r2) {
|
|
601
|
-
// Runes already match
|
|
602
|
-
if (r1
|
|
607
|
+
// Runes already match
|
|
608
|
+
if (r1 === r2) {
|
|
603
609
|
return true;
|
|
604
610
|
}
|
|
605
611
|
|
|
612
|
+
// Safely fail if either is EOF (and they didn't explicitly match above)
|
|
613
|
+
if (r1 < 0 || r2 < 0) {
|
|
614
|
+
return false;
|
|
615
|
+
}
|
|
616
|
+
|
|
606
617
|
// Fast path for the common case where both runes are ASCII characters.
|
|
607
618
|
// Coerces both runes to lowercase if applicable.
|
|
608
619
|
if (r1 <= this.MAX_ASCII && r2 <= this.MAX_ASCII) {
|
|
@@ -855,7 +866,7 @@
|
|
|
855
866
|
// Encoding[(Encoding['UTF_16'] = 0)] = 'UTF_16'
|
|
856
867
|
// Encoding[(Encoding['UTF_8'] = 1)] = 'UTF_8'
|
|
857
868
|
const createEnum = (values = [], initNum = 0) => {
|
|
858
|
-
const enumObject =
|
|
869
|
+
const enumObject = Object.create(null);
|
|
859
870
|
for (let i = 0; i < values.length; i++) {
|
|
860
871
|
const val = values[i];
|
|
861
872
|
const keyVal = initNum + i;
|
|
@@ -997,6 +1008,9 @@
|
|
|
997
1008
|
hasString() {
|
|
998
1009
|
return false;
|
|
999
1010
|
}
|
|
1011
|
+
hasAnyString() {
|
|
1012
|
+
return false;
|
|
1013
|
+
}
|
|
1000
1014
|
|
|
1001
1015
|
// Helper for the exact-literal fast-path execution router
|
|
1002
1016
|
prefixLength() {
|
|
@@ -1022,6 +1036,13 @@
|
|
|
1022
1036
|
return idx !== -1 && idx <= this.end - target.length;
|
|
1023
1037
|
}
|
|
1024
1038
|
|
|
1039
|
+
// Executes a high-speed, single - pass search for multiple literal strings
|
|
1040
|
+
// simultaneously using an Aho-Corasick automaton.
|
|
1041
|
+
hasAnyString(prefilter, pos) {
|
|
1042
|
+
if (!prefilter.ac8) return false;
|
|
1043
|
+
return prefilter.ac8.searchUTF8(this.bytes, this.start + pos, this.end);
|
|
1044
|
+
}
|
|
1045
|
+
|
|
1025
1046
|
// Returns the rune at the specified index; the units are
|
|
1026
1047
|
// unspecified, but could be UTF-8 byte, UTF-16 char, or rune
|
|
1027
1048
|
// indices. Returns the width (in the same units) of the rune in
|
|
@@ -1040,17 +1061,23 @@
|
|
|
1040
1061
|
return c << 3 | 1;
|
|
1041
1062
|
} else if (c >= 0xc2 && c <= 0xdf && pos + 1 < this.end) {
|
|
1042
1063
|
const c1 = this.bytes[pos + 1] & 0xff;
|
|
1064
|
+
if ((c1 & 0xc0) !== 0x80) return c << 3 | 1;
|
|
1043
1065
|
const rune = (c & 0x1f) << 6 | c1 & 0x3f;
|
|
1044
1066
|
return rune << 3 | 2;
|
|
1045
1067
|
} else if (c >= 0xe0 && c <= 0xef && pos + 2 < this.end) {
|
|
1046
1068
|
const c1 = this.bytes[pos + 1] & 0xff;
|
|
1069
|
+
if ((c1 & 0xc0) !== 0x80) return c << 3 | 1;
|
|
1047
1070
|
const c2 = this.bytes[pos + 2] & 0xff;
|
|
1071
|
+
if ((c2 & 0xc0) !== 0x80) return c << 3 | 1;
|
|
1048
1072
|
const rune = (c & 0x0f) << 12 | (c1 & 0x3f) << 6 | c2 & 0x3f;
|
|
1049
1073
|
return rune << 3 | 3;
|
|
1050
1074
|
} else if (c >= 0xf0 && c <= 0xf4 && pos + 3 < this.end) {
|
|
1051
1075
|
const c1 = this.bytes[pos + 1] & 0xff;
|
|
1076
|
+
if ((c1 & 0xc0) !== 0x80) return c << 3 | 1;
|
|
1052
1077
|
const c2 = this.bytes[pos + 2] & 0xff;
|
|
1078
|
+
if ((c2 & 0xc0) !== 0x80) return c << 3 | 1;
|
|
1053
1079
|
const c3 = this.bytes[pos + 3] & 0xff;
|
|
1080
|
+
if ((c3 & 0xc0) !== 0x80) return c << 3 | 1;
|
|
1054
1081
|
const rune = (c & 0x07) << 18 | (c1 & 0x3f) << 12 | (c2 & 0x3f) << 6 | c3 & 0x3f;
|
|
1055
1082
|
return rune << 3 | 4;
|
|
1056
1083
|
} else {
|
|
@@ -1129,6 +1156,13 @@
|
|
|
1129
1156
|
return idx !== -1 && idx <= this.end - prefilter.str.length;
|
|
1130
1157
|
}
|
|
1131
1158
|
|
|
1159
|
+
// Executes a high-speed, single - pass search for multiple literal strings
|
|
1160
|
+
// simultaneously using an Aho-Corasick automaton.
|
|
1161
|
+
hasAnyString(prefilter, pos) {
|
|
1162
|
+
if (!prefilter.ac16) return false;
|
|
1163
|
+
return prefilter.ac16.searchUTF16(this.charSequence, this.start + pos, this.end);
|
|
1164
|
+
}
|
|
1165
|
+
|
|
1132
1166
|
// Returns the rune at the specified index; the units are
|
|
1133
1167
|
// unspecified, but could be UTF-8 byte, UTF-16 char, or rune
|
|
1134
1168
|
// indices. Returns the width (in the same units) of the rune in
|
|
@@ -1574,7 +1608,15 @@
|
|
|
1574
1608
|
if (this.hasMatch) {
|
|
1575
1609
|
start = this.groups[1];
|
|
1576
1610
|
if (this.groups[0] === this.groups[1]) {
|
|
1577
|
-
|
|
1611
|
+
// Safely calculate structural encoding width to avoid sequence corruption
|
|
1612
|
+
const machineInput = this.matcherInput.isUTF16Encoding() ? MachineInput.fromUTF16(this.matcherInput.asCharSequence(), 0, this.matcherInputLength) : MachineInput.fromUTF8(this.matcherInput.asBytes(), 0, this.matcherInputLength);
|
|
1613
|
+
const r = machineInput.step(start);
|
|
1614
|
+
if (r < 0) {
|
|
1615
|
+
// EOF
|
|
1616
|
+
start++; // Advance past length to force loop exit
|
|
1617
|
+
} else {
|
|
1618
|
+
start += r & 7; // Advance by safely decoded width
|
|
1619
|
+
}
|
|
1578
1620
|
}
|
|
1579
1621
|
}
|
|
1580
1622
|
return this.genMatch(start, RE2Flags.UNANCHORED);
|
|
@@ -1713,6 +1755,8 @@
|
|
|
1713
1755
|
const groupName = replacement.substring(i + 1, j);
|
|
1714
1756
|
res += this.group(groupName);
|
|
1715
1757
|
last = j + 1;
|
|
1758
|
+
i = j;
|
|
1759
|
+
continue;
|
|
1716
1760
|
}
|
|
1717
1761
|
}
|
|
1718
1762
|
}
|
|
@@ -1792,6 +1836,7 @@
|
|
|
1792
1836
|
if (j === replacement.length || replacement.codePointAt(j) !== Codepoint.CODES.get('>')) {
|
|
1793
1837
|
res += replacement.substring(i - 1, j + 1);
|
|
1794
1838
|
last = j + 1;
|
|
1839
|
+
i = j;
|
|
1795
1840
|
continue;
|
|
1796
1841
|
}
|
|
1797
1842
|
const groupName = replacement.substring(i + 1, j);
|
|
@@ -1801,6 +1846,8 @@
|
|
|
1801
1846
|
res += `$<${groupName}>`;
|
|
1802
1847
|
}
|
|
1803
1848
|
last = j + 1;
|
|
1849
|
+
i = j;
|
|
1850
|
+
continue;
|
|
1804
1851
|
}
|
|
1805
1852
|
}
|
|
1806
1853
|
}
|
|
@@ -1925,6 +1972,8 @@
|
|
|
1925
1972
|
return r === r0;
|
|
1926
1973
|
}
|
|
1927
1974
|
const len = this.runes.length;
|
|
1975
|
+
if (len === 0) return false;
|
|
1976
|
+
|
|
1928
1977
|
// If the array is exactly 2, 4, 6, or 8 items, DO NOT fall through to binary search
|
|
1929
1978
|
if (len === 2 || len === 4 || len === 6 || len === 8) {
|
|
1930
1979
|
for (let j = 0; j < len; j += 2) {
|
|
@@ -1938,22 +1987,19 @@
|
|
|
1938
1987
|
return false; // Stop here
|
|
1939
1988
|
}
|
|
1940
1989
|
|
|
1941
|
-
//
|
|
1942
|
-
|
|
1943
|
-
|
|
1944
|
-
|
|
1945
|
-
|
|
1946
|
-
|
|
1947
|
-
|
|
1948
|
-
|
|
1949
|
-
|
|
1950
|
-
}
|
|
1951
|
-
lo = m + 1;
|
|
1952
|
-
} else {
|
|
1953
|
-
hi = m;
|
|
1954
|
-
}
|
|
1990
|
+
// Branchless Binary Search (Lower Bound)
|
|
1991
|
+
// Compiles to optimal conditional move (cmov) machine code, preventing
|
|
1992
|
+
// branch mispredictions on large, chaotic Unicode arrays
|
|
1993
|
+
let base = 0;
|
|
1994
|
+
let n = len >> 1;
|
|
1995
|
+
while (n > 1) {
|
|
1996
|
+
const half = n >> 1;
|
|
1997
|
+
base += this.runes[base + half << 1] <= r ? half : 0;
|
|
1998
|
+
n -= half;
|
|
1955
1999
|
}
|
|
1956
|
-
|
|
2000
|
+
base += this.runes[base << 1] <= r ? 1 : 0;
|
|
2001
|
+
const m = base - 1;
|
|
2002
|
+
return m >= 0 && r <= this.runes[m << 1 | 1];
|
|
1957
2003
|
}
|
|
1958
2004
|
|
|
1959
2005
|
// matchRunePos checks whether the instruction matches (and consumes) r.
|
|
@@ -1968,6 +2014,7 @@
|
|
|
1968
2014
|
return r === r0 ? 0 : -1;
|
|
1969
2015
|
}
|
|
1970
2016
|
const len = this.runes.length;
|
|
2017
|
+
if (len === 0) return -1;
|
|
1971
2018
|
if (len === 2 || len === 4 || len === 6 || len === 8) {
|
|
1972
2019
|
for (let j = 0; j < len; j += 2) {
|
|
1973
2020
|
if (r < this.runes[j]) return -1;
|
|
@@ -1975,19 +2022,18 @@
|
|
|
1975
2022
|
}
|
|
1976
2023
|
return -1;
|
|
1977
2024
|
}
|
|
1978
|
-
|
|
1979
|
-
|
|
1980
|
-
|
|
1981
|
-
|
|
1982
|
-
|
|
1983
|
-
|
|
1984
|
-
|
|
1985
|
-
|
|
1986
|
-
} else {
|
|
1987
|
-
hi = m;
|
|
1988
|
-
}
|
|
2025
|
+
|
|
2026
|
+
// Branchless Binary Search (Lower Bound)
|
|
2027
|
+
let base = 0;
|
|
2028
|
+
let n = len >> 1;
|
|
2029
|
+
while (n > 1) {
|
|
2030
|
+
const half = n >> 1;
|
|
2031
|
+
base += this.runes[base + half << 1] <= r ? half : 0;
|
|
2032
|
+
n -= half;
|
|
1989
2033
|
}
|
|
1990
|
-
|
|
2034
|
+
base += this.runes[base << 1] <= r ? 1 : 0;
|
|
2035
|
+
const m = base - 1;
|
|
2036
|
+
return m >= 0 && r <= this.runes[m << 1 | 1] ? m : -1;
|
|
1991
2037
|
}
|
|
1992
2038
|
/**
|
|
1993
2039
|
*
|
|
@@ -2086,6 +2132,7 @@
|
|
|
2086
2132
|
//
|
|
2087
2133
|
// Called by RE2.doExecute.
|
|
2088
2134
|
class Machine {
|
|
2135
|
+
static THREADS_CHUNK_SIZE = 128;
|
|
2089
2136
|
static fromRE2(re2) {
|
|
2090
2137
|
const m = new Machine();
|
|
2091
2138
|
m.prog = re2.prog;
|
|
@@ -2126,15 +2173,15 @@
|
|
|
2126
2173
|
resetCap() {
|
|
2127
2174
|
for (let i = 0; i < this.poolSize; i++) {
|
|
2128
2175
|
const t = this.pool[i];
|
|
2129
|
-
t.cap.fill(
|
|
2176
|
+
t.cap.fill(-1);
|
|
2130
2177
|
}
|
|
2131
2178
|
}
|
|
2132
2179
|
initNewCap(ncap) {
|
|
2133
2180
|
for (let i = 0; i < this.poolSize; i++) {
|
|
2134
2181
|
const t = this.pool[i];
|
|
2135
|
-
t.cap = new Int32Array(ncap);
|
|
2182
|
+
t.cap = new Int32Array(ncap).fill(-1);
|
|
2136
2183
|
}
|
|
2137
|
-
this.matchcap = new Int32Array(ncap);
|
|
2184
|
+
this.matchcap = new Int32Array(ncap).fill(-1);
|
|
2138
2185
|
}
|
|
2139
2186
|
submatches() {
|
|
2140
2187
|
if (this.ncap === 0) {
|
|
@@ -2147,14 +2194,21 @@
|
|
|
2147
2194
|
// alloc() allocates a new thread with the given instruction.
|
|
2148
2195
|
// It uses the free pool if possible.
|
|
2149
2196
|
alloc(inst) {
|
|
2150
|
-
|
|
2151
|
-
|
|
2152
|
-
|
|
2153
|
-
|
|
2154
|
-
|
|
2155
|
-
|
|
2156
|
-
|
|
2197
|
+
if (this.poolSize === 0) {
|
|
2198
|
+
const capLen = this.matchcap.length;
|
|
2199
|
+
|
|
2200
|
+
// Bulk allocate threads in a tight loop so the V8 engine
|
|
2201
|
+
// places them adjacently in the young generation heap
|
|
2202
|
+
for (let i = 0; i < Machine.THREADS_CHUNK_SIZE; i++) {
|
|
2203
|
+
const t = new Thread();
|
|
2204
|
+
t.cap = new Int32Array(capLen);
|
|
2205
|
+
this.pool[this.poolSize++] = t;
|
|
2206
|
+
}
|
|
2157
2207
|
}
|
|
2208
|
+
|
|
2209
|
+
// Pop a thread from the top of the pool stack
|
|
2210
|
+
this.poolSize--;
|
|
2211
|
+
const t = this.pool[this.poolSize];
|
|
2158
2212
|
t.inst = inst;
|
|
2159
2213
|
return t;
|
|
2160
2214
|
}
|
|
@@ -2207,6 +2261,9 @@
|
|
|
2207
2261
|
if ((startCond & Utils.EMPTY_BEGIN_TEXT) !== 0 && pos !== 0) {
|
|
2208
2262
|
break;
|
|
2209
2263
|
}
|
|
2264
|
+
if ((anchor === RE2Flags.ANCHOR_START || anchor === RE2Flags.ANCHOR_BOTH) && pos !== 0) {
|
|
2265
|
+
break;
|
|
2266
|
+
}
|
|
2210
2267
|
if (this.matched) {
|
|
2211
2268
|
break;
|
|
2212
2269
|
}
|
|
@@ -2284,6 +2341,9 @@
|
|
|
2284
2341
|
while (true) {
|
|
2285
2342
|
if (runq.isEmpty()) {
|
|
2286
2343
|
if ((startCond & Utils.EMPTY_BEGIN_TEXT) !== 0 && pos !== 0) break;
|
|
2344
|
+
if ((anchor === RE2Flags.ANCHOR_START || anchor === RE2Flags.ANCHOR_BOTH) && pos !== 0) {
|
|
2345
|
+
break;
|
|
2346
|
+
}
|
|
2287
2347
|
}
|
|
2288
2348
|
if (pos === 0 || anchor === RE2Flags.UNANCHORED) {
|
|
2289
2349
|
// Spawn Lookbehind threads BEFORE the main pattern
|
|
@@ -2399,78 +2459,83 @@
|
|
|
2399
2459
|
runq.clear();
|
|
2400
2460
|
}
|
|
2401
2461
|
add(q, pc, pos, cap, cond, t) {
|
|
2402
|
-
|
|
2403
|
-
|
|
2404
|
-
|
|
2405
|
-
|
|
2406
|
-
|
|
2407
|
-
|
|
2408
|
-
|
|
2409
|
-
|
|
2410
|
-
|
|
2411
|
-
|
|
2412
|
-
|
|
2413
|
-
|
|
2414
|
-
|
|
2415
|
-
|
|
2416
|
-
t = this.add(q, inst.arg, pos, cap, cond, t);
|
|
2417
|
-
break;
|
|
2418
|
-
case Inst.EMPTY_WIDTH:
|
|
2419
|
-
if ((inst.arg & ~cond) === 0) {
|
|
2420
|
-
t = this.add(q, inst.out, pos, cap, cond, t);
|
|
2421
|
-
}
|
|
2422
|
-
break;
|
|
2423
|
-
case Inst.NOP:
|
|
2424
|
-
t = this.add(q, inst.out, pos, cap, cond, t);
|
|
2425
|
-
break;
|
|
2426
|
-
case Inst.CAPTURE:
|
|
2427
|
-
if (inst.arg < this.ncap) {
|
|
2428
|
-
const opos = cap[inst.arg];
|
|
2429
|
-
cap[inst.arg] = pos;
|
|
2430
|
-
this.add(q, inst.out, pos, cap, cond, null);
|
|
2431
|
-
cap[inst.arg] = opos;
|
|
2432
|
-
} else {
|
|
2462
|
+
while (true) {
|
|
2463
|
+
if (pc === 0) {
|
|
2464
|
+
return t;
|
|
2465
|
+
}
|
|
2466
|
+
if (q.contains(pc)) {
|
|
2467
|
+
return t;
|
|
2468
|
+
}
|
|
2469
|
+
const d = q.add(pc);
|
|
2470
|
+
const inst = this.prog.inst[pc];
|
|
2471
|
+
switch (inst.op) {
|
|
2472
|
+
case Inst.FAIL:
|
|
2473
|
+
return t;
|
|
2474
|
+
case Inst.ALT:
|
|
2475
|
+
case Inst.ALT_MATCH:
|
|
2433
2476
|
t = this.add(q, inst.out, pos, cap, cond, t);
|
|
2434
|
-
|
|
2435
|
-
|
|
2436
|
-
|
|
2437
|
-
|
|
2438
|
-
|
|
2439
|
-
|
|
2440
|
-
case Inst.LB_CHECK:
|
|
2441
|
-
if (inst.lb > 0) {
|
|
2442
|
-
// Positive Lookbehind
|
|
2443
|
-
if (this.lbTable[inst.lb] === pos) {
|
|
2444
|
-
t = this.add(q, inst.out, pos, cap, cond, t);
|
|
2477
|
+
pc = inst.arg; // Flattened tail recursion
|
|
2478
|
+
continue;
|
|
2479
|
+
case Inst.EMPTY_WIDTH:
|
|
2480
|
+
if ((inst.arg & ~cond) === 0) {
|
|
2481
|
+
pc = inst.out; // Flattened tail recursion
|
|
2482
|
+
continue;
|
|
2445
2483
|
}
|
|
2446
|
-
|
|
2447
|
-
|
|
2448
|
-
|
|
2449
|
-
|
|
2450
|
-
|
|
2451
|
-
|
|
2452
|
-
|
|
2453
|
-
|
|
2454
|
-
|
|
2455
|
-
|
|
2456
|
-
|
|
2457
|
-
|
|
2458
|
-
|
|
2459
|
-
|
|
2460
|
-
}
|
|
2461
|
-
if (this.ncap > 0 && t.cap !== cap) {
|
|
2462
|
-
// Direct assignment utilizing Typed Array performance
|
|
2463
|
-
for (let c = 0; c < this.ncap; c++) {
|
|
2464
|
-
t.cap[c] = cap[c];
|
|
2484
|
+
return t;
|
|
2485
|
+
case Inst.NOP:
|
|
2486
|
+
pc = inst.out; // Flattened tail recursion
|
|
2487
|
+
continue;
|
|
2488
|
+
case Inst.CAPTURE:
|
|
2489
|
+
if (inst.arg < this.ncap) {
|
|
2490
|
+
const opos = cap[inst.arg];
|
|
2491
|
+
cap[inst.arg] = pos;
|
|
2492
|
+
this.add(q, inst.out, pos, cap, cond, null);
|
|
2493
|
+
cap[inst.arg] = opos;
|
|
2494
|
+
return t;
|
|
2495
|
+
} else {
|
|
2496
|
+
pc = inst.out; // Flattened tail recursion
|
|
2497
|
+
continue;
|
|
2465
2498
|
}
|
|
2466
|
-
|
|
2467
|
-
|
|
2468
|
-
|
|
2469
|
-
|
|
2470
|
-
|
|
2471
|
-
|
|
2499
|
+
case Inst.LB_WRITE:
|
|
2500
|
+
this.lbTable[Math.abs(inst.lb)] = pos;
|
|
2501
|
+
pc = inst.out;
|
|
2502
|
+
continue;
|
|
2503
|
+
case Inst.LB_CHECK:
|
|
2504
|
+
if (inst.lb > 0) {
|
|
2505
|
+
// Positive Lookbehind
|
|
2506
|
+
if (this.lbTable[inst.lb] === pos) {
|
|
2507
|
+
pc = inst.out; // Flattened tail recursion
|
|
2508
|
+
continue;
|
|
2509
|
+
}
|
|
2510
|
+
} else if (this.lbTable[-inst.lb] !== pos) {
|
|
2511
|
+
// Negative Lookbehind
|
|
2512
|
+
pc = inst.out; // Flattened tail recursion
|
|
2513
|
+
continue;
|
|
2514
|
+
}
|
|
2515
|
+
return t;
|
|
2516
|
+
case Inst.MATCH:
|
|
2517
|
+
case Inst.RUNE:
|
|
2518
|
+
case Inst.RUNE1:
|
|
2519
|
+
case Inst.RUNE_ANY:
|
|
2520
|
+
case Inst.RUNE_ANY_NOT_NL:
|
|
2521
|
+
if (t === null) {
|
|
2522
|
+
t = this.alloc(inst);
|
|
2523
|
+
} else {
|
|
2524
|
+
t.inst = inst;
|
|
2525
|
+
}
|
|
2526
|
+
if (this.ncap > 0 && t.cap !== cap) {
|
|
2527
|
+
// Direct assignment utilizing Typed Array performance
|
|
2528
|
+
for (let c = 0; c < this.ncap; c++) {
|
|
2529
|
+
t.cap[c] = cap[c];
|
|
2530
|
+
}
|
|
2531
|
+
}
|
|
2532
|
+
q.denseThreads[d] = t;
|
|
2533
|
+
t = null;
|
|
2534
|
+
return t;
|
|
2535
|
+
default:
|
|
2536
|
+
throw new Error('unhandled');
|
|
2537
|
+
}
|
|
2472
2538
|
}
|
|
2473
|
-
return t;
|
|
2474
2539
|
}
|
|
2475
2540
|
}
|
|
2476
2541
|
|
|
@@ -2498,8 +2563,15 @@
|
|
|
2498
2563
|
this.nfaStates = nfaStates; // Int32Array of Instruction PCs
|
|
2499
2564
|
this.isMatch = isMatch; // Boolean
|
|
2500
2565
|
this.matchIDs = matchIDs; // Array of integers indicating which Set patterns matched
|
|
2501
|
-
|
|
2502
|
-
|
|
2566
|
+
|
|
2567
|
+
// Latin-1 (Unicode.MAX_LATIN1 + 1) flat arrays for blisteringly fast O(1) lookups
|
|
2568
|
+
// completely covering standard English, European languages, and 1-byte encodings.
|
|
2569
|
+
this.nextLatin1 = new Array(Unicode.MAX_LATIN1 + 1).fill(null); // Flat array for blisteringly fast ASCII lookups
|
|
2570
|
+
this.nextLatin1Anchored = new Array(Unicode.MAX_LATIN1 + 1).fill(null); // Flat array for blisteringly fast ASCII lookups
|
|
2571
|
+
// 2 arrays used as hash map for V8 optimization (N is small number, so O(n) faster than Map O(1))
|
|
2572
|
+
this.transKeys = [];
|
|
2573
|
+
this.transVals = [];
|
|
2574
|
+
this.lastSeen = 0; // Track when this state was last used for LRU eviction
|
|
2503
2575
|
}
|
|
2504
2576
|
}
|
|
2505
2577
|
class DFA {
|
|
@@ -2512,6 +2584,7 @@
|
|
|
2512
2584
|
this.stateLimit = 10000; // Prevent memory explosion (ReDoS protection)
|
|
2513
2585
|
this.cacheClears = 0; // Track thrashing
|
|
2514
2586
|
this.failed = false; // mark if DFA cannot work with provided prog
|
|
2587
|
+
this.clock = 0; // Global clock for LRU eviction
|
|
2515
2588
|
}
|
|
2516
2589
|
|
|
2517
2590
|
// Follows epsilon (empty) transitions to find all reachable states without consuming a char
|
|
@@ -2571,6 +2644,7 @@
|
|
|
2571
2644
|
for (let i = 0; i < bucket.length; i++) {
|
|
2572
2645
|
const state = bucket[i];
|
|
2573
2646
|
if (arraysEqual(state.nfaStates, sortedPCs)) {
|
|
2647
|
+
state.lastSeen = ++this.clock;
|
|
2574
2648
|
return state;
|
|
2575
2649
|
}
|
|
2576
2650
|
}
|
|
@@ -2583,40 +2657,99 @@
|
|
|
2583
2657
|
if (this.failed) return null;
|
|
2584
2658
|
|
|
2585
2659
|
// Safety: prevent memory exhaustion from state explosion
|
|
2586
|
-
// We
|
|
2660
|
+
// We prune the cache to keep the newest 50%
|
|
2587
2661
|
if (this.stateCount >= this.stateLimit) {
|
|
2588
|
-
this.stateCache.clear();
|
|
2589
|
-
this.stateCount = 0;
|
|
2590
|
-
this.startState = null;
|
|
2591
2662
|
this.cacheClears++;
|
|
2592
2663
|
|
|
2593
2664
|
// If this regex causes continuous cache thrashing, permanently fall back to NFA
|
|
2594
2665
|
// to avoid spending CPU cycles constantly rebuilding the DFA tree.
|
|
2595
2666
|
if (this.cacheClears >= DFA.MAX_CACHE_CLEARS) {
|
|
2596
2667
|
this.failed = true;
|
|
2668
|
+
this.stateCache.clear();
|
|
2669
|
+
this.stateCount = 0;
|
|
2670
|
+
this.startState = null;
|
|
2671
|
+
return null;
|
|
2672
|
+
}
|
|
2673
|
+
this.evictCache();
|
|
2674
|
+
|
|
2675
|
+
// After eviction, the bucket reference might be stale or empty.
|
|
2676
|
+
// We must re-fetch or re-create the bucket.
|
|
2677
|
+
bucket = this.stateCache.get(hash);
|
|
2678
|
+
if (!bucket) {
|
|
2679
|
+
bucket = [];
|
|
2680
|
+
this.stateCache.set(hash, bucket);
|
|
2597
2681
|
}
|
|
2598
|
-
return null;
|
|
2599
2682
|
}
|
|
2600
2683
|
|
|
2601
2684
|
// State not found, create it and add to bucket
|
|
2602
2685
|
const state = new DFAState(sortedPCs, closureResult.isMatch, closureResult.matchIDs);
|
|
2686
|
+
state.lastSeen = ++this.clock;
|
|
2603
2687
|
bucket.push(state);
|
|
2604
2688
|
this.stateCount++;
|
|
2605
2689
|
return state;
|
|
2606
2690
|
}
|
|
2691
|
+
evictCache() {
|
|
2692
|
+
const allStates = [];
|
|
2693
|
+
for (const bucket of this.stateCache.values()) {
|
|
2694
|
+
for (let i = 0; i < bucket.length; i++) {
|
|
2695
|
+
allStates.push(bucket[i]);
|
|
2696
|
+
}
|
|
2697
|
+
}
|
|
2698
|
+
|
|
2699
|
+
// Sort ascending by lastSeen (oldest first)
|
|
2700
|
+
allStates.sort((a, b) => a.lastSeen - b.lastSeen);
|
|
2701
|
+
|
|
2702
|
+
// Keep the newest 50%
|
|
2703
|
+
const keepCount = Math.max(1, Math.floor(this.stateLimit / 2));
|
|
2704
|
+
const startIndex = allStates.length - keepCount;
|
|
2705
|
+
const survivorsArray = allStates.slice(startIndex);
|
|
2706
|
+
const survivors = new Set(survivorsArray);
|
|
2707
|
+
this.stateCache.clear();
|
|
2708
|
+
this.stateCount = 0;
|
|
2709
|
+
for (let i = 0; i < survivorsArray.length; i++) {
|
|
2710
|
+
const state = survivorsArray[i];
|
|
2711
|
+
|
|
2712
|
+
// Sever ties to all states to prevent memory leaks and dangling pointers
|
|
2713
|
+
state.nextLatin1.fill(null);
|
|
2714
|
+
state.nextLatin1Anchored.fill(null);
|
|
2715
|
+
// zero-allocation cleanup
|
|
2716
|
+
state.transKeys.length = 0;
|
|
2717
|
+
state.transVals.length = 0;
|
|
2718
|
+
const hash = hashPCs(state.nfaStates);
|
|
2719
|
+
let bucket = this.stateCache.get(hash);
|
|
2720
|
+
if (!bucket) {
|
|
2721
|
+
bucket = [];
|
|
2722
|
+
this.stateCache.set(hash, bucket);
|
|
2723
|
+
}
|
|
2724
|
+
bucket.push(state);
|
|
2725
|
+
this.stateCount++;
|
|
2726
|
+
}
|
|
2727
|
+
|
|
2728
|
+
// Start state must either be preserved or nullified so it gets re-created
|
|
2729
|
+
if (this.startState && !survivors.has(this.startState)) {
|
|
2730
|
+
this.startState = null;
|
|
2731
|
+
}
|
|
2732
|
+
}
|
|
2607
2733
|
|
|
2608
2734
|
// Compute the next DFA state given a current state and a character
|
|
2609
2735
|
step(state, charCode, anchor) {
|
|
2610
|
-
// OPTIMIZATION:
|
|
2611
|
-
if (
|
|
2612
|
-
|
|
2613
|
-
|
|
2614
|
-
return next;
|
|
2736
|
+
// OPTIMIZATION: Latin-1 Array Fast-Path
|
|
2737
|
+
if (charCode <= Unicode.MAX_LATIN1) {
|
|
2738
|
+
if (anchor === RE2Flags.UNANCHORED) {
|
|
2739
|
+
const next = state.nextLatin1[charCode];
|
|
2740
|
+
if (next !== null) return next;
|
|
2741
|
+
} else {
|
|
2742
|
+
const next = state.nextLatin1Anchored[charCode];
|
|
2743
|
+
if (next !== null) return next;
|
|
2615
2744
|
}
|
|
2616
2745
|
} else {
|
|
2746
|
+
// Dense Array Linear Search fallback for Runes > 255
|
|
2617
2747
|
const key = charCode + (anchor === RE2Flags.UNANCHORED ? 0 : Unicode.MAX_RUNE + 1);
|
|
2618
|
-
|
|
2619
|
-
|
|
2748
|
+
// get [key] -> nextState
|
|
2749
|
+
const keys = state.transKeys;
|
|
2750
|
+
const len = keys.length;
|
|
2751
|
+
for (let i = 0; i < len; i++) {
|
|
2752
|
+
if (keys[i] === key) return state.transVals[i];
|
|
2620
2753
|
}
|
|
2621
2754
|
}
|
|
2622
2755
|
const nextPCs = [];
|
|
@@ -2633,11 +2766,17 @@
|
|
|
2633
2766
|
const nextState = this.getState(nextPCs);
|
|
2634
2767
|
|
|
2635
2768
|
// Cache the result
|
|
2636
|
-
if (
|
|
2637
|
-
|
|
2769
|
+
if (charCode <= Unicode.MAX_LATIN1) {
|
|
2770
|
+
if (anchor === RE2Flags.UNANCHORED) {
|
|
2771
|
+
state.nextLatin1[charCode] = nextState;
|
|
2772
|
+
} else {
|
|
2773
|
+
state.nextLatin1Anchored[charCode] = nextState;
|
|
2774
|
+
}
|
|
2638
2775
|
} else {
|
|
2639
2776
|
const key = charCode + (anchor === RE2Flags.UNANCHORED ? 0 : Unicode.MAX_RUNE + 1);
|
|
2640
|
-
|
|
2777
|
+
// store key -> nextState
|
|
2778
|
+
state.transKeys.push(key);
|
|
2779
|
+
state.transVals.push(nextState);
|
|
2641
2780
|
}
|
|
2642
2781
|
return nextState;
|
|
2643
2782
|
}
|
|
@@ -2670,10 +2809,11 @@
|
|
|
2670
2809
|
if (width === 0) {
|
|
2671
2810
|
break;
|
|
2672
2811
|
}
|
|
2673
|
-
currentState = anchor === RE2Flags.UNANCHORED && rune <= Unicode.
|
|
2812
|
+
currentState = anchor === RE2Flags.UNANCHORED && rune <= Unicode.MAX_LATIN1 && currentState.nextLatin1[rune] || this.step(currentState, rune, anchor);
|
|
2674
2813
|
|
|
2675
2814
|
// If we hit an unrecoverable DFA error or bailout, signal fallback
|
|
2676
2815
|
if (currentState === null) return null;
|
|
2816
|
+
currentState.lastSeen = ++this.clock;
|
|
2677
2817
|
if (currentState.isMatch) {
|
|
2678
2818
|
if (anchor === RE2Flags.ANCHOR_BOTH) {
|
|
2679
2819
|
if (i + width === endPos) return true;
|
|
@@ -2721,9 +2861,10 @@
|
|
|
2721
2861
|
const rune = r >> 3;
|
|
2722
2862
|
const width = r & 7;
|
|
2723
2863
|
if (width === 0) break;
|
|
2724
|
-
currentState = anchor === RE2Flags.UNANCHORED && rune <= Unicode.
|
|
2864
|
+
currentState = anchor === RE2Flags.UNANCHORED && rune <= Unicode.MAX_LATIN1 && currentState.nextLatin1[rune] || this.step(currentState, rune, anchor);
|
|
2725
2865
|
if (currentState === null) return null; // Bailout to NFA
|
|
2726
2866
|
|
|
2867
|
+
currentState.lastSeen = ++this.clock;
|
|
2727
2868
|
i += width;
|
|
2728
2869
|
checkMatch(currentState, i);
|
|
2729
2870
|
if (currentState.nfaStates.length === 0) {
|
|
@@ -2761,7 +2902,7 @@
|
|
|
2761
2902
|
// Bitwise shift (>>> 5) instead of Math.floor( / 32)
|
|
2762
2903
|
const visitedSize = prog.numInst() * (end + 1) + VISITED_BITS - 1 >>> 5;
|
|
2763
2904
|
if (this.visited.length < visitedSize) {
|
|
2764
|
-
this.visited = new Uint32Array(
|
|
2905
|
+
this.visited = new Uint32Array(visitedSize);
|
|
2765
2906
|
} else {
|
|
2766
2907
|
this.visited.fill(0, 0, visitedSize);
|
|
2767
2908
|
}
|
|
@@ -2847,11 +2988,12 @@
|
|
|
2847
2988
|
const outInst = re2.prog.getInst(inst.out);
|
|
2848
2989
|
if (Inst.isRuneOp(outInst.op)) {
|
|
2849
2990
|
this.push(re2, inst.arg, currentPos, false);
|
|
2850
|
-
currentPc = inst.
|
|
2991
|
+
currentPc = inst.arg;
|
|
2992
|
+
currentPos = this.end;
|
|
2851
2993
|
continue;
|
|
2852
2994
|
}
|
|
2853
2995
|
this.push(re2, inst.out, this.end, false);
|
|
2854
|
-
currentPc = inst.
|
|
2996
|
+
currentPc = inst.out;
|
|
2855
2997
|
continue;
|
|
2856
2998
|
}
|
|
2857
2999
|
case Inst.RUNE:
|
|
@@ -2932,6 +3074,11 @@
|
|
|
2932
3074
|
if (currentPos === this.end) return true;
|
|
2933
3075
|
break;
|
|
2934
3076
|
}
|
|
3077
|
+
case Inst.LB_WRITE:
|
|
3078
|
+
case Inst.LB_CHECK:
|
|
3079
|
+
{
|
|
3080
|
+
throw new RE2JSInternalException('Backtracker cannot evaluate Lookbehind instructions');
|
|
3081
|
+
}
|
|
2935
3082
|
default:
|
|
2936
3083
|
{
|
|
2937
3084
|
throw new RE2JSInternalException('bad inst');
|
|
@@ -3200,7 +3347,9 @@
|
|
|
3200
3347
|
}
|
|
3201
3348
|
runes.sort((a, b) => a - b);
|
|
3202
3349
|
} else {
|
|
3203
|
-
|
|
3350
|
+
for (let j = 0; j < inst.runes.length; j++) {
|
|
3351
|
+
runes.push(inst.runes[j]);
|
|
3352
|
+
}
|
|
3204
3353
|
}
|
|
3205
3354
|
onePassRunes[pc] = runes;
|
|
3206
3355
|
inst.next = new Uint32Array(Math.floor(runes.length / 2) + 1).fill(inst.out);
|
|
@@ -3368,6 +3517,10 @@
|
|
|
3368
3517
|
switch (inst.op) {
|
|
3369
3518
|
case Inst.MATCH:
|
|
3370
3519
|
{
|
|
3520
|
+
// Verify ANCHOR_BOTH constraint before accepting the match
|
|
3521
|
+
if (anchor === RE2Flags.ANCHOR_BOTH && pos !== input.endPos()) {
|
|
3522
|
+
return null;
|
|
3523
|
+
}
|
|
3371
3524
|
matched = true;
|
|
3372
3525
|
if (matchcap.length > 0) {
|
|
3373
3526
|
matchcap[0] = 0;
|
|
@@ -3798,6 +3951,88 @@
|
|
|
3798
3951
|
}
|
|
3799
3952
|
}
|
|
3800
3953
|
|
|
3954
|
+
// High-speed, single-pass Aho-Corasick string matcher optimized for V8.
|
|
3955
|
+
// Builds a trie with failure links to search for multiple prefixes simultaneously.
|
|
3956
|
+
class AhoCorasick {
|
|
3957
|
+
constructor(wordArrays) {
|
|
3958
|
+
this.next = [Object.create(null)];
|
|
3959
|
+
this.fail = [0];
|
|
3960
|
+
this.match = [false];
|
|
3961
|
+
|
|
3962
|
+
// Build Trie
|
|
3963
|
+
for (const word of wordArrays) {
|
|
3964
|
+
let node = 0;
|
|
3965
|
+
for (let i = 0; i < word.length; i++) {
|
|
3966
|
+
const val = word[i];
|
|
3967
|
+
if (!(val in this.next[node])) {
|
|
3968
|
+
this.next.push(Object.create(null));
|
|
3969
|
+
this.fail.push(0);
|
|
3970
|
+
this.match.push(false);
|
|
3971
|
+
this.next[node][val] = this.next.length - 1;
|
|
3972
|
+
}
|
|
3973
|
+
node = this.next[node][val];
|
|
3974
|
+
}
|
|
3975
|
+
this.match[node] = true;
|
|
3976
|
+
}
|
|
3977
|
+
|
|
3978
|
+
// Build Failure Links (BFS)
|
|
3979
|
+
const queue = [];
|
|
3980
|
+
for (const val in this.next[0]) {
|
|
3981
|
+
if (Object.prototype.hasOwnProperty.call(this.next[0], val)) {
|
|
3982
|
+
const child = this.next[0][val];
|
|
3983
|
+
this.fail[child] = 0;
|
|
3984
|
+
queue.push(child);
|
|
3985
|
+
}
|
|
3986
|
+
}
|
|
3987
|
+
while (queue.length > 0) {
|
|
3988
|
+
const curr = queue.shift();
|
|
3989
|
+
for (const val in this.next[curr]) {
|
|
3990
|
+
if (Object.prototype.hasOwnProperty.call(this.next[curr], val)) {
|
|
3991
|
+
const child = this.next[curr][val];
|
|
3992
|
+
let failNode = this.fail[curr];
|
|
3993
|
+
while (failNode !== 0 && !(val in this.next[failNode])) {
|
|
3994
|
+
failNode = this.fail[failNode];
|
|
3995
|
+
}
|
|
3996
|
+
if (val in this.next[failNode]) {
|
|
3997
|
+
this.fail[child] = this.next[failNode][val];
|
|
3998
|
+
} else {
|
|
3999
|
+
this.fail[child] = 0;
|
|
4000
|
+
}
|
|
4001
|
+
this.match[child] = this.match[child] || this.match[this.fail[child]];
|
|
4002
|
+
queue.push(child);
|
|
4003
|
+
}
|
|
4004
|
+
}
|
|
4005
|
+
}
|
|
4006
|
+
}
|
|
4007
|
+
searchUTF16(charSeq, start, end) {
|
|
4008
|
+
let node = 0;
|
|
4009
|
+
for (let i = start; i < end; i++) {
|
|
4010
|
+
const val = charSeq.charCodeAt(i);
|
|
4011
|
+
while (node !== 0 && !(val in this.next[node])) {
|
|
4012
|
+
node = this.fail[node];
|
|
4013
|
+
}
|
|
4014
|
+
if (val in this.next[node]) {
|
|
4015
|
+
node = this.next[node][val];
|
|
4016
|
+
}
|
|
4017
|
+
if (this.match[node]) return true;
|
|
4018
|
+
}
|
|
4019
|
+
return false;
|
|
4020
|
+
}
|
|
4021
|
+
searchUTF8(bytes, start, end) {
|
|
4022
|
+
let node = 0;
|
|
4023
|
+
for (let i = start; i < end; i++) {
|
|
4024
|
+
const val = bytes[i];
|
|
4025
|
+
while (node !== 0 && !(val in this.next[node])) {
|
|
4026
|
+
node = this.fail[node];
|
|
4027
|
+
}
|
|
4028
|
+
if (val in this.next[node]) {
|
|
4029
|
+
node = this.next[node][val];
|
|
4030
|
+
}
|
|
4031
|
+
if (this.match[node]) return true;
|
|
4032
|
+
}
|
|
4033
|
+
return false;
|
|
4034
|
+
}
|
|
4035
|
+
}
|
|
3801
4036
|
class Prefilter {
|
|
3802
4037
|
static Type = {
|
|
3803
4038
|
NONE: 0,
|
|
@@ -3810,6 +4045,8 @@
|
|
|
3810
4045
|
this.subs = [];
|
|
3811
4046
|
this.str = '';
|
|
3812
4047
|
this.bytes = null;
|
|
4048
|
+
this.ac16 = null;
|
|
4049
|
+
this.ac8 = null;
|
|
3813
4050
|
}
|
|
3814
4051
|
eval(input, pos) {
|
|
3815
4052
|
switch (this.type) {
|
|
@@ -3823,6 +4060,10 @@
|
|
|
3823
4060
|
}
|
|
3824
4061
|
return true;
|
|
3825
4062
|
case Prefilter.Type.OR:
|
|
4063
|
+
// Exploit Aho-Corasick if it was successfully built
|
|
4064
|
+
if (this.ac16 && this.ac8) {
|
|
4065
|
+
return input.hasAnyString(this, pos);
|
|
4066
|
+
}
|
|
3826
4067
|
for (let i = 0; i < this.subs.length; i++) {
|
|
3827
4068
|
if (this.subs[i].eval(input, pos)) return true;
|
|
3828
4069
|
}
|
|
@@ -3913,7 +4154,9 @@
|
|
|
3913
4154
|
const s = PrefilterTree.simplify(sub);
|
|
3914
4155
|
if (s.type !== Prefilter.Type.NONE) {
|
|
3915
4156
|
if (s.type === Prefilter.Type.AND) {
|
|
3916
|
-
|
|
4157
|
+
for (let j = 0; j < s.subs.length; j++) {
|
|
4158
|
+
newSubs.push(s.subs[j]);
|
|
4159
|
+
}
|
|
3917
4160
|
} else {
|
|
3918
4161
|
newSubs.push(s);
|
|
3919
4162
|
}
|
|
@@ -3933,7 +4176,9 @@
|
|
|
3933
4176
|
return new Prefilter(Prefilter.Type.NONE);
|
|
3934
4177
|
}
|
|
3935
4178
|
if (s.type === Prefilter.Type.OR) {
|
|
3936
|
-
|
|
4179
|
+
for (let j = 0; j < s.subs.length; j++) {
|
|
4180
|
+
newSubs.push(s.subs[j]);
|
|
4181
|
+
}
|
|
3937
4182
|
} else {
|
|
3938
4183
|
newSubs.push(s);
|
|
3939
4184
|
}
|
|
@@ -3955,6 +4200,27 @@
|
|
|
3955
4200
|
}
|
|
3956
4201
|
}
|
|
3957
4202
|
pf.subs = uniqueSubs;
|
|
4203
|
+
|
|
4204
|
+
// Build an Aho-Corasick automaton if all children are exact matches
|
|
4205
|
+
let allExact = true;
|
|
4206
|
+
for (const sub of uniqueSubs) {
|
|
4207
|
+
if (sub.type !== Prefilter.Type.EXACT) {
|
|
4208
|
+
allExact = false;
|
|
4209
|
+
break;
|
|
4210
|
+
}
|
|
4211
|
+
}
|
|
4212
|
+
if (allExact && uniqueSubs.length > 1) {
|
|
4213
|
+
const words16 = uniqueSubs.map(s => {
|
|
4214
|
+
const arr = [];
|
|
4215
|
+
for (let i = 0; i < s.str.length; i++) {
|
|
4216
|
+
arr.push(s.str.charCodeAt(i));
|
|
4217
|
+
}
|
|
4218
|
+
return arr;
|
|
4219
|
+
});
|
|
4220
|
+
pf.ac16 = new AhoCorasick(words16);
|
|
4221
|
+
const words8 = uniqueSubs.map(s => s.bytes);
|
|
4222
|
+
pf.ac8 = new AhoCorasick(words8);
|
|
4223
|
+
}
|
|
3958
4224
|
return pf;
|
|
3959
4225
|
}
|
|
3960
4226
|
return pf;
|
|
@@ -4481,7 +4747,9 @@
|
|
|
4481
4747
|
// Flatten nested concatenations
|
|
4482
4748
|
if (nsub.op === Regexp.Op.CONCAT) {
|
|
4483
4749
|
changed = true;
|
|
4484
|
-
|
|
4750
|
+
for (let j = 0; j < nsub.subs.length; j++) {
|
|
4751
|
+
newSubs.push(nsub.subs[j]);
|
|
4752
|
+
}
|
|
4485
4753
|
continue;
|
|
4486
4754
|
}
|
|
4487
4755
|
} else if (re.op === Regexp.Op.ALTERNATE) {
|
|
@@ -4493,7 +4761,9 @@
|
|
|
4493
4761
|
// Flatten nested alternations
|
|
4494
4762
|
if (nsub.op === Regexp.Op.ALTERNATE) {
|
|
4495
4763
|
changed = true;
|
|
4496
|
-
|
|
4764
|
+
for (let j = 0; j < nsub.subs.length; j++) {
|
|
4765
|
+
newSubs.push(nsub.subs[j]);
|
|
4766
|
+
}
|
|
4497
4767
|
continue;
|
|
4498
4768
|
}
|
|
4499
4769
|
}
|
|
@@ -5503,7 +5773,10 @@
|
|
|
5503
5773
|
return t.pop();
|
|
5504
5774
|
}
|
|
5505
5775
|
static concatRunes(x, y) {
|
|
5506
|
-
|
|
5776
|
+
for (let i = 0; i < y.length; i++) {
|
|
5777
|
+
x.push(y[i]);
|
|
5778
|
+
}
|
|
5779
|
+
return x;
|
|
5507
5780
|
}
|
|
5508
5781
|
constructor(wholeRegexp, flags = 0) {
|
|
5509
5782
|
this.wholeRegexp = wholeRegexp;
|
|
@@ -5539,8 +5812,8 @@
|
|
|
5539
5812
|
return re;
|
|
5540
5813
|
}
|
|
5541
5814
|
reuse(re) {
|
|
5542
|
-
if (this.height !== null &&
|
|
5543
|
-
|
|
5815
|
+
if (this.height !== null && this.height.has(re)) {
|
|
5816
|
+
this.height.delete(re);
|
|
5544
5817
|
}
|
|
5545
5818
|
if (re.subs !== null && re.subs.length > 0) {
|
|
5546
5819
|
re.subs[0] = this.free;
|
|
@@ -5572,20 +5845,20 @@
|
|
|
5572
5845
|
if (n <= 0) {
|
|
5573
5846
|
n = 1;
|
|
5574
5847
|
}
|
|
5575
|
-
if (n > Parser.MAX_SIZE / this.repeats) {
|
|
5848
|
+
if (n > Math.floor(Parser.MAX_SIZE / this.repeats)) {
|
|
5576
5849
|
this.repeats = Parser.MAX_SIZE;
|
|
5577
5850
|
} else {
|
|
5578
5851
|
this.repeats *= n;
|
|
5579
5852
|
}
|
|
5580
5853
|
}
|
|
5581
|
-
if (this.numRegexp < Parser.MAX_SIZE / this.repeats) {
|
|
5854
|
+
if (this.numRegexp < Math.floor(Parser.MAX_SIZE / this.repeats)) {
|
|
5582
5855
|
return;
|
|
5583
5856
|
}
|
|
5584
5857
|
|
|
5585
5858
|
// We need to start tracking size.
|
|
5586
5859
|
// Make the map and belatedly populate it
|
|
5587
5860
|
// with info about everything we've constructed so far.
|
|
5588
|
-
this.size =
|
|
5861
|
+
this.size = new Map();
|
|
5589
5862
|
for (let reEx of this.stack) {
|
|
5590
5863
|
this.checkSize(reEx);
|
|
5591
5864
|
}
|
|
@@ -5596,8 +5869,8 @@
|
|
|
5596
5869
|
}
|
|
5597
5870
|
calcSize(re, force = false) {
|
|
5598
5871
|
if (!force && this.size !== null) {
|
|
5599
|
-
if (
|
|
5600
|
-
return this.size
|
|
5872
|
+
if (this.size.has(re)) {
|
|
5873
|
+
return this.size.get(re);
|
|
5601
5874
|
}
|
|
5602
5875
|
}
|
|
5603
5876
|
let size = 0;
|
|
@@ -5657,9 +5930,9 @@
|
|
|
5657
5930
|
}
|
|
5658
5931
|
size = Math.max(1, size);
|
|
5659
5932
|
if (this.size === null) {
|
|
5660
|
-
this.size =
|
|
5933
|
+
this.size = new Map();
|
|
5661
5934
|
}
|
|
5662
|
-
this.size
|
|
5935
|
+
this.size.set(re, size);
|
|
5663
5936
|
return size;
|
|
5664
5937
|
}
|
|
5665
5938
|
checkHeight(re) {
|
|
@@ -5667,7 +5940,7 @@
|
|
|
5667
5940
|
return;
|
|
5668
5941
|
}
|
|
5669
5942
|
if (this.height === null) {
|
|
5670
|
-
this.height =
|
|
5943
|
+
this.height = new Map();
|
|
5671
5944
|
for (let reEx of this.stack) {
|
|
5672
5945
|
this.checkHeight(reEx);
|
|
5673
5946
|
}
|
|
@@ -5678,8 +5951,8 @@
|
|
|
5678
5951
|
}
|
|
5679
5952
|
calcHeight(re, force = false) {
|
|
5680
5953
|
if (!force && this.height !== null) {
|
|
5681
|
-
if (
|
|
5682
|
-
return this.height
|
|
5954
|
+
if (this.height.has(re)) {
|
|
5955
|
+
return this.height.get(re);
|
|
5683
5956
|
}
|
|
5684
5957
|
}
|
|
5685
5958
|
let h = 1;
|
|
@@ -5690,9 +5963,9 @@
|
|
|
5690
5963
|
}
|
|
5691
5964
|
}
|
|
5692
5965
|
if (this.height === null) {
|
|
5693
|
-
this.height =
|
|
5966
|
+
this.height = new Map();
|
|
5694
5967
|
}
|
|
5695
|
-
this.height
|
|
5968
|
+
this.height.set(re, h);
|
|
5696
5969
|
return h;
|
|
5697
5970
|
}
|
|
5698
5971
|
|
|
@@ -7814,7 +8087,7 @@
|
|
|
7814
8087
|
changed = true;
|
|
7815
8088
|
continue;
|
|
7816
8089
|
} else if (ch === '(' && i + 2 < size && data[i + 1] === '?' && data[i + 2] === '<') {
|
|
7817
|
-
if (i + 3
|
|
8090
|
+
if (i + 3 < size && !'=!>)'.includes(data[i + 3])) {
|
|
7818
8091
|
result += '(?P<';
|
|
7819
8092
|
i += 3;
|
|
7820
8093
|
changed = true;
|