re2js 2.2.0 → 2.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/README.md +54 -54
- package/build/index.cjs.cjs +432 -159
- package/build/index.cjs.cjs.map +1 -1
- package/build/index.esm.d.ts +3 -1
- package/build/index.esm.d.ts.map +1 -1
- package/build/index.esm.js +432 -159
- package/build/index.esm.js.map +1 -1
- package/build/index.umd.js +432 -159
- package/build/index.umd.js.map +1 -1
- package/package.json +7 -7
package/build/index.esm.js
CHANGED
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
* re2js
|
|
3
3
|
* RE2JS is the JavaScript port of RE2, a regular expression engine that provides linear time matching
|
|
4
4
|
*
|
|
5
|
-
* @version v2.2.
|
|
6
|
-
* @author
|
|
5
|
+
* @version v2.2.2
|
|
6
|
+
* @author Oleksii Vasyliev
|
|
7
7
|
* @homepage https://github.com/le0pard/re2js#readme
|
|
8
8
|
* @repository github:le0pard/re2js
|
|
9
9
|
* @license MIT
|
|
@@ -76,6 +76,8 @@ const PublicFlags = {
|
|
|
76
76
|
const ASCII_SIZE = 128;
|
|
77
77
|
const ASCII_TO_UPPER = new Int32Array(ASCII_SIZE);
|
|
78
78
|
const ASCII_TO_LOWER = new Int32Array(ASCII_SIZE);
|
|
79
|
+
// The highest legal Basic Multilingual Plane (BMP) value.
|
|
80
|
+
const MAX_BMP = 0xffff;
|
|
79
81
|
for (let i = 0; i < ASCII_SIZE; i++) {
|
|
80
82
|
if (i >= 97 && i <= 122) {
|
|
81
83
|
// a-z
|
|
@@ -99,11 +101,13 @@ class Codepoint {
|
|
|
99
101
|
static toUpperCase(codepoint) {
|
|
100
102
|
if (codepoint < ASCII_SIZE) return ASCII_TO_UPPER[codepoint];
|
|
101
103
|
const s = String.fromCodePoint(codepoint).toUpperCase();
|
|
102
|
-
|
|
104
|
+
const expectedLen = s.codePointAt(0) > MAX_BMP ? 2 : 1;
|
|
105
|
+
if (s.length > expectedLen) {
|
|
103
106
|
return codepoint;
|
|
104
107
|
}
|
|
105
108
|
const sOrigin = String.fromCodePoint(s.codePointAt(0)).toLowerCase();
|
|
106
|
-
|
|
109
|
+
const originExpectedLen = sOrigin.codePointAt(0) > MAX_BMP ? 2 : 1;
|
|
110
|
+
if (sOrigin.length > originExpectedLen || sOrigin.codePointAt(0) !== codepoint) {
|
|
107
111
|
return codepoint;
|
|
108
112
|
}
|
|
109
113
|
return s.codePointAt(0);
|
|
@@ -114,11 +118,13 @@ class Codepoint {
|
|
|
114
118
|
static toLowerCase(codepoint) {
|
|
115
119
|
if (codepoint < ASCII_SIZE) return ASCII_TO_LOWER[codepoint];
|
|
116
120
|
const s = String.fromCodePoint(codepoint).toLowerCase();
|
|
117
|
-
|
|
121
|
+
const expectedLen = s.codePointAt(0) > MAX_BMP ? 2 : 1;
|
|
122
|
+
if (s.length > expectedLen) {
|
|
118
123
|
return codepoint;
|
|
119
124
|
}
|
|
120
125
|
const sOrigin = String.fromCodePoint(s.codePointAt(0)).toUpperCase();
|
|
121
|
-
|
|
126
|
+
const originExpectedLen = sOrigin.codePointAt(0) > MAX_BMP ? 2 : 1;
|
|
127
|
+
if (sOrigin.length > originExpectedLen || sOrigin.codePointAt(0) !== codepoint) {
|
|
122
128
|
return codepoint;
|
|
123
129
|
}
|
|
124
130
|
return s.codePointAt(0);
|
|
@@ -220,7 +226,7 @@ class UnicodeTables {
|
|
|
220
226
|
static _CASE_ORBIT = null;
|
|
221
227
|
static get CASE_ORBIT() {
|
|
222
228
|
if (!this._CASE_ORBIT) {
|
|
223
|
-
this._CASE_ORBIT = decodeOrbit('rCrDIzDYqpII-LiC8cQlHa+0HGrpI6EzClClOBmOBkOBoOBpOBnOBrOBsOBqOlByPBzPBxPyK5crCz+HCydD1dD4dB5dB6dC8dEgeBheCieDmeDpeHj-HCweD1fDxeB+9HBwfC1FE2eBxfBjeBjdD1eDmpIHycB0fEmdBgda6cBhdD4cB1cdyhBC0hBK+hBDhiBBiiBIqiBIgkHChkHKikHDjkHBkkHImkHYjjBBnkH9gGygBB0gBB+gBBhhBBlkHBihBBqhBBijBBqypB4OhzHB70H6BgzHD-GiHo8HBp8HBq8HBr8HBs8HBt8HBu8HBv8HBg8HBh8HBi8HBj8HBk8HBl8HBm8HBn8HB48HB58HB68HB78HB88HB98HB+8HB-8HBw8HBx8HBy8HBz8HB08HB18HB28HB38HBo9HBp9HBq9HBr9HBs9HBt9HBu9HBv9HBg9HBh9HBi9HBj9HBk9HBl9HBm9HBn9HE89HJz9HClaFs+HJj+HHwcQwdQ8-HJz-HqJpdErCBlG-ohBrypBBokH6lVm4+BBl4+
|
|
229
|
+
this._CASE_ORBIT = decodeOrbit('rCrDIzDYqpII-LiC8cQlHa+0HGrpI6EzClClOBmOBkOBoOBpOBnOBrOBsOBqOlByPBzPBxPyK5crCz+HCydD1dD4dB5dB6dC8dEgeBheCieDmeDpeHj-HCweD1fDxeB+9HBwfC1FE2eBxfBjeBjdD1eDmpIHycB0fEmdBgda6cBhdD4cB1cdyhBC0hBK+hBDhiBBiiBIqiBIgkHChkHKikHDjkHBkkHImkHYjjBBnkH9gGygBB0gBB+gBBhhBBlkHBihBBqhBBijBBqypB4OhzHB70H6BgzHD-GiHo8HBp8HBq8HBr8HBs8HBt8HBu8HBv8HBg8HBh8HBi8HBj8HBk8HBl8HBm8HBn8HB48HB58HB68HB78HB88HB98HB+8HB-8HBw8HBx8HBy8HBz8HB08HB18HB28HB38HBo9HBp9HBq9HBr9HBs9HBt9HBu9HBv9HBg9HBh9HBi9HBj9HBk9HBl9HBm9HBn9HE89HJz9HClaFs+HJj+HHwcQwdQ8-HJz-HqJpdErCBlG-ohBrypBBokH6lVm4+BBl4+B');
|
|
224
230
|
}
|
|
225
231
|
return this._CASE_ORBIT;
|
|
226
232
|
}
|
|
@@ -592,11 +598,16 @@ class Unicode {
|
|
|
592
598
|
// to compare it to |r2|.
|
|
593
599
|
// -1 is interpreted as the end-of-file mark.
|
|
594
600
|
static equalsIgnoreCase(r1, r2) {
|
|
595
|
-
// Runes already match
|
|
596
|
-
if (r1
|
|
601
|
+
// Runes already match
|
|
602
|
+
if (r1 === r2) {
|
|
597
603
|
return true;
|
|
598
604
|
}
|
|
599
605
|
|
|
606
|
+
// Safely fail if either is EOF (and they didn't explicitly match above)
|
|
607
|
+
if (r1 < 0 || r2 < 0) {
|
|
608
|
+
return false;
|
|
609
|
+
}
|
|
610
|
+
|
|
600
611
|
// Fast path for the common case where both runes are ASCII characters.
|
|
601
612
|
// Coerces both runes to lowercase if applicable.
|
|
602
613
|
if (r1 <= this.MAX_ASCII && r2 <= this.MAX_ASCII) {
|
|
@@ -849,7 +860,7 @@ class Utils {
|
|
|
849
860
|
// Encoding[(Encoding['UTF_16'] = 0)] = 'UTF_16'
|
|
850
861
|
// Encoding[(Encoding['UTF_8'] = 1)] = 'UTF_8'
|
|
851
862
|
const createEnum = (values = [], initNum = 0) => {
|
|
852
|
-
const enumObject =
|
|
863
|
+
const enumObject = Object.create(null);
|
|
853
864
|
for (let i = 0; i < values.length; i++) {
|
|
854
865
|
const val = values[i];
|
|
855
866
|
const keyVal = initNum + i;
|
|
@@ -991,6 +1002,9 @@ class MachineInputBase {
|
|
|
991
1002
|
hasString() {
|
|
992
1003
|
return false;
|
|
993
1004
|
}
|
|
1005
|
+
hasAnyString() {
|
|
1006
|
+
return false;
|
|
1007
|
+
}
|
|
994
1008
|
|
|
995
1009
|
// Helper for the exact-literal fast-path execution router
|
|
996
1010
|
prefixLength() {
|
|
@@ -1016,6 +1030,13 @@ class MachineUTF8Input extends MachineInputBase {
|
|
|
1016
1030
|
return idx !== -1 && idx <= this.end - target.length;
|
|
1017
1031
|
}
|
|
1018
1032
|
|
|
1033
|
+
// Executes a high-speed, single - pass search for multiple literal strings
|
|
1034
|
+
// simultaneously using an Aho-Corasick automaton.
|
|
1035
|
+
hasAnyString(prefilter, pos) {
|
|
1036
|
+
if (!prefilter.ac8) return false;
|
|
1037
|
+
return prefilter.ac8.searchUTF8(this.bytes, this.start + pos, this.end);
|
|
1038
|
+
}
|
|
1039
|
+
|
|
1019
1040
|
// Returns the rune at the specified index; the units are
|
|
1020
1041
|
// unspecified, but could be UTF-8 byte, UTF-16 char, or rune
|
|
1021
1042
|
// indices. Returns the width (in the same units) of the rune in
|
|
@@ -1034,17 +1055,23 @@ class MachineUTF8Input extends MachineInputBase {
|
|
|
1034
1055
|
return c << 3 | 1;
|
|
1035
1056
|
} else if (c >= 0xc2 && c <= 0xdf && pos + 1 < this.end) {
|
|
1036
1057
|
const c1 = this.bytes[pos + 1] & 0xff;
|
|
1058
|
+
if ((c1 & 0xc0) !== 0x80) return c << 3 | 1;
|
|
1037
1059
|
const rune = (c & 0x1f) << 6 | c1 & 0x3f;
|
|
1038
1060
|
return rune << 3 | 2;
|
|
1039
1061
|
} else if (c >= 0xe0 && c <= 0xef && pos + 2 < this.end) {
|
|
1040
1062
|
const c1 = this.bytes[pos + 1] & 0xff;
|
|
1063
|
+
if ((c1 & 0xc0) !== 0x80) return c << 3 | 1;
|
|
1041
1064
|
const c2 = this.bytes[pos + 2] & 0xff;
|
|
1065
|
+
if ((c2 & 0xc0) !== 0x80) return c << 3 | 1;
|
|
1042
1066
|
const rune = (c & 0x0f) << 12 | (c1 & 0x3f) << 6 | c2 & 0x3f;
|
|
1043
1067
|
return rune << 3 | 3;
|
|
1044
1068
|
} else if (c >= 0xf0 && c <= 0xf4 && pos + 3 < this.end) {
|
|
1045
1069
|
const c1 = this.bytes[pos + 1] & 0xff;
|
|
1070
|
+
if ((c1 & 0xc0) !== 0x80) return c << 3 | 1;
|
|
1046
1071
|
const c2 = this.bytes[pos + 2] & 0xff;
|
|
1072
|
+
if ((c2 & 0xc0) !== 0x80) return c << 3 | 1;
|
|
1047
1073
|
const c3 = this.bytes[pos + 3] & 0xff;
|
|
1074
|
+
if ((c3 & 0xc0) !== 0x80) return c << 3 | 1;
|
|
1048
1075
|
const rune = (c & 0x07) << 18 | (c1 & 0x3f) << 12 | (c2 & 0x3f) << 6 | c3 & 0x3f;
|
|
1049
1076
|
return rune << 3 | 4;
|
|
1050
1077
|
} else {
|
|
@@ -1123,6 +1150,13 @@ class MachineUTF16Input extends MachineInputBase {
|
|
|
1123
1150
|
return idx !== -1 && idx <= this.end - prefilter.str.length;
|
|
1124
1151
|
}
|
|
1125
1152
|
|
|
1153
|
+
// Executes a high-speed, single - pass search for multiple literal strings
|
|
1154
|
+
// simultaneously using an Aho-Corasick automaton.
|
|
1155
|
+
hasAnyString(prefilter, pos) {
|
|
1156
|
+
if (!prefilter.ac16) return false;
|
|
1157
|
+
return prefilter.ac16.searchUTF16(this.charSequence, this.start + pos, this.end);
|
|
1158
|
+
}
|
|
1159
|
+
|
|
1126
1160
|
// Returns the rune at the specified index; the units are
|
|
1127
1161
|
// unspecified, but could be UTF-8 byte, UTF-16 char, or rune
|
|
1128
1162
|
// indices. Returns the width (in the same units) of the rune in
|
|
@@ -1568,7 +1602,15 @@ class Matcher {
|
|
|
1568
1602
|
if (this.hasMatch) {
|
|
1569
1603
|
start = this.groups[1];
|
|
1570
1604
|
if (this.groups[0] === this.groups[1]) {
|
|
1571
|
-
|
|
1605
|
+
// Safely calculate structural encoding width to avoid sequence corruption
|
|
1606
|
+
const machineInput = this.matcherInput.isUTF16Encoding() ? MachineInput.fromUTF16(this.matcherInput.asCharSequence(), 0, this.matcherInputLength) : MachineInput.fromUTF8(this.matcherInput.asBytes(), 0, this.matcherInputLength);
|
|
1607
|
+
const r = machineInput.step(start);
|
|
1608
|
+
if (r < 0) {
|
|
1609
|
+
// EOF
|
|
1610
|
+
start++; // Advance past length to force loop exit
|
|
1611
|
+
} else {
|
|
1612
|
+
start += r & 7; // Advance by safely decoded width
|
|
1613
|
+
}
|
|
1572
1614
|
}
|
|
1573
1615
|
}
|
|
1574
1616
|
return this.genMatch(start, RE2Flags.UNANCHORED);
|
|
@@ -1707,6 +1749,8 @@ class Matcher {
|
|
|
1707
1749
|
const groupName = replacement.substring(i + 1, j);
|
|
1708
1750
|
res += this.group(groupName);
|
|
1709
1751
|
last = j + 1;
|
|
1752
|
+
i = j;
|
|
1753
|
+
continue;
|
|
1710
1754
|
}
|
|
1711
1755
|
}
|
|
1712
1756
|
}
|
|
@@ -1786,6 +1830,7 @@ class Matcher {
|
|
|
1786
1830
|
if (j === replacement.length || replacement.codePointAt(j) !== Codepoint.CODES.get('>')) {
|
|
1787
1831
|
res += replacement.substring(i - 1, j + 1);
|
|
1788
1832
|
last = j + 1;
|
|
1833
|
+
i = j;
|
|
1789
1834
|
continue;
|
|
1790
1835
|
}
|
|
1791
1836
|
const groupName = replacement.substring(i + 1, j);
|
|
@@ -1795,6 +1840,8 @@ class Matcher {
|
|
|
1795
1840
|
res += `$<${groupName}>`;
|
|
1796
1841
|
}
|
|
1797
1842
|
last = j + 1;
|
|
1843
|
+
i = j;
|
|
1844
|
+
continue;
|
|
1798
1845
|
}
|
|
1799
1846
|
}
|
|
1800
1847
|
}
|
|
@@ -1919,6 +1966,8 @@ class Inst {
|
|
|
1919
1966
|
return r === r0;
|
|
1920
1967
|
}
|
|
1921
1968
|
const len = this.runes.length;
|
|
1969
|
+
if (len === 0) return false;
|
|
1970
|
+
|
|
1922
1971
|
// If the array is exactly 2, 4, 6, or 8 items, DO NOT fall through to binary search
|
|
1923
1972
|
if (len === 2 || len === 4 || len === 6 || len === 8) {
|
|
1924
1973
|
for (let j = 0; j < len; j += 2) {
|
|
@@ -1932,22 +1981,19 @@ class Inst {
|
|
|
1932
1981
|
return false; // Stop here
|
|
1933
1982
|
}
|
|
1934
1983
|
|
|
1935
|
-
//
|
|
1936
|
-
|
|
1937
|
-
|
|
1938
|
-
|
|
1939
|
-
|
|
1940
|
-
|
|
1941
|
-
|
|
1942
|
-
|
|
1943
|
-
|
|
1944
|
-
}
|
|
1945
|
-
lo = m + 1;
|
|
1946
|
-
} else {
|
|
1947
|
-
hi = m;
|
|
1948
|
-
}
|
|
1984
|
+
// Branchless Binary Search (Lower Bound)
|
|
1985
|
+
// Compiles to optimal conditional move (cmov) machine code, preventing
|
|
1986
|
+
// branch mispredictions on large, chaotic Unicode arrays
|
|
1987
|
+
let base = 0;
|
|
1988
|
+
let n = len >> 1;
|
|
1989
|
+
while (n > 1) {
|
|
1990
|
+
const half = n >> 1;
|
|
1991
|
+
base += this.runes[base + half << 1] <= r ? half : 0;
|
|
1992
|
+
n -= half;
|
|
1949
1993
|
}
|
|
1950
|
-
|
|
1994
|
+
base += this.runes[base << 1] <= r ? 1 : 0;
|
|
1995
|
+
const m = base - 1;
|
|
1996
|
+
return m >= 0 && r <= this.runes[m << 1 | 1];
|
|
1951
1997
|
}
|
|
1952
1998
|
|
|
1953
1999
|
// matchRunePos checks whether the instruction matches (and consumes) r.
|
|
@@ -1962,6 +2008,7 @@ class Inst {
|
|
|
1962
2008
|
return r === r0 ? 0 : -1;
|
|
1963
2009
|
}
|
|
1964
2010
|
const len = this.runes.length;
|
|
2011
|
+
if (len === 0) return -1;
|
|
1965
2012
|
if (len === 2 || len === 4 || len === 6 || len === 8) {
|
|
1966
2013
|
for (let j = 0; j < len; j += 2) {
|
|
1967
2014
|
if (r < this.runes[j]) return -1;
|
|
@@ -1969,19 +2016,18 @@ class Inst {
|
|
|
1969
2016
|
}
|
|
1970
2017
|
return -1;
|
|
1971
2018
|
}
|
|
1972
|
-
|
|
1973
|
-
|
|
1974
|
-
|
|
1975
|
-
|
|
1976
|
-
|
|
1977
|
-
|
|
1978
|
-
|
|
1979
|
-
|
|
1980
|
-
} else {
|
|
1981
|
-
hi = m;
|
|
1982
|
-
}
|
|
2019
|
+
|
|
2020
|
+
// Branchless Binary Search (Lower Bound)
|
|
2021
|
+
let base = 0;
|
|
2022
|
+
let n = len >> 1;
|
|
2023
|
+
while (n > 1) {
|
|
2024
|
+
const half = n >> 1;
|
|
2025
|
+
base += this.runes[base + half << 1] <= r ? half : 0;
|
|
2026
|
+
n -= half;
|
|
1983
2027
|
}
|
|
1984
|
-
|
|
2028
|
+
base += this.runes[base << 1] <= r ? 1 : 0;
|
|
2029
|
+
const m = base - 1;
|
|
2030
|
+
return m >= 0 && r <= this.runes[m << 1 | 1] ? m : -1;
|
|
1985
2031
|
}
|
|
1986
2032
|
/**
|
|
1987
2033
|
*
|
|
@@ -2080,6 +2126,7 @@ class Queue {
|
|
|
2080
2126
|
//
|
|
2081
2127
|
// Called by RE2.doExecute.
|
|
2082
2128
|
class Machine {
|
|
2129
|
+
static THREADS_CHUNK_SIZE = 128;
|
|
2083
2130
|
static fromRE2(re2) {
|
|
2084
2131
|
const m = new Machine();
|
|
2085
2132
|
m.prog = re2.prog;
|
|
@@ -2120,15 +2167,15 @@ class Machine {
|
|
|
2120
2167
|
resetCap() {
|
|
2121
2168
|
for (let i = 0; i < this.poolSize; i++) {
|
|
2122
2169
|
const t = this.pool[i];
|
|
2123
|
-
t.cap.fill(
|
|
2170
|
+
t.cap.fill(-1);
|
|
2124
2171
|
}
|
|
2125
2172
|
}
|
|
2126
2173
|
initNewCap(ncap) {
|
|
2127
2174
|
for (let i = 0; i < this.poolSize; i++) {
|
|
2128
2175
|
const t = this.pool[i];
|
|
2129
|
-
t.cap = new Int32Array(ncap);
|
|
2176
|
+
t.cap = new Int32Array(ncap).fill(-1);
|
|
2130
2177
|
}
|
|
2131
|
-
this.matchcap = new Int32Array(ncap);
|
|
2178
|
+
this.matchcap = new Int32Array(ncap).fill(-1);
|
|
2132
2179
|
}
|
|
2133
2180
|
submatches() {
|
|
2134
2181
|
if (this.ncap === 0) {
|
|
@@ -2141,14 +2188,21 @@ class Machine {
|
|
|
2141
2188
|
// alloc() allocates a new thread with the given instruction.
|
|
2142
2189
|
// It uses the free pool if possible.
|
|
2143
2190
|
alloc(inst) {
|
|
2144
|
-
|
|
2145
|
-
|
|
2146
|
-
|
|
2147
|
-
|
|
2148
|
-
|
|
2149
|
-
|
|
2150
|
-
|
|
2191
|
+
if (this.poolSize === 0) {
|
|
2192
|
+
const capLen = this.matchcap.length;
|
|
2193
|
+
|
|
2194
|
+
// Bulk allocate threads in a tight loop so the V8 engine
|
|
2195
|
+
// places them adjacently in the young generation heap
|
|
2196
|
+
for (let i = 0; i < Machine.THREADS_CHUNK_SIZE; i++) {
|
|
2197
|
+
const t = new Thread();
|
|
2198
|
+
t.cap = new Int32Array(capLen);
|
|
2199
|
+
this.pool[this.poolSize++] = t;
|
|
2200
|
+
}
|
|
2151
2201
|
}
|
|
2202
|
+
|
|
2203
|
+
// Pop a thread from the top of the pool stack
|
|
2204
|
+
this.poolSize--;
|
|
2205
|
+
const t = this.pool[this.poolSize];
|
|
2152
2206
|
t.inst = inst;
|
|
2153
2207
|
return t;
|
|
2154
2208
|
}
|
|
@@ -2201,6 +2255,9 @@ class Machine {
|
|
|
2201
2255
|
if ((startCond & Utils.EMPTY_BEGIN_TEXT) !== 0 && pos !== 0) {
|
|
2202
2256
|
break;
|
|
2203
2257
|
}
|
|
2258
|
+
if ((anchor === RE2Flags.ANCHOR_START || anchor === RE2Flags.ANCHOR_BOTH) && pos !== 0) {
|
|
2259
|
+
break;
|
|
2260
|
+
}
|
|
2204
2261
|
if (this.matched) {
|
|
2205
2262
|
break;
|
|
2206
2263
|
}
|
|
@@ -2278,6 +2335,9 @@ class Machine {
|
|
|
2278
2335
|
while (true) {
|
|
2279
2336
|
if (runq.isEmpty()) {
|
|
2280
2337
|
if ((startCond & Utils.EMPTY_BEGIN_TEXT) !== 0 && pos !== 0) break;
|
|
2338
|
+
if ((anchor === RE2Flags.ANCHOR_START || anchor === RE2Flags.ANCHOR_BOTH) && pos !== 0) {
|
|
2339
|
+
break;
|
|
2340
|
+
}
|
|
2281
2341
|
}
|
|
2282
2342
|
if (pos === 0 || anchor === RE2Flags.UNANCHORED) {
|
|
2283
2343
|
// Spawn Lookbehind threads BEFORE the main pattern
|
|
@@ -2393,78 +2453,83 @@ class Machine {
|
|
|
2393
2453
|
runq.clear();
|
|
2394
2454
|
}
|
|
2395
2455
|
add(q, pc, pos, cap, cond, t) {
|
|
2396
|
-
|
|
2397
|
-
|
|
2398
|
-
|
|
2399
|
-
|
|
2400
|
-
|
|
2401
|
-
|
|
2402
|
-
|
|
2403
|
-
|
|
2404
|
-
|
|
2405
|
-
|
|
2406
|
-
|
|
2407
|
-
|
|
2408
|
-
|
|
2409
|
-
|
|
2410
|
-
t = this.add(q, inst.arg, pos, cap, cond, t);
|
|
2411
|
-
break;
|
|
2412
|
-
case Inst.EMPTY_WIDTH:
|
|
2413
|
-
if ((inst.arg & ~cond) === 0) {
|
|
2414
|
-
t = this.add(q, inst.out, pos, cap, cond, t);
|
|
2415
|
-
}
|
|
2416
|
-
break;
|
|
2417
|
-
case Inst.NOP:
|
|
2418
|
-
t = this.add(q, inst.out, pos, cap, cond, t);
|
|
2419
|
-
break;
|
|
2420
|
-
case Inst.CAPTURE:
|
|
2421
|
-
if (inst.arg < this.ncap) {
|
|
2422
|
-
const opos = cap[inst.arg];
|
|
2423
|
-
cap[inst.arg] = pos;
|
|
2424
|
-
this.add(q, inst.out, pos, cap, cond, null);
|
|
2425
|
-
cap[inst.arg] = opos;
|
|
2426
|
-
} else {
|
|
2456
|
+
while (true) {
|
|
2457
|
+
if (pc === 0) {
|
|
2458
|
+
return t;
|
|
2459
|
+
}
|
|
2460
|
+
if (q.contains(pc)) {
|
|
2461
|
+
return t;
|
|
2462
|
+
}
|
|
2463
|
+
const d = q.add(pc);
|
|
2464
|
+
const inst = this.prog.inst[pc];
|
|
2465
|
+
switch (inst.op) {
|
|
2466
|
+
case Inst.FAIL:
|
|
2467
|
+
return t;
|
|
2468
|
+
case Inst.ALT:
|
|
2469
|
+
case Inst.ALT_MATCH:
|
|
2427
2470
|
t = this.add(q, inst.out, pos, cap, cond, t);
|
|
2428
|
-
|
|
2429
|
-
|
|
2430
|
-
|
|
2431
|
-
|
|
2432
|
-
|
|
2433
|
-
|
|
2434
|
-
case Inst.LB_CHECK:
|
|
2435
|
-
if (inst.lb > 0) {
|
|
2436
|
-
// Positive Lookbehind
|
|
2437
|
-
if (this.lbTable[inst.lb] === pos) {
|
|
2438
|
-
t = this.add(q, inst.out, pos, cap, cond, t);
|
|
2471
|
+
pc = inst.arg; // Flattened tail recursion
|
|
2472
|
+
continue;
|
|
2473
|
+
case Inst.EMPTY_WIDTH:
|
|
2474
|
+
if ((inst.arg & ~cond) === 0) {
|
|
2475
|
+
pc = inst.out; // Flattened tail recursion
|
|
2476
|
+
continue;
|
|
2439
2477
|
}
|
|
2440
|
-
|
|
2441
|
-
|
|
2442
|
-
|
|
2443
|
-
|
|
2444
|
-
|
|
2445
|
-
|
|
2446
|
-
|
|
2447
|
-
|
|
2448
|
-
|
|
2449
|
-
|
|
2450
|
-
|
|
2451
|
-
|
|
2452
|
-
|
|
2453
|
-
|
|
2454
|
-
}
|
|
2455
|
-
if (this.ncap > 0 && t.cap !== cap) {
|
|
2456
|
-
// Direct assignment utilizing Typed Array performance
|
|
2457
|
-
for (let c = 0; c < this.ncap; c++) {
|
|
2458
|
-
t.cap[c] = cap[c];
|
|
2478
|
+
return t;
|
|
2479
|
+
case Inst.NOP:
|
|
2480
|
+
pc = inst.out; // Flattened tail recursion
|
|
2481
|
+
continue;
|
|
2482
|
+
case Inst.CAPTURE:
|
|
2483
|
+
if (inst.arg < this.ncap) {
|
|
2484
|
+
const opos = cap[inst.arg];
|
|
2485
|
+
cap[inst.arg] = pos;
|
|
2486
|
+
this.add(q, inst.out, pos, cap, cond, null);
|
|
2487
|
+
cap[inst.arg] = opos;
|
|
2488
|
+
return t;
|
|
2489
|
+
} else {
|
|
2490
|
+
pc = inst.out; // Flattened tail recursion
|
|
2491
|
+
continue;
|
|
2459
2492
|
}
|
|
2460
|
-
|
|
2461
|
-
|
|
2462
|
-
|
|
2463
|
-
|
|
2464
|
-
|
|
2465
|
-
|
|
2493
|
+
case Inst.LB_WRITE:
|
|
2494
|
+
this.lbTable[Math.abs(inst.lb)] = pos;
|
|
2495
|
+
pc = inst.out;
|
|
2496
|
+
continue;
|
|
2497
|
+
case Inst.LB_CHECK:
|
|
2498
|
+
if (inst.lb > 0) {
|
|
2499
|
+
// Positive Lookbehind
|
|
2500
|
+
if (this.lbTable[inst.lb] === pos) {
|
|
2501
|
+
pc = inst.out; // Flattened tail recursion
|
|
2502
|
+
continue;
|
|
2503
|
+
}
|
|
2504
|
+
} else if (this.lbTable[-inst.lb] !== pos) {
|
|
2505
|
+
// Negative Lookbehind
|
|
2506
|
+
pc = inst.out; // Flattened tail recursion
|
|
2507
|
+
continue;
|
|
2508
|
+
}
|
|
2509
|
+
return t;
|
|
2510
|
+
case Inst.MATCH:
|
|
2511
|
+
case Inst.RUNE:
|
|
2512
|
+
case Inst.RUNE1:
|
|
2513
|
+
case Inst.RUNE_ANY:
|
|
2514
|
+
case Inst.RUNE_ANY_NOT_NL:
|
|
2515
|
+
if (t === null) {
|
|
2516
|
+
t = this.alloc(inst);
|
|
2517
|
+
} else {
|
|
2518
|
+
t.inst = inst;
|
|
2519
|
+
}
|
|
2520
|
+
if (this.ncap > 0 && t.cap !== cap) {
|
|
2521
|
+
// Direct assignment utilizing Typed Array performance
|
|
2522
|
+
for (let c = 0; c < this.ncap; c++) {
|
|
2523
|
+
t.cap[c] = cap[c];
|
|
2524
|
+
}
|
|
2525
|
+
}
|
|
2526
|
+
q.denseThreads[d] = t;
|
|
2527
|
+
t = null;
|
|
2528
|
+
return t;
|
|
2529
|
+
default:
|
|
2530
|
+
throw new Error('unhandled');
|
|
2531
|
+
}
|
|
2466
2532
|
}
|
|
2467
|
-
return t;
|
|
2468
2533
|
}
|
|
2469
2534
|
}
|
|
2470
2535
|
|
|
@@ -2492,8 +2557,15 @@ class DFAState {
|
|
|
2492
2557
|
this.nfaStates = nfaStates; // Int32Array of Instruction PCs
|
|
2493
2558
|
this.isMatch = isMatch; // Boolean
|
|
2494
2559
|
this.matchIDs = matchIDs; // Array of integers indicating which Set patterns matched
|
|
2495
|
-
|
|
2496
|
-
|
|
2560
|
+
|
|
2561
|
+
// Latin-1 (Unicode.MAX_LATIN1 + 1) flat arrays for blisteringly fast O(1) lookups
|
|
2562
|
+
// completely covering standard English, European languages, and 1-byte encodings.
|
|
2563
|
+
this.nextLatin1 = new Array(Unicode.MAX_LATIN1 + 1).fill(null); // Flat array for blisteringly fast ASCII lookups
|
|
2564
|
+
this.nextLatin1Anchored = new Array(Unicode.MAX_LATIN1 + 1).fill(null); // Flat array for blisteringly fast ASCII lookups
|
|
2565
|
+
// 2 arrays used as hash map for V8 optimization (N is small number, so O(n) faster than Map O(1))
|
|
2566
|
+
this.transKeys = [];
|
|
2567
|
+
this.transVals = [];
|
|
2568
|
+
this.lastSeen = 0; // Track when this state was last used for LRU eviction
|
|
2497
2569
|
}
|
|
2498
2570
|
}
|
|
2499
2571
|
class DFA {
|
|
@@ -2506,6 +2578,7 @@ class DFA {
|
|
|
2506
2578
|
this.stateLimit = 10000; // Prevent memory explosion (ReDoS protection)
|
|
2507
2579
|
this.cacheClears = 0; // Track thrashing
|
|
2508
2580
|
this.failed = false; // mark if DFA cannot work with provided prog
|
|
2581
|
+
this.clock = 0; // Global clock for LRU eviction
|
|
2509
2582
|
}
|
|
2510
2583
|
|
|
2511
2584
|
// Follows epsilon (empty) transitions to find all reachable states without consuming a char
|
|
@@ -2565,6 +2638,7 @@ class DFA {
|
|
|
2565
2638
|
for (let i = 0; i < bucket.length; i++) {
|
|
2566
2639
|
const state = bucket[i];
|
|
2567
2640
|
if (arraysEqual(state.nfaStates, sortedPCs)) {
|
|
2641
|
+
state.lastSeen = ++this.clock;
|
|
2568
2642
|
return state;
|
|
2569
2643
|
}
|
|
2570
2644
|
}
|
|
@@ -2577,40 +2651,99 @@ class DFA {
|
|
|
2577
2651
|
if (this.failed) return null;
|
|
2578
2652
|
|
|
2579
2653
|
// Safety: prevent memory exhaustion from state explosion
|
|
2580
|
-
// We
|
|
2654
|
+
// We prune the cache to keep the newest 50%
|
|
2581
2655
|
if (this.stateCount >= this.stateLimit) {
|
|
2582
|
-
this.stateCache.clear();
|
|
2583
|
-
this.stateCount = 0;
|
|
2584
|
-
this.startState = null;
|
|
2585
2656
|
this.cacheClears++;
|
|
2586
2657
|
|
|
2587
2658
|
// If this regex causes continuous cache thrashing, permanently fall back to NFA
|
|
2588
2659
|
// to avoid spending CPU cycles constantly rebuilding the DFA tree.
|
|
2589
2660
|
if (this.cacheClears >= DFA.MAX_CACHE_CLEARS) {
|
|
2590
2661
|
this.failed = true;
|
|
2662
|
+
this.stateCache.clear();
|
|
2663
|
+
this.stateCount = 0;
|
|
2664
|
+
this.startState = null;
|
|
2665
|
+
return null;
|
|
2666
|
+
}
|
|
2667
|
+
this.evictCache();
|
|
2668
|
+
|
|
2669
|
+
// After eviction, the bucket reference might be stale or empty.
|
|
2670
|
+
// We must re-fetch or re-create the bucket.
|
|
2671
|
+
bucket = this.stateCache.get(hash);
|
|
2672
|
+
if (!bucket) {
|
|
2673
|
+
bucket = [];
|
|
2674
|
+
this.stateCache.set(hash, bucket);
|
|
2591
2675
|
}
|
|
2592
|
-
return null;
|
|
2593
2676
|
}
|
|
2594
2677
|
|
|
2595
2678
|
// State not found, create it and add to bucket
|
|
2596
2679
|
const state = new DFAState(sortedPCs, closureResult.isMatch, closureResult.matchIDs);
|
|
2680
|
+
state.lastSeen = ++this.clock;
|
|
2597
2681
|
bucket.push(state);
|
|
2598
2682
|
this.stateCount++;
|
|
2599
2683
|
return state;
|
|
2600
2684
|
}
|
|
2685
|
+
evictCache() {
|
|
2686
|
+
const allStates = [];
|
|
2687
|
+
for (const bucket of this.stateCache.values()) {
|
|
2688
|
+
for (let i = 0; i < bucket.length; i++) {
|
|
2689
|
+
allStates.push(bucket[i]);
|
|
2690
|
+
}
|
|
2691
|
+
}
|
|
2692
|
+
|
|
2693
|
+
// Sort ascending by lastSeen (oldest first)
|
|
2694
|
+
allStates.sort((a, b) => a.lastSeen - b.lastSeen);
|
|
2695
|
+
|
|
2696
|
+
// Keep the newest 50%
|
|
2697
|
+
const keepCount = Math.max(1, Math.floor(this.stateLimit / 2));
|
|
2698
|
+
const startIndex = allStates.length - keepCount;
|
|
2699
|
+
const survivorsArray = allStates.slice(startIndex);
|
|
2700
|
+
const survivors = new Set(survivorsArray);
|
|
2701
|
+
this.stateCache.clear();
|
|
2702
|
+
this.stateCount = 0;
|
|
2703
|
+
for (let i = 0; i < survivorsArray.length; i++) {
|
|
2704
|
+
const state = survivorsArray[i];
|
|
2705
|
+
|
|
2706
|
+
// Sever ties to all states to prevent memory leaks and dangling pointers
|
|
2707
|
+
state.nextLatin1.fill(null);
|
|
2708
|
+
state.nextLatin1Anchored.fill(null);
|
|
2709
|
+
// zero-allocation cleanup
|
|
2710
|
+
state.transKeys.length = 0;
|
|
2711
|
+
state.transVals.length = 0;
|
|
2712
|
+
const hash = hashPCs(state.nfaStates);
|
|
2713
|
+
let bucket = this.stateCache.get(hash);
|
|
2714
|
+
if (!bucket) {
|
|
2715
|
+
bucket = [];
|
|
2716
|
+
this.stateCache.set(hash, bucket);
|
|
2717
|
+
}
|
|
2718
|
+
bucket.push(state);
|
|
2719
|
+
this.stateCount++;
|
|
2720
|
+
}
|
|
2721
|
+
|
|
2722
|
+
// Start state must either be preserved or nullified so it gets re-created
|
|
2723
|
+
if (this.startState && !survivors.has(this.startState)) {
|
|
2724
|
+
this.startState = null;
|
|
2725
|
+
}
|
|
2726
|
+
}
|
|
2601
2727
|
|
|
2602
2728
|
// Compute the next DFA state given a current state and a character
|
|
2603
2729
|
step(state, charCode, anchor) {
|
|
2604
|
-
// OPTIMIZATION:
|
|
2605
|
-
if (
|
|
2606
|
-
|
|
2607
|
-
|
|
2608
|
-
return next;
|
|
2730
|
+
// OPTIMIZATION: Latin-1 Array Fast-Path
|
|
2731
|
+
if (charCode <= Unicode.MAX_LATIN1) {
|
|
2732
|
+
if (anchor === RE2Flags.UNANCHORED) {
|
|
2733
|
+
const next = state.nextLatin1[charCode];
|
|
2734
|
+
if (next !== null) return next;
|
|
2735
|
+
} else {
|
|
2736
|
+
const next = state.nextLatin1Anchored[charCode];
|
|
2737
|
+
if (next !== null) return next;
|
|
2609
2738
|
}
|
|
2610
2739
|
} else {
|
|
2740
|
+
// Dense Array Linear Search fallback for Runes > 255
|
|
2611
2741
|
const key = charCode + (anchor === RE2Flags.UNANCHORED ? 0 : Unicode.MAX_RUNE + 1);
|
|
2612
|
-
|
|
2613
|
-
|
|
2742
|
+
// get [key] -> nextState
|
|
2743
|
+
const keys = state.transKeys;
|
|
2744
|
+
const len = keys.length;
|
|
2745
|
+
for (let i = 0; i < len; i++) {
|
|
2746
|
+
if (keys[i] === key) return state.transVals[i];
|
|
2614
2747
|
}
|
|
2615
2748
|
}
|
|
2616
2749
|
const nextPCs = [];
|
|
@@ -2627,11 +2760,17 @@ class DFA {
|
|
|
2627
2760
|
const nextState = this.getState(nextPCs);
|
|
2628
2761
|
|
|
2629
2762
|
// Cache the result
|
|
2630
|
-
if (
|
|
2631
|
-
|
|
2763
|
+
if (charCode <= Unicode.MAX_LATIN1) {
|
|
2764
|
+
if (anchor === RE2Flags.UNANCHORED) {
|
|
2765
|
+
state.nextLatin1[charCode] = nextState;
|
|
2766
|
+
} else {
|
|
2767
|
+
state.nextLatin1Anchored[charCode] = nextState;
|
|
2768
|
+
}
|
|
2632
2769
|
} else {
|
|
2633
2770
|
const key = charCode + (anchor === RE2Flags.UNANCHORED ? 0 : Unicode.MAX_RUNE + 1);
|
|
2634
|
-
|
|
2771
|
+
// store key -> nextState
|
|
2772
|
+
state.transKeys.push(key);
|
|
2773
|
+
state.transVals.push(nextState);
|
|
2635
2774
|
}
|
|
2636
2775
|
return nextState;
|
|
2637
2776
|
}
|
|
@@ -2664,10 +2803,11 @@ class DFA {
|
|
|
2664
2803
|
if (width === 0) {
|
|
2665
2804
|
break;
|
|
2666
2805
|
}
|
|
2667
|
-
currentState = anchor === RE2Flags.UNANCHORED && rune <= Unicode.
|
|
2806
|
+
currentState = anchor === RE2Flags.UNANCHORED && rune <= Unicode.MAX_LATIN1 && currentState.nextLatin1[rune] || this.step(currentState, rune, anchor);
|
|
2668
2807
|
|
|
2669
2808
|
// If we hit an unrecoverable DFA error or bailout, signal fallback
|
|
2670
2809
|
if (currentState === null) return null;
|
|
2810
|
+
currentState.lastSeen = ++this.clock;
|
|
2671
2811
|
if (currentState.isMatch) {
|
|
2672
2812
|
if (anchor === RE2Flags.ANCHOR_BOTH) {
|
|
2673
2813
|
if (i + width === endPos) return true;
|
|
@@ -2715,9 +2855,10 @@ class DFA {
|
|
|
2715
2855
|
const rune = r >> 3;
|
|
2716
2856
|
const width = r & 7;
|
|
2717
2857
|
if (width === 0) break;
|
|
2718
|
-
currentState = anchor === RE2Flags.UNANCHORED && rune <= Unicode.
|
|
2858
|
+
currentState = anchor === RE2Flags.UNANCHORED && rune <= Unicode.MAX_LATIN1 && currentState.nextLatin1[rune] || this.step(currentState, rune, anchor);
|
|
2719
2859
|
if (currentState === null) return null; // Bailout to NFA
|
|
2720
2860
|
|
|
2861
|
+
currentState.lastSeen = ++this.clock;
|
|
2721
2862
|
i += width;
|
|
2722
2863
|
checkMatch(currentState, i);
|
|
2723
2864
|
if (currentState.nfaStates.length === 0) {
|
|
@@ -2755,7 +2896,7 @@ class BitState {
|
|
|
2755
2896
|
// Bitwise shift (>>> 5) instead of Math.floor( / 32)
|
|
2756
2897
|
const visitedSize = prog.numInst() * (end + 1) + VISITED_BITS - 1 >>> 5;
|
|
2757
2898
|
if (this.visited.length < visitedSize) {
|
|
2758
|
-
this.visited = new Uint32Array(
|
|
2899
|
+
this.visited = new Uint32Array(visitedSize);
|
|
2759
2900
|
} else {
|
|
2760
2901
|
this.visited.fill(0, 0, visitedSize);
|
|
2761
2902
|
}
|
|
@@ -2841,11 +2982,12 @@ class BitState {
|
|
|
2841
2982
|
const outInst = re2.prog.getInst(inst.out);
|
|
2842
2983
|
if (Inst.isRuneOp(outInst.op)) {
|
|
2843
2984
|
this.push(re2, inst.arg, currentPos, false);
|
|
2844
|
-
currentPc = inst.
|
|
2985
|
+
currentPc = inst.arg;
|
|
2986
|
+
currentPos = this.end;
|
|
2845
2987
|
continue;
|
|
2846
2988
|
}
|
|
2847
2989
|
this.push(re2, inst.out, this.end, false);
|
|
2848
|
-
currentPc = inst.
|
|
2990
|
+
currentPc = inst.out;
|
|
2849
2991
|
continue;
|
|
2850
2992
|
}
|
|
2851
2993
|
case Inst.RUNE:
|
|
@@ -2926,6 +3068,11 @@ class BitState {
|
|
|
2926
3068
|
if (currentPos === this.end) return true;
|
|
2927
3069
|
break;
|
|
2928
3070
|
}
|
|
3071
|
+
case Inst.LB_WRITE:
|
|
3072
|
+
case Inst.LB_CHECK:
|
|
3073
|
+
{
|
|
3074
|
+
throw new RE2JSInternalException('Backtracker cannot evaluate Lookbehind instructions');
|
|
3075
|
+
}
|
|
2929
3076
|
default:
|
|
2930
3077
|
{
|
|
2931
3078
|
throw new RE2JSInternalException('bad inst');
|
|
@@ -3194,7 +3341,9 @@ const makeOnePass = p => {
|
|
|
3194
3341
|
}
|
|
3195
3342
|
runes.sort((a, b) => a - b);
|
|
3196
3343
|
} else {
|
|
3197
|
-
|
|
3344
|
+
for (let j = 0; j < inst.runes.length; j++) {
|
|
3345
|
+
runes.push(inst.runes[j]);
|
|
3346
|
+
}
|
|
3198
3347
|
}
|
|
3199
3348
|
onePassRunes[pc] = runes;
|
|
3200
3349
|
inst.next = new Uint32Array(Math.floor(runes.length / 2) + 1).fill(inst.out);
|
|
@@ -3362,6 +3511,10 @@ class OnePass {
|
|
|
3362
3511
|
switch (inst.op) {
|
|
3363
3512
|
case Inst.MATCH:
|
|
3364
3513
|
{
|
|
3514
|
+
// Verify ANCHOR_BOTH constraint before accepting the match
|
|
3515
|
+
if (anchor === RE2Flags.ANCHOR_BOTH && pos !== input.endPos()) {
|
|
3516
|
+
return null;
|
|
3517
|
+
}
|
|
3365
3518
|
matched = true;
|
|
3366
3519
|
if (matchcap.length > 0) {
|
|
3367
3520
|
matchcap[0] = 0;
|
|
@@ -3792,6 +3945,88 @@ class Regexp {
|
|
|
3792
3945
|
}
|
|
3793
3946
|
}
|
|
3794
3947
|
|
|
3948
|
+
// High-speed, single-pass Aho-Corasick string matcher optimized for V8.
|
|
3949
|
+
// Builds a trie with failure links to search for multiple prefixes simultaneously.
|
|
3950
|
+
class AhoCorasick {
|
|
3951
|
+
constructor(wordArrays) {
|
|
3952
|
+
this.next = [Object.create(null)];
|
|
3953
|
+
this.fail = [0];
|
|
3954
|
+
this.match = [false];
|
|
3955
|
+
|
|
3956
|
+
// Build Trie
|
|
3957
|
+
for (const word of wordArrays) {
|
|
3958
|
+
let node = 0;
|
|
3959
|
+
for (let i = 0; i < word.length; i++) {
|
|
3960
|
+
const val = word[i];
|
|
3961
|
+
if (!(val in this.next[node])) {
|
|
3962
|
+
this.next.push(Object.create(null));
|
|
3963
|
+
this.fail.push(0);
|
|
3964
|
+
this.match.push(false);
|
|
3965
|
+
this.next[node][val] = this.next.length - 1;
|
|
3966
|
+
}
|
|
3967
|
+
node = this.next[node][val];
|
|
3968
|
+
}
|
|
3969
|
+
this.match[node] = true;
|
|
3970
|
+
}
|
|
3971
|
+
|
|
3972
|
+
// Build Failure Links (BFS)
|
|
3973
|
+
const queue = [];
|
|
3974
|
+
for (const val in this.next[0]) {
|
|
3975
|
+
if (Object.prototype.hasOwnProperty.call(this.next[0], val)) {
|
|
3976
|
+
const child = this.next[0][val];
|
|
3977
|
+
this.fail[child] = 0;
|
|
3978
|
+
queue.push(child);
|
|
3979
|
+
}
|
|
3980
|
+
}
|
|
3981
|
+
while (queue.length > 0) {
|
|
3982
|
+
const curr = queue.shift();
|
|
3983
|
+
for (const val in this.next[curr]) {
|
|
3984
|
+
if (Object.prototype.hasOwnProperty.call(this.next[curr], val)) {
|
|
3985
|
+
const child = this.next[curr][val];
|
|
3986
|
+
let failNode = this.fail[curr];
|
|
3987
|
+
while (failNode !== 0 && !(val in this.next[failNode])) {
|
|
3988
|
+
failNode = this.fail[failNode];
|
|
3989
|
+
}
|
|
3990
|
+
if (val in this.next[failNode]) {
|
|
3991
|
+
this.fail[child] = this.next[failNode][val];
|
|
3992
|
+
} else {
|
|
3993
|
+
this.fail[child] = 0;
|
|
3994
|
+
}
|
|
3995
|
+
this.match[child] = this.match[child] || this.match[this.fail[child]];
|
|
3996
|
+
queue.push(child);
|
|
3997
|
+
}
|
|
3998
|
+
}
|
|
3999
|
+
}
|
|
4000
|
+
}
|
|
4001
|
+
searchUTF16(charSeq, start, end) {
|
|
4002
|
+
let node = 0;
|
|
4003
|
+
for (let i = start; i < end; i++) {
|
|
4004
|
+
const val = charSeq.charCodeAt(i);
|
|
4005
|
+
while (node !== 0 && !(val in this.next[node])) {
|
|
4006
|
+
node = this.fail[node];
|
|
4007
|
+
}
|
|
4008
|
+
if (val in this.next[node]) {
|
|
4009
|
+
node = this.next[node][val];
|
|
4010
|
+
}
|
|
4011
|
+
if (this.match[node]) return true;
|
|
4012
|
+
}
|
|
4013
|
+
return false;
|
|
4014
|
+
}
|
|
4015
|
+
searchUTF8(bytes, start, end) {
|
|
4016
|
+
let node = 0;
|
|
4017
|
+
for (let i = start; i < end; i++) {
|
|
4018
|
+
const val = bytes[i];
|
|
4019
|
+
while (node !== 0 && !(val in this.next[node])) {
|
|
4020
|
+
node = this.fail[node];
|
|
4021
|
+
}
|
|
4022
|
+
if (val in this.next[node]) {
|
|
4023
|
+
node = this.next[node][val];
|
|
4024
|
+
}
|
|
4025
|
+
if (this.match[node]) return true;
|
|
4026
|
+
}
|
|
4027
|
+
return false;
|
|
4028
|
+
}
|
|
4029
|
+
}
|
|
3795
4030
|
class Prefilter {
|
|
3796
4031
|
static Type = {
|
|
3797
4032
|
NONE: 0,
|
|
@@ -3804,6 +4039,8 @@ class Prefilter {
|
|
|
3804
4039
|
this.subs = [];
|
|
3805
4040
|
this.str = '';
|
|
3806
4041
|
this.bytes = null;
|
|
4042
|
+
this.ac16 = null;
|
|
4043
|
+
this.ac8 = null;
|
|
3807
4044
|
}
|
|
3808
4045
|
eval(input, pos) {
|
|
3809
4046
|
switch (this.type) {
|
|
@@ -3817,6 +4054,10 @@ class Prefilter {
|
|
|
3817
4054
|
}
|
|
3818
4055
|
return true;
|
|
3819
4056
|
case Prefilter.Type.OR:
|
|
4057
|
+
// Exploit Aho-Corasick if it was successfully built
|
|
4058
|
+
if (this.ac16 && this.ac8) {
|
|
4059
|
+
return input.hasAnyString(this, pos);
|
|
4060
|
+
}
|
|
3820
4061
|
for (let i = 0; i < this.subs.length; i++) {
|
|
3821
4062
|
if (this.subs[i].eval(input, pos)) return true;
|
|
3822
4063
|
}
|
|
@@ -3907,7 +4148,9 @@ class PrefilterTree {
|
|
|
3907
4148
|
const s = PrefilterTree.simplify(sub);
|
|
3908
4149
|
if (s.type !== Prefilter.Type.NONE) {
|
|
3909
4150
|
if (s.type === Prefilter.Type.AND) {
|
|
3910
|
-
|
|
4151
|
+
for (let j = 0; j < s.subs.length; j++) {
|
|
4152
|
+
newSubs.push(s.subs[j]);
|
|
4153
|
+
}
|
|
3911
4154
|
} else {
|
|
3912
4155
|
newSubs.push(s);
|
|
3913
4156
|
}
|
|
@@ -3927,7 +4170,9 @@ class PrefilterTree {
|
|
|
3927
4170
|
return new Prefilter(Prefilter.Type.NONE);
|
|
3928
4171
|
}
|
|
3929
4172
|
if (s.type === Prefilter.Type.OR) {
|
|
3930
|
-
|
|
4173
|
+
for (let j = 0; j < s.subs.length; j++) {
|
|
4174
|
+
newSubs.push(s.subs[j]);
|
|
4175
|
+
}
|
|
3931
4176
|
} else {
|
|
3932
4177
|
newSubs.push(s);
|
|
3933
4178
|
}
|
|
@@ -3949,6 +4194,27 @@ class PrefilterTree {
|
|
|
3949
4194
|
}
|
|
3950
4195
|
}
|
|
3951
4196
|
pf.subs = uniqueSubs;
|
|
4197
|
+
|
|
4198
|
+
// Build an Aho-Corasick automaton if all children are exact matches
|
|
4199
|
+
let allExact = true;
|
|
4200
|
+
for (const sub of uniqueSubs) {
|
|
4201
|
+
if (sub.type !== Prefilter.Type.EXACT) {
|
|
4202
|
+
allExact = false;
|
|
4203
|
+
break;
|
|
4204
|
+
}
|
|
4205
|
+
}
|
|
4206
|
+
if (allExact && uniqueSubs.length > 1) {
|
|
4207
|
+
const words16 = uniqueSubs.map(s => {
|
|
4208
|
+
const arr = [];
|
|
4209
|
+
for (let i = 0; i < s.str.length; i++) {
|
|
4210
|
+
arr.push(s.str.charCodeAt(i));
|
|
4211
|
+
}
|
|
4212
|
+
return arr;
|
|
4213
|
+
});
|
|
4214
|
+
pf.ac16 = new AhoCorasick(words16);
|
|
4215
|
+
const words8 = uniqueSubs.map(s => s.bytes);
|
|
4216
|
+
pf.ac8 = new AhoCorasick(words8);
|
|
4217
|
+
}
|
|
3952
4218
|
return pf;
|
|
3953
4219
|
}
|
|
3954
4220
|
return pf;
|
|
@@ -4475,7 +4741,9 @@ class Simplify {
|
|
|
4475
4741
|
// Flatten nested concatenations
|
|
4476
4742
|
if (nsub.op === Regexp.Op.CONCAT) {
|
|
4477
4743
|
changed = true;
|
|
4478
|
-
|
|
4744
|
+
for (let j = 0; j < nsub.subs.length; j++) {
|
|
4745
|
+
newSubs.push(nsub.subs[j]);
|
|
4746
|
+
}
|
|
4479
4747
|
continue;
|
|
4480
4748
|
}
|
|
4481
4749
|
} else if (re.op === Regexp.Op.ALTERNATE) {
|
|
@@ -4487,7 +4755,9 @@ class Simplify {
|
|
|
4487
4755
|
// Flatten nested alternations
|
|
4488
4756
|
if (nsub.op === Regexp.Op.ALTERNATE) {
|
|
4489
4757
|
changed = true;
|
|
4490
|
-
|
|
4758
|
+
for (let j = 0; j < nsub.subs.length; j++) {
|
|
4759
|
+
newSubs.push(nsub.subs[j]);
|
|
4760
|
+
}
|
|
4491
4761
|
continue;
|
|
4492
4762
|
}
|
|
4493
4763
|
}
|
|
@@ -5497,7 +5767,10 @@ class Parser {
|
|
|
5497
5767
|
return t.pop();
|
|
5498
5768
|
}
|
|
5499
5769
|
static concatRunes(x, y) {
|
|
5500
|
-
|
|
5770
|
+
for (let i = 0; i < y.length; i++) {
|
|
5771
|
+
x.push(y[i]);
|
|
5772
|
+
}
|
|
5773
|
+
return x;
|
|
5501
5774
|
}
|
|
5502
5775
|
constructor(wholeRegexp, flags = 0) {
|
|
5503
5776
|
this.wholeRegexp = wholeRegexp;
|
|
@@ -5533,8 +5806,8 @@ class Parser {
|
|
|
5533
5806
|
return re;
|
|
5534
5807
|
}
|
|
5535
5808
|
reuse(re) {
|
|
5536
|
-
if (this.height !== null &&
|
|
5537
|
-
|
|
5809
|
+
if (this.height !== null && this.height.has(re)) {
|
|
5810
|
+
this.height.delete(re);
|
|
5538
5811
|
}
|
|
5539
5812
|
if (re.subs !== null && re.subs.length > 0) {
|
|
5540
5813
|
re.subs[0] = this.free;
|
|
@@ -5566,20 +5839,20 @@ class Parser {
|
|
|
5566
5839
|
if (n <= 0) {
|
|
5567
5840
|
n = 1;
|
|
5568
5841
|
}
|
|
5569
|
-
if (n > Parser.MAX_SIZE / this.repeats) {
|
|
5842
|
+
if (n > Math.floor(Parser.MAX_SIZE / this.repeats)) {
|
|
5570
5843
|
this.repeats = Parser.MAX_SIZE;
|
|
5571
5844
|
} else {
|
|
5572
5845
|
this.repeats *= n;
|
|
5573
5846
|
}
|
|
5574
5847
|
}
|
|
5575
|
-
if (this.numRegexp < Parser.MAX_SIZE / this.repeats) {
|
|
5848
|
+
if (this.numRegexp < Math.floor(Parser.MAX_SIZE / this.repeats)) {
|
|
5576
5849
|
return;
|
|
5577
5850
|
}
|
|
5578
5851
|
|
|
5579
5852
|
// We need to start tracking size.
|
|
5580
5853
|
// Make the map and belatedly populate it
|
|
5581
5854
|
// with info about everything we've constructed so far.
|
|
5582
|
-
this.size =
|
|
5855
|
+
this.size = new Map();
|
|
5583
5856
|
for (let reEx of this.stack) {
|
|
5584
5857
|
this.checkSize(reEx);
|
|
5585
5858
|
}
|
|
@@ -5590,8 +5863,8 @@ class Parser {
|
|
|
5590
5863
|
}
|
|
5591
5864
|
calcSize(re, force = false) {
|
|
5592
5865
|
if (!force && this.size !== null) {
|
|
5593
|
-
if (
|
|
5594
|
-
return this.size
|
|
5866
|
+
if (this.size.has(re)) {
|
|
5867
|
+
return this.size.get(re);
|
|
5595
5868
|
}
|
|
5596
5869
|
}
|
|
5597
5870
|
let size = 0;
|
|
@@ -5651,9 +5924,9 @@ class Parser {
|
|
|
5651
5924
|
}
|
|
5652
5925
|
size = Math.max(1, size);
|
|
5653
5926
|
if (this.size === null) {
|
|
5654
|
-
this.size =
|
|
5927
|
+
this.size = new Map();
|
|
5655
5928
|
}
|
|
5656
|
-
this.size
|
|
5929
|
+
this.size.set(re, size);
|
|
5657
5930
|
return size;
|
|
5658
5931
|
}
|
|
5659
5932
|
checkHeight(re) {
|
|
@@ -5661,7 +5934,7 @@ class Parser {
|
|
|
5661
5934
|
return;
|
|
5662
5935
|
}
|
|
5663
5936
|
if (this.height === null) {
|
|
5664
|
-
this.height =
|
|
5937
|
+
this.height = new Map();
|
|
5665
5938
|
for (let reEx of this.stack) {
|
|
5666
5939
|
this.checkHeight(reEx);
|
|
5667
5940
|
}
|
|
@@ -5672,8 +5945,8 @@ class Parser {
|
|
|
5672
5945
|
}
|
|
5673
5946
|
calcHeight(re, force = false) {
|
|
5674
5947
|
if (!force && this.height !== null) {
|
|
5675
|
-
if (
|
|
5676
|
-
return this.height
|
|
5948
|
+
if (this.height.has(re)) {
|
|
5949
|
+
return this.height.get(re);
|
|
5677
5950
|
}
|
|
5678
5951
|
}
|
|
5679
5952
|
let h = 1;
|
|
@@ -5684,9 +5957,9 @@ class Parser {
|
|
|
5684
5957
|
}
|
|
5685
5958
|
}
|
|
5686
5959
|
if (this.height === null) {
|
|
5687
|
-
this.height =
|
|
5960
|
+
this.height = new Map();
|
|
5688
5961
|
}
|
|
5689
|
-
this.height
|
|
5962
|
+
this.height.set(re, h);
|
|
5690
5963
|
return h;
|
|
5691
5964
|
}
|
|
5692
5965
|
|
|
@@ -7808,7 +8081,7 @@ class TranslateRegExpString {
|
|
|
7808
8081
|
changed = true;
|
|
7809
8082
|
continue;
|
|
7810
8083
|
} else if (ch === '(' && i + 2 < size && data[i + 1] === '?' && data[i + 2] === '<') {
|
|
7811
|
-
if (i + 3
|
|
8084
|
+
if (i + 3 < size && !'=!>)'.includes(data[i + 3])) {
|
|
7812
8085
|
result += '(?P<';
|
|
7813
8086
|
i += 3;
|
|
7814
8087
|
changed = true;
|