re2js 2.2.0 → 2.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/README.md +54 -54
- package/build/index.cjs.cjs +432 -159
- package/build/index.cjs.cjs.map +1 -1
- package/build/index.esm.d.ts +3 -1
- package/build/index.esm.d.ts.map +1 -1
- package/build/index.esm.js +432 -159
- package/build/index.esm.js.map +1 -1
- package/build/index.umd.js +432 -159
- package/build/index.umd.js.map +1 -1
- package/package.json +7 -7
package/build/index.cjs.cjs
CHANGED
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
* re2js
|
|
3
3
|
* RE2JS is the JavaScript port of RE2, a regular expression engine that provides linear time matching
|
|
4
4
|
*
|
|
5
|
-
* @version v2.2.
|
|
6
|
-
* @author
|
|
5
|
+
* @version v2.2.2
|
|
6
|
+
* @author Oleksii Vasyliev
|
|
7
7
|
* @homepage https://github.com/le0pard/re2js#readme
|
|
8
8
|
* @repository github:le0pard/re2js
|
|
9
9
|
* @license MIT
|
|
@@ -78,6 +78,8 @@ const PublicFlags = {
|
|
|
78
78
|
const ASCII_SIZE = 128;
|
|
79
79
|
const ASCII_TO_UPPER = new Int32Array(ASCII_SIZE);
|
|
80
80
|
const ASCII_TO_LOWER = new Int32Array(ASCII_SIZE);
|
|
81
|
+
// The highest legal Basic Multilingual Plane (BMP) value.
|
|
82
|
+
const MAX_BMP = 0xffff;
|
|
81
83
|
for (let i = 0; i < ASCII_SIZE; i++) {
|
|
82
84
|
if (i >= 97 && i <= 122) {
|
|
83
85
|
// a-z
|
|
@@ -101,11 +103,13 @@ class Codepoint {
|
|
|
101
103
|
static toUpperCase(codepoint) {
|
|
102
104
|
if (codepoint < ASCII_SIZE) return ASCII_TO_UPPER[codepoint];
|
|
103
105
|
const s = String.fromCodePoint(codepoint).toUpperCase();
|
|
104
|
-
|
|
106
|
+
const expectedLen = s.codePointAt(0) > MAX_BMP ? 2 : 1;
|
|
107
|
+
if (s.length > expectedLen) {
|
|
105
108
|
return codepoint;
|
|
106
109
|
}
|
|
107
110
|
const sOrigin = String.fromCodePoint(s.codePointAt(0)).toLowerCase();
|
|
108
|
-
|
|
111
|
+
const originExpectedLen = sOrigin.codePointAt(0) > MAX_BMP ? 2 : 1;
|
|
112
|
+
if (sOrigin.length > originExpectedLen || sOrigin.codePointAt(0) !== codepoint) {
|
|
109
113
|
return codepoint;
|
|
110
114
|
}
|
|
111
115
|
return s.codePointAt(0);
|
|
@@ -116,11 +120,13 @@ class Codepoint {
|
|
|
116
120
|
static toLowerCase(codepoint) {
|
|
117
121
|
if (codepoint < ASCII_SIZE) return ASCII_TO_LOWER[codepoint];
|
|
118
122
|
const s = String.fromCodePoint(codepoint).toLowerCase();
|
|
119
|
-
|
|
123
|
+
const expectedLen = s.codePointAt(0) > MAX_BMP ? 2 : 1;
|
|
124
|
+
if (s.length > expectedLen) {
|
|
120
125
|
return codepoint;
|
|
121
126
|
}
|
|
122
127
|
const sOrigin = String.fromCodePoint(s.codePointAt(0)).toUpperCase();
|
|
123
|
-
|
|
128
|
+
const originExpectedLen = sOrigin.codePointAt(0) > MAX_BMP ? 2 : 1;
|
|
129
|
+
if (sOrigin.length > originExpectedLen || sOrigin.codePointAt(0) !== codepoint) {
|
|
124
130
|
return codepoint;
|
|
125
131
|
}
|
|
126
132
|
return s.codePointAt(0);
|
|
@@ -222,7 +228,7 @@ class UnicodeTables {
|
|
|
222
228
|
static _CASE_ORBIT = null;
|
|
223
229
|
static get CASE_ORBIT() {
|
|
224
230
|
if (!this._CASE_ORBIT) {
|
|
225
|
-
this._CASE_ORBIT = decodeOrbit('rCrDIzDYqpII-LiC8cQlHa+0HGrpI6EzClClOBmOBkOBoOBpOBnOBrOBsOBqOlByPBzPBxPyK5crCz+HCydD1dD4dB5dB6dC8dEgeBheCieDmeDpeHj-HCweD1fDxeB+9HBwfC1FE2eBxfBjeBjdD1eDmpIHycB0fEmdBgda6cBhdD4cB1cdyhBC0hBK+hBDhiBBiiBIqiBIgkHChkHKikHDjkHBkkHImkHYjjBBnkH9gGygBB0gBB+gBBhhBBlkHBihBBqhBBijBBqypB4OhzHB70H6BgzHD-GiHo8HBp8HBq8HBr8HBs8HBt8HBu8HBv8HBg8HBh8HBi8HBj8HBk8HBl8HBm8HBn8HB48HB58HB68HB78HB88HB98HB+8HB-8HBw8HBx8HBy8HBz8HB08HB18HB28HB38HBo9HBp9HBq9HBr9HBs9HBt9HBu9HBv9HBg9HBh9HBi9HBj9HBk9HBl9HBm9HBn9HE89HJz9HClaFs+HJj+HHwcQwdQ8-HJz-HqJpdErCBlG-ohBrypBBokH6lVm4+BBl4+
|
|
231
|
+
this._CASE_ORBIT = decodeOrbit('rCrDIzDYqpII-LiC8cQlHa+0HGrpI6EzClClOBmOBkOBoOBpOBnOBrOBsOBqOlByPBzPBxPyK5crCz+HCydD1dD4dB5dB6dC8dEgeBheCieDmeDpeHj-HCweD1fDxeB+9HBwfC1FE2eBxfBjeBjdD1eDmpIHycB0fEmdBgda6cBhdD4cB1cdyhBC0hBK+hBDhiBBiiBIqiBIgkHChkHKikHDjkHBkkHImkHYjjBBnkH9gGygBB0gBB+gBBhhBBlkHBihBBqhBBijBBqypB4OhzHB70H6BgzHD-GiHo8HBp8HBq8HBr8HBs8HBt8HBu8HBv8HBg8HBh8HBi8HBj8HBk8HBl8HBm8HBn8HB48HB58HB68HB78HB88HB98HB+8HB-8HBw8HBx8HBy8HBz8HB08HB18HB28HB38HBo9HBp9HBq9HBr9HBs9HBt9HBu9HBv9HBg9HBh9HBi9HBj9HBk9HBl9HBm9HBn9HE89HJz9HClaFs+HJj+HHwcQwdQ8-HJz-HqJpdErCBlG-ohBrypBBokH6lVm4+BBl4+B');
|
|
226
232
|
}
|
|
227
233
|
return this._CASE_ORBIT;
|
|
228
234
|
}
|
|
@@ -594,11 +600,16 @@ class Unicode {
|
|
|
594
600
|
// to compare it to |r2|.
|
|
595
601
|
// -1 is interpreted as the end-of-file mark.
|
|
596
602
|
static equalsIgnoreCase(r1, r2) {
|
|
597
|
-
// Runes already match
|
|
598
|
-
if (r1
|
|
603
|
+
// Runes already match
|
|
604
|
+
if (r1 === r2) {
|
|
599
605
|
return true;
|
|
600
606
|
}
|
|
601
607
|
|
|
608
|
+
// Safely fail if either is EOF (and they didn't explicitly match above)
|
|
609
|
+
if (r1 < 0 || r2 < 0) {
|
|
610
|
+
return false;
|
|
611
|
+
}
|
|
612
|
+
|
|
602
613
|
// Fast path for the common case where both runes are ASCII characters.
|
|
603
614
|
// Coerces both runes to lowercase if applicable.
|
|
604
615
|
if (r1 <= this.MAX_ASCII && r2 <= this.MAX_ASCII) {
|
|
@@ -851,7 +862,7 @@ class Utils {
|
|
|
851
862
|
// Encoding[(Encoding['UTF_16'] = 0)] = 'UTF_16'
|
|
852
863
|
// Encoding[(Encoding['UTF_8'] = 1)] = 'UTF_8'
|
|
853
864
|
const createEnum = (values = [], initNum = 0) => {
|
|
854
|
-
const enumObject =
|
|
865
|
+
const enumObject = Object.create(null);
|
|
855
866
|
for (let i = 0; i < values.length; i++) {
|
|
856
867
|
const val = values[i];
|
|
857
868
|
const keyVal = initNum + i;
|
|
@@ -993,6 +1004,9 @@ class MachineInputBase {
|
|
|
993
1004
|
hasString() {
|
|
994
1005
|
return false;
|
|
995
1006
|
}
|
|
1007
|
+
hasAnyString() {
|
|
1008
|
+
return false;
|
|
1009
|
+
}
|
|
996
1010
|
|
|
997
1011
|
// Helper for the exact-literal fast-path execution router
|
|
998
1012
|
prefixLength() {
|
|
@@ -1018,6 +1032,13 @@ class MachineUTF8Input extends MachineInputBase {
|
|
|
1018
1032
|
return idx !== -1 && idx <= this.end - target.length;
|
|
1019
1033
|
}
|
|
1020
1034
|
|
|
1035
|
+
// Executes a high-speed, single - pass search for multiple literal strings
|
|
1036
|
+
// simultaneously using an Aho-Corasick automaton.
|
|
1037
|
+
hasAnyString(prefilter, pos) {
|
|
1038
|
+
if (!prefilter.ac8) return false;
|
|
1039
|
+
return prefilter.ac8.searchUTF8(this.bytes, this.start + pos, this.end);
|
|
1040
|
+
}
|
|
1041
|
+
|
|
1021
1042
|
// Returns the rune at the specified index; the units are
|
|
1022
1043
|
// unspecified, but could be UTF-8 byte, UTF-16 char, or rune
|
|
1023
1044
|
// indices. Returns the width (in the same units) of the rune in
|
|
@@ -1036,17 +1057,23 @@ class MachineUTF8Input extends MachineInputBase {
|
|
|
1036
1057
|
return c << 3 | 1;
|
|
1037
1058
|
} else if (c >= 0xc2 && c <= 0xdf && pos + 1 < this.end) {
|
|
1038
1059
|
const c1 = this.bytes[pos + 1] & 0xff;
|
|
1060
|
+
if ((c1 & 0xc0) !== 0x80) return c << 3 | 1;
|
|
1039
1061
|
const rune = (c & 0x1f) << 6 | c1 & 0x3f;
|
|
1040
1062
|
return rune << 3 | 2;
|
|
1041
1063
|
} else if (c >= 0xe0 && c <= 0xef && pos + 2 < this.end) {
|
|
1042
1064
|
const c1 = this.bytes[pos + 1] & 0xff;
|
|
1065
|
+
if ((c1 & 0xc0) !== 0x80) return c << 3 | 1;
|
|
1043
1066
|
const c2 = this.bytes[pos + 2] & 0xff;
|
|
1067
|
+
if ((c2 & 0xc0) !== 0x80) return c << 3 | 1;
|
|
1044
1068
|
const rune = (c & 0x0f) << 12 | (c1 & 0x3f) << 6 | c2 & 0x3f;
|
|
1045
1069
|
return rune << 3 | 3;
|
|
1046
1070
|
} else if (c >= 0xf0 && c <= 0xf4 && pos + 3 < this.end) {
|
|
1047
1071
|
const c1 = this.bytes[pos + 1] & 0xff;
|
|
1072
|
+
if ((c1 & 0xc0) !== 0x80) return c << 3 | 1;
|
|
1048
1073
|
const c2 = this.bytes[pos + 2] & 0xff;
|
|
1074
|
+
if ((c2 & 0xc0) !== 0x80) return c << 3 | 1;
|
|
1049
1075
|
const c3 = this.bytes[pos + 3] & 0xff;
|
|
1076
|
+
if ((c3 & 0xc0) !== 0x80) return c << 3 | 1;
|
|
1050
1077
|
const rune = (c & 0x07) << 18 | (c1 & 0x3f) << 12 | (c2 & 0x3f) << 6 | c3 & 0x3f;
|
|
1051
1078
|
return rune << 3 | 4;
|
|
1052
1079
|
} else {
|
|
@@ -1125,6 +1152,13 @@ class MachineUTF16Input extends MachineInputBase {
|
|
|
1125
1152
|
return idx !== -1 && idx <= this.end - prefilter.str.length;
|
|
1126
1153
|
}
|
|
1127
1154
|
|
|
1155
|
+
// Executes a high-speed, single - pass search for multiple literal strings
|
|
1156
|
+
// simultaneously using an Aho-Corasick automaton.
|
|
1157
|
+
hasAnyString(prefilter, pos) {
|
|
1158
|
+
if (!prefilter.ac16) return false;
|
|
1159
|
+
return prefilter.ac16.searchUTF16(this.charSequence, this.start + pos, this.end);
|
|
1160
|
+
}
|
|
1161
|
+
|
|
1128
1162
|
// Returns the rune at the specified index; the units are
|
|
1129
1163
|
// unspecified, but could be UTF-8 byte, UTF-16 char, or rune
|
|
1130
1164
|
// indices. Returns the width (in the same units) of the rune in
|
|
@@ -1570,7 +1604,15 @@ class Matcher {
|
|
|
1570
1604
|
if (this.hasMatch) {
|
|
1571
1605
|
start = this.groups[1];
|
|
1572
1606
|
if (this.groups[0] === this.groups[1]) {
|
|
1573
|
-
|
|
1607
|
+
// Safely calculate structural encoding width to avoid sequence corruption
|
|
1608
|
+
const machineInput = this.matcherInput.isUTF16Encoding() ? MachineInput.fromUTF16(this.matcherInput.asCharSequence(), 0, this.matcherInputLength) : MachineInput.fromUTF8(this.matcherInput.asBytes(), 0, this.matcherInputLength);
|
|
1609
|
+
const r = machineInput.step(start);
|
|
1610
|
+
if (r < 0) {
|
|
1611
|
+
// EOF
|
|
1612
|
+
start++; // Advance past length to force loop exit
|
|
1613
|
+
} else {
|
|
1614
|
+
start += r & 7; // Advance by safely decoded width
|
|
1615
|
+
}
|
|
1574
1616
|
}
|
|
1575
1617
|
}
|
|
1576
1618
|
return this.genMatch(start, RE2Flags.UNANCHORED);
|
|
@@ -1709,6 +1751,8 @@ class Matcher {
|
|
|
1709
1751
|
const groupName = replacement.substring(i + 1, j);
|
|
1710
1752
|
res += this.group(groupName);
|
|
1711
1753
|
last = j + 1;
|
|
1754
|
+
i = j;
|
|
1755
|
+
continue;
|
|
1712
1756
|
}
|
|
1713
1757
|
}
|
|
1714
1758
|
}
|
|
@@ -1788,6 +1832,7 @@ class Matcher {
|
|
|
1788
1832
|
if (j === replacement.length || replacement.codePointAt(j) !== Codepoint.CODES.get('>')) {
|
|
1789
1833
|
res += replacement.substring(i - 1, j + 1);
|
|
1790
1834
|
last = j + 1;
|
|
1835
|
+
i = j;
|
|
1791
1836
|
continue;
|
|
1792
1837
|
}
|
|
1793
1838
|
const groupName = replacement.substring(i + 1, j);
|
|
@@ -1797,6 +1842,8 @@ class Matcher {
|
|
|
1797
1842
|
res += `$<${groupName}>`;
|
|
1798
1843
|
}
|
|
1799
1844
|
last = j + 1;
|
|
1845
|
+
i = j;
|
|
1846
|
+
continue;
|
|
1800
1847
|
}
|
|
1801
1848
|
}
|
|
1802
1849
|
}
|
|
@@ -1921,6 +1968,8 @@ class Inst {
|
|
|
1921
1968
|
return r === r0;
|
|
1922
1969
|
}
|
|
1923
1970
|
const len = this.runes.length;
|
|
1971
|
+
if (len === 0) return false;
|
|
1972
|
+
|
|
1924
1973
|
// If the array is exactly 2, 4, 6, or 8 items, DO NOT fall through to binary search
|
|
1925
1974
|
if (len === 2 || len === 4 || len === 6 || len === 8) {
|
|
1926
1975
|
for (let j = 0; j < len; j += 2) {
|
|
@@ -1934,22 +1983,19 @@ class Inst {
|
|
|
1934
1983
|
return false; // Stop here
|
|
1935
1984
|
}
|
|
1936
1985
|
|
|
1937
|
-
//
|
|
1938
|
-
|
|
1939
|
-
|
|
1940
|
-
|
|
1941
|
-
|
|
1942
|
-
|
|
1943
|
-
|
|
1944
|
-
|
|
1945
|
-
|
|
1946
|
-
}
|
|
1947
|
-
lo = m + 1;
|
|
1948
|
-
} else {
|
|
1949
|
-
hi = m;
|
|
1950
|
-
}
|
|
1986
|
+
// Branchless Binary Search (Lower Bound)
|
|
1987
|
+
// Compiles to optimal conditional move (cmov) machine code, preventing
|
|
1988
|
+
// branch mispredictions on large, chaotic Unicode arrays
|
|
1989
|
+
let base = 0;
|
|
1990
|
+
let n = len >> 1;
|
|
1991
|
+
while (n > 1) {
|
|
1992
|
+
const half = n >> 1;
|
|
1993
|
+
base += this.runes[base + half << 1] <= r ? half : 0;
|
|
1994
|
+
n -= half;
|
|
1951
1995
|
}
|
|
1952
|
-
|
|
1996
|
+
base += this.runes[base << 1] <= r ? 1 : 0;
|
|
1997
|
+
const m = base - 1;
|
|
1998
|
+
return m >= 0 && r <= this.runes[m << 1 | 1];
|
|
1953
1999
|
}
|
|
1954
2000
|
|
|
1955
2001
|
// matchRunePos checks whether the instruction matches (and consumes) r.
|
|
@@ -1964,6 +2010,7 @@ class Inst {
|
|
|
1964
2010
|
return r === r0 ? 0 : -1;
|
|
1965
2011
|
}
|
|
1966
2012
|
const len = this.runes.length;
|
|
2013
|
+
if (len === 0) return -1;
|
|
1967
2014
|
if (len === 2 || len === 4 || len === 6 || len === 8) {
|
|
1968
2015
|
for (let j = 0; j < len; j += 2) {
|
|
1969
2016
|
if (r < this.runes[j]) return -1;
|
|
@@ -1971,19 +2018,18 @@ class Inst {
|
|
|
1971
2018
|
}
|
|
1972
2019
|
return -1;
|
|
1973
2020
|
}
|
|
1974
|
-
|
|
1975
|
-
|
|
1976
|
-
|
|
1977
|
-
|
|
1978
|
-
|
|
1979
|
-
|
|
1980
|
-
|
|
1981
|
-
|
|
1982
|
-
} else {
|
|
1983
|
-
hi = m;
|
|
1984
|
-
}
|
|
2021
|
+
|
|
2022
|
+
// Branchless Binary Search (Lower Bound)
|
|
2023
|
+
let base = 0;
|
|
2024
|
+
let n = len >> 1;
|
|
2025
|
+
while (n > 1) {
|
|
2026
|
+
const half = n >> 1;
|
|
2027
|
+
base += this.runes[base + half << 1] <= r ? half : 0;
|
|
2028
|
+
n -= half;
|
|
1985
2029
|
}
|
|
1986
|
-
|
|
2030
|
+
base += this.runes[base << 1] <= r ? 1 : 0;
|
|
2031
|
+
const m = base - 1;
|
|
2032
|
+
return m >= 0 && r <= this.runes[m << 1 | 1] ? m : -1;
|
|
1987
2033
|
}
|
|
1988
2034
|
/**
|
|
1989
2035
|
*
|
|
@@ -2082,6 +2128,7 @@ class Queue {
|
|
|
2082
2128
|
//
|
|
2083
2129
|
// Called by RE2.doExecute.
|
|
2084
2130
|
class Machine {
|
|
2131
|
+
static THREADS_CHUNK_SIZE = 128;
|
|
2085
2132
|
static fromRE2(re2) {
|
|
2086
2133
|
const m = new Machine();
|
|
2087
2134
|
m.prog = re2.prog;
|
|
@@ -2122,15 +2169,15 @@ class Machine {
|
|
|
2122
2169
|
resetCap() {
|
|
2123
2170
|
for (let i = 0; i < this.poolSize; i++) {
|
|
2124
2171
|
const t = this.pool[i];
|
|
2125
|
-
t.cap.fill(
|
|
2172
|
+
t.cap.fill(-1);
|
|
2126
2173
|
}
|
|
2127
2174
|
}
|
|
2128
2175
|
initNewCap(ncap) {
|
|
2129
2176
|
for (let i = 0; i < this.poolSize; i++) {
|
|
2130
2177
|
const t = this.pool[i];
|
|
2131
|
-
t.cap = new Int32Array(ncap);
|
|
2178
|
+
t.cap = new Int32Array(ncap).fill(-1);
|
|
2132
2179
|
}
|
|
2133
|
-
this.matchcap = new Int32Array(ncap);
|
|
2180
|
+
this.matchcap = new Int32Array(ncap).fill(-1);
|
|
2134
2181
|
}
|
|
2135
2182
|
submatches() {
|
|
2136
2183
|
if (this.ncap === 0) {
|
|
@@ -2143,14 +2190,21 @@ class Machine {
|
|
|
2143
2190
|
// alloc() allocates a new thread with the given instruction.
|
|
2144
2191
|
// It uses the free pool if possible.
|
|
2145
2192
|
alloc(inst) {
|
|
2146
|
-
|
|
2147
|
-
|
|
2148
|
-
|
|
2149
|
-
|
|
2150
|
-
|
|
2151
|
-
|
|
2152
|
-
|
|
2193
|
+
if (this.poolSize === 0) {
|
|
2194
|
+
const capLen = this.matchcap.length;
|
|
2195
|
+
|
|
2196
|
+
// Bulk allocate threads in a tight loop so the V8 engine
|
|
2197
|
+
// places them adjacently in the young generation heap
|
|
2198
|
+
for (let i = 0; i < Machine.THREADS_CHUNK_SIZE; i++) {
|
|
2199
|
+
const t = new Thread();
|
|
2200
|
+
t.cap = new Int32Array(capLen);
|
|
2201
|
+
this.pool[this.poolSize++] = t;
|
|
2202
|
+
}
|
|
2153
2203
|
}
|
|
2204
|
+
|
|
2205
|
+
// Pop a thread from the top of the pool stack
|
|
2206
|
+
this.poolSize--;
|
|
2207
|
+
const t = this.pool[this.poolSize];
|
|
2154
2208
|
t.inst = inst;
|
|
2155
2209
|
return t;
|
|
2156
2210
|
}
|
|
@@ -2203,6 +2257,9 @@ class Machine {
|
|
|
2203
2257
|
if ((startCond & Utils.EMPTY_BEGIN_TEXT) !== 0 && pos !== 0) {
|
|
2204
2258
|
break;
|
|
2205
2259
|
}
|
|
2260
|
+
if ((anchor === RE2Flags.ANCHOR_START || anchor === RE2Flags.ANCHOR_BOTH) && pos !== 0) {
|
|
2261
|
+
break;
|
|
2262
|
+
}
|
|
2206
2263
|
if (this.matched) {
|
|
2207
2264
|
break;
|
|
2208
2265
|
}
|
|
@@ -2280,6 +2337,9 @@ class Machine {
|
|
|
2280
2337
|
while (true) {
|
|
2281
2338
|
if (runq.isEmpty()) {
|
|
2282
2339
|
if ((startCond & Utils.EMPTY_BEGIN_TEXT) !== 0 && pos !== 0) break;
|
|
2340
|
+
if ((anchor === RE2Flags.ANCHOR_START || anchor === RE2Flags.ANCHOR_BOTH) && pos !== 0) {
|
|
2341
|
+
break;
|
|
2342
|
+
}
|
|
2283
2343
|
}
|
|
2284
2344
|
if (pos === 0 || anchor === RE2Flags.UNANCHORED) {
|
|
2285
2345
|
// Spawn Lookbehind threads BEFORE the main pattern
|
|
@@ -2395,78 +2455,83 @@ class Machine {
|
|
|
2395
2455
|
runq.clear();
|
|
2396
2456
|
}
|
|
2397
2457
|
add(q, pc, pos, cap, cond, t) {
|
|
2398
|
-
|
|
2399
|
-
|
|
2400
|
-
|
|
2401
|
-
|
|
2402
|
-
|
|
2403
|
-
|
|
2404
|
-
|
|
2405
|
-
|
|
2406
|
-
|
|
2407
|
-
|
|
2408
|
-
|
|
2409
|
-
|
|
2410
|
-
|
|
2411
|
-
|
|
2412
|
-
t = this.add(q, inst.arg, pos, cap, cond, t);
|
|
2413
|
-
break;
|
|
2414
|
-
case Inst.EMPTY_WIDTH:
|
|
2415
|
-
if ((inst.arg & ~cond) === 0) {
|
|
2416
|
-
t = this.add(q, inst.out, pos, cap, cond, t);
|
|
2417
|
-
}
|
|
2418
|
-
break;
|
|
2419
|
-
case Inst.NOP:
|
|
2420
|
-
t = this.add(q, inst.out, pos, cap, cond, t);
|
|
2421
|
-
break;
|
|
2422
|
-
case Inst.CAPTURE:
|
|
2423
|
-
if (inst.arg < this.ncap) {
|
|
2424
|
-
const opos = cap[inst.arg];
|
|
2425
|
-
cap[inst.arg] = pos;
|
|
2426
|
-
this.add(q, inst.out, pos, cap, cond, null);
|
|
2427
|
-
cap[inst.arg] = opos;
|
|
2428
|
-
} else {
|
|
2458
|
+
while (true) {
|
|
2459
|
+
if (pc === 0) {
|
|
2460
|
+
return t;
|
|
2461
|
+
}
|
|
2462
|
+
if (q.contains(pc)) {
|
|
2463
|
+
return t;
|
|
2464
|
+
}
|
|
2465
|
+
const d = q.add(pc);
|
|
2466
|
+
const inst = this.prog.inst[pc];
|
|
2467
|
+
switch (inst.op) {
|
|
2468
|
+
case Inst.FAIL:
|
|
2469
|
+
return t;
|
|
2470
|
+
case Inst.ALT:
|
|
2471
|
+
case Inst.ALT_MATCH:
|
|
2429
2472
|
t = this.add(q, inst.out, pos, cap, cond, t);
|
|
2430
|
-
|
|
2431
|
-
|
|
2432
|
-
|
|
2433
|
-
|
|
2434
|
-
|
|
2435
|
-
|
|
2436
|
-
case Inst.LB_CHECK:
|
|
2437
|
-
if (inst.lb > 0) {
|
|
2438
|
-
// Positive Lookbehind
|
|
2439
|
-
if (this.lbTable[inst.lb] === pos) {
|
|
2440
|
-
t = this.add(q, inst.out, pos, cap, cond, t);
|
|
2473
|
+
pc = inst.arg; // Flattened tail recursion
|
|
2474
|
+
continue;
|
|
2475
|
+
case Inst.EMPTY_WIDTH:
|
|
2476
|
+
if ((inst.arg & ~cond) === 0) {
|
|
2477
|
+
pc = inst.out; // Flattened tail recursion
|
|
2478
|
+
continue;
|
|
2441
2479
|
}
|
|
2442
|
-
|
|
2443
|
-
|
|
2444
|
-
|
|
2445
|
-
|
|
2446
|
-
|
|
2447
|
-
|
|
2448
|
-
|
|
2449
|
-
|
|
2450
|
-
|
|
2451
|
-
|
|
2452
|
-
|
|
2453
|
-
|
|
2454
|
-
|
|
2455
|
-
|
|
2456
|
-
}
|
|
2457
|
-
if (this.ncap > 0 && t.cap !== cap) {
|
|
2458
|
-
// Direct assignment utilizing Typed Array performance
|
|
2459
|
-
for (let c = 0; c < this.ncap; c++) {
|
|
2460
|
-
t.cap[c] = cap[c];
|
|
2480
|
+
return t;
|
|
2481
|
+
case Inst.NOP:
|
|
2482
|
+
pc = inst.out; // Flattened tail recursion
|
|
2483
|
+
continue;
|
|
2484
|
+
case Inst.CAPTURE:
|
|
2485
|
+
if (inst.arg < this.ncap) {
|
|
2486
|
+
const opos = cap[inst.arg];
|
|
2487
|
+
cap[inst.arg] = pos;
|
|
2488
|
+
this.add(q, inst.out, pos, cap, cond, null);
|
|
2489
|
+
cap[inst.arg] = opos;
|
|
2490
|
+
return t;
|
|
2491
|
+
} else {
|
|
2492
|
+
pc = inst.out; // Flattened tail recursion
|
|
2493
|
+
continue;
|
|
2461
2494
|
}
|
|
2462
|
-
|
|
2463
|
-
|
|
2464
|
-
|
|
2465
|
-
|
|
2466
|
-
|
|
2467
|
-
|
|
2495
|
+
case Inst.LB_WRITE:
|
|
2496
|
+
this.lbTable[Math.abs(inst.lb)] = pos;
|
|
2497
|
+
pc = inst.out;
|
|
2498
|
+
continue;
|
|
2499
|
+
case Inst.LB_CHECK:
|
|
2500
|
+
if (inst.lb > 0) {
|
|
2501
|
+
// Positive Lookbehind
|
|
2502
|
+
if (this.lbTable[inst.lb] === pos) {
|
|
2503
|
+
pc = inst.out; // Flattened tail recursion
|
|
2504
|
+
continue;
|
|
2505
|
+
}
|
|
2506
|
+
} else if (this.lbTable[-inst.lb] !== pos) {
|
|
2507
|
+
// Negative Lookbehind
|
|
2508
|
+
pc = inst.out; // Flattened tail recursion
|
|
2509
|
+
continue;
|
|
2510
|
+
}
|
|
2511
|
+
return t;
|
|
2512
|
+
case Inst.MATCH:
|
|
2513
|
+
case Inst.RUNE:
|
|
2514
|
+
case Inst.RUNE1:
|
|
2515
|
+
case Inst.RUNE_ANY:
|
|
2516
|
+
case Inst.RUNE_ANY_NOT_NL:
|
|
2517
|
+
if (t === null) {
|
|
2518
|
+
t = this.alloc(inst);
|
|
2519
|
+
} else {
|
|
2520
|
+
t.inst = inst;
|
|
2521
|
+
}
|
|
2522
|
+
if (this.ncap > 0 && t.cap !== cap) {
|
|
2523
|
+
// Direct assignment utilizing Typed Array performance
|
|
2524
|
+
for (let c = 0; c < this.ncap; c++) {
|
|
2525
|
+
t.cap[c] = cap[c];
|
|
2526
|
+
}
|
|
2527
|
+
}
|
|
2528
|
+
q.denseThreads[d] = t;
|
|
2529
|
+
t = null;
|
|
2530
|
+
return t;
|
|
2531
|
+
default:
|
|
2532
|
+
throw new Error('unhandled');
|
|
2533
|
+
}
|
|
2468
2534
|
}
|
|
2469
|
-
return t;
|
|
2470
2535
|
}
|
|
2471
2536
|
}
|
|
2472
2537
|
|
|
@@ -2494,8 +2559,15 @@ class DFAState {
|
|
|
2494
2559
|
this.nfaStates = nfaStates; // Int32Array of Instruction PCs
|
|
2495
2560
|
this.isMatch = isMatch; // Boolean
|
|
2496
2561
|
this.matchIDs = matchIDs; // Array of integers indicating which Set patterns matched
|
|
2497
|
-
|
|
2498
|
-
|
|
2562
|
+
|
|
2563
|
+
// Latin-1 (Unicode.MAX_LATIN1 + 1) flat arrays for blisteringly fast O(1) lookups
|
|
2564
|
+
// completely covering standard English, European languages, and 1-byte encodings.
|
|
2565
|
+
this.nextLatin1 = new Array(Unicode.MAX_LATIN1 + 1).fill(null); // Flat array for blisteringly fast ASCII lookups
|
|
2566
|
+
this.nextLatin1Anchored = new Array(Unicode.MAX_LATIN1 + 1).fill(null); // Flat array for blisteringly fast ASCII lookups
|
|
2567
|
+
// 2 arrays used as hash map for V8 optimization (N is small number, so O(n) faster than Map O(1))
|
|
2568
|
+
this.transKeys = [];
|
|
2569
|
+
this.transVals = [];
|
|
2570
|
+
this.lastSeen = 0; // Track when this state was last used for LRU eviction
|
|
2499
2571
|
}
|
|
2500
2572
|
}
|
|
2501
2573
|
class DFA {
|
|
@@ -2508,6 +2580,7 @@ class DFA {
|
|
|
2508
2580
|
this.stateLimit = 10000; // Prevent memory explosion (ReDoS protection)
|
|
2509
2581
|
this.cacheClears = 0; // Track thrashing
|
|
2510
2582
|
this.failed = false; // mark if DFA cannot work with provided prog
|
|
2583
|
+
this.clock = 0; // Global clock for LRU eviction
|
|
2511
2584
|
}
|
|
2512
2585
|
|
|
2513
2586
|
// Follows epsilon (empty) transitions to find all reachable states without consuming a char
|
|
@@ -2567,6 +2640,7 @@ class DFA {
|
|
|
2567
2640
|
for (let i = 0; i < bucket.length; i++) {
|
|
2568
2641
|
const state = bucket[i];
|
|
2569
2642
|
if (arraysEqual(state.nfaStates, sortedPCs)) {
|
|
2643
|
+
state.lastSeen = ++this.clock;
|
|
2570
2644
|
return state;
|
|
2571
2645
|
}
|
|
2572
2646
|
}
|
|
@@ -2579,40 +2653,99 @@ class DFA {
|
|
|
2579
2653
|
if (this.failed) return null;
|
|
2580
2654
|
|
|
2581
2655
|
// Safety: prevent memory exhaustion from state explosion
|
|
2582
|
-
// We
|
|
2656
|
+
// We prune the cache to keep the newest 50%
|
|
2583
2657
|
if (this.stateCount >= this.stateLimit) {
|
|
2584
|
-
this.stateCache.clear();
|
|
2585
|
-
this.stateCount = 0;
|
|
2586
|
-
this.startState = null;
|
|
2587
2658
|
this.cacheClears++;
|
|
2588
2659
|
|
|
2589
2660
|
// If this regex causes continuous cache thrashing, permanently fall back to NFA
|
|
2590
2661
|
// to avoid spending CPU cycles constantly rebuilding the DFA tree.
|
|
2591
2662
|
if (this.cacheClears >= DFA.MAX_CACHE_CLEARS) {
|
|
2592
2663
|
this.failed = true;
|
|
2664
|
+
this.stateCache.clear();
|
|
2665
|
+
this.stateCount = 0;
|
|
2666
|
+
this.startState = null;
|
|
2667
|
+
return null;
|
|
2668
|
+
}
|
|
2669
|
+
this.evictCache();
|
|
2670
|
+
|
|
2671
|
+
// After eviction, the bucket reference might be stale or empty.
|
|
2672
|
+
// We must re-fetch or re-create the bucket.
|
|
2673
|
+
bucket = this.stateCache.get(hash);
|
|
2674
|
+
if (!bucket) {
|
|
2675
|
+
bucket = [];
|
|
2676
|
+
this.stateCache.set(hash, bucket);
|
|
2593
2677
|
}
|
|
2594
|
-
return null;
|
|
2595
2678
|
}
|
|
2596
2679
|
|
|
2597
2680
|
// State not found, create it and add to bucket
|
|
2598
2681
|
const state = new DFAState(sortedPCs, closureResult.isMatch, closureResult.matchIDs);
|
|
2682
|
+
state.lastSeen = ++this.clock;
|
|
2599
2683
|
bucket.push(state);
|
|
2600
2684
|
this.stateCount++;
|
|
2601
2685
|
return state;
|
|
2602
2686
|
}
|
|
2687
|
+
evictCache() {
|
|
2688
|
+
const allStates = [];
|
|
2689
|
+
for (const bucket of this.stateCache.values()) {
|
|
2690
|
+
for (let i = 0; i < bucket.length; i++) {
|
|
2691
|
+
allStates.push(bucket[i]);
|
|
2692
|
+
}
|
|
2693
|
+
}
|
|
2694
|
+
|
|
2695
|
+
// Sort ascending by lastSeen (oldest first)
|
|
2696
|
+
allStates.sort((a, b) => a.lastSeen - b.lastSeen);
|
|
2697
|
+
|
|
2698
|
+
// Keep the newest 50%
|
|
2699
|
+
const keepCount = Math.max(1, Math.floor(this.stateLimit / 2));
|
|
2700
|
+
const startIndex = allStates.length - keepCount;
|
|
2701
|
+
const survivorsArray = allStates.slice(startIndex);
|
|
2702
|
+
const survivors = new Set(survivorsArray);
|
|
2703
|
+
this.stateCache.clear();
|
|
2704
|
+
this.stateCount = 0;
|
|
2705
|
+
for (let i = 0; i < survivorsArray.length; i++) {
|
|
2706
|
+
const state = survivorsArray[i];
|
|
2707
|
+
|
|
2708
|
+
// Sever ties to all states to prevent memory leaks and dangling pointers
|
|
2709
|
+
state.nextLatin1.fill(null);
|
|
2710
|
+
state.nextLatin1Anchored.fill(null);
|
|
2711
|
+
// zero-allocation cleanup
|
|
2712
|
+
state.transKeys.length = 0;
|
|
2713
|
+
state.transVals.length = 0;
|
|
2714
|
+
const hash = hashPCs(state.nfaStates);
|
|
2715
|
+
let bucket = this.stateCache.get(hash);
|
|
2716
|
+
if (!bucket) {
|
|
2717
|
+
bucket = [];
|
|
2718
|
+
this.stateCache.set(hash, bucket);
|
|
2719
|
+
}
|
|
2720
|
+
bucket.push(state);
|
|
2721
|
+
this.stateCount++;
|
|
2722
|
+
}
|
|
2723
|
+
|
|
2724
|
+
// Start state must either be preserved or nullified so it gets re-created
|
|
2725
|
+
if (this.startState && !survivors.has(this.startState)) {
|
|
2726
|
+
this.startState = null;
|
|
2727
|
+
}
|
|
2728
|
+
}
|
|
2603
2729
|
|
|
2604
2730
|
// Compute the next DFA state given a current state and a character
|
|
2605
2731
|
step(state, charCode, anchor) {
|
|
2606
|
-
// OPTIMIZATION:
|
|
2607
|
-
if (
|
|
2608
|
-
|
|
2609
|
-
|
|
2610
|
-
return next;
|
|
2732
|
+
// OPTIMIZATION: Latin-1 Array Fast-Path
|
|
2733
|
+
if (charCode <= Unicode.MAX_LATIN1) {
|
|
2734
|
+
if (anchor === RE2Flags.UNANCHORED) {
|
|
2735
|
+
const next = state.nextLatin1[charCode];
|
|
2736
|
+
if (next !== null) return next;
|
|
2737
|
+
} else {
|
|
2738
|
+
const next = state.nextLatin1Anchored[charCode];
|
|
2739
|
+
if (next !== null) return next;
|
|
2611
2740
|
}
|
|
2612
2741
|
} else {
|
|
2742
|
+
// Dense Array Linear Search fallback for Runes > 255
|
|
2613
2743
|
const key = charCode + (anchor === RE2Flags.UNANCHORED ? 0 : Unicode.MAX_RUNE + 1);
|
|
2614
|
-
|
|
2615
|
-
|
|
2744
|
+
// get [key] -> nextState
|
|
2745
|
+
const keys = state.transKeys;
|
|
2746
|
+
const len = keys.length;
|
|
2747
|
+
for (let i = 0; i < len; i++) {
|
|
2748
|
+
if (keys[i] === key) return state.transVals[i];
|
|
2616
2749
|
}
|
|
2617
2750
|
}
|
|
2618
2751
|
const nextPCs = [];
|
|
@@ -2629,11 +2762,17 @@ class DFA {
|
|
|
2629
2762
|
const nextState = this.getState(nextPCs);
|
|
2630
2763
|
|
|
2631
2764
|
// Cache the result
|
|
2632
|
-
if (
|
|
2633
|
-
|
|
2765
|
+
if (charCode <= Unicode.MAX_LATIN1) {
|
|
2766
|
+
if (anchor === RE2Flags.UNANCHORED) {
|
|
2767
|
+
state.nextLatin1[charCode] = nextState;
|
|
2768
|
+
} else {
|
|
2769
|
+
state.nextLatin1Anchored[charCode] = nextState;
|
|
2770
|
+
}
|
|
2634
2771
|
} else {
|
|
2635
2772
|
const key = charCode + (anchor === RE2Flags.UNANCHORED ? 0 : Unicode.MAX_RUNE + 1);
|
|
2636
|
-
|
|
2773
|
+
// store key -> nextState
|
|
2774
|
+
state.transKeys.push(key);
|
|
2775
|
+
state.transVals.push(nextState);
|
|
2637
2776
|
}
|
|
2638
2777
|
return nextState;
|
|
2639
2778
|
}
|
|
@@ -2666,10 +2805,11 @@ class DFA {
|
|
|
2666
2805
|
if (width === 0) {
|
|
2667
2806
|
break;
|
|
2668
2807
|
}
|
|
2669
|
-
currentState = anchor === RE2Flags.UNANCHORED && rune <= Unicode.
|
|
2808
|
+
currentState = anchor === RE2Flags.UNANCHORED && rune <= Unicode.MAX_LATIN1 && currentState.nextLatin1[rune] || this.step(currentState, rune, anchor);
|
|
2670
2809
|
|
|
2671
2810
|
// If we hit an unrecoverable DFA error or bailout, signal fallback
|
|
2672
2811
|
if (currentState === null) return null;
|
|
2812
|
+
currentState.lastSeen = ++this.clock;
|
|
2673
2813
|
if (currentState.isMatch) {
|
|
2674
2814
|
if (anchor === RE2Flags.ANCHOR_BOTH) {
|
|
2675
2815
|
if (i + width === endPos) return true;
|
|
@@ -2717,9 +2857,10 @@ class DFA {
|
|
|
2717
2857
|
const rune = r >> 3;
|
|
2718
2858
|
const width = r & 7;
|
|
2719
2859
|
if (width === 0) break;
|
|
2720
|
-
currentState = anchor === RE2Flags.UNANCHORED && rune <= Unicode.
|
|
2860
|
+
currentState = anchor === RE2Flags.UNANCHORED && rune <= Unicode.MAX_LATIN1 && currentState.nextLatin1[rune] || this.step(currentState, rune, anchor);
|
|
2721
2861
|
if (currentState === null) return null; // Bailout to NFA
|
|
2722
2862
|
|
|
2863
|
+
currentState.lastSeen = ++this.clock;
|
|
2723
2864
|
i += width;
|
|
2724
2865
|
checkMatch(currentState, i);
|
|
2725
2866
|
if (currentState.nfaStates.length === 0) {
|
|
@@ -2757,7 +2898,7 @@ class BitState {
|
|
|
2757
2898
|
// Bitwise shift (>>> 5) instead of Math.floor( / 32)
|
|
2758
2899
|
const visitedSize = prog.numInst() * (end + 1) + VISITED_BITS - 1 >>> 5;
|
|
2759
2900
|
if (this.visited.length < visitedSize) {
|
|
2760
|
-
this.visited = new Uint32Array(
|
|
2901
|
+
this.visited = new Uint32Array(visitedSize);
|
|
2761
2902
|
} else {
|
|
2762
2903
|
this.visited.fill(0, 0, visitedSize);
|
|
2763
2904
|
}
|
|
@@ -2843,11 +2984,12 @@ class BitState {
|
|
|
2843
2984
|
const outInst = re2.prog.getInst(inst.out);
|
|
2844
2985
|
if (Inst.isRuneOp(outInst.op)) {
|
|
2845
2986
|
this.push(re2, inst.arg, currentPos, false);
|
|
2846
|
-
currentPc = inst.
|
|
2987
|
+
currentPc = inst.arg;
|
|
2988
|
+
currentPos = this.end;
|
|
2847
2989
|
continue;
|
|
2848
2990
|
}
|
|
2849
2991
|
this.push(re2, inst.out, this.end, false);
|
|
2850
|
-
currentPc = inst.
|
|
2992
|
+
currentPc = inst.out;
|
|
2851
2993
|
continue;
|
|
2852
2994
|
}
|
|
2853
2995
|
case Inst.RUNE:
|
|
@@ -2928,6 +3070,11 @@ class BitState {
|
|
|
2928
3070
|
if (currentPos === this.end) return true;
|
|
2929
3071
|
break;
|
|
2930
3072
|
}
|
|
3073
|
+
case Inst.LB_WRITE:
|
|
3074
|
+
case Inst.LB_CHECK:
|
|
3075
|
+
{
|
|
3076
|
+
throw new RE2JSInternalException('Backtracker cannot evaluate Lookbehind instructions');
|
|
3077
|
+
}
|
|
2931
3078
|
default:
|
|
2932
3079
|
{
|
|
2933
3080
|
throw new RE2JSInternalException('bad inst');
|
|
@@ -3196,7 +3343,9 @@ const makeOnePass = p => {
|
|
|
3196
3343
|
}
|
|
3197
3344
|
runes.sort((a, b) => a - b);
|
|
3198
3345
|
} else {
|
|
3199
|
-
|
|
3346
|
+
for (let j = 0; j < inst.runes.length; j++) {
|
|
3347
|
+
runes.push(inst.runes[j]);
|
|
3348
|
+
}
|
|
3200
3349
|
}
|
|
3201
3350
|
onePassRunes[pc] = runes;
|
|
3202
3351
|
inst.next = new Uint32Array(Math.floor(runes.length / 2) + 1).fill(inst.out);
|
|
@@ -3364,6 +3513,10 @@ class OnePass {
|
|
|
3364
3513
|
switch (inst.op) {
|
|
3365
3514
|
case Inst.MATCH:
|
|
3366
3515
|
{
|
|
3516
|
+
// Verify ANCHOR_BOTH constraint before accepting the match
|
|
3517
|
+
if (anchor === RE2Flags.ANCHOR_BOTH && pos !== input.endPos()) {
|
|
3518
|
+
return null;
|
|
3519
|
+
}
|
|
3367
3520
|
matched = true;
|
|
3368
3521
|
if (matchcap.length > 0) {
|
|
3369
3522
|
matchcap[0] = 0;
|
|
@@ -3794,6 +3947,88 @@ class Regexp {
|
|
|
3794
3947
|
}
|
|
3795
3948
|
}
|
|
3796
3949
|
|
|
3950
|
+
// High-speed, single-pass Aho-Corasick string matcher optimized for V8.
|
|
3951
|
+
// Builds a trie with failure links to search for multiple prefixes simultaneously.
|
|
3952
|
+
class AhoCorasick {
|
|
3953
|
+
constructor(wordArrays) {
|
|
3954
|
+
this.next = [Object.create(null)];
|
|
3955
|
+
this.fail = [0];
|
|
3956
|
+
this.match = [false];
|
|
3957
|
+
|
|
3958
|
+
// Build Trie
|
|
3959
|
+
for (const word of wordArrays) {
|
|
3960
|
+
let node = 0;
|
|
3961
|
+
for (let i = 0; i < word.length; i++) {
|
|
3962
|
+
const val = word[i];
|
|
3963
|
+
if (!(val in this.next[node])) {
|
|
3964
|
+
this.next.push(Object.create(null));
|
|
3965
|
+
this.fail.push(0);
|
|
3966
|
+
this.match.push(false);
|
|
3967
|
+
this.next[node][val] = this.next.length - 1;
|
|
3968
|
+
}
|
|
3969
|
+
node = this.next[node][val];
|
|
3970
|
+
}
|
|
3971
|
+
this.match[node] = true;
|
|
3972
|
+
}
|
|
3973
|
+
|
|
3974
|
+
// Build Failure Links (BFS)
|
|
3975
|
+
const queue = [];
|
|
3976
|
+
for (const val in this.next[0]) {
|
|
3977
|
+
if (Object.prototype.hasOwnProperty.call(this.next[0], val)) {
|
|
3978
|
+
const child = this.next[0][val];
|
|
3979
|
+
this.fail[child] = 0;
|
|
3980
|
+
queue.push(child);
|
|
3981
|
+
}
|
|
3982
|
+
}
|
|
3983
|
+
while (queue.length > 0) {
|
|
3984
|
+
const curr = queue.shift();
|
|
3985
|
+
for (const val in this.next[curr]) {
|
|
3986
|
+
if (Object.prototype.hasOwnProperty.call(this.next[curr], val)) {
|
|
3987
|
+
const child = this.next[curr][val];
|
|
3988
|
+
let failNode = this.fail[curr];
|
|
3989
|
+
while (failNode !== 0 && !(val in this.next[failNode])) {
|
|
3990
|
+
failNode = this.fail[failNode];
|
|
3991
|
+
}
|
|
3992
|
+
if (val in this.next[failNode]) {
|
|
3993
|
+
this.fail[child] = this.next[failNode][val];
|
|
3994
|
+
} else {
|
|
3995
|
+
this.fail[child] = 0;
|
|
3996
|
+
}
|
|
3997
|
+
this.match[child] = this.match[child] || this.match[this.fail[child]];
|
|
3998
|
+
queue.push(child);
|
|
3999
|
+
}
|
|
4000
|
+
}
|
|
4001
|
+
}
|
|
4002
|
+
}
|
|
4003
|
+
searchUTF16(charSeq, start, end) {
|
|
4004
|
+
let node = 0;
|
|
4005
|
+
for (let i = start; i < end; i++) {
|
|
4006
|
+
const val = charSeq.charCodeAt(i);
|
|
4007
|
+
while (node !== 0 && !(val in this.next[node])) {
|
|
4008
|
+
node = this.fail[node];
|
|
4009
|
+
}
|
|
4010
|
+
if (val in this.next[node]) {
|
|
4011
|
+
node = this.next[node][val];
|
|
4012
|
+
}
|
|
4013
|
+
if (this.match[node]) return true;
|
|
4014
|
+
}
|
|
4015
|
+
return false;
|
|
4016
|
+
}
|
|
4017
|
+
searchUTF8(bytes, start, end) {
|
|
4018
|
+
let node = 0;
|
|
4019
|
+
for (let i = start; i < end; i++) {
|
|
4020
|
+
const val = bytes[i];
|
|
4021
|
+
while (node !== 0 && !(val in this.next[node])) {
|
|
4022
|
+
node = this.fail[node];
|
|
4023
|
+
}
|
|
4024
|
+
if (val in this.next[node]) {
|
|
4025
|
+
node = this.next[node][val];
|
|
4026
|
+
}
|
|
4027
|
+
if (this.match[node]) return true;
|
|
4028
|
+
}
|
|
4029
|
+
return false;
|
|
4030
|
+
}
|
|
4031
|
+
}
|
|
3797
4032
|
class Prefilter {
|
|
3798
4033
|
static Type = {
|
|
3799
4034
|
NONE: 0,
|
|
@@ -3806,6 +4041,8 @@ class Prefilter {
|
|
|
3806
4041
|
this.subs = [];
|
|
3807
4042
|
this.str = '';
|
|
3808
4043
|
this.bytes = null;
|
|
4044
|
+
this.ac16 = null;
|
|
4045
|
+
this.ac8 = null;
|
|
3809
4046
|
}
|
|
3810
4047
|
eval(input, pos) {
|
|
3811
4048
|
switch (this.type) {
|
|
@@ -3819,6 +4056,10 @@ class Prefilter {
|
|
|
3819
4056
|
}
|
|
3820
4057
|
return true;
|
|
3821
4058
|
case Prefilter.Type.OR:
|
|
4059
|
+
// Exploit Aho-Corasick if it was successfully built
|
|
4060
|
+
if (this.ac16 && this.ac8) {
|
|
4061
|
+
return input.hasAnyString(this, pos);
|
|
4062
|
+
}
|
|
3822
4063
|
for (let i = 0; i < this.subs.length; i++) {
|
|
3823
4064
|
if (this.subs[i].eval(input, pos)) return true;
|
|
3824
4065
|
}
|
|
@@ -3909,7 +4150,9 @@ class PrefilterTree {
|
|
|
3909
4150
|
const s = PrefilterTree.simplify(sub);
|
|
3910
4151
|
if (s.type !== Prefilter.Type.NONE) {
|
|
3911
4152
|
if (s.type === Prefilter.Type.AND) {
|
|
3912
|
-
|
|
4153
|
+
for (let j = 0; j < s.subs.length; j++) {
|
|
4154
|
+
newSubs.push(s.subs[j]);
|
|
4155
|
+
}
|
|
3913
4156
|
} else {
|
|
3914
4157
|
newSubs.push(s);
|
|
3915
4158
|
}
|
|
@@ -3929,7 +4172,9 @@ class PrefilterTree {
|
|
|
3929
4172
|
return new Prefilter(Prefilter.Type.NONE);
|
|
3930
4173
|
}
|
|
3931
4174
|
if (s.type === Prefilter.Type.OR) {
|
|
3932
|
-
|
|
4175
|
+
for (let j = 0; j < s.subs.length; j++) {
|
|
4176
|
+
newSubs.push(s.subs[j]);
|
|
4177
|
+
}
|
|
3933
4178
|
} else {
|
|
3934
4179
|
newSubs.push(s);
|
|
3935
4180
|
}
|
|
@@ -3951,6 +4196,27 @@ class PrefilterTree {
|
|
|
3951
4196
|
}
|
|
3952
4197
|
}
|
|
3953
4198
|
pf.subs = uniqueSubs;
|
|
4199
|
+
|
|
4200
|
+
// Build an Aho-Corasick automaton if all children are exact matches
|
|
4201
|
+
let allExact = true;
|
|
4202
|
+
for (const sub of uniqueSubs) {
|
|
4203
|
+
if (sub.type !== Prefilter.Type.EXACT) {
|
|
4204
|
+
allExact = false;
|
|
4205
|
+
break;
|
|
4206
|
+
}
|
|
4207
|
+
}
|
|
4208
|
+
if (allExact && uniqueSubs.length > 1) {
|
|
4209
|
+
const words16 = uniqueSubs.map(s => {
|
|
4210
|
+
const arr = [];
|
|
4211
|
+
for (let i = 0; i < s.str.length; i++) {
|
|
4212
|
+
arr.push(s.str.charCodeAt(i));
|
|
4213
|
+
}
|
|
4214
|
+
return arr;
|
|
4215
|
+
});
|
|
4216
|
+
pf.ac16 = new AhoCorasick(words16);
|
|
4217
|
+
const words8 = uniqueSubs.map(s => s.bytes);
|
|
4218
|
+
pf.ac8 = new AhoCorasick(words8);
|
|
4219
|
+
}
|
|
3954
4220
|
return pf;
|
|
3955
4221
|
}
|
|
3956
4222
|
return pf;
|
|
@@ -4477,7 +4743,9 @@ class Simplify {
|
|
|
4477
4743
|
// Flatten nested concatenations
|
|
4478
4744
|
if (nsub.op === Regexp.Op.CONCAT) {
|
|
4479
4745
|
changed = true;
|
|
4480
|
-
|
|
4746
|
+
for (let j = 0; j < nsub.subs.length; j++) {
|
|
4747
|
+
newSubs.push(nsub.subs[j]);
|
|
4748
|
+
}
|
|
4481
4749
|
continue;
|
|
4482
4750
|
}
|
|
4483
4751
|
} else if (re.op === Regexp.Op.ALTERNATE) {
|
|
@@ -4489,7 +4757,9 @@ class Simplify {
|
|
|
4489
4757
|
// Flatten nested alternations
|
|
4490
4758
|
if (nsub.op === Regexp.Op.ALTERNATE) {
|
|
4491
4759
|
changed = true;
|
|
4492
|
-
|
|
4760
|
+
for (let j = 0; j < nsub.subs.length; j++) {
|
|
4761
|
+
newSubs.push(nsub.subs[j]);
|
|
4762
|
+
}
|
|
4493
4763
|
continue;
|
|
4494
4764
|
}
|
|
4495
4765
|
}
|
|
@@ -5499,7 +5769,10 @@ class Parser {
|
|
|
5499
5769
|
return t.pop();
|
|
5500
5770
|
}
|
|
5501
5771
|
static concatRunes(x, y) {
|
|
5502
|
-
|
|
5772
|
+
for (let i = 0; i < y.length; i++) {
|
|
5773
|
+
x.push(y[i]);
|
|
5774
|
+
}
|
|
5775
|
+
return x;
|
|
5503
5776
|
}
|
|
5504
5777
|
constructor(wholeRegexp, flags = 0) {
|
|
5505
5778
|
this.wholeRegexp = wholeRegexp;
|
|
@@ -5535,8 +5808,8 @@ class Parser {
|
|
|
5535
5808
|
return re;
|
|
5536
5809
|
}
|
|
5537
5810
|
reuse(re) {
|
|
5538
|
-
if (this.height !== null &&
|
|
5539
|
-
|
|
5811
|
+
if (this.height !== null && this.height.has(re)) {
|
|
5812
|
+
this.height.delete(re);
|
|
5540
5813
|
}
|
|
5541
5814
|
if (re.subs !== null && re.subs.length > 0) {
|
|
5542
5815
|
re.subs[0] = this.free;
|
|
@@ -5568,20 +5841,20 @@ class Parser {
|
|
|
5568
5841
|
if (n <= 0) {
|
|
5569
5842
|
n = 1;
|
|
5570
5843
|
}
|
|
5571
|
-
if (n > Parser.MAX_SIZE / this.repeats) {
|
|
5844
|
+
if (n > Math.floor(Parser.MAX_SIZE / this.repeats)) {
|
|
5572
5845
|
this.repeats = Parser.MAX_SIZE;
|
|
5573
5846
|
} else {
|
|
5574
5847
|
this.repeats *= n;
|
|
5575
5848
|
}
|
|
5576
5849
|
}
|
|
5577
|
-
if (this.numRegexp < Parser.MAX_SIZE / this.repeats) {
|
|
5850
|
+
if (this.numRegexp < Math.floor(Parser.MAX_SIZE / this.repeats)) {
|
|
5578
5851
|
return;
|
|
5579
5852
|
}
|
|
5580
5853
|
|
|
5581
5854
|
// We need to start tracking size.
|
|
5582
5855
|
// Make the map and belatedly populate it
|
|
5583
5856
|
// with info about everything we've constructed so far.
|
|
5584
|
-
this.size =
|
|
5857
|
+
this.size = new Map();
|
|
5585
5858
|
for (let reEx of this.stack) {
|
|
5586
5859
|
this.checkSize(reEx);
|
|
5587
5860
|
}
|
|
@@ -5592,8 +5865,8 @@ class Parser {
|
|
|
5592
5865
|
}
|
|
5593
5866
|
calcSize(re, force = false) {
|
|
5594
5867
|
if (!force && this.size !== null) {
|
|
5595
|
-
if (
|
|
5596
|
-
return this.size
|
|
5868
|
+
if (this.size.has(re)) {
|
|
5869
|
+
return this.size.get(re);
|
|
5597
5870
|
}
|
|
5598
5871
|
}
|
|
5599
5872
|
let size = 0;
|
|
@@ -5653,9 +5926,9 @@ class Parser {
|
|
|
5653
5926
|
}
|
|
5654
5927
|
size = Math.max(1, size);
|
|
5655
5928
|
if (this.size === null) {
|
|
5656
|
-
this.size =
|
|
5929
|
+
this.size = new Map();
|
|
5657
5930
|
}
|
|
5658
|
-
this.size
|
|
5931
|
+
this.size.set(re, size);
|
|
5659
5932
|
return size;
|
|
5660
5933
|
}
|
|
5661
5934
|
checkHeight(re) {
|
|
@@ -5663,7 +5936,7 @@ class Parser {
|
|
|
5663
5936
|
return;
|
|
5664
5937
|
}
|
|
5665
5938
|
if (this.height === null) {
|
|
5666
|
-
this.height =
|
|
5939
|
+
this.height = new Map();
|
|
5667
5940
|
for (let reEx of this.stack) {
|
|
5668
5941
|
this.checkHeight(reEx);
|
|
5669
5942
|
}
|
|
@@ -5674,8 +5947,8 @@ class Parser {
|
|
|
5674
5947
|
}
|
|
5675
5948
|
calcHeight(re, force = false) {
|
|
5676
5949
|
if (!force && this.height !== null) {
|
|
5677
|
-
if (
|
|
5678
|
-
return this.height
|
|
5950
|
+
if (this.height.has(re)) {
|
|
5951
|
+
return this.height.get(re);
|
|
5679
5952
|
}
|
|
5680
5953
|
}
|
|
5681
5954
|
let h = 1;
|
|
@@ -5686,9 +5959,9 @@ class Parser {
|
|
|
5686
5959
|
}
|
|
5687
5960
|
}
|
|
5688
5961
|
if (this.height === null) {
|
|
5689
|
-
this.height =
|
|
5962
|
+
this.height = new Map();
|
|
5690
5963
|
}
|
|
5691
|
-
this.height
|
|
5964
|
+
this.height.set(re, h);
|
|
5692
5965
|
return h;
|
|
5693
5966
|
}
|
|
5694
5967
|
|
|
@@ -7810,7 +8083,7 @@ class TranslateRegExpString {
|
|
|
7810
8083
|
changed = true;
|
|
7811
8084
|
continue;
|
|
7812
8085
|
} else if (ch === '(' && i + 2 < size && data[i + 1] === '?' && data[i + 2] === '<') {
|
|
7813
|
-
if (i + 3
|
|
8086
|
+
if (i + 3 < size && !'=!>)'.includes(data[i + 3])) {
|
|
7814
8087
|
result += '(?P<';
|
|
7815
8088
|
i += 3;
|
|
7816
8089
|
changed = true;
|