re2js 1.3.0 → 1.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +34 -0
- package/build/index.cjs.cjs +270 -213
- package/build/index.cjs.cjs.map +1 -1
- package/build/index.esm.d.ts +28 -13
- package/build/index.esm.d.ts.map +1 -1
- package/build/index.esm.js +271 -213
- package/build/index.esm.js.map +1 -1
- package/build/index.umd.js +270 -213
- package/build/index.umd.js.map +1 -1
- package/package.json +12 -12
package/build/index.umd.js
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
* re2js
|
|
3
3
|
* RE2JS is the JavaScript port of RE2, a regular expression engine that provides linear time matching
|
|
4
4
|
*
|
|
5
|
-
* @version v1.3.
|
|
5
|
+
* @version v1.3.1
|
|
6
6
|
* @author Alexey Vasiliev
|
|
7
7
|
* @homepage https://github.com/le0pard/re2js#readme
|
|
8
8
|
* @repository github:le0pard/re2js
|
|
@@ -847,6 +847,181 @@
|
|
|
847
847
|
}
|
|
848
848
|
}
|
|
849
849
|
|
|
850
|
+
/**
|
|
851
|
+
* MachineInput abstracts different representations of the input text supplied to the Machine. It
|
|
852
|
+
* provides one-character lookahead.
|
|
853
|
+
*/
|
|
854
|
+
class MachineInputBase {
|
|
855
|
+
static EOF() {
|
|
856
|
+
return -1 << 3;
|
|
857
|
+
}
|
|
858
|
+
|
|
859
|
+
// can we look ahead without losing info?
|
|
860
|
+
canCheckPrefix() {
|
|
861
|
+
return true;
|
|
862
|
+
}
|
|
863
|
+
|
|
864
|
+
// Returns the end position in the same units as step().
|
|
865
|
+
endPos() {
|
|
866
|
+
return this.end;
|
|
867
|
+
}
|
|
868
|
+
}
|
|
869
|
+
|
|
870
|
+
// An implementation of MachineInput for UTF-8 byte arrays.
|
|
871
|
+
// |pos| and |width| are byte indices.
|
|
872
|
+
class MachineUTF8Input extends MachineInputBase {
|
|
873
|
+
constructor(bytes, start = 0, end = bytes.length) {
|
|
874
|
+
super();
|
|
875
|
+
this.bytes = bytes;
|
|
876
|
+
this.start = start;
|
|
877
|
+
this.end = end;
|
|
878
|
+
}
|
|
879
|
+
|
|
880
|
+
// Returns the rune at the specified index; the units are
|
|
881
|
+
// unspecified, but could be UTF-8 byte, UTF-16 char, or rune
|
|
882
|
+
// indices. Returns the width (in the same units) of the rune in
|
|
883
|
+
// the lower 3 bits, and the rune (Unicode code point) in the high
|
|
884
|
+
// bits. Never negative, except for EOF which is represented as -1
|
|
885
|
+
// << 3 | 0.
|
|
886
|
+
step(i) {
|
|
887
|
+
i += this.start;
|
|
888
|
+
if (i >= this.end) {
|
|
889
|
+
return MachineInputBase.EOF();
|
|
890
|
+
}
|
|
891
|
+
let x = this.bytes[i++] & 255;
|
|
892
|
+
if ((x & 128) === 0) {
|
|
893
|
+
return x << 3 | 1;
|
|
894
|
+
} else if ((x & 224) === 192) {
|
|
895
|
+
x = x & 31;
|
|
896
|
+
if (i >= this.end) {
|
|
897
|
+
return MachineInputBase.EOF();
|
|
898
|
+
}
|
|
899
|
+
x = x << 6 | this.bytes[i++] & 63;
|
|
900
|
+
return x << 3 | 2;
|
|
901
|
+
} else if ((x & 240) === 224) {
|
|
902
|
+
x = x & 15;
|
|
903
|
+
if (i + 1 >= this.end) {
|
|
904
|
+
return MachineInputBase.EOF();
|
|
905
|
+
}
|
|
906
|
+
x = x << 6 | this.bytes[i++] & 63;
|
|
907
|
+
x = x << 6 | this.bytes[i++] & 63;
|
|
908
|
+
return x << 3 | 3;
|
|
909
|
+
} else {
|
|
910
|
+
x = x & 7;
|
|
911
|
+
if (i + 2 >= this.end) {
|
|
912
|
+
return MachineInputBase.EOF();
|
|
913
|
+
}
|
|
914
|
+
x = x << 6 | this.bytes[i++] & 63;
|
|
915
|
+
x = x << 6 | this.bytes[i++] & 63;
|
|
916
|
+
x = x << 6 | this.bytes[i++] & 63;
|
|
917
|
+
return x << 3 | 4;
|
|
918
|
+
}
|
|
919
|
+
}
|
|
920
|
+
|
|
921
|
+
// Returns the index relative to |pos| at which |re2.prefix| is found
|
|
922
|
+
// in this input stream, or a negative value if not found.
|
|
923
|
+
index(re2, pos) {
|
|
924
|
+
pos += this.start;
|
|
925
|
+
const i = this.indexOf(this.bytes, re2.prefixUTF8, pos);
|
|
926
|
+
return i < 0 ? i : i - pos;
|
|
927
|
+
}
|
|
928
|
+
|
|
929
|
+
// Returns a bitmask of EMPTY_* flags.
|
|
930
|
+
context(pos) {
|
|
931
|
+
pos += this.start;
|
|
932
|
+
let r1 = -1;
|
|
933
|
+
if (pos > this.start && pos <= this.end) {
|
|
934
|
+
let start = pos - 1;
|
|
935
|
+
r1 = this.bytes[start--];
|
|
936
|
+
if (r1 >= 128) {
|
|
937
|
+
let lim = pos - 4;
|
|
938
|
+
if (lim < this.start) {
|
|
939
|
+
lim = this.start;
|
|
940
|
+
}
|
|
941
|
+
while (start >= lim && (this.bytes[start] & 192) === 128) {
|
|
942
|
+
start--;
|
|
943
|
+
}
|
|
944
|
+
if (start < this.start) {
|
|
945
|
+
start = this.start;
|
|
946
|
+
}
|
|
947
|
+
r1 = this.step(start) >> 3;
|
|
948
|
+
}
|
|
949
|
+
}
|
|
950
|
+
const r2 = pos < this.end ? this.step(pos) >> 3 : -1;
|
|
951
|
+
return Utils.emptyOpContext(r1, r2);
|
|
952
|
+
}
|
|
953
|
+
|
|
954
|
+
// Returns the index of the first occurrence of array |target| within
|
|
955
|
+
// array |source| after |fromIndex|, or -1 if not found.
|
|
956
|
+
indexOf(source, target, fromIndex = 0) {
|
|
957
|
+
let targetLength = target.length;
|
|
958
|
+
if (targetLength === 0) {
|
|
959
|
+
return -1;
|
|
960
|
+
}
|
|
961
|
+
let sourceLength = source.length;
|
|
962
|
+
for (let i = fromIndex; i <= sourceLength - targetLength; i++) {
|
|
963
|
+
for (let j = 0; j < targetLength; j++) {
|
|
964
|
+
if (source[i + j] !== target[j]) {
|
|
965
|
+
break;
|
|
966
|
+
} else if (j === targetLength - 1) {
|
|
967
|
+
return i;
|
|
968
|
+
}
|
|
969
|
+
}
|
|
970
|
+
}
|
|
971
|
+
return -1;
|
|
972
|
+
}
|
|
973
|
+
}
|
|
974
|
+
|
|
975
|
+
// |pos| and |width| are in JS "char" units.
|
|
976
|
+
class MachineUTF16Input extends MachineInputBase {
|
|
977
|
+
constructor(charSequence, start = 0, end = charSequence.length) {
|
|
978
|
+
super();
|
|
979
|
+
this.charSequence = charSequence;
|
|
980
|
+
this.start = start;
|
|
981
|
+
this.end = end;
|
|
982
|
+
}
|
|
983
|
+
|
|
984
|
+
// Returns the rune at the specified index; the units are
|
|
985
|
+
// unspecified, but could be UTF-8 byte, UTF-16 char, or rune
|
|
986
|
+
// indices. Returns the width (in the same units) of the rune in
|
|
987
|
+
// the lower 3 bits, and the rune (Unicode code point) in the high
|
|
988
|
+
// bits. Never negative, except for EOF which is represented as -1
|
|
989
|
+
// << 3 | 0.
|
|
990
|
+
step(pos) {
|
|
991
|
+
pos += this.start;
|
|
992
|
+
if (pos < this.end) {
|
|
993
|
+
const rune = this.charSequence.codePointAt(pos);
|
|
994
|
+
return rune << 3 | Utils.charCount(rune);
|
|
995
|
+
} else {
|
|
996
|
+
return MachineInputBase.EOF();
|
|
997
|
+
}
|
|
998
|
+
}
|
|
999
|
+
|
|
1000
|
+
// Returns the index relative to |pos| at which |re2.prefix| is found
|
|
1001
|
+
// in this input stream, or a negative value if not found.
|
|
1002
|
+
index(re2, pos) {
|
|
1003
|
+
pos += this.start;
|
|
1004
|
+
const i = this.charSequence.indexOf(re2.prefix, pos);
|
|
1005
|
+
return i < 0 ? i : i - pos;
|
|
1006
|
+
}
|
|
1007
|
+
|
|
1008
|
+
// Returns a bitmask of EMPTY_* flags.
|
|
1009
|
+
context(pos) {
|
|
1010
|
+
pos += this.start;
|
|
1011
|
+
const r1 = pos > 0 && pos <= this.charSequence.length ? this.charSequence.codePointAt(pos - 1) : -1;
|
|
1012
|
+
const r2 = pos < this.charSequence.length ? this.charSequence.codePointAt(pos) : -1;
|
|
1013
|
+
return Utils.emptyOpContext(r1, r2);
|
|
1014
|
+
}
|
|
1015
|
+
}
|
|
1016
|
+
class MachineInput {
|
|
1017
|
+
static fromUTF8(bytes, start = 0, end = bytes.length) {
|
|
1018
|
+
return new MachineUTF8Input(bytes, start, end);
|
|
1019
|
+
}
|
|
1020
|
+
static fromUTF16(charSequence, start = 0, end = charSequence.length) {
|
|
1021
|
+
return new MachineUTF16Input(charSequence, start, end);
|
|
1022
|
+
}
|
|
1023
|
+
}
|
|
1024
|
+
|
|
850
1025
|
class RE2JSException extends Error {
|
|
851
1026
|
/** @param {string} message */
|
|
852
1027
|
constructor(message) {
|
|
@@ -927,17 +1102,6 @@
|
|
|
927
1102
|
}
|
|
928
1103
|
}
|
|
929
1104
|
|
|
930
|
-
/**
|
|
931
|
-
* An exception thrown by DFA
|
|
932
|
-
*/
|
|
933
|
-
class RE2JSDfaMemoryException extends RE2JSException {
|
|
934
|
-
/** @param {string} message */
|
|
935
|
-
constructor(message) {
|
|
936
|
-
super(message);
|
|
937
|
-
this.name = 'RE2JSDfaMemoryException';
|
|
938
|
-
}
|
|
939
|
-
}
|
|
940
|
-
|
|
941
1105
|
/**
|
|
942
1106
|
* A stateful iterator that interprets a regex {@code RE2JS} on a specific input.
|
|
943
1107
|
*
|
|
@@ -1494,181 +1658,6 @@
|
|
|
1494
1658
|
}
|
|
1495
1659
|
}
|
|
1496
1660
|
|
|
1497
|
-
/**
|
|
1498
|
-
* MachineInput abstracts different representations of the input text supplied to the Machine. It
|
|
1499
|
-
* provides one-character lookahead.
|
|
1500
|
-
*/
|
|
1501
|
-
class MachineInputBase {
|
|
1502
|
-
static EOF() {
|
|
1503
|
-
return -1 << 3;
|
|
1504
|
-
}
|
|
1505
|
-
|
|
1506
|
-
// can we look ahead without losing info?
|
|
1507
|
-
canCheckPrefix() {
|
|
1508
|
-
return true;
|
|
1509
|
-
}
|
|
1510
|
-
|
|
1511
|
-
// Returns the end position in the same units as step().
|
|
1512
|
-
endPos() {
|
|
1513
|
-
return this.end;
|
|
1514
|
-
}
|
|
1515
|
-
}
|
|
1516
|
-
|
|
1517
|
-
// An implementation of MachineInput for UTF-8 byte arrays.
|
|
1518
|
-
// |pos| and |width| are byte indices.
|
|
1519
|
-
class MachineUTF8Input extends MachineInputBase {
|
|
1520
|
-
constructor(bytes, start = 0, end = bytes.length) {
|
|
1521
|
-
super();
|
|
1522
|
-
this.bytes = bytes;
|
|
1523
|
-
this.start = start;
|
|
1524
|
-
this.end = end;
|
|
1525
|
-
}
|
|
1526
|
-
|
|
1527
|
-
// Returns the rune at the specified index; the units are
|
|
1528
|
-
// unspecified, but could be UTF-8 byte, UTF-16 char, or rune
|
|
1529
|
-
// indices. Returns the width (in the same units) of the rune in
|
|
1530
|
-
// the lower 3 bits, and the rune (Unicode code point) in the high
|
|
1531
|
-
// bits. Never negative, except for EOF which is represented as -1
|
|
1532
|
-
// << 3 | 0.
|
|
1533
|
-
step(i) {
|
|
1534
|
-
i += this.start;
|
|
1535
|
-
if (i >= this.end) {
|
|
1536
|
-
return MachineInputBase.EOF();
|
|
1537
|
-
}
|
|
1538
|
-
let x = this.bytes[i++] & 255;
|
|
1539
|
-
if ((x & 128) === 0) {
|
|
1540
|
-
return x << 3 | 1;
|
|
1541
|
-
} else if ((x & 224) === 192) {
|
|
1542
|
-
x = x & 31;
|
|
1543
|
-
if (i >= this.end) {
|
|
1544
|
-
return MachineInputBase.EOF();
|
|
1545
|
-
}
|
|
1546
|
-
x = x << 6 | this.bytes[i++] & 63;
|
|
1547
|
-
return x << 3 | 2;
|
|
1548
|
-
} else if ((x & 240) === 224) {
|
|
1549
|
-
x = x & 15;
|
|
1550
|
-
if (i + 1 >= this.end) {
|
|
1551
|
-
return MachineInputBase.EOF();
|
|
1552
|
-
}
|
|
1553
|
-
x = x << 6 | this.bytes[i++] & 63;
|
|
1554
|
-
x = x << 6 | this.bytes[i++] & 63;
|
|
1555
|
-
return x << 3 | 3;
|
|
1556
|
-
} else {
|
|
1557
|
-
x = x & 7;
|
|
1558
|
-
if (i + 2 >= this.end) {
|
|
1559
|
-
return MachineInputBase.EOF();
|
|
1560
|
-
}
|
|
1561
|
-
x = x << 6 | this.bytes[i++] & 63;
|
|
1562
|
-
x = x << 6 | this.bytes[i++] & 63;
|
|
1563
|
-
x = x << 6 | this.bytes[i++] & 63;
|
|
1564
|
-
return x << 3 | 4;
|
|
1565
|
-
}
|
|
1566
|
-
}
|
|
1567
|
-
|
|
1568
|
-
// Returns the index relative to |pos| at which |re2.prefix| is found
|
|
1569
|
-
// in this input stream, or a negative value if not found.
|
|
1570
|
-
index(re2, pos) {
|
|
1571
|
-
pos += this.start;
|
|
1572
|
-
const i = this.indexOf(this.bytes, re2.prefixUTF8, pos);
|
|
1573
|
-
return i < 0 ? i : i - pos;
|
|
1574
|
-
}
|
|
1575
|
-
|
|
1576
|
-
// Returns a bitmask of EMPTY_* flags.
|
|
1577
|
-
context(pos) {
|
|
1578
|
-
pos += this.start;
|
|
1579
|
-
let r1 = -1;
|
|
1580
|
-
if (pos > this.start && pos <= this.end) {
|
|
1581
|
-
let start = pos - 1;
|
|
1582
|
-
r1 = this.bytes[start--];
|
|
1583
|
-
if (r1 >= 128) {
|
|
1584
|
-
let lim = pos - 4;
|
|
1585
|
-
if (lim < this.start) {
|
|
1586
|
-
lim = this.start;
|
|
1587
|
-
}
|
|
1588
|
-
while (start >= lim && (this.bytes[start] & 192) === 128) {
|
|
1589
|
-
start--;
|
|
1590
|
-
}
|
|
1591
|
-
if (start < this.start) {
|
|
1592
|
-
start = this.start;
|
|
1593
|
-
}
|
|
1594
|
-
r1 = this.step(start) >> 3;
|
|
1595
|
-
}
|
|
1596
|
-
}
|
|
1597
|
-
const r2 = pos < this.end ? this.step(pos) >> 3 : -1;
|
|
1598
|
-
return Utils.emptyOpContext(r1, r2);
|
|
1599
|
-
}
|
|
1600
|
-
|
|
1601
|
-
// Returns the index of the first occurrence of array |target| within
|
|
1602
|
-
// array |source| after |fromIndex|, or -1 if not found.
|
|
1603
|
-
indexOf(source, target, fromIndex = 0) {
|
|
1604
|
-
let targetLength = target.length;
|
|
1605
|
-
if (targetLength === 0) {
|
|
1606
|
-
return -1;
|
|
1607
|
-
}
|
|
1608
|
-
let sourceLength = source.length;
|
|
1609
|
-
for (let i = fromIndex; i <= sourceLength - targetLength; i++) {
|
|
1610
|
-
for (let j = 0; j < targetLength; j++) {
|
|
1611
|
-
if (source[i + j] !== target[j]) {
|
|
1612
|
-
break;
|
|
1613
|
-
} else if (j === targetLength - 1) {
|
|
1614
|
-
return i;
|
|
1615
|
-
}
|
|
1616
|
-
}
|
|
1617
|
-
}
|
|
1618
|
-
return -1;
|
|
1619
|
-
}
|
|
1620
|
-
}
|
|
1621
|
-
|
|
1622
|
-
// |pos| and |width| are in JS "char" units.
|
|
1623
|
-
class MachineUTF16Input extends MachineInputBase {
|
|
1624
|
-
constructor(charSequence, start = 0, end = charSequence.length) {
|
|
1625
|
-
super();
|
|
1626
|
-
this.charSequence = charSequence;
|
|
1627
|
-
this.start = start;
|
|
1628
|
-
this.end = end;
|
|
1629
|
-
}
|
|
1630
|
-
|
|
1631
|
-
// Returns the rune at the specified index; the units are
|
|
1632
|
-
// unspecified, but could be UTF-8 byte, UTF-16 char, or rune
|
|
1633
|
-
// indices. Returns the width (in the same units) of the rune in
|
|
1634
|
-
// the lower 3 bits, and the rune (Unicode code point) in the high
|
|
1635
|
-
// bits. Never negative, except for EOF which is represented as -1
|
|
1636
|
-
// << 3 | 0.
|
|
1637
|
-
step(pos) {
|
|
1638
|
-
pos += this.start;
|
|
1639
|
-
if (pos < this.end) {
|
|
1640
|
-
const rune = this.charSequence.codePointAt(pos);
|
|
1641
|
-
return rune << 3 | Utils.charCount(rune);
|
|
1642
|
-
} else {
|
|
1643
|
-
return MachineInputBase.EOF();
|
|
1644
|
-
}
|
|
1645
|
-
}
|
|
1646
|
-
|
|
1647
|
-
// Returns the index relative to |pos| at which |re2.prefix| is found
|
|
1648
|
-
// in this input stream, or a negative value if not found.
|
|
1649
|
-
index(re2, pos) {
|
|
1650
|
-
pos += this.start;
|
|
1651
|
-
const i = this.charSequence.indexOf(re2.prefix, pos);
|
|
1652
|
-
return i < 0 ? i : i - pos;
|
|
1653
|
-
}
|
|
1654
|
-
|
|
1655
|
-
// Returns a bitmask of EMPTY_* flags.
|
|
1656
|
-
context(pos) {
|
|
1657
|
-
pos += this.start;
|
|
1658
|
-
const r1 = pos > 0 && pos <= this.charSequence.length ? this.charSequence.codePointAt(pos - 1) : -1;
|
|
1659
|
-
const r2 = pos < this.charSequence.length ? this.charSequence.codePointAt(pos) : -1;
|
|
1660
|
-
return Utils.emptyOpContext(r1, r2);
|
|
1661
|
-
}
|
|
1662
|
-
}
|
|
1663
|
-
class MachineInput {
|
|
1664
|
-
static fromUTF8(bytes, start = 0, end = bytes.length) {
|
|
1665
|
-
return new MachineUTF8Input(bytes, start, end);
|
|
1666
|
-
}
|
|
1667
|
-
static fromUTF16(charSequence, start = 0, end = charSequence.length) {
|
|
1668
|
-
return new MachineUTF16Input(charSequence, start, end);
|
|
1669
|
-
}
|
|
1670
|
-
}
|
|
1671
|
-
|
|
1672
1661
|
/**
|
|
1673
1662
|
* A single instruction in the regular expression virtual machine.
|
|
1674
1663
|
*
|
|
@@ -2127,21 +2116,40 @@
|
|
|
2127
2116
|
}
|
|
2128
2117
|
}
|
|
2129
2118
|
|
|
2119
|
+
// FNV-1a 32-bit hash for an array of integers.
|
|
2120
|
+
// Extremely fast, allocates no memory, and produces good distribution.
|
|
2121
|
+
const hashPCs = pcs => {
|
|
2122
|
+
let h = -2128831035; // 0x811c9dc5 (32-bit signed offset basis)
|
|
2123
|
+
for (let i = 0; i < pcs.length; i++) {
|
|
2124
|
+
h ^= pcs[i];
|
|
2125
|
+
h = Math.imul(h, 16777619); // 0x01000193 (FNV prime)
|
|
2126
|
+
}
|
|
2127
|
+
return h;
|
|
2128
|
+
};
|
|
2129
|
+
|
|
2130
|
+
// Zero-allocation array comparison for hash collision resolution
|
|
2131
|
+
const arraysEqual = (a, b) => {
|
|
2132
|
+
if (a.length !== b.length) return false;
|
|
2133
|
+
for (let i = 0; i < a.length; i++) {
|
|
2134
|
+
if (a[i] !== b[i]) return false;
|
|
2135
|
+
}
|
|
2136
|
+
return true;
|
|
2137
|
+
};
|
|
2130
2138
|
class DFAState {
|
|
2131
|
-
constructor(
|
|
2132
|
-
this.
|
|
2133
|
-
this.nfaStates = nfaStates; // Array of Instruction PCs
|
|
2139
|
+
constructor(nfaStates, isMatch) {
|
|
2140
|
+
this.nfaStates = nfaStates; // Int32Array of Instruction PCs
|
|
2134
2141
|
this.isMatch = isMatch; // Boolean
|
|
2135
|
-
this.nextAscii = new Array(Unicode.MAX_ASCII + 1).fill(null); // Flat array for blisteringly fast ASCII lookups
|
|
2142
|
+
this.nextAscii = new Array(Unicode.MAX_ASCII + 1).fill(null); // Flat array for blisteringly fast ASCII lookups
|
|
2136
2143
|
this.nextMap = new Map(); // Cache of Char -> DFAState
|
|
2137
2144
|
}
|
|
2138
2145
|
}
|
|
2139
2146
|
class DFA {
|
|
2140
2147
|
constructor(prog) {
|
|
2141
2148
|
this.prog = prog;
|
|
2142
|
-
this.stateCache = new Map(); //
|
|
2149
|
+
this.stateCache = new Map(); // hash(number) -> DFAState[]
|
|
2150
|
+
this.stateCount = 0; // Tracks total states for memory limits
|
|
2143
2151
|
this.startState = null;
|
|
2144
|
-
this.stateLimit = 10000; // Prevent memory explosion (ReDoS protection)
|
|
2152
|
+
this.stateLimit = 10000; // Prevent memory explosion (ReDoS protection)
|
|
2145
2153
|
}
|
|
2146
2154
|
|
|
2147
2155
|
// Follows epsilon (empty) transitions to find all reachable states without consuming a char
|
|
@@ -2185,17 +2193,37 @@
|
|
|
2185
2193
|
const closureResult = this.computeClosure(pcs);
|
|
2186
2194
|
if (!closureResult) return null; // Bailout to NFA required
|
|
2187
2195
|
|
|
2188
|
-
const
|
|
2189
|
-
|
|
2190
|
-
|
|
2196
|
+
const sortedPCs = closureResult.pcs;
|
|
2197
|
+
const hash = hashPCs(sortedPCs);
|
|
2198
|
+
|
|
2199
|
+
// Lookup hash bucket
|
|
2200
|
+
let bucket = this.stateCache.get(hash);
|
|
2201
|
+
if (bucket) {
|
|
2202
|
+
// Resolve potential hash collisions
|
|
2203
|
+
for (let i = 0; i < bucket.length; i++) {
|
|
2204
|
+
const state = bucket[i];
|
|
2205
|
+
if (arraysEqual(state.nfaStates, sortedPCs)) {
|
|
2206
|
+
return state;
|
|
2207
|
+
}
|
|
2208
|
+
}
|
|
2209
|
+
} else {
|
|
2210
|
+
bucket = [];
|
|
2211
|
+
this.stateCache.set(hash, bucket);
|
|
2191
2212
|
}
|
|
2192
2213
|
|
|
2193
2214
|
// Safety: prevent memory exhaustion from state explosion
|
|
2194
|
-
|
|
2195
|
-
|
|
2215
|
+
// We flush the cache and return null, which seamlessly routes execution to the NFA
|
|
2216
|
+
if (this.stateCount >= this.stateLimit) {
|
|
2217
|
+
this.stateCache.clear();
|
|
2218
|
+
this.stateCount = 0;
|
|
2219
|
+
this.startState = null;
|
|
2220
|
+
return null;
|
|
2196
2221
|
}
|
|
2197
|
-
|
|
2198
|
-
|
|
2222
|
+
|
|
2223
|
+
// State not found, create it and add to bucket
|
|
2224
|
+
const state = new DFAState(sortedPCs, closureResult.isMatch);
|
|
2225
|
+
bucket.push(state);
|
|
2226
|
+
this.stateCount++;
|
|
2199
2227
|
return state;
|
|
2200
2228
|
}
|
|
2201
2229
|
|
|
@@ -2259,6 +2287,11 @@
|
|
|
2259
2287
|
const r = input.step(i);
|
|
2260
2288
|
const rune = r >> 3;
|
|
2261
2289
|
const width = r & 7;
|
|
2290
|
+
|
|
2291
|
+
// prevent infinite loop on EOF
|
|
2292
|
+
if (width === 0) {
|
|
2293
|
+
break;
|
|
2294
|
+
}
|
|
2262
2295
|
currentState = this.step(currentState, rune, anchor);
|
|
2263
2296
|
|
|
2264
2297
|
// If we hit an unrecoverable DFA error or bailout, signal fallback
|
|
@@ -5421,18 +5454,10 @@
|
|
|
5421
5454
|
if (ncap > 0) {
|
|
5422
5455
|
return this.doExecuteNFA(input, pos, anchor, ncap);
|
|
5423
5456
|
}
|
|
5424
|
-
|
|
5425
|
-
|
|
5426
|
-
|
|
5427
|
-
|
|
5428
|
-
return dfaResult ? [] : null; // Return empty array to signify "matched but no captures"
|
|
5429
|
-
}
|
|
5430
|
-
} catch (e) {
|
|
5431
|
-
if (e instanceof RE2JSDfaMemoryException) {
|
|
5432
|
-
this.dfa = new DFA(this.prog); // flush cache
|
|
5433
|
-
} else {
|
|
5434
|
-
throw e;
|
|
5435
|
-
}
|
|
5457
|
+
const dfaResult = this.dfa.match(input, pos, anchor);
|
|
5458
|
+
if (dfaResult !== null) {
|
|
5459
|
+
// DFA succeeded (returned true or false)
|
|
5460
|
+
return dfaResult ? [] : null; // Return empty array to signify "matched but no captures"
|
|
5436
5461
|
}
|
|
5437
5462
|
|
|
5438
5463
|
// Fallback to NFA
|
|
@@ -6316,6 +6341,39 @@
|
|
|
6316
6341
|
return new Matcher(this, input);
|
|
6317
6342
|
}
|
|
6318
6343
|
|
|
6344
|
+
/**
|
|
6345
|
+
* Tests whether the regular expression matches any part of the input string.
|
|
6346
|
+
* Performance Note: This method is highly optimized. Because it only returns
|
|
6347
|
+
* a boolean and does not extract capture groups, it bypasses the `Matcher` overhead
|
|
6348
|
+
* and guarantees execution on the high-speed DFA engine whenever possible.
|
|
6349
|
+
*
|
|
6350
|
+
* @param {string|number[]} input - The input string or UTF-8 byte array to test against.
|
|
6351
|
+
* @returns {boolean} `true` if the pattern is found anywhere in the input, `false` otherwise.
|
|
6352
|
+
*/
|
|
6353
|
+
test(input) {
|
|
6354
|
+
if (Array.isArray(input)) {
|
|
6355
|
+
// Reuse the existing UTF-8 fast-path method
|
|
6356
|
+
return this.re2Input.matchUTF8(input);
|
|
6357
|
+
}
|
|
6358
|
+
|
|
6359
|
+
// Reuse the existing UTF-16 fast-path method
|
|
6360
|
+
return this.re2Input.match(input);
|
|
6361
|
+
}
|
|
6362
|
+
|
|
6363
|
+
/**
|
|
6364
|
+
* Tests whether the regular expression matches the ENTIRE input string.
|
|
6365
|
+
* * **Performance Note:** This operates identically to `.matches()`, but is significantly
|
|
6366
|
+
* faster because it does not request capture group data. By requesting 0 capture groups,
|
|
6367
|
+
* it securely routes execution through the DFA fast-path.
|
|
6368
|
+
*
|
|
6369
|
+
* @param {string|number[]} input - The input string or UTF-8 byte array to test against.
|
|
6370
|
+
* @returns {boolean} `true` if the exact input string fully matches the pattern, `false` otherwise.
|
|
6371
|
+
*/
|
|
6372
|
+
testExact(input) {
|
|
6373
|
+
const machineInput = Array.isArray(input) ? MachineInput.fromUTF8(input) : MachineInput.fromUTF16(input);
|
|
6374
|
+
return this.re2Input.executeEngine(machineInput, 0, RE2Flags.ANCHOR_BOTH, 0) !== null;
|
|
6375
|
+
}
|
|
6376
|
+
|
|
6319
6377
|
/**
|
|
6320
6378
|
* Splits input around instances of the regular expression. It returns an array giving the strings
|
|
6321
6379
|
* that occur before, between, and after instances of the regular expression.
|
|
@@ -6437,7 +6495,6 @@
|
|
|
6437
6495
|
exports.Matcher = Matcher;
|
|
6438
6496
|
exports.RE2JS = RE2JS;
|
|
6439
6497
|
exports.RE2JSCompileException = RE2JSCompileException;
|
|
6440
|
-
exports.RE2JSDfaMemoryException = RE2JSDfaMemoryException;
|
|
6441
6498
|
exports.RE2JSException = RE2JSException;
|
|
6442
6499
|
exports.RE2JSFlagsException = RE2JSFlagsException;
|
|
6443
6500
|
exports.RE2JSGroupException = RE2JSGroupException;
|