re2js 2.2.2 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -0
- package/build/index.cjs.cjs +256 -74
- package/build/index.cjs.cjs.map +1 -1
- package/build/index.esm.d.ts +54 -79
- package/build/index.esm.d.ts.map +1 -1
- package/build/index.esm.js +256 -74
- package/build/index.esm.js.map +1 -1
- package/build/index.umd.js +256 -74
- package/build/index.umd.js.map +1 -1
- package/package.json +4 -2
package/README.md
CHANGED
|
@@ -473,6 +473,8 @@ Parameters:
|
|
|
473
473
|
- `$1, $2, ...` refer to the corresponding capture groups in the pattern
|
|
474
474
|
- `$$` inserts a literal `$`
|
|
475
475
|
- `$<name>` can be used to reference named capture groups
|
|
476
|
+
- `` $` `` inserts the portion of the string that precedes the matched substring
|
|
477
|
+
- `$'` inserts the portion of the string that follows the matched substring
|
|
476
478
|
- on invalid group - ignore it
|
|
477
479
|
- `javaMode (Boolean)`: If set to `true`, the replacement follows Java's rules for replacement. Defaults to `false`. If `javaMode = true`, changed rules for capture groups and special characters:
|
|
478
480
|
- `$0` refers to the entire matched substring
|
|
@@ -575,6 +577,10 @@ const unicodeRegexp = RE2JS.translateRegExp('\\u{1F600}') // '\\x{1F600}'
|
|
|
575
577
|
|
|
576
578
|
RE2JS.matches(unicodeRegexp, '😀') // true
|
|
577
579
|
RE2JS.matches(unicodeRegexp, '😃') // false
|
|
580
|
+
|
|
581
|
+
// also support native Regex
|
|
582
|
+
RE2JS.translateRegExp(/foo/ims) // '(?ims)foo'
|
|
583
|
+
RE2JS.translateRegExp(/bar/giy) // '(?i)bar'
|
|
578
584
|
```
|
|
579
585
|
|
|
580
586
|
## Performance and Architecture
|
package/build/index.cjs.cjs
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
* re2js
|
|
3
3
|
* RE2JS is the JavaScript port of RE2, a regular expression engine that provides linear time matching
|
|
4
4
|
*
|
|
5
|
-
* @version v2.
|
|
5
|
+
* @version v2.3.0
|
|
6
6
|
* @author Oleksii Vasyliev
|
|
7
7
|
* @homepage https://github.com/le0pard/re2js#readme
|
|
8
8
|
* @repository github:le0pard/re2js
|
|
@@ -96,7 +96,7 @@ for (let i = 0; i < ASCII_SIZE; i++) {
|
|
|
96
96
|
}
|
|
97
97
|
class Codepoint {
|
|
98
98
|
// codePointAt(0)
|
|
99
|
-
static CODES = new Map([['\x07', 7], ['\b', 8], ['\t', 9], ['\n', 10], ['\v', 11], ['\f', 12], ['\r', 13], [' ', 32], ['"', 34], ['$', 36], ['&', 38], ['(', 40], [')', 41], ['*', 42], ['+', 43], ['-', 45], ['.', 46], ['0', 48], ['1', 49], ['2', 50], ['3', 51], ['4', 52], ['5', 53], ['6', 54], ['7', 55], ['8', 56], ['9', 57], [':', 58], ['<', 60], ['>', 62], ['?', 63], ['A', 65], ['B', 66], ['C', 67], ['F', 70], ['P', 80], ['Q', 81], ['U', 85], ['Z', 90], ['[', 91], ['\\', 92], [']', 93], ['^', 94], ['_', 95], ['a', 97], ['b', 98], ['f', 102], ['i', 105], ['m', 109], ['n', 110], ['r', 114], ['s', 115], ['t', 116], ['v', 118], ['x', 120], ['z', 122], ['{', 123], ['|', 124], ['}', 125]]);
|
|
99
|
+
static CODES = new Map([['\x07', 7], ['\b', 8], ['\t', 9], ['\n', 10], ['\v', 11], ['\f', 12], ['\r', 13], [' ', 32], ['"', 34], ['$', 36], ['&', 38], ["'", 39], ['(', 40], [')', 41], ['*', 42], ['+', 43], ['-', 45], ['.', 46], ['0', 48], ['1', 49], ['2', 50], ['3', 51], ['4', 52], ['5', 53], ['6', 54], ['7', 55], ['8', 56], ['9', 57], [':', 58], ['<', 60], ['>', 62], ['?', 63], ['A', 65], ['B', 66], ['C', 67], ['F', 70], ['P', 80], ['Q', 81], ['U', 85], ['Z', 90], ['[', 91], ['\\', 92], [']', 93], ['^', 94], ['_', 95], ['`', 96], ['a', 97], ['b', 98], ['f', 102], ['i', 105], ['m', 109], ['n', 110], ['r', 114], ['s', 115], ['t', 116], ['v', 118], ['x', 120], ['z', 122], ['{', 123], ['|', 124], ['}', 125]]);
|
|
100
100
|
|
|
101
101
|
// convert unicode codepoint to upper case codepoint
|
|
102
102
|
// return same codepoint, if cannot do it (or codepoint not have upper variation)
|
|
@@ -150,10 +150,6 @@ class UnicodeRangeTable {
|
|
|
150
150
|
getStride(index) {
|
|
151
151
|
return this.isStride1 ? 1 : this.data[index * this.SIZE + 2];
|
|
152
152
|
}
|
|
153
|
-
get(index) {
|
|
154
|
-
const i = index * this.SIZE;
|
|
155
|
-
return [this.data[i], this.data[i + 1], this.getStride(index)];
|
|
156
|
-
}
|
|
157
153
|
get length() {
|
|
158
154
|
return this.data.length / this.SIZE;
|
|
159
155
|
}
|
|
@@ -650,6 +646,9 @@ class Utils {
|
|
|
650
646
|
static emptyInts() {
|
|
651
647
|
return [];
|
|
652
648
|
}
|
|
649
|
+
static isByteArray(input) {
|
|
650
|
+
return Array.isArray(input) || input instanceof Uint8Array;
|
|
651
|
+
}
|
|
653
652
|
|
|
654
653
|
// Returns true iff |c| is an ASCII letter or decimal digit.
|
|
655
654
|
static isalnum(c) {
|
|
@@ -951,7 +950,7 @@ class Utf16MatcherInput extends MatcherInputBase {
|
|
|
951
950
|
* @returns {number[]}
|
|
952
951
|
*/
|
|
953
952
|
asBytes() {
|
|
954
|
-
return this.charSequence.toString()
|
|
953
|
+
return Utils.stringToUtf8ByteArray(this.charSequence.toString());
|
|
955
954
|
}
|
|
956
955
|
|
|
957
956
|
/**
|
|
@@ -976,7 +975,7 @@ class MatcherInput {
|
|
|
976
975
|
* @returns {Utf8MatcherInput}
|
|
977
976
|
*/
|
|
978
977
|
static utf8(input) {
|
|
979
|
-
if (
|
|
978
|
+
if (Utils.isByteArray(input)) {
|
|
980
979
|
return new Utf8MatcherInput(input);
|
|
981
980
|
}
|
|
982
981
|
return new Utf8MatcherInput(Utils.stringToUtf8ByteArray(input));
|
|
@@ -1108,10 +1107,10 @@ class MachineUTF8Input extends MachineInputBase {
|
|
|
1108
1107
|
if (start < this.start) {
|
|
1109
1108
|
start = this.start;
|
|
1110
1109
|
}
|
|
1111
|
-
r1 = this.step(start) >> 3;
|
|
1110
|
+
r1 = this.step(start - this.start) >> 3;
|
|
1112
1111
|
}
|
|
1113
1112
|
}
|
|
1114
|
-
const r2 = pos < this.end ? this.step(pos) >> 3 : -1;
|
|
1113
|
+
const r2 = pos < this.end ? this.step(pos - this.start) >> 3 : -1;
|
|
1115
1114
|
return Utils.emptyOpContext(r1, r2);
|
|
1116
1115
|
}
|
|
1117
1116
|
|
|
@@ -1193,14 +1192,17 @@ class MachineUTF16Input extends MachineInputBase {
|
|
|
1193
1192
|
index(re2, pos) {
|
|
1194
1193
|
pos += this.start;
|
|
1195
1194
|
const i = this.charSequence.indexOf(re2.prefix, pos);
|
|
1196
|
-
|
|
1195
|
+
if (i < 0 || i > this.end - re2.prefix.length) {
|
|
1196
|
+
return -1;
|
|
1197
|
+
}
|
|
1198
|
+
return i - pos;
|
|
1197
1199
|
}
|
|
1198
1200
|
|
|
1199
1201
|
// Returns a bitmask of EMPTY_* flags.
|
|
1200
1202
|
context(pos) {
|
|
1201
1203
|
pos += this.start;
|
|
1202
|
-
const r1 = pos >
|
|
1203
|
-
const r2 = pos < this.
|
|
1204
|
+
const r1 = pos > this.start && pos <= this.end ? this.charSequence.codePointAt(pos - 1) : -1;
|
|
1205
|
+
const r2 = pos < this.end ? this.charSequence.codePointAt(pos) : -1;
|
|
1204
1206
|
return Utils.emptyOpContext(r1, r2);
|
|
1205
1207
|
}
|
|
1206
1208
|
prefixLength(re2) {
|
|
@@ -1329,6 +1331,11 @@ class RE2JSInternalException extends RE2JSException {
|
|
|
1329
1331
|
*
|
|
1330
1332
|
* @author rsc@google.com (Russ Cox)
|
|
1331
1333
|
*/
|
|
1334
|
+
|
|
1335
|
+
/**
|
|
1336
|
+
* @typedef {import('./index').RE2JS} RE2JS_Pattern
|
|
1337
|
+
*/
|
|
1338
|
+
|
|
1332
1339
|
class Matcher {
|
|
1333
1340
|
/**
|
|
1334
1341
|
* Quotes '\' and '$' in {@code s}, so that the returned string could be used in
|
|
@@ -1366,14 +1373,17 @@ class Matcher {
|
|
|
1366
1373
|
}
|
|
1367
1374
|
/**
|
|
1368
1375
|
*
|
|
1369
|
-
* @param {
|
|
1370
|
-
* @param {
|
|
1376
|
+
* @param {RE2JS_Pattern} pattern
|
|
1377
|
+
* @param {Uint8Array|number[]|string} input
|
|
1371
1378
|
*/
|
|
1372
1379
|
constructor(pattern, input) {
|
|
1373
1380
|
if (pattern === null) {
|
|
1374
1381
|
throw new Error('pattern is null');
|
|
1375
1382
|
}
|
|
1376
|
-
|
|
1383
|
+
/**
|
|
1384
|
+
* The pattern being matched.
|
|
1385
|
+
* @type {RE2JS_Pattern}
|
|
1386
|
+
*/
|
|
1377
1387
|
this.patternInput = pattern;
|
|
1378
1388
|
const re2 = this.patternInput.re2();
|
|
1379
1389
|
// The number of submatches (groups) in the pattern.
|
|
@@ -1388,7 +1398,7 @@ class Matcher {
|
|
|
1388
1398
|
this.numberOfInstructions = re2.numberOfInstructions();
|
|
1389
1399
|
if (input instanceof MatcherInputBase) {
|
|
1390
1400
|
this.resetMatcherInput(input);
|
|
1391
|
-
} else if (
|
|
1401
|
+
} else if (Utils.isByteArray(input)) {
|
|
1392
1402
|
this.resetMatcherInput(MatcherInput.utf8(input));
|
|
1393
1403
|
} else {
|
|
1394
1404
|
this.resetMatcherInput(MatcherInput.utf16(input));
|
|
@@ -1397,7 +1407,7 @@ class Matcher {
|
|
|
1397
1407
|
|
|
1398
1408
|
/**
|
|
1399
1409
|
* Returns the {@code RE2JS} associated with this {@code Matcher}.
|
|
1400
|
-
* @returns {
|
|
1410
|
+
* @returns {RE2JS_Pattern}
|
|
1401
1411
|
*/
|
|
1402
1412
|
pattern() {
|
|
1403
1413
|
return this.patternInput;
|
|
@@ -1427,7 +1437,7 @@ class Matcher {
|
|
|
1427
1437
|
|
|
1428
1438
|
/**
|
|
1429
1439
|
* Resets the {@code Matcher} and changes the input.
|
|
1430
|
-
* @param {
|
|
1440
|
+
* @param {import('./MatcherInput').MatcherInputBase} input
|
|
1431
1441
|
* @returns {Matcher} the {@code Matcher} itself, for chained method calls
|
|
1432
1442
|
*/
|
|
1433
1443
|
resetMatcherInput(input) {
|
|
@@ -1492,7 +1502,7 @@ class Matcher {
|
|
|
1492
1502
|
/**
|
|
1493
1503
|
* Returns the named group of the most recent match, or {@code null} if the group was not matched.
|
|
1494
1504
|
* @param {string|number} [group=0]
|
|
1495
|
-
* @returns {
|
|
1505
|
+
* @returns {string|null}
|
|
1496
1506
|
*/
|
|
1497
1507
|
group(group = 0) {
|
|
1498
1508
|
if (typeof group === 'string') {
|
|
@@ -1550,10 +1560,7 @@ class Matcher {
|
|
|
1550
1560
|
if (group === 0 || this.hasGroups) {
|
|
1551
1561
|
return;
|
|
1552
1562
|
}
|
|
1553
|
-
|
|
1554
|
-
if (end > this.matcherInputLength) {
|
|
1555
|
-
end = this.matcherInputLength;
|
|
1556
|
-
}
|
|
1563
|
+
const end = this.matcherInputLength;
|
|
1557
1564
|
const res = this.patternInput.re2().matchMachineInput(this.matcherInput, this.groups[0], end, this.anchorFlag, 1 + this.patternGroupCount);
|
|
1558
1565
|
const ok = res[0];
|
|
1559
1566
|
if (!ok) {
|
|
@@ -1587,7 +1594,7 @@ class Matcher {
|
|
|
1587
1594
|
* Matches the input against the pattern (unanchored), starting at a specified position. If there
|
|
1588
1595
|
* is a match, {@code find} sets the match state to describe it.
|
|
1589
1596
|
*
|
|
1590
|
-
* @param {number} [start=null] the input position where the search begins
|
|
1597
|
+
* @param {number|null} [start=null] the input position where the search begins
|
|
1591
1598
|
* @returns {boolean} if it finds a match
|
|
1592
1599
|
* @throws IndexOutOfBoundsException if start is not a valid input position
|
|
1593
1600
|
*/
|
|
@@ -1749,7 +1756,10 @@ class Matcher {
|
|
|
1749
1756
|
throw new RE2JSGroupException("named capture group is missing trailing '}'");
|
|
1750
1757
|
}
|
|
1751
1758
|
const groupName = replacement.substring(i + 1, j);
|
|
1752
|
-
|
|
1759
|
+
const groupVal = this.group(groupName);
|
|
1760
|
+
if (groupVal !== null) {
|
|
1761
|
+
res += groupVal;
|
|
1762
|
+
}
|
|
1753
1763
|
last = j + 1;
|
|
1754
1764
|
i = j;
|
|
1755
1765
|
continue;
|
|
@@ -1795,6 +1805,22 @@ class Matcher {
|
|
|
1795
1805
|
i++;
|
|
1796
1806
|
last = i + 1;
|
|
1797
1807
|
continue;
|
|
1808
|
+
} else if (Codepoint.CODES.get('`') === c) {
|
|
1809
|
+
if (last < i) {
|
|
1810
|
+
res += replacement.substring(last, i);
|
|
1811
|
+
}
|
|
1812
|
+
res += this.substring(0, this.start(0));
|
|
1813
|
+
i++;
|
|
1814
|
+
last = i + 1;
|
|
1815
|
+
continue;
|
|
1816
|
+
} else if (Codepoint.CODES.get("'") === c) {
|
|
1817
|
+
if (last < i) {
|
|
1818
|
+
res += replacement.substring(last, i);
|
|
1819
|
+
}
|
|
1820
|
+
res += this.substring(this.end(0), this.matcherInputLength);
|
|
1821
|
+
i++;
|
|
1822
|
+
last = i + 1;
|
|
1823
|
+
continue;
|
|
1798
1824
|
} else if (Codepoint.CODES.get('1') <= c && c <= Codepoint.CODES.get('9')) {
|
|
1799
1825
|
let n = c - Codepoint.CODES.get('0');
|
|
1800
1826
|
if (last < i) {
|
|
@@ -1837,7 +1863,10 @@ class Matcher {
|
|
|
1837
1863
|
}
|
|
1838
1864
|
const groupName = replacement.substring(i + 1, j);
|
|
1839
1865
|
if (Object.prototype.hasOwnProperty.call(this.namedGroups, groupName)) {
|
|
1840
|
-
|
|
1866
|
+
const groupVal = this.group(groupName);
|
|
1867
|
+
if (groupVal !== null) {
|
|
1868
|
+
res += groupVal;
|
|
1869
|
+
}
|
|
1841
1870
|
} else {
|
|
1842
1871
|
res += `$<${groupName}>`;
|
|
1843
1872
|
}
|
|
@@ -4339,13 +4368,6 @@ class Prog {
|
|
|
4339
4368
|
// start every program with a fail instruction, so we'll never want to point
|
|
4340
4369
|
// at its output link.
|
|
4341
4370
|
|
|
4342
|
-
next(l) {
|
|
4343
|
-
const i = this.inst[l >> 1];
|
|
4344
|
-
if ((l & 1) === 0) {
|
|
4345
|
-
return i.out;
|
|
4346
|
-
}
|
|
4347
|
-
return i.arg;
|
|
4348
|
-
}
|
|
4349
4371
|
patch(l, val) {
|
|
4350
4372
|
let head = l.head;
|
|
4351
4373
|
while (head !== 0) {
|
|
@@ -5675,6 +5697,7 @@ class Parser {
|
|
|
5675
5697
|
case Codepoint.CODES.get('6'):
|
|
5676
5698
|
case Codepoint.CODES.get('7'):
|
|
5677
5699
|
{
|
|
5700
|
+
// Single non-zero digit is a backreference; not supported
|
|
5678
5701
|
if (!t.more() || t.peek() < Codepoint.CODES.get('0') || t.peek() > Codepoint.CODES.get('7')) {
|
|
5679
5702
|
break;
|
|
5680
5703
|
}
|
|
@@ -5682,6 +5705,7 @@ class Parser {
|
|
|
5682
5705
|
// eslint-disable-next-line no-fallthrough
|
|
5683
5706
|
case Codepoint.CODES.get('0'):
|
|
5684
5707
|
{
|
|
5708
|
+
// Consume up to three octal digits; already have one.
|
|
5685
5709
|
let r = c - Codepoint.CODES.get('0');
|
|
5686
5710
|
for (let i = 1; i < 3; i++) {
|
|
5687
5711
|
if (!t.more() || t.peek() < Codepoint.CODES.get('0') || t.peek() > Codepoint.CODES.get('7')) {
|
|
@@ -7462,7 +7486,11 @@ class RE2 {
|
|
|
7462
7486
|
*/
|
|
7463
7487
|
matchWithGroup(input, start, end, anchor, ngroup) {
|
|
7464
7488
|
if (!(input instanceof MatcherInputBase)) {
|
|
7465
|
-
|
|
7489
|
+
if (Utils.isByteArray(input)) {
|
|
7490
|
+
input = MatcherInput.utf8(input);
|
|
7491
|
+
} else {
|
|
7492
|
+
input = MatcherInput.utf16(input);
|
|
7493
|
+
}
|
|
7466
7494
|
}
|
|
7467
7495
|
return this.matchMachineInput(input, start, end, anchor, ngroup);
|
|
7468
7496
|
}
|
|
@@ -7917,9 +7945,18 @@ class RE2 {
|
|
|
7917
7945
|
}
|
|
7918
7946
|
|
|
7919
7947
|
class RE2Set {
|
|
7948
|
+
/** @type {number} */
|
|
7920
7949
|
static UNANCHORED = RE2Flags.UNANCHORED;
|
|
7950
|
+
/** @type {number} */
|
|
7921
7951
|
static ANCHOR_START = RE2Flags.ANCHOR_START;
|
|
7952
|
+
/** @type {number} */
|
|
7922
7953
|
static ANCHOR_BOTH = RE2Flags.ANCHOR_BOTH;
|
|
7954
|
+
|
|
7955
|
+
/**
|
|
7956
|
+
* Constructs a new RE2Set with the specified anchor mode and flags.
|
|
7957
|
+
* @param {number} [anchor=RE2Set.UNANCHORED] - The anchoring mode (e.g., RE2Set.UNANCHORED).
|
|
7958
|
+
* @param {number} [flags=0] - The public flags to apply to all patterns in the set.
|
|
7959
|
+
*/
|
|
7923
7960
|
constructor(anchor = RE2Set.UNANCHORED, flags = 0) {
|
|
7924
7961
|
this.anchor = anchor;
|
|
7925
7962
|
this.jsFlags = flags;
|
|
@@ -7936,6 +7973,14 @@ class RE2Set {
|
|
|
7936
7973
|
this.dfa = null;
|
|
7937
7974
|
this.dummyRe2 = null;
|
|
7938
7975
|
}
|
|
7976
|
+
|
|
7977
|
+
/**
|
|
7978
|
+
* Adds a new regular expression pattern to the set.
|
|
7979
|
+
* Patterns cannot be added after the set has been compiled.
|
|
7980
|
+
* @param {string} pattern - The regular expression pattern to add.
|
|
7981
|
+
* @returns {number} The integer index assigned to the added pattern.
|
|
7982
|
+
* @throws {RE2JSCompileException} If patterns are added after compilation.
|
|
7983
|
+
*/
|
|
7939
7984
|
add(pattern) {
|
|
7940
7985
|
if (this.prog) {
|
|
7941
7986
|
throw new RE2JSCompileException('Cannot add patterns after compile');
|
|
@@ -7954,6 +7999,12 @@ class RE2Set {
|
|
|
7954
7999
|
this.regexps.push(Simplify.simplify(re));
|
|
7955
8000
|
return this.regexps.length - 1;
|
|
7956
8001
|
}
|
|
8002
|
+
|
|
8003
|
+
/**
|
|
8004
|
+
* Compiles the added patterns into a single state machine.
|
|
8005
|
+
* This is automatically called on the first match if not called explicitly.
|
|
8006
|
+
* @returns {void}
|
|
8007
|
+
*/
|
|
7957
8008
|
compile() {
|
|
7958
8009
|
if (this.prog) return;
|
|
7959
8010
|
this.prog = Compiler.compileSet(this.regexps);
|
|
@@ -7966,9 +8017,15 @@ class RE2Set {
|
|
|
7966
8017
|
longest: false
|
|
7967
8018
|
};
|
|
7968
8019
|
}
|
|
8020
|
+
|
|
8021
|
+
/**
|
|
8022
|
+
* Matches the input against the compiled set of regular expressions.
|
|
8023
|
+
* @param {string|number[]|Uint8Array} input - The input string or UTF-8 byte array to match against.
|
|
8024
|
+
* @returns {number[]} An array of indices representing the patterns that successfully matched the input.
|
|
8025
|
+
*/
|
|
7969
8026
|
match(input) {
|
|
7970
8027
|
if (!this.prog) this.compile();
|
|
7971
|
-
const machineInput =
|
|
8028
|
+
const machineInput = Utils.isByteArray(input) ? MachineInput.fromUTF8(input) : MachineInput.fromUTF16(input);
|
|
7972
8029
|
let internalAnchor = RE2Flags.UNANCHORED;
|
|
7973
8030
|
if (this.anchor === RE2Set.ANCHOR_START) {
|
|
7974
8031
|
internalAnchor = RE2Flags.ANCHOR_START;
|
|
@@ -7991,13 +8048,19 @@ class RE2Set {
|
|
|
7991
8048
|
* Transform JS regex string to RE2 regex string
|
|
7992
8049
|
*/
|
|
7993
8050
|
class TranslateRegExpString {
|
|
7994
|
-
static isUpperCaseAlpha(ch) {
|
|
7995
|
-
return 'A' <= ch && ch <= 'Z';
|
|
7996
|
-
}
|
|
7997
8051
|
static isHexadecimal(ch) {
|
|
7998
8052
|
return '0' <= ch && ch <= '9' || 'A' <= ch && ch <= 'F' || 'a' <= ch && ch <= 'f';
|
|
7999
8053
|
}
|
|
8000
8054
|
static translate(data) {
|
|
8055
|
+
let prefixFlags = '';
|
|
8056
|
+
if (data instanceof RegExp) {
|
|
8057
|
+
if (data.ignoreCase) prefixFlags += 'i';
|
|
8058
|
+
if (data.multiline) prefixFlags += 'm';
|
|
8059
|
+
if (data.dotAll) prefixFlags += 's';
|
|
8060
|
+
|
|
8061
|
+
// execution flags ('g', 'y') are safely ignored here.
|
|
8062
|
+
data = data.source;
|
|
8063
|
+
}
|
|
8001
8064
|
if (typeof data !== 'string') {
|
|
8002
8065
|
return data;
|
|
8003
8066
|
}
|
|
@@ -8008,6 +8071,7 @@ class TranslateRegExpString {
|
|
|
8008
8071
|
result = '(?:)';
|
|
8009
8072
|
changed = true;
|
|
8010
8073
|
}
|
|
8074
|
+
let inCharClass = false;
|
|
8011
8075
|
let i = 0;
|
|
8012
8076
|
while (i < size) {
|
|
8013
8077
|
let ch = data[i];
|
|
@@ -8025,54 +8089,160 @@ class TranslateRegExpString {
|
|
|
8025
8089
|
{
|
|
8026
8090
|
if (i + 2 < size) {
|
|
8027
8091
|
let nextCh = data[i + 2];
|
|
8028
|
-
|
|
8092
|
+
let code = nextCh.charCodeAt(0);
|
|
8093
|
+
if (code >= 65 && code <= 90 || code >= 97 && code <= 122) {
|
|
8094
|
+
let val = code % 32;
|
|
8029
8095
|
result += '\\x';
|
|
8030
|
-
result += (
|
|
8031
|
-
result += (
|
|
8096
|
+
result += (val >> 4).toString(16).toUpperCase();
|
|
8097
|
+
result += (val & 15).toString(16).toUpperCase();
|
|
8032
8098
|
i += 3;
|
|
8033
8099
|
changed = true;
|
|
8034
8100
|
continue;
|
|
8035
8101
|
}
|
|
8036
8102
|
}
|
|
8037
|
-
result += '
|
|
8103
|
+
result += 'c';
|
|
8038
8104
|
i += 2;
|
|
8105
|
+
changed = true;
|
|
8039
8106
|
continue;
|
|
8040
8107
|
}
|
|
8041
8108
|
case 'u':
|
|
8042
8109
|
{
|
|
8043
8110
|
if (i + 2 < size) {
|
|
8044
8111
|
let nextCh = data[i + 2];
|
|
8045
|
-
if (
|
|
8046
|
-
|
|
8047
|
-
i
|
|
8048
|
-
|
|
8049
|
-
|
|
8050
|
-
|
|
8112
|
+
if (nextCh === '{') {
|
|
8113
|
+
// Must have a closing brace and at least one valid hex digit inside
|
|
8114
|
+
let j = i + 3;
|
|
8115
|
+
let hasHex = false;
|
|
8116
|
+
let closed = false;
|
|
8117
|
+
while (j < size) {
|
|
8118
|
+
const hexChar = data[j];
|
|
8119
|
+
if (hexChar === '}') {
|
|
8120
|
+
closed = true;
|
|
8121
|
+
break;
|
|
8122
|
+
}
|
|
8123
|
+
if (!TranslateRegExpString.isHexadecimal(hexChar)) {
|
|
8051
8124
|
break;
|
|
8052
8125
|
}
|
|
8053
|
-
|
|
8126
|
+
hasHex = true;
|
|
8127
|
+
j++;
|
|
8128
|
+
}
|
|
8129
|
+
if (closed && hasHex) {
|
|
8130
|
+
result += '\\x';
|
|
8131
|
+
i += 2;
|
|
8132
|
+
changed = true;
|
|
8133
|
+
continue;
|
|
8134
|
+
}
|
|
8135
|
+
} else if (i + 5 < size) {
|
|
8136
|
+
let isHex4 = true;
|
|
8137
|
+
for (let j = 0; j < 4; j++) {
|
|
8138
|
+
if (!TranslateRegExpString.isHexadecimal(data[i + 2 + j])) {
|
|
8139
|
+
isHex4 = false;
|
|
8140
|
+
break;
|
|
8141
|
+
}
|
|
8142
|
+
}
|
|
8143
|
+
if (isHex4) {
|
|
8144
|
+
result += '\\x{' + data.substring(i + 2, i + 6) + '}';
|
|
8145
|
+
i += 6;
|
|
8146
|
+
changed = true;
|
|
8147
|
+
continue;
|
|
8054
8148
|
}
|
|
8055
|
-
result += '}';
|
|
8056
|
-
changed = true;
|
|
8057
|
-
continue;
|
|
8058
|
-
} else if (nextCh === '{') {
|
|
8059
|
-
result += '\\x';
|
|
8060
|
-
i += 2;
|
|
8061
|
-
changed = true;
|
|
8062
|
-
continue;
|
|
8063
8149
|
}
|
|
8064
8150
|
}
|
|
8065
|
-
|
|
8151
|
+
|
|
8152
|
+
// Graceful degradation for invalid/unclosed \u sequences
|
|
8153
|
+
result += 'u';
|
|
8154
|
+
i += 2;
|
|
8155
|
+
changed = true;
|
|
8156
|
+
continue;
|
|
8157
|
+
}
|
|
8158
|
+
case 'x':
|
|
8159
|
+
{
|
|
8160
|
+
let isValidHex = false;
|
|
8161
|
+
if (i + 2 < size && data[i + 2] === '{') {
|
|
8162
|
+
// Must have a closing brace and at least one valid hex digit inside
|
|
8163
|
+
let j = i + 3;
|
|
8164
|
+
let hasHex = false;
|
|
8165
|
+
let closed = false;
|
|
8166
|
+
while (j < size) {
|
|
8167
|
+
const hexChar = data[j];
|
|
8168
|
+
if (hexChar === '}') {
|
|
8169
|
+
closed = true;
|
|
8170
|
+
break;
|
|
8171
|
+
}
|
|
8172
|
+
if (!TranslateRegExpString.isHexadecimal(hexChar)) {
|
|
8173
|
+
break;
|
|
8174
|
+
}
|
|
8175
|
+
hasHex = true;
|
|
8176
|
+
j++;
|
|
8177
|
+
}
|
|
8178
|
+
if (closed && hasHex) {
|
|
8179
|
+
isValidHex = true;
|
|
8180
|
+
}
|
|
8181
|
+
} else if (i + 3 < size && TranslateRegExpString.isHexadecimal(data[i + 2]) && TranslateRegExpString.isHexadecimal(data[i + 3])) {
|
|
8182
|
+
isValidHex = true;
|
|
8183
|
+
}
|
|
8184
|
+
if (isValidHex) {
|
|
8185
|
+
result += '\\x';
|
|
8186
|
+
i += 2;
|
|
8187
|
+
} else {
|
|
8188
|
+
result += 'x';
|
|
8189
|
+
i += 2;
|
|
8190
|
+
changed = true;
|
|
8191
|
+
}
|
|
8192
|
+
continue;
|
|
8193
|
+
}
|
|
8194
|
+
// Whitelist of valid RE2/JS alphanumeric escapes
|
|
8195
|
+
case 'n':
|
|
8196
|
+
case 'r':
|
|
8197
|
+
case 't':
|
|
8198
|
+
case 'a':
|
|
8199
|
+
case 'f':
|
|
8200
|
+
case 'v':
|
|
8201
|
+
case 'd':
|
|
8202
|
+
case 'D':
|
|
8203
|
+
case 's':
|
|
8204
|
+
case 'S':
|
|
8205
|
+
case 'w':
|
|
8206
|
+
case 'W':
|
|
8207
|
+
case 'b':
|
|
8208
|
+
case 'B':
|
|
8209
|
+
case 'p':
|
|
8210
|
+
case 'P':
|
|
8211
|
+
case 'A':
|
|
8212
|
+
case 'z':
|
|
8213
|
+
case 'Q':
|
|
8214
|
+
case 'E':
|
|
8215
|
+
case '0':
|
|
8216
|
+
case '1':
|
|
8217
|
+
case '2':
|
|
8218
|
+
case '3':
|
|
8219
|
+
case '4':
|
|
8220
|
+
case '5':
|
|
8221
|
+
case '6':
|
|
8222
|
+
case '7':
|
|
8223
|
+
{
|
|
8224
|
+
result += '\\' + ch;
|
|
8066
8225
|
i += 2;
|
|
8067
8226
|
continue;
|
|
8068
8227
|
}
|
|
8069
8228
|
default:
|
|
8070
8229
|
{
|
|
8071
|
-
result += '\\';
|
|
8072
8230
|
let cp = data.codePointAt(i + 1);
|
|
8073
|
-
let
|
|
8074
|
-
|
|
8075
|
-
|
|
8231
|
+
let isAlphaNum = cp >= 48 && cp <= 57 || cp >= 65 && cp <= 90 || cp >= 97 && cp <= 122;
|
|
8232
|
+
if (isAlphaNum) {
|
|
8233
|
+
// Invalid JS alphanumeric escape sequence (e.g. \8, \9, \e, \K)
|
|
8234
|
+
// Gracefully degrade to the literal character to prevent RE2 syntax crashes
|
|
8235
|
+
let symSize = Utils.charCount(cp);
|
|
8236
|
+
result += data.substring(i + 1, i + 1 + symSize);
|
|
8237
|
+
i += symSize + 1;
|
|
8238
|
+
changed = true;
|
|
8239
|
+
} else {
|
|
8240
|
+
// Escaped symbol (e.g. \., \*, \])
|
|
8241
|
+
result += '\\';
|
|
8242
|
+
let symSize = Utils.charCount(cp);
|
|
8243
|
+
result += data.substring(i + 1, i + 1 + symSize);
|
|
8244
|
+
i += symSize + 1;
|
|
8245
|
+
}
|
|
8076
8246
|
continue;
|
|
8077
8247
|
}
|
|
8078
8248
|
}
|
|
@@ -8082,7 +8252,13 @@ class TranslateRegExpString {
|
|
|
8082
8252
|
i += 1;
|
|
8083
8253
|
changed = true;
|
|
8084
8254
|
continue;
|
|
8085
|
-
} else if (ch === '
|
|
8255
|
+
} else if (ch === '[') {
|
|
8256
|
+
// Track entry into a character class (protects syntax inside)
|
|
8257
|
+
inCharClass = true;
|
|
8258
|
+
} else if (ch === ']') {
|
|
8259
|
+
// Track exit of a character class
|
|
8260
|
+
inCharClass = false;
|
|
8261
|
+
} else if (!inCharClass && ch === '(' && i + 2 < size && data[i + 1] === '?' && data[i + 2] === '<') {
|
|
8086
8262
|
if (i + 3 < size && !'=!>)'.includes(data[i + 3])) {
|
|
8087
8263
|
result += '(?P<';
|
|
8088
8264
|
i += 3;
|
|
@@ -8095,7 +8271,13 @@ class TranslateRegExpString {
|
|
|
8095
8271
|
result += data.substring(i, i + symSize);
|
|
8096
8272
|
i += symSize;
|
|
8097
8273
|
}
|
|
8098
|
-
|
|
8274
|
+
const finalResult = changed ? result : data;
|
|
8275
|
+
|
|
8276
|
+
// Append any extracted inline flags
|
|
8277
|
+
if (prefixFlags.length > 0) {
|
|
8278
|
+
return `(?${prefixFlags})${finalResult}`;
|
|
8279
|
+
}
|
|
8280
|
+
return finalResult;
|
|
8099
8281
|
}
|
|
8100
8282
|
}
|
|
8101
8283
|
|
|
@@ -8173,7 +8355,7 @@ class RE2JS {
|
|
|
8173
8355
|
* RE2JS-compatible syntax, and handling Unicode sequences properly. It ensures that the
|
|
8174
8356
|
* resulting regex is safe and properly formatted before compilation.
|
|
8175
8357
|
*
|
|
8176
|
-
* @param {string} expr - The regular expression string to be translated.
|
|
8358
|
+
* @param {string|RegExp} expr - The regular expression string to be translated.
|
|
8177
8359
|
* @returns {string} - The transformed regular expression string, ready for compilation.
|
|
8178
8360
|
*/
|
|
8179
8361
|
static translateRegExp(expr) {
|
|
@@ -8217,7 +8399,7 @@ class RE2JS {
|
|
|
8217
8399
|
* Matches a string against a regular expression.
|
|
8218
8400
|
*
|
|
8219
8401
|
* @param {string} regex the regular expression
|
|
8220
|
-
* @param {string|number[]} input the input
|
|
8402
|
+
* @param {string|number[]|Uint8Array} input the input
|
|
8221
8403
|
* @returns {boolean} true if the regular expression matches the entire input
|
|
8222
8404
|
* @throws RE2JSSyntaxException if the regular expression is malformed
|
|
8223
8405
|
*/
|
|
@@ -8284,7 +8466,7 @@ class RE2JS {
|
|
|
8284
8466
|
/**
|
|
8285
8467
|
* Matches a string against a regular expression.
|
|
8286
8468
|
*
|
|
8287
|
-
* @param {string|number[]} input the input
|
|
8469
|
+
* @param {string|number[]|Uint8Array} input the input
|
|
8288
8470
|
* @returns {boolean} true if the regular expression matches the entire input
|
|
8289
8471
|
*/
|
|
8290
8472
|
matches(input) {
|
|
@@ -8294,11 +8476,11 @@ class RE2JS {
|
|
|
8294
8476
|
/**
|
|
8295
8477
|
* Creates a new {@code Matcher} matching the pattern against the input.
|
|
8296
8478
|
*
|
|
8297
|
-
* @param {string|number[]} input the input string
|
|
8479
|
+
* @param {string|number[]|Uint8Array} input the input string
|
|
8298
8480
|
* @returns {Matcher}
|
|
8299
8481
|
*/
|
|
8300
8482
|
matcher(input) {
|
|
8301
|
-
if (
|
|
8483
|
+
if (Utils.isByteArray(input)) {
|
|
8302
8484
|
input = MatcherInput.utf8(input);
|
|
8303
8485
|
}
|
|
8304
8486
|
return new Matcher(this, input);
|
|
@@ -8310,11 +8492,11 @@ class RE2JS {
|
|
|
8310
8492
|
* a boolean and does not extract capture groups, it bypasses the `Matcher` overhead
|
|
8311
8493
|
* and guarantees execution on the high-speed DFA engine whenever possible.
|
|
8312
8494
|
*
|
|
8313
|
-
* @param {string|number[]} input - The input string or UTF-8 byte array to test against.
|
|
8495
|
+
* @param {string|number[]|Uint8Array} input - The input string or UTF-8 byte array to test against.
|
|
8314
8496
|
* @returns {boolean} `true` if the pattern is found anywhere in the input, `false` otherwise.
|
|
8315
8497
|
*/
|
|
8316
8498
|
test(input) {
|
|
8317
|
-
if (
|
|
8499
|
+
if (Utils.isByteArray(input)) {
|
|
8318
8500
|
// Reuse the existing UTF-8 fast-path method
|
|
8319
8501
|
return this.re2Input.matchUTF8(input);
|
|
8320
8502
|
}
|
|
@@ -8329,11 +8511,11 @@ class RE2JS {
|
|
|
8329
8511
|
* faster because it does not request capture group data. By requesting 0 capture groups,
|
|
8330
8512
|
* it securely routes execution through the DFA fast-path.
|
|
8331
8513
|
*
|
|
8332
|
-
* @param {string|number[]} input - The input string or UTF-8 byte array to test against.
|
|
8514
|
+
* @param {string|number[]|Uint8Array} input - The input string or UTF-8 byte array to test against.
|
|
8333
8515
|
* @returns {boolean} `true` if the exact input string fully matches the pattern, `false` otherwise.
|
|
8334
8516
|
*/
|
|
8335
8517
|
testExact(input) {
|
|
8336
|
-
const machineInput =
|
|
8518
|
+
const machineInput = Utils.isByteArray(input) ? MachineInput.fromUTF8(input) : MachineInput.fromUTF16(input);
|
|
8337
8519
|
return this.re2Input.executeEngine(machineInput, 0, RE2Flags.ANCHOR_BOTH, 0) !== null;
|
|
8338
8520
|
}
|
|
8339
8521
|
|