re2js 0.4.3 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +12 -0
- package/build/index.cjs.cjs +290 -21
- package/build/index.cjs.cjs.map +1 -1
- package/build/index.esm.d.ts +23 -0
- package/build/index.esm.d.ts.map +1 -1
- package/build/index.esm.js +290 -21
- package/build/index.esm.js.map +1 -1
- package/build/index.umd.js +290 -21
- package/build/index.umd.js.map +1 -1
- package/package.json +9 -9
package/README.md
CHANGED
|
@@ -98,6 +98,18 @@ RE2JS.DISABLE_UNICODE_GROUPS
|
|
|
98
98
|
RE2JS.LONGEST_MATCH
|
|
99
99
|
```
|
|
100
100
|
|
|
101
|
+
### Program size
|
|
102
|
+
|
|
103
|
+
The program size represents a very approximate measure of a regexp's "cost". Larger numbers are more expensive than smaller numbers.
|
|
104
|
+
|
|
105
|
+
```js
|
|
106
|
+
import { RE2JS } from 're2js'
|
|
107
|
+
|
|
108
|
+
console.log(RE2JS.compile('^').programSize()); // Outputs: 3
|
|
109
|
+
console.log(RE2JS.compile('a+b').programSize()); // Outputs: 5
|
|
110
|
+
console.log(RE2JS.compile('(a+b?)').programSize()); // Outputs: 8
|
|
111
|
+
```
|
|
112
|
+
|
|
101
113
|
### Checking for Matches
|
|
102
114
|
|
|
103
115
|
RE2JS allows you to check if a string matches a given regex pattern using the `matches()` function
|
package/build/index.cjs.cjs
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
* re2js
|
|
3
3
|
* RE2JS is the JavaScript port of RE2, a regular expression engine that provides linear time matching
|
|
4
4
|
*
|
|
5
|
-
* @version
|
|
5
|
+
* @version v1.0.1
|
|
6
6
|
* @author Alexey Vasiliev
|
|
7
7
|
* @homepage https://github.com/le0pard/re2js#readme
|
|
8
8
|
* @repository github:le0pard/re2js
|
|
@@ -928,6 +928,7 @@ class Matcher {
|
|
|
928
928
|
// The group indexes, in [start, end) pairs. Zeroth pair is overall match.
|
|
929
929
|
this.groups = [];
|
|
930
930
|
this.namedGroups = re2.namedGroups;
|
|
931
|
+
this.numberOfInstructions = re2.numberOfInstructions();
|
|
931
932
|
if (input instanceof MatcherInputBase) {
|
|
932
933
|
this.resetMatcherInput(input);
|
|
933
934
|
} else if (Array.isArray(input)) {
|
|
@@ -1014,6 +1015,20 @@ class Matcher {
|
|
|
1014
1015
|
return this.groups[2 * group + 1];
|
|
1015
1016
|
}
|
|
1016
1017
|
|
|
1018
|
+
/**
|
|
1019
|
+
* Returns the program size of this pattern.
|
|
1020
|
+
*
|
|
1021
|
+
* <p>
|
|
1022
|
+
* Similar to the C++ implementation, the program size is a very approximate measure of a regexp's
|
|
1023
|
+
* "cost". Larger numbers are more expensive than smaller numbers.
|
|
1024
|
+
* </p>
|
|
1025
|
+
*
|
|
1026
|
+
* @return the program size of this pattern
|
|
1027
|
+
*/
|
|
1028
|
+
programSize() {
|
|
1029
|
+
return this.numberOfInstructions;
|
|
1030
|
+
}
|
|
1031
|
+
|
|
1017
1032
|
/**
|
|
1018
1033
|
* Returns the named group of the most recent match, or {@code null} if the group was not matched.
|
|
1019
1034
|
* @param {string|number} [group=0]
|
|
@@ -1654,7 +1669,7 @@ class Regexp {
|
|
|
1654
1669
|
// subexpressions, if any. Never null.
|
|
1655
1670
|
// subs[0] is used as the freelist.
|
|
1656
1671
|
this.subs = Regexp.emptySubs();
|
|
1657
|
-
this.runes =
|
|
1672
|
+
this.runes = []; // matched runes, for LITERAL, CHAR_CLASS
|
|
1658
1673
|
this.min = 0; // min for REPEAT
|
|
1659
1674
|
this.max = 0; // max for REPEAT
|
|
1660
1675
|
this.cap = 0; // capturing index, for CAPTURE
|
|
@@ -1664,7 +1679,7 @@ class Regexp {
|
|
|
1664
1679
|
reinit() {
|
|
1665
1680
|
this.flags = 0;
|
|
1666
1681
|
this.subs = Regexp.emptySubs();
|
|
1667
|
-
this.runes =
|
|
1682
|
+
this.runes = [];
|
|
1668
1683
|
this.cap = 0;
|
|
1669
1684
|
this.min = 0;
|
|
1670
1685
|
this.max = 0;
|
|
@@ -1965,7 +1980,7 @@ class Inst {
|
|
|
1965
1980
|
this.arg = 0; // ALT, ALT_MATCH, CAPTURE, EMPTY_WIDTH
|
|
1966
1981
|
// length==1 => exact match
|
|
1967
1982
|
// otherwise a list of [lo,hi] pairs. hi is *inclusive*.
|
|
1968
|
-
this.runes =
|
|
1983
|
+
this.runes = [];
|
|
1969
1984
|
}
|
|
1970
1985
|
|
|
1971
1986
|
// MatchRune returns true if the instruction matches (and consumes) r.
|
|
@@ -2359,7 +2374,7 @@ class Compiler {
|
|
|
2359
2374
|
i.runes = runes;
|
|
2360
2375
|
flags &= RE2Flags.FOLD_CASE;
|
|
2361
2376
|
if (runes.length !== 1 || Unicode.simpleFold(runes[0]) === runes[0]) {
|
|
2362
|
-
flags &=
|
|
2377
|
+
flags &= -2;
|
|
2363
2378
|
}
|
|
2364
2379
|
i.arg = flags;
|
|
2365
2380
|
f.out = f.i << 1;
|
|
@@ -2476,7 +2491,7 @@ class Simplify {
|
|
|
2476
2491
|
const nsub = Simplify.simplify(sub);
|
|
2477
2492
|
if (nre === re && nsub !== sub) {
|
|
2478
2493
|
nre = Regexp.fromRegexp(re);
|
|
2479
|
-
nre.runes =
|
|
2494
|
+
nre.runes = [];
|
|
2480
2495
|
nre.subs = re.subs.slice(0, re.subs.length);
|
|
2481
2496
|
}
|
|
2482
2497
|
if (nre !== re) {
|
|
@@ -2617,7 +2632,7 @@ class CharGroup {
|
|
|
2617
2632
|
const code1 = [0x30, 0x39];
|
|
2618
2633
|
const code2 = [0x9, 0xa, 0xc, 0xd, 0x20, 0x20];
|
|
2619
2634
|
const code3 = [0x30, 0x39, 0x41, 0x5a, 0x5f, 0x5f, 0x61, 0x7a];
|
|
2620
|
-
const PERL_GROUPS = new Map([['\\d', new CharGroup(
|
|
2635
|
+
const PERL_GROUPS = new Map([['\\d', new CharGroup(1, code1)], ['\\D', new CharGroup(-1, code1)], ['\\s', new CharGroup(1, code2)], ['\\S', new CharGroup(-1, code2)], ['\\w', new CharGroup(1, code3)], ['\\W', new CharGroup(-1, code3)]]);
|
|
2621
2636
|
const code4 = [0x30, 0x39, 0x41, 0x5a, 0x61, 0x7a];
|
|
2622
2637
|
const code5 = [0x41, 0x5a, 0x61, 0x7a];
|
|
2623
2638
|
const code6 = [0x0, 0x7f];
|
|
@@ -2632,7 +2647,7 @@ const code14 = [0x9, 0xd, 0x20, 0x20];
|
|
|
2632
2647
|
const code15 = [0x41, 0x5a];
|
|
2633
2648
|
const code16 = [0x30, 0x39, 0x41, 0x5a, 0x5f, 0x5f, 0x61, 0x7a];
|
|
2634
2649
|
const code17 = [0x30, 0x39, 0x41, 0x46, 0x61, 0x66];
|
|
2635
|
-
const POSIX_GROUPS = new Map([['[:alnum:]', new CharGroup(
|
|
2650
|
+
const POSIX_GROUPS = new Map([['[:alnum:]', new CharGroup(1, code4)], ['[:^alnum:]', new CharGroup(-1, code4)], ['[:alpha:]', new CharGroup(1, code5)], ['[:^alpha:]', new CharGroup(-1, code5)], ['[:ascii:]', new CharGroup(1, code6)], ['[:^ascii:]', new CharGroup(-1, code6)], ['[:blank:]', new CharGroup(1, code7)], ['[:^blank:]', new CharGroup(-1, code7)], ['[:cntrl:]', new CharGroup(1, code8)], ['[:^cntrl:]', new CharGroup(-1, code8)], ['[:digit:]', new CharGroup(1, code9)], ['[:^digit:]', new CharGroup(-1, code9)], ['[:graph:]', new CharGroup(1, code10)], ['[:^graph:]', new CharGroup(-1, code10)], ['[:lower:]', new CharGroup(1, code11)], ['[:^lower:]', new CharGroup(-1, code11)], ['[:print:]', new CharGroup(1, code12)], ['[:^print:]', new CharGroup(-1, code12)], ['[:punct:]', new CharGroup(1, code13)], ['[:^punct:]', new CharGroup(-1, code13)], ['[:space:]', new CharGroup(1, code14)], ['[:^space:]', new CharGroup(-1, code14)], ['[:upper:]', new CharGroup(1, code15)], ['[:^upper:]', new CharGroup(-1, code15)], ['[:word:]', new CharGroup(1, code16)], ['[:^word:]', new CharGroup(-1, code16)], ['[:xdigit:]', new CharGroup(1, code17)], ['[:^xdigit:]', new CharGroup(-1, code17)]]);
|
|
2636
2651
|
|
|
2637
2652
|
/**
|
|
2638
2653
|
* A "builder"-style helper class for manipulating character classes represented as an array of
|
|
@@ -2671,7 +2686,7 @@ class CharClass {
|
|
|
2671
2686
|
// qsortIntPair() quicksorts pairs of ints in |array| according to lt().
|
|
2672
2687
|
// Precondition: |left|, |right|, |this.len| must all be even; |this.len > 1|.
|
|
2673
2688
|
static qsortIntPair(array, left, right) {
|
|
2674
|
-
const pivotIndex = ((left + right) / 2 | 0) &
|
|
2689
|
+
const pivotIndex = ((left + right) / 2 | 0) & -2;
|
|
2675
2690
|
const pivotFrom = array[pivotIndex];
|
|
2676
2691
|
const pivotTo = array[pivotIndex + 1];
|
|
2677
2692
|
let i = left;
|
|
@@ -3056,6 +3071,48 @@ class Parser {
|
|
|
3056
3071
|
static ERR_MISSING_REPEAT_ARGUMENT = 'missing argument to repetition operator';
|
|
3057
3072
|
static ERR_TRAILING_BACKSLASH = 'trailing backslash at end of expression';
|
|
3058
3073
|
static ERR_DUPLICATE_NAMED_CAPTURE = 'duplicate capture group name';
|
|
3074
|
+
static ERR_UNEXPECTED_PAREN = 'unexpected )';
|
|
3075
|
+
static ERR_NESTING_DEPTH = 'expression nests too deeply';
|
|
3076
|
+
static ERR_LARGE = 'expression too large';
|
|
3077
|
+
|
|
3078
|
+
// maxHeight is the maximum height of a regexp parse tree.
|
|
3079
|
+
// It is somewhat arbitrarily chosen, but the idea is to be large enough
|
|
3080
|
+
// that no one will actually hit in real use but at the same time small enough
|
|
3081
|
+
// that recursion on the Regexp tree will not hit the 1GB Go stack limit.
|
|
3082
|
+
// The maximum amount of stack for a single recursive frame is probably
|
|
3083
|
+
// closer to 1kB, so this could potentially be raised, but it seems unlikely
|
|
3084
|
+
// that people have regexps nested even this deeply.
|
|
3085
|
+
// We ran a test on Google's C++ code base and turned up only
|
|
3086
|
+
// a single use case with depth > 100; it had depth 128.
|
|
3087
|
+
// Using depth 1000 should be plenty of margin.
|
|
3088
|
+
// As an optimization, we don't even bother calculating heights
|
|
3089
|
+
// until we've allocated at least maxHeight Regexp structures.
|
|
3090
|
+
static MAX_HEIGHT = 1000;
|
|
3091
|
+
|
|
3092
|
+
// maxSize is the maximum size of a compiled regexp in Insts.
|
|
3093
|
+
// It too is somewhat arbitrarily chosen, but the idea is to be large enough
|
|
3094
|
+
// to allow significant regexps while at the same time small enough that
|
|
3095
|
+
// the compiled form will not take up too much memory.
|
|
3096
|
+
// 128 MB is enough for a 3.3 million Inst structures, which roughly
|
|
3097
|
+
// corresponds to a 3.3 MB regexp.
|
|
3098
|
+
static MAX_SIZE = 3355443; // 128 << 20 / (5 * 8) (instSize = byte, 2 uint32, slice is 5 64-bit words)
|
|
3099
|
+
|
|
3100
|
+
// maxRunes is the maximum number of runes allowed in a regexp tree
|
|
3101
|
+
// counting the runes in all the nodes.
|
|
3102
|
+
// Ignoring character classes p.numRunes is always less than the length of the regexp.
|
|
3103
|
+
// Character classes can make it much larger: each \pL adds 1292 runes.
|
|
3104
|
+
// 128 MB is enough for 32M runes, which is over 26k \pL instances.
|
|
3105
|
+
// Note that repetitions do not make copies of the rune slices,
|
|
3106
|
+
// so \pL{1000} is only one rune slice, not 1000.
|
|
3107
|
+
// We could keep a cache of character classes we've seen,
|
|
3108
|
+
// so that all the \pL we see use the same rune list,
|
|
3109
|
+
// but that doesn't remove the problem entirely:
|
|
3110
|
+
// consider something like [\pL01234][\pL01235][\pL01236]...[\pL^&*()].
|
|
3111
|
+
// And because the Rune slice is exposed directly in the Regexp,
|
|
3112
|
+
// there is not an opportunity to change the representation to allow
|
|
3113
|
+
// partial sharing between different character classes.
|
|
3114
|
+
// So the limit is the best we can do.
|
|
3115
|
+
static MAX_RUNES = 33554432; // 128 << 20 / 4 (runeSize, int32 is 4 bytes)
|
|
3059
3116
|
|
|
3060
3117
|
// RangeTables are represented as int[][], a list of triples (start, end,
|
|
3061
3118
|
// stride).
|
|
@@ -3354,7 +3411,7 @@ class Parser {
|
|
|
3354
3411
|
case Codepoint.CODES.get('v'):
|
|
3355
3412
|
return Codepoint.CODES.get('\v');
|
|
3356
3413
|
default:
|
|
3357
|
-
if (!Utils.isalnum(c)) {
|
|
3414
|
+
if (c <= Unicode.MAX_ASCII && !Utils.isalnum(c)) {
|
|
3358
3415
|
return c;
|
|
3359
3416
|
}
|
|
3360
3417
|
break;
|
|
@@ -3388,6 +3445,12 @@ class Parser {
|
|
|
3388
3445
|
// Stack of parsed expressions.
|
|
3389
3446
|
this.stack = [];
|
|
3390
3447
|
this.free = null;
|
|
3448
|
+
// checks
|
|
3449
|
+
this.numRegexp = 0; // number of regexps allocated
|
|
3450
|
+
this.numRunes = 0; // number of runes in char classes
|
|
3451
|
+
this.repeats = 0; // product of all repetitions seen
|
|
3452
|
+
this.height = null; // regexp height, for height limit check
|
|
3453
|
+
this.size = null; // regexp compiled size, for size limit check
|
|
3391
3454
|
}
|
|
3392
3455
|
|
|
3393
3456
|
// Allocate a Regexp, from the free list if possible.
|
|
@@ -3399,15 +3462,159 @@ class Parser {
|
|
|
3399
3462
|
re.op = op;
|
|
3400
3463
|
} else {
|
|
3401
3464
|
re = new Regexp(op);
|
|
3465
|
+
this.numRegexp += 1;
|
|
3402
3466
|
}
|
|
3403
3467
|
return re;
|
|
3404
3468
|
}
|
|
3405
3469
|
reuse(re) {
|
|
3470
|
+
if (this.height !== null && Object.prototype.hasOwnProperty.call(this.height, re)) {
|
|
3471
|
+
delete this.height[re];
|
|
3472
|
+
}
|
|
3406
3473
|
if (re.subs !== null && re.subs.length > 0) {
|
|
3407
3474
|
re.subs[0] = this.free;
|
|
3408
3475
|
}
|
|
3409
3476
|
this.free = re;
|
|
3410
3477
|
}
|
|
3478
|
+
checkLimits(re) {
|
|
3479
|
+
if (this.numRunes > Parser.MAX_RUNES) {
|
|
3480
|
+
throw new RE2JSSyntaxException(Parser.ERR_LARGE);
|
|
3481
|
+
}
|
|
3482
|
+
this.checkSize(re);
|
|
3483
|
+
this.checkHeight(re);
|
|
3484
|
+
}
|
|
3485
|
+
checkSize(re) {
|
|
3486
|
+
if (this.size === null) {
|
|
3487
|
+
// We haven't started tracking size yet.
|
|
3488
|
+
// Do a relatively cheap check to see if we need to start.
|
|
3489
|
+
// Maintain the product of all the repeats we've seen
|
|
3490
|
+
// and don't track if the total number of regexp nodes
|
|
3491
|
+
// we've seen times the repeat product is in budget.
|
|
3492
|
+
if (this.repeats === 0) {
|
|
3493
|
+
this.repeats = 1;
|
|
3494
|
+
}
|
|
3495
|
+
if (re.op === Regexp.Op.REPEAT) {
|
|
3496
|
+
let n = re.max;
|
|
3497
|
+
if (n === -1) {
|
|
3498
|
+
n = re.min;
|
|
3499
|
+
}
|
|
3500
|
+
if (n <= 0) {
|
|
3501
|
+
n = 1;
|
|
3502
|
+
}
|
|
3503
|
+
if (n > Parser.MAX_SIZE / this.repeats) {
|
|
3504
|
+
this.repeats = Parser.MAX_SIZE;
|
|
3505
|
+
} else {
|
|
3506
|
+
this.repeats *= n;
|
|
3507
|
+
}
|
|
3508
|
+
}
|
|
3509
|
+
if (this.numRegexp < Parser.MAX_SIZE / this.repeats) {
|
|
3510
|
+
return;
|
|
3511
|
+
}
|
|
3512
|
+
|
|
3513
|
+
// We need to start tracking size.
|
|
3514
|
+
// Make the map and belatedly populate it
|
|
3515
|
+
// with info about everything we've constructed so far.
|
|
3516
|
+
this.size = {};
|
|
3517
|
+
for (let reEx of this.stack) {
|
|
3518
|
+
this.checkSize(reEx);
|
|
3519
|
+
}
|
|
3520
|
+
}
|
|
3521
|
+
if (this.calcSize(re, true) > Parser.MAX_SIZE) {
|
|
3522
|
+
throw new RE2JSSyntaxException(Parser.ERR_LARGE);
|
|
3523
|
+
}
|
|
3524
|
+
}
|
|
3525
|
+
calcSize(re, force = false) {
|
|
3526
|
+
if (!force) {
|
|
3527
|
+
if (Object.prototype.hasOwnProperty.call(this.size, re)) {
|
|
3528
|
+
return this.size[re];
|
|
3529
|
+
}
|
|
3530
|
+
}
|
|
3531
|
+
let size = 0;
|
|
3532
|
+
switch (re.op) {
|
|
3533
|
+
case Regexp.Op.LITERAL:
|
|
3534
|
+
{
|
|
3535
|
+
size = re.runes.length;
|
|
3536
|
+
break;
|
|
3537
|
+
}
|
|
3538
|
+
case Regexp.Op.CAPTURE:
|
|
3539
|
+
case Regexp.Op.STAR:
|
|
3540
|
+
{
|
|
3541
|
+
// star can be 1+ or 2+; assume 2 pessimistically
|
|
3542
|
+
size = 2 + this.calcSize(re.subs[0]);
|
|
3543
|
+
break;
|
|
3544
|
+
}
|
|
3545
|
+
case Regexp.Op.PLUS:
|
|
3546
|
+
case Regexp.Op.QUEST:
|
|
3547
|
+
{
|
|
3548
|
+
size = 1 + this.calcSize(re.subs[0]);
|
|
3549
|
+
break;
|
|
3550
|
+
}
|
|
3551
|
+
case Regexp.Op.CONCAT:
|
|
3552
|
+
{
|
|
3553
|
+
for (let sub of re.subs) {
|
|
3554
|
+
size = size + this.calcSize(sub);
|
|
3555
|
+
}
|
|
3556
|
+
break;
|
|
3557
|
+
}
|
|
3558
|
+
case Regexp.Op.ALTERNATE:
|
|
3559
|
+
{
|
|
3560
|
+
for (let sub of re.subs) {
|
|
3561
|
+
size = size + this.calcSize(sub);
|
|
3562
|
+
}
|
|
3563
|
+
if (re.subs.length > 1) {
|
|
3564
|
+
size = size + re.subs.length - 1;
|
|
3565
|
+
}
|
|
3566
|
+
break;
|
|
3567
|
+
}
|
|
3568
|
+
case Regexp.Op.REPEAT:
|
|
3569
|
+
{
|
|
3570
|
+
let sub = this.calcSize(re.subs[0]);
|
|
3571
|
+
if (re.max === -1) {
|
|
3572
|
+
if (re.min === 0) {
|
|
3573
|
+
size = 2 + sub; // x*
|
|
3574
|
+
} else {
|
|
3575
|
+
size = 1 + re.min * sub; // xxx+
|
|
3576
|
+
}
|
|
3577
|
+
break;
|
|
3578
|
+
}
|
|
3579
|
+
// x{2,5} = xx(x(x(x)?)?)?
|
|
3580
|
+
size = re.max * sub + (re.max - re.min);
|
|
3581
|
+
break;
|
|
3582
|
+
}
|
|
3583
|
+
}
|
|
3584
|
+
size = Math.max(1, size);
|
|
3585
|
+
this.size[re] = size;
|
|
3586
|
+
return size;
|
|
3587
|
+
}
|
|
3588
|
+
checkHeight(re) {
|
|
3589
|
+
if (this.numRegexp < Parser.MAX_HEIGHT) {
|
|
3590
|
+
return;
|
|
3591
|
+
}
|
|
3592
|
+
if (this.height === null) {
|
|
3593
|
+
this.height = {};
|
|
3594
|
+
for (let reEx of this.stack) {
|
|
3595
|
+
this.checkHeight(reEx);
|
|
3596
|
+
}
|
|
3597
|
+
}
|
|
3598
|
+
if (this.calcHeight(re, true) > Parser.MAX_HEIGHT) {
|
|
3599
|
+
throw new RE2JSSyntaxException(Parser.ERR_NESTING_DEPTH);
|
|
3600
|
+
}
|
|
3601
|
+
}
|
|
3602
|
+
calcHeight(re, force = false) {
|
|
3603
|
+
if (!force) {
|
|
3604
|
+
if (Object.prototype.hasOwnProperty.call(this.height, re)) {
|
|
3605
|
+
return this.height[re];
|
|
3606
|
+
}
|
|
3607
|
+
}
|
|
3608
|
+
let h = 1;
|
|
3609
|
+
for (let sub of re.subs) {
|
|
3610
|
+
const hsub = this.calcHeight(sub);
|
|
3611
|
+
if (h < 1 + hsub) {
|
|
3612
|
+
h = 1 + hsub;
|
|
3613
|
+
}
|
|
3614
|
+
}
|
|
3615
|
+
this.height[re] = h;
|
|
3616
|
+
return h;
|
|
3617
|
+
}
|
|
3411
3618
|
|
|
3412
3619
|
// Parse stack manipulation.
|
|
3413
3620
|
|
|
@@ -3428,13 +3635,14 @@ class Parser {
|
|
|
3428
3635
|
// push pushes the regexp re onto the parse stack and returns the regexp.
|
|
3429
3636
|
// Returns null for a CHAR_CLASS that can be merged with the top-of-stack.
|
|
3430
3637
|
push(re) {
|
|
3638
|
+
this.numRunes += re.runes.length;
|
|
3431
3639
|
if (re.op === Regexp.Op.CHAR_CLASS && re.runes.length === 2 && re.runes[0] === re.runes[1]) {
|
|
3432
|
-
if (this.maybeConcat(re.runes[0], this.flags &
|
|
3640
|
+
if (this.maybeConcat(re.runes[0], this.flags & -2)) {
|
|
3433
3641
|
return null;
|
|
3434
3642
|
}
|
|
3435
3643
|
re.op = Regexp.Op.LITERAL;
|
|
3436
3644
|
re.runes = [re.runes[0]];
|
|
3437
|
-
re.flags = this.flags &
|
|
3645
|
+
re.flags = this.flags & -2;
|
|
3438
3646
|
} else if (re.op === Regexp.Op.CHAR_CLASS && re.runes.length === 4 && re.runes[0] === re.runes[1] && re.runes[2] === re.runes[3] && Unicode.simpleFold(re.runes[0]) === re.runes[2] && Unicode.simpleFold(re.runes[2]) === re.runes[0] || re.op === Regexp.Op.CHAR_CLASS && re.runes.length === 2 && re.runes[0] + 1 === re.runes[1] && Unicode.simpleFold(re.runes[0]) === re.runes[1] && Unicode.simpleFold(re.runes[1]) === re.runes[0]) {
|
|
3439
3647
|
// Case-insensitive rune like [Aa] or [Δδ].
|
|
3440
3648
|
if (this.maybeConcat(re.runes[0], this.flags | RE2Flags.FOLD_CASE)) {
|
|
@@ -3449,6 +3657,7 @@ class Parser {
|
|
|
3449
3657
|
this.maybeConcat(-1, 0);
|
|
3450
3658
|
}
|
|
3451
3659
|
this.stack.push(re);
|
|
3660
|
+
this.checkLimits(re);
|
|
3452
3661
|
return re;
|
|
3453
3662
|
}
|
|
3454
3663
|
|
|
@@ -3542,6 +3751,43 @@ class Parser {
|
|
|
3542
3751
|
re.flags = flags;
|
|
3543
3752
|
re.subs = [sub];
|
|
3544
3753
|
this.stack[n - 1] = re;
|
|
3754
|
+
this.checkLimits(re);
|
|
3755
|
+
if (op === Regexp.Op.REPEAT && (min >= 2 || max >= 2) && !this.repeatIsValid(re, 1000)) {
|
|
3756
|
+
throw new RE2JSSyntaxException(Parser.ERR_INVALID_REPEAT_SIZE, t.from(beforePos));
|
|
3757
|
+
}
|
|
3758
|
+
}
|
|
3759
|
+
|
|
3760
|
+
// repeatIsValid reports whether the repetition re is valid.
|
|
3761
|
+
// Valid means that the combination of the top-level repetition
|
|
3762
|
+
// and any inner repetitions does not exceed n copies of the
|
|
3763
|
+
// innermost thing.
|
|
3764
|
+
// This function rewalks the regexp tree and is called for every repetition,
|
|
3765
|
+
// so we have to worry about inducing quadratic behavior in the parser.
|
|
3766
|
+
// We avoid this by only calling repeatIsValid when min or max >= 2.
|
|
3767
|
+
// In that case the depth of any >= 2 nesting can only get to 9 without
|
|
3768
|
+
// triggering a parse error, so each subtree can only be rewalked 9 times.
|
|
3769
|
+
repeatIsValid(re, n) {
|
|
3770
|
+
if (re.op === Regexp.Op.REPEAT) {
|
|
3771
|
+
let m = re.max;
|
|
3772
|
+
if (m === 0) {
|
|
3773
|
+
return true;
|
|
3774
|
+
}
|
|
3775
|
+
if (m < 0) {
|
|
3776
|
+
m = re.min;
|
|
3777
|
+
}
|
|
3778
|
+
if (m > n) {
|
|
3779
|
+
return false;
|
|
3780
|
+
}
|
|
3781
|
+
if (m > 0) {
|
|
3782
|
+
n = Math.trunc(n / m);
|
|
3783
|
+
}
|
|
3784
|
+
}
|
|
3785
|
+
for (let sub of re.subs) {
|
|
3786
|
+
if (!this.repeatIsValid(sub, n)) {
|
|
3787
|
+
return false;
|
|
3788
|
+
}
|
|
3789
|
+
}
|
|
3790
|
+
return true;
|
|
3545
3791
|
}
|
|
3546
3792
|
|
|
3547
3793
|
// concat replaces the top of the stack (above the topmost '|' or '(') with
|
|
@@ -3579,10 +3825,10 @@ class Parser {
|
|
|
3579
3825
|
if (re.op === Regexp.Op.CHAR_CLASS) {
|
|
3580
3826
|
re.runes = new CharClass(re.runes).cleanClass().toArray();
|
|
3581
3827
|
if (re.runes.length === 2 && re.runes[0] === 0 && re.runes[1] === Unicode.MAX_RUNE) {
|
|
3582
|
-
re.runes =
|
|
3828
|
+
re.runes = [];
|
|
3583
3829
|
re.op = Regexp.Op.ANY_CHAR;
|
|
3584
3830
|
} else if (re.runes.length === 4 && re.runes[0] === 0 && re.runes[1] === Codepoint.CODES.get('\n') - 1 && re.runes[2] === Codepoint.CODES.get('\n') + 1 && re.runes[3] === Unicode.MAX_RUNE) {
|
|
3585
|
-
re.runes =
|
|
3831
|
+
re.runes = [];
|
|
3586
3832
|
re.op = Regexp.Op.ANY_CHAR_NOT_NL;
|
|
3587
3833
|
}
|
|
3588
3834
|
}
|
|
@@ -3717,6 +3963,7 @@ class Parser {
|
|
|
3717
3963
|
prefix.runes = str.slice(0, strlen);
|
|
3718
3964
|
for (let j = start; j < i; j++) {
|
|
3719
3965
|
array[s + j] = this.removeLeadingString(array[s + j], strlen);
|
|
3966
|
+
this.checkLimits(array[s + j]);
|
|
3720
3967
|
}
|
|
3721
3968
|
// Recurse.
|
|
3722
3969
|
const suffix = this.collapse(array.slice(s + start, s + i), Regexp.Op.ALTERNATE);
|
|
@@ -3766,6 +4013,7 @@ class Parser {
|
|
|
3766
4013
|
for (let j = start; j < i; j++) {
|
|
3767
4014
|
const reuse = j !== start; // prefix came from sub[start]
|
|
3768
4015
|
array[s + j] = this.removeLeadingRegexp(array[s + j], reuse);
|
|
4016
|
+
this.checkLimits(array[s + j]);
|
|
3769
4017
|
}
|
|
3770
4018
|
// recurse
|
|
3771
4019
|
const suffix = this.collapse(array.slice(s + start, s + i), Regexp.Op.ALTERNATE);
|
|
@@ -4149,7 +4397,7 @@ class Parser {
|
|
|
4149
4397
|
t.skip(2); // "(?"
|
|
4150
4398
|
|
|
4151
4399
|
let flags = this.flags;
|
|
4152
|
-
let sign =
|
|
4400
|
+
let sign = 1;
|
|
4153
4401
|
let sawFlag = false;
|
|
4154
4402
|
loop: while (t.more()) {
|
|
4155
4403
|
{
|
|
@@ -4160,7 +4408,7 @@ class Parser {
|
|
|
4160
4408
|
sawFlag = true;
|
|
4161
4409
|
break;
|
|
4162
4410
|
case Codepoint.CODES.get('m'):
|
|
4163
|
-
flags &=
|
|
4411
|
+
flags &= -17;
|
|
4164
4412
|
sawFlag = true;
|
|
4165
4413
|
break;
|
|
4166
4414
|
case Codepoint.CODES.get('s'):
|
|
@@ -4266,12 +4514,12 @@ class Parser {
|
|
|
4266
4514
|
this.alternate();
|
|
4267
4515
|
const n = this.stack.length;
|
|
4268
4516
|
if (n < 2) {
|
|
4269
|
-
throw new RE2JSSyntaxException(Parser.
|
|
4517
|
+
throw new RE2JSSyntaxException(Parser.ERR_UNEXPECTED_PAREN, this.wholeRegexp);
|
|
4270
4518
|
}
|
|
4271
4519
|
const re1 = this.pop();
|
|
4272
4520
|
const re2 = this.pop();
|
|
4273
4521
|
if (re2.op !== Regexp.Op.LEFT_PAREN) {
|
|
4274
|
-
throw new RE2JSSyntaxException(Parser.
|
|
4522
|
+
throw new RE2JSSyntaxException(Parser.ERR_UNEXPECTED_PAREN, this.wholeRegexp);
|
|
4275
4523
|
}
|
|
4276
4524
|
// Restore flags at time of paren.
|
|
4277
4525
|
this.flags = re2.flags;
|
|
@@ -4341,7 +4589,7 @@ class Parser {
|
|
|
4341
4589
|
}
|
|
4342
4590
|
t.skip(1); // '\\'
|
|
4343
4591
|
// Committed to parse or throw exception.
|
|
4344
|
-
let sign =
|
|
4592
|
+
let sign = 1;
|
|
4345
4593
|
let c = t.pop(); // 'p' or 'P'
|
|
4346
4594
|
if (c === Codepoint.CODES.get('P')) {
|
|
4347
4595
|
sign = -1;
|
|
@@ -4405,7 +4653,7 @@ class Parser {
|
|
|
4405
4653
|
const re = this.newRegexp(Regexp.Op.CHAR_CLASS);
|
|
4406
4654
|
re.flags = this.flags;
|
|
4407
4655
|
const cc = new CharClass();
|
|
4408
|
-
let sign =
|
|
4656
|
+
let sign = 1;
|
|
4409
4657
|
if (t.more() && t.lookingAt('^')) {
|
|
4410
4658
|
sign = -1;
|
|
4411
4659
|
t.skip(1); // '^'
|
|
@@ -4948,6 +5196,13 @@ class RE2 {
|
|
|
4948
5196
|
return this.numSubexp;
|
|
4949
5197
|
}
|
|
4950
5198
|
|
|
5199
|
+
/**
|
|
5200
|
+
* Returns the number of instructions in this compiled regular expression program.
|
|
5201
|
+
*/
|
|
5202
|
+
numberOfInstructions() {
|
|
5203
|
+
return this.prog.numInst();
|
|
5204
|
+
}
|
|
5205
|
+
|
|
4951
5206
|
// get() returns a machine to use for matching |this|. It uses |this|'s
|
|
4952
5207
|
// machine cache if possible, to avoid unnecessary allocation.
|
|
4953
5208
|
get() {
|
|
@@ -5580,7 +5835,7 @@ class RE2JS {
|
|
|
5580
5835
|
}
|
|
5581
5836
|
let re2Flags = RE2Flags.PERL;
|
|
5582
5837
|
if ((flags & RE2JS.DISABLE_UNICODE_GROUPS) !== 0) {
|
|
5583
|
-
re2Flags &=
|
|
5838
|
+
re2Flags &= -129;
|
|
5584
5839
|
}
|
|
5585
5840
|
const p = new RE2JS(regex, flags);
|
|
5586
5841
|
// The compiled RE2 regexp.
|
|
@@ -5748,6 +6003,20 @@ class RE2JS {
|
|
|
5748
6003
|
return this.patternInput;
|
|
5749
6004
|
}
|
|
5750
6005
|
|
|
6006
|
+
/**
|
|
6007
|
+
* Returns the program size of this pattern.
|
|
6008
|
+
*
|
|
6009
|
+
* <p>
|
|
6010
|
+
* Similar to the C++ implementation, the program size is a very approximate measure of a regexp's
|
|
6011
|
+
* "cost". Larger numbers are more expensive than smaller numbers.
|
|
6012
|
+
* </p>
|
|
6013
|
+
*
|
|
6014
|
+
* @returns {number} the program size of this pattern
|
|
6015
|
+
*/
|
|
6016
|
+
programSize() {
|
|
6017
|
+
return this.re2Input.numberOfInstructions();
|
|
6018
|
+
}
|
|
6019
|
+
|
|
5751
6020
|
/**
|
|
5752
6021
|
* Returns the number of capturing groups in this matcher's pattern. Group zero denotes the entire
|
|
5753
6022
|
* pattern and is excluded from this count.
|