re2js 0.4.2 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -98,6 +98,18 @@ RE2JS.DISABLE_UNICODE_GROUPS
98
98
  RE2JS.LONGEST_MATCH
99
99
  ```
100
100
 
101
+ ### Program size
102
+
103
+ The program size represents a very approximate measure of a regexp's "cost". Larger numbers are more expensive than smaller numbers.
104
+
105
+ ```js
106
+ import { RE2JS } from 're2js'
107
+
108
+ console.log(RE2JS.compile('^').programSize()); // Outputs: 3
109
+ console.log(RE2JS.compile('a+b').programSize()); // Outputs: 5
110
+ console.log(RE2JS.compile('(a+b?)').programSize()); // Outputs: 8
111
+ ```
112
+
101
113
  ### Checking for Matches
102
114
 
103
115
  RE2JS allows you to check if a string matches a given regex pattern using the `matches()` function
@@ -131,6 +143,20 @@ RE2JS.compile('ab+c').matcher('cbbba').find() // false
131
143
  RE2JS.compile('ab+c', RE2JS.CASE_INSENSITIVE).matcher('abBBc').find() // true
132
144
  ```
133
145
 
146
+ Example to collect all matches in string
147
+
148
+ ```js
149
+ import { RE2JS } from 're2js'
150
+
151
+ const p = RE2JS.compile('abc+')
152
+ const matchString = p.matcher('abc abcccc abcc')
153
+ const results = []
154
+ while (matchString.find()) {
155
+ results.push(matchString.group())
156
+ }
157
+ results // ['abc', 'abcccc', 'abcc']
158
+ ```
159
+
134
160
  The `find()` method searches for a pattern match in a string starting from a specific index
135
161
 
136
162
  ```js
@@ -2,7 +2,7 @@
2
2
  * re2js
3
3
  * RE2JS is the JavaScript port of RE2, a regular expression engine that provides linear time matching
4
4
  *
5
- * @version v0.4.2
5
+ * @version v1.0.0
6
6
  * @author Alexey Vasiliev
7
7
  * @homepage https://github.com/le0pard/re2js#readme
8
8
  * @repository github:le0pard/re2js
@@ -324,7 +324,7 @@ class Unicode {
324
324
  // Minimum and maximum runes involved in folding.
325
325
  // Checked during test.
326
326
  static MIN_FOLD = 0x0041;
327
- static MAX_FOLD = 0x1044f;
327
+ static MAX_FOLD = 0x1e943;
328
328
 
329
329
  // is32 uses binary search to test whether rune is in the specified
330
330
  // slice of 32-bit ranges.
@@ -928,6 +928,7 @@ class Matcher {
928
928
  // The group indexes, in [start, end) pairs. Zeroth pair is overall match.
929
929
  this.groups = [];
930
930
  this.namedGroups = re2.namedGroups;
931
+ this.numberOfInstructions = re2.numberOfInstructions();
931
932
  if (input instanceof MatcherInputBase) {
932
933
  this.resetMatcherInput(input);
933
934
  } else if (Array.isArray(input)) {
@@ -1014,6 +1015,20 @@ class Matcher {
1014
1015
  return this.groups[2 * group + 1];
1015
1016
  }
1016
1017
 
1018
+ /**
1019
+ * Returns the program size of this pattern.
1020
+ *
1021
+ * <p>
1022
+ * Similar to the C++ implementation, the program size is a very approximate measure of a regexp's
1023
+ * "cost". Larger numbers are more expensive than smaller numbers.
1024
+ * </p>
1025
+ *
1026
+ * @return the program size of this pattern
1027
+ */
1028
+ programSize() {
1029
+ return this.numberOfInstructions;
1030
+ }
1031
+
1017
1032
  /**
1018
1033
  * Returns the named group of the most recent match, or {@code null} if the group was not matched.
1019
1034
  * @param {string|number} [group=0]
@@ -1654,7 +1669,7 @@ class Regexp {
1654
1669
  // subexpressions, if any. Never null.
1655
1670
  // subs[0] is used as the freelist.
1656
1671
  this.subs = Regexp.emptySubs();
1657
- this.runes = null; // matched runes, for LITERAL, CHAR_CLASS
1672
+ this.runes = []; // matched runes, for LITERAL, CHAR_CLASS
1658
1673
  this.min = 0; // min for REPEAT
1659
1674
  this.max = 0; // max for REPEAT
1660
1675
  this.cap = 0; // capturing index, for CAPTURE
@@ -1664,7 +1679,7 @@ class Regexp {
1664
1679
  reinit() {
1665
1680
  this.flags = 0;
1666
1681
  this.subs = Regexp.emptySubs();
1667
- this.runes = null;
1682
+ this.runes = [];
1668
1683
  this.cap = 0;
1669
1684
  this.min = 0;
1670
1685
  this.max = 0;
@@ -1965,7 +1980,7 @@ class Inst {
1965
1980
  this.arg = 0; // ALT, ALT_MATCH, CAPTURE, EMPTY_WIDTH
1966
1981
  // length==1 => exact match
1967
1982
  // otherwise a list of [lo,hi] pairs. hi is *inclusive*.
1968
- this.runes = null;
1983
+ this.runes = [];
1969
1984
  }
1970
1985
 
1971
1986
  // MatchRune returns true if the instruction matches (and consumes) r.
@@ -2359,7 +2374,7 @@ class Compiler {
2359
2374
  i.runes = runes;
2360
2375
  flags &= RE2Flags.FOLD_CASE;
2361
2376
  if (runes.length !== 1 || Unicode.simpleFold(runes[0]) === runes[0]) {
2362
- flags &= ~RE2Flags.FOLD_CASE;
2377
+ flags &= -2;
2363
2378
  }
2364
2379
  i.arg = flags;
2365
2380
  f.out = f.i << 1;
@@ -2476,7 +2491,7 @@ class Simplify {
2476
2491
  const nsub = Simplify.simplify(sub);
2477
2492
  if (nre === re && nsub !== sub) {
2478
2493
  nre = Regexp.fromRegexp(re);
2479
- nre.runes = null;
2494
+ nre.runes = [];
2480
2495
  nre.subs = re.subs.slice(0, re.subs.length);
2481
2496
  }
2482
2497
  if (nre !== re) {
@@ -2617,7 +2632,7 @@ class CharGroup {
2617
2632
  const code1 = [0x30, 0x39];
2618
2633
  const code2 = [0x9, 0xa, 0xc, 0xd, 0x20, 0x20];
2619
2634
  const code3 = [0x30, 0x39, 0x41, 0x5a, 0x5f, 0x5f, 0x61, 0x7a];
2620
- const PERL_GROUPS = new Map([['\\d', new CharGroup(+1, code1)], ['\\D', new CharGroup(-1, code1)], ['\\s', new CharGroup(+1, code2)], ['\\S', new CharGroup(-1, code2)], ['\\w', new CharGroup(+1, code3)], ['\\W', new CharGroup(-1, code3)]]);
2635
+ const PERL_GROUPS = new Map([['\\d', new CharGroup(1, code1)], ['\\D', new CharGroup(-1, code1)], ['\\s', new CharGroup(1, code2)], ['\\S', new CharGroup(-1, code2)], ['\\w', new CharGroup(1, code3)], ['\\W', new CharGroup(-1, code3)]]);
2621
2636
  const code4 = [0x30, 0x39, 0x41, 0x5a, 0x61, 0x7a];
2622
2637
  const code5 = [0x41, 0x5a, 0x61, 0x7a];
2623
2638
  const code6 = [0x0, 0x7f];
@@ -2632,7 +2647,7 @@ const code14 = [0x9, 0xd, 0x20, 0x20];
2632
2647
  const code15 = [0x41, 0x5a];
2633
2648
  const code16 = [0x30, 0x39, 0x41, 0x5a, 0x5f, 0x5f, 0x61, 0x7a];
2634
2649
  const code17 = [0x30, 0x39, 0x41, 0x46, 0x61, 0x66];
2635
- const POSIX_GROUPS = new Map([['[:alnum:]', new CharGroup(+1, code4)], ['[:^alnum:]', new CharGroup(-1, code4)], ['[:alpha:]', new CharGroup(+1, code5)], ['[:^alpha:]', new CharGroup(-1, code5)], ['[:ascii:]', new CharGroup(+1, code6)], ['[:^ascii:]', new CharGroup(-1, code6)], ['[:blank:]', new CharGroup(+1, code7)], ['[:^blank:]', new CharGroup(-1, code7)], ['[:cntrl:]', new CharGroup(+1, code8)], ['[:^cntrl:]', new CharGroup(-1, code8)], ['[:digit:]', new CharGroup(+1, code9)], ['[:^digit:]', new CharGroup(-1, code9)], ['[:graph:]', new CharGroup(+1, code10)], ['[:^graph:]', new CharGroup(-1, code10)], ['[:lower:]', new CharGroup(+1, code11)], ['[:^lower:]', new CharGroup(-1, code11)], ['[:print:]', new CharGroup(+1, code12)], ['[:^print:]', new CharGroup(-1, code12)], ['[:punct:]', new CharGroup(+1, code13)], ['[:^punct:]', new CharGroup(-1, code13)], ['[:space:]', new CharGroup(+1, code14)], ['[:^space:]', new CharGroup(-1, code14)], ['[:upper:]', new CharGroup(+1, code15)], ['[:^upper:]', new CharGroup(-1, code15)], ['[:word:]', new CharGroup(+1, code16)], ['[:^word:]', new CharGroup(-1, code16)], ['[:xdigit:]', new CharGroup(+1, code17)], ['[:^xdigit:]', new CharGroup(-1, code17)]]);
2650
+ const POSIX_GROUPS = new Map([['[:alnum:]', new CharGroup(1, code4)], ['[:^alnum:]', new CharGroup(-1, code4)], ['[:alpha:]', new CharGroup(1, code5)], ['[:^alpha:]', new CharGroup(-1, code5)], ['[:ascii:]', new CharGroup(1, code6)], ['[:^ascii:]', new CharGroup(-1, code6)], ['[:blank:]', new CharGroup(1, code7)], ['[:^blank:]', new CharGroup(-1, code7)], ['[:cntrl:]', new CharGroup(1, code8)], ['[:^cntrl:]', new CharGroup(-1, code8)], ['[:digit:]', new CharGroup(1, code9)], ['[:^digit:]', new CharGroup(-1, code9)], ['[:graph:]', new CharGroup(1, code10)], ['[:^graph:]', new CharGroup(-1, code10)], ['[:lower:]', new CharGroup(1, code11)], ['[:^lower:]', new CharGroup(-1, code11)], ['[:print:]', new CharGroup(1, code12)], ['[:^print:]', new CharGroup(-1, code12)], ['[:punct:]', new CharGroup(1, code13)], ['[:^punct:]', new CharGroup(-1, code13)], ['[:space:]', new CharGroup(1, code14)], ['[:^space:]', new CharGroup(-1, code14)], ['[:upper:]', new CharGroup(1, code15)], ['[:^upper:]', new CharGroup(-1, code15)], ['[:word:]', new CharGroup(1, code16)], ['[:^word:]', new CharGroup(-1, code16)], ['[:xdigit:]', new CharGroup(1, code17)], ['[:^xdigit:]', new CharGroup(-1, code17)]]);
2636
2651
 
2637
2652
  /**
2638
2653
  * A "builder"-style helper class for manipulating character classes represented as an array of
@@ -2671,7 +2686,7 @@ class CharClass {
2671
2686
  // qsortIntPair() quicksorts pairs of ints in |array| according to lt().
2672
2687
  // Precondition: |left|, |right|, |this.len| must all be even; |this.len > 1|.
2673
2688
  static qsortIntPair(array, left, right) {
2674
- const pivotIndex = ((left + right) / 2 | 0) & ~1;
2689
+ const pivotIndex = ((left + right) / 2 | 0) & -2;
2675
2690
  const pivotFrom = array[pivotIndex];
2676
2691
  const pivotTo = array[pivotIndex + 1];
2677
2692
  let i = left;
@@ -3056,6 +3071,48 @@ class Parser {
3056
3071
  static ERR_MISSING_REPEAT_ARGUMENT = 'missing argument to repetition operator';
3057
3072
  static ERR_TRAILING_BACKSLASH = 'trailing backslash at end of expression';
3058
3073
  static ERR_DUPLICATE_NAMED_CAPTURE = 'duplicate capture group name';
3074
+ static ERR_UNEXPECTED_PAREN = 'unexpected )';
3075
+ static ERR_NESTING_DEPTH = 'expression nests too deeply';
3076
+ static ERR_LARGE = 'expression too large';
3077
+
3078
+ // maxHeight is the maximum height of a regexp parse tree.
3079
+ // It is somewhat arbitrarily chosen, but the idea is to be large enough
3080
+ // that no one will actually hit in real use but at the same time small enough
3081
+ // that recursion on the Regexp tree will not hit the 1GB Go stack limit.
3082
+ // The maximum amount of stack for a single recursive frame is probably
3083
+ // closer to 1kB, so this could potentially be raised, but it seems unlikely
3084
+ // that people have regexps nested even this deeply.
3085
+ // We ran a test on Google's C++ code base and turned up only
3086
+ // a single use case with depth > 100; it had depth 128.
3087
+ // Using depth 1000 should be plenty of margin.
3088
+ // As an optimization, we don't even bother calculating heights
3089
+ // until we've allocated at least maxHeight Regexp structures.
3090
+ static MAX_HEIGHT = 1000;
3091
+
3092
+ // maxSize is the maximum size of a compiled regexp in Insts.
3093
+ // It too is somewhat arbitrarily chosen, but the idea is to be large enough
3094
+ // to allow significant regexps while at the same time small enough that
3095
+ // the compiled form will not take up too much memory.
3096
+ // 128 MB is enough for a 3.3 million Inst structures, which roughly
3097
+ // corresponds to a 3.3 MB regexp.
3098
+ static MAX_SIZE = 3355443; // 128 << 20 / (5 * 8) (instSize = byte, 2 uint32, slice is 5 64-bit words)
3099
+
3100
+ // maxRunes is the maximum number of runes allowed in a regexp tree
3101
+ // counting the runes in all the nodes.
3102
+ // Ignoring character classes p.numRunes is always less than the length of the regexp.
3103
+ // Character classes can make it much larger: each \pL adds 1292 runes.
3104
+ // 128 MB is enough for 32M runes, which is over 26k \pL instances.
3105
+ // Note that repetitions do not make copies of the rune slices,
3106
+ // so \pL{1000} is only one rune slice, not 1000.
3107
+ // We could keep a cache of character classes we've seen,
3108
+ // so that all the \pL we see use the same rune list,
3109
+ // but that doesn't remove the problem entirely:
3110
+ // consider something like [\pL01234][\pL01235][\pL01236]...[\pL^&*()].
3111
+ // And because the Rune slice is exposed directly in the Regexp,
3112
+ // there is not an opportunity to change the representation to allow
3113
+ // partial sharing between different character classes.
3114
+ // So the limit is the best we can do.
3115
+ static MAX_RUNES = 33554432; // 128 << 20 / 4 (runeSize, int32 is 4 bytes)
3059
3116
 
3060
3117
  // RangeTables are represented as int[][], a list of triples (start, end,
3061
3118
  // stride).
@@ -3354,7 +3411,7 @@ class Parser {
3354
3411
  case Codepoint.CODES.get('v'):
3355
3412
  return Codepoint.CODES.get('\v');
3356
3413
  default:
3357
- if (!Utils.isalnum(c)) {
3414
+ if (c <= Unicode.MAX_ASCII && !Utils.isalnum(c)) {
3358
3415
  return c;
3359
3416
  }
3360
3417
  break;
@@ -3388,6 +3445,12 @@ class Parser {
3388
3445
  // Stack of parsed expressions.
3389
3446
  this.stack = [];
3390
3447
  this.free = null;
3448
+ // checks
3449
+ this.numRegexp = 0; // number of regexps allocated
3450
+ this.numRunes = 0; // number of runes in char classes
3451
+ this.repeats = 0; // product of all repetitions seen
3452
+ this.height = null; // regexp height, for height limit check
3453
+ this.size = null; // regexp compiled size, for size limit check
3391
3454
  }
3392
3455
 
3393
3456
  // Allocate a Regexp, from the free list if possible.
@@ -3399,15 +3462,159 @@ class Parser {
3399
3462
  re.op = op;
3400
3463
  } else {
3401
3464
  re = new Regexp(op);
3465
+ this.numRegexp += 1;
3402
3466
  }
3403
3467
  return re;
3404
3468
  }
3405
3469
  reuse(re) {
3470
+ if (this.height !== null && Object.prototype.hasOwnProperty.call(this.height, re)) {
3471
+ delete this.height[re];
3472
+ }
3406
3473
  if (re.subs !== null && re.subs.length > 0) {
3407
3474
  re.subs[0] = this.free;
3408
3475
  }
3409
3476
  this.free = re;
3410
3477
  }
3478
+ checkLimits(re) {
3479
+ if (this.numRunes > Parser.MAX_RUNES) {
3480
+ throw new RE2JSSyntaxException(Parser.ERR_LARGE);
3481
+ }
3482
+ this.checkSize(re);
3483
+ this.checkHeight(re);
3484
+ }
3485
+ checkSize(re) {
3486
+ if (this.size === null) {
3487
+ // We haven't started tracking size yet.
3488
+ // Do a relatively cheap check to see if we need to start.
3489
+ // Maintain the product of all the repeats we've seen
3490
+ // and don't track if the total number of regexp nodes
3491
+ // we've seen times the repeat product is in budget.
3492
+ if (this.repeats === 0) {
3493
+ this.repeats = 1;
3494
+ }
3495
+ if (re.op === Regexp.Op.REPEAT) {
3496
+ let n = re.max;
3497
+ if (n === -1) {
3498
+ n = re.min;
3499
+ }
3500
+ if (n <= 0) {
3501
+ n = 1;
3502
+ }
3503
+ if (n > Parser.MAX_SIZE / this.repeats) {
3504
+ this.repeats = Parser.MAX_SIZE;
3505
+ } else {
3506
+ this.repeats *= n;
3507
+ }
3508
+ }
3509
+ if (this.numRegexp < Parser.MAX_SIZE / this.repeats) {
3510
+ return;
3511
+ }
3512
+
3513
+ // We need to start tracking size.
3514
+ // Make the map and belatedly populate it
3515
+ // with info about everything we've constructed so far.
3516
+ this.size = {};
3517
+ for (let reEx of this.stack) {
3518
+ this.checkSize(reEx);
3519
+ }
3520
+ }
3521
+ if (this.calcSize(re, true) > Parser.MAX_SIZE) {
3522
+ throw new RE2JSSyntaxException(Parser.ERR_LARGE);
3523
+ }
3524
+ }
3525
+ calcSize(re, force = false) {
3526
+ if (!force) {
3527
+ if (Object.prototype.hasOwnProperty.call(this.size, re)) {
3528
+ return this.size[re];
3529
+ }
3530
+ }
3531
+ let size = 0;
3532
+ switch (re.op) {
3533
+ case Regexp.Op.LITERAL:
3534
+ {
3535
+ size = re.runes.length;
3536
+ break;
3537
+ }
3538
+ case Regexp.Op.CAPTURE:
3539
+ case Regexp.Op.STAR:
3540
+ {
3541
+ // star can be 1+ or 2+; assume 2 pessimistically
3542
+ size = 2 + this.calcSize(re.subs[0]);
3543
+ break;
3544
+ }
3545
+ case Regexp.Op.PLUS:
3546
+ case Regexp.Op.QUEST:
3547
+ {
3548
+ size = 1 + this.calcSize(re.subs[0]);
3549
+ break;
3550
+ }
3551
+ case Regexp.Op.CONCAT:
3552
+ {
3553
+ for (let sub of re.subs) {
3554
+ size = size + this.calcSize(sub);
3555
+ }
3556
+ break;
3557
+ }
3558
+ case Regexp.Op.ALTERNATE:
3559
+ {
3560
+ for (let sub of re.subs) {
3561
+ size = size + this.calcSize(sub);
3562
+ }
3563
+ if (re.subs.length > 1) {
3564
+ size = size + re.subs.length - 1;
3565
+ }
3566
+ break;
3567
+ }
3568
+ case Regexp.Op.REPEAT:
3569
+ {
3570
+ let sub = this.calcSize(re.subs[0]);
3571
+ if (re.max === -1) {
3572
+ if (re.min === 0) {
3573
+ size = 2 + sub; // x*
3574
+ } else {
3575
+ size = 1 + re.min * sub; // xxx+
3576
+ }
3577
+ break;
3578
+ }
3579
+ // x{2,5} = xx(x(x(x)?)?)?
3580
+ size = re.max * sub + (re.max - re.min);
3581
+ break;
3582
+ }
3583
+ }
3584
+ size = Math.max(1, size);
3585
+ this.size[re] = size;
3586
+ return size;
3587
+ }
3588
+ checkHeight(re) {
3589
+ if (this.numRegexp < Parser.MAX_HEIGHT) {
3590
+ return;
3591
+ }
3592
+ if (this.height === null) {
3593
+ this.height = {};
3594
+ for (let reEx of this.stack) {
3595
+ this.checkHeight(reEx);
3596
+ }
3597
+ }
3598
+ if (this.calcHeight(re, true) > Parser.MAX_HEIGHT) {
3599
+ throw new RE2JSSyntaxException(Parser.ERR_NESTING_DEPTH);
3600
+ }
3601
+ }
3602
+ calcHeight(re, force = false) {
3603
+ if (!force) {
3604
+ if (Object.prototype.hasOwnProperty.call(this.height, re)) {
3605
+ return this.height[re];
3606
+ }
3607
+ }
3608
+ let h = 1;
3609
+ for (let sub of re.subs) {
3610
+ const hsub = this.calcHeight(sub);
3611
+ if (h < 1 + hsub) {
3612
+ h = 1 + hsub;
3613
+ }
3614
+ }
3615
+ this.height[re] = h;
3616
+ return h;
3617
+ }
3411
3618
 
3412
3619
  // Parse stack manipulation.
3413
3620
 
@@ -3428,13 +3635,14 @@ class Parser {
3428
3635
  // push pushes the regexp re onto the parse stack and returns the regexp.
3429
3636
  // Returns null for a CHAR_CLASS that can be merged with the top-of-stack.
3430
3637
  push(re) {
3638
+ this.numRunes += re.runes.length;
3431
3639
  if (re.op === Regexp.Op.CHAR_CLASS && re.runes.length === 2 && re.runes[0] === re.runes[1]) {
3432
- if (this.maybeConcat(re.runes[0], this.flags & ~RE2Flags.FOLD_CASE)) {
3640
+ if (this.maybeConcat(re.runes[0], this.flags & -2)) {
3433
3641
  return null;
3434
3642
  }
3435
3643
  re.op = Regexp.Op.LITERAL;
3436
3644
  re.runes = [re.runes[0]];
3437
- re.flags = this.flags & ~RE2Flags.FOLD_CASE;
3645
+ re.flags = this.flags & -2;
3438
3646
  } else if (re.op === Regexp.Op.CHAR_CLASS && re.runes.length === 4 && re.runes[0] === re.runes[1] && re.runes[2] === re.runes[3] && Unicode.simpleFold(re.runes[0]) === re.runes[2] && Unicode.simpleFold(re.runes[2]) === re.runes[0] || re.op === Regexp.Op.CHAR_CLASS && re.runes.length === 2 && re.runes[0] + 1 === re.runes[1] && Unicode.simpleFold(re.runes[0]) === re.runes[1] && Unicode.simpleFold(re.runes[1]) === re.runes[0]) {
3439
3647
  // Case-insensitive rune like [Aa] or [Δδ].
3440
3648
  if (this.maybeConcat(re.runes[0], this.flags | RE2Flags.FOLD_CASE)) {
@@ -3449,6 +3657,7 @@ class Parser {
3449
3657
  this.maybeConcat(-1, 0);
3450
3658
  }
3451
3659
  this.stack.push(re);
3660
+ this.checkLimits(re);
3452
3661
  return re;
3453
3662
  }
3454
3663
 
@@ -3542,6 +3751,43 @@ class Parser {
3542
3751
  re.flags = flags;
3543
3752
  re.subs = [sub];
3544
3753
  this.stack[n - 1] = re;
3754
+ this.checkLimits(re);
3755
+ if (op === Regexp.Op.REPEAT && (min >= 2 || max >= 2) && !this.repeatIsValid(re, 1000)) {
3756
+ throw new RE2JSSyntaxException(Parser.ERR_INVALID_REPEAT_SIZE, t.from(beforePos));
3757
+ }
3758
+ }
3759
+
3760
+ // repeatIsValid reports whether the repetition re is valid.
3761
+ // Valid means that the combination of the top-level repetition
3762
+ // and any inner repetitions does not exceed n copies of the
3763
+ // innermost thing.
3764
+ // This function rewalks the regexp tree and is called for every repetition,
3765
+ // so we have to worry about inducing quadratic behavior in the parser.
3766
+ // We avoid this by only calling repeatIsValid when min or max >= 2.
3767
+ // In that case the depth of any >= 2 nesting can only get to 9 without
3768
+ // triggering a parse error, so each subtree can only be rewalked 9 times.
3769
+ repeatIsValid(re, n) {
3770
+ if (re.op === Regexp.Op.REPEAT) {
3771
+ let m = re.max;
3772
+ if (m === 0) {
3773
+ return true;
3774
+ }
3775
+ if (m < 0) {
3776
+ m = re.min;
3777
+ }
3778
+ if (m > n) {
3779
+ return false;
3780
+ }
3781
+ if (m > 0) {
3782
+ n = Math.trunc(n / m);
3783
+ }
3784
+ }
3785
+ for (let sub of re.subs) {
3786
+ if (!this.repeatIsValid(sub, n)) {
3787
+ return false;
3788
+ }
3789
+ }
3790
+ return true;
3545
3791
  }
3546
3792
 
3547
3793
  // concat replaces the top of the stack (above the topmost '|' or '(') with
@@ -3579,10 +3825,10 @@ class Parser {
3579
3825
  if (re.op === Regexp.Op.CHAR_CLASS) {
3580
3826
  re.runes = new CharClass(re.runes).cleanClass().toArray();
3581
3827
  if (re.runes.length === 2 && re.runes[0] === 0 && re.runes[1] === Unicode.MAX_RUNE) {
3582
- re.runes = null;
3828
+ re.runes = [];
3583
3829
  re.op = Regexp.Op.ANY_CHAR;
3584
3830
  } else if (re.runes.length === 4 && re.runes[0] === 0 && re.runes[1] === Codepoint.CODES.get('\n') - 1 && re.runes[2] === Codepoint.CODES.get('\n') + 1 && re.runes[3] === Unicode.MAX_RUNE) {
3585
- re.runes = null;
3831
+ re.runes = [];
3586
3832
  re.op = Regexp.Op.ANY_CHAR_NOT_NL;
3587
3833
  }
3588
3834
  }
@@ -3717,6 +3963,7 @@ class Parser {
3717
3963
  prefix.runes = str.slice(0, strlen);
3718
3964
  for (let j = start; j < i; j++) {
3719
3965
  array[s + j] = this.removeLeadingString(array[s + j], strlen);
3966
+ this.checkLimits(array[s + j]);
3720
3967
  }
3721
3968
  // Recurse.
3722
3969
  const suffix = this.collapse(array.slice(s + start, s + i), Regexp.Op.ALTERNATE);
@@ -3766,6 +4013,7 @@ class Parser {
3766
4013
  for (let j = start; j < i; j++) {
3767
4014
  const reuse = j !== start; // prefix came from sub[start]
3768
4015
  array[s + j] = this.removeLeadingRegexp(array[s + j], reuse);
4016
+ this.checkLimits(array[s + j]);
3769
4017
  }
3770
4018
  // recurse
3771
4019
  const suffix = this.collapse(array.slice(s + start, s + i), Regexp.Op.ALTERNATE);
@@ -4149,7 +4397,7 @@ class Parser {
4149
4397
  t.skip(2); // "(?"
4150
4398
 
4151
4399
  let flags = this.flags;
4152
- let sign = +1;
4400
+ let sign = 1;
4153
4401
  let sawFlag = false;
4154
4402
  loop: while (t.more()) {
4155
4403
  {
@@ -4160,7 +4408,7 @@ class Parser {
4160
4408
  sawFlag = true;
4161
4409
  break;
4162
4410
  case Codepoint.CODES.get('m'):
4163
- flags &= ~RE2Flags.ONE_LINE;
4411
+ flags &= -17;
4164
4412
  sawFlag = true;
4165
4413
  break;
4166
4414
  case Codepoint.CODES.get('s'):
@@ -4266,12 +4514,12 @@ class Parser {
4266
4514
  this.alternate();
4267
4515
  const n = this.stack.length;
4268
4516
  if (n < 2) {
4269
- throw new RE2JSSyntaxException(Parser.ERR_INTERNAL_ERROR, 'stack underflow');
4517
+ throw new RE2JSSyntaxException(Parser.ERR_UNEXPECTED_PAREN, this.wholeRegexp);
4270
4518
  }
4271
4519
  const re1 = this.pop();
4272
4520
  const re2 = this.pop();
4273
4521
  if (re2.op !== Regexp.Op.LEFT_PAREN) {
4274
- throw new RE2JSSyntaxException(Parser.ERR_MISSING_PAREN, this.wholeRegexp);
4522
+ throw new RE2JSSyntaxException(Parser.ERR_UNEXPECTED_PAREN, this.wholeRegexp);
4275
4523
  }
4276
4524
  // Restore flags at time of paren.
4277
4525
  this.flags = re2.flags;
@@ -4341,7 +4589,7 @@ class Parser {
4341
4589
  }
4342
4590
  t.skip(1); // '\\'
4343
4591
  // Committed to parse or throw exception.
4344
- let sign = +1;
4592
+ let sign = 1;
4345
4593
  let c = t.pop(); // 'p' or 'P'
4346
4594
  if (c === Codepoint.CODES.get('P')) {
4347
4595
  sign = -1;
@@ -4405,7 +4653,7 @@ class Parser {
4405
4653
  const re = this.newRegexp(Regexp.Op.CHAR_CLASS);
4406
4654
  re.flags = this.flags;
4407
4655
  const cc = new CharClass();
4408
- let sign = +1;
4656
+ let sign = 1;
4409
4657
  if (t.more() && t.lookingAt('^')) {
4410
4658
  sign = -1;
4411
4659
  t.skip(1); // '^'
@@ -4948,6 +5196,13 @@ class RE2 {
4948
5196
  return this.numSubexp;
4949
5197
  }
4950
5198
 
5199
+ /**
5200
+ * Returns the number of instructions in this compiled regular expression program.
5201
+ */
5202
+ numberOfInstructions() {
5203
+ return this.prog.numInst();
5204
+ }
5205
+
4951
5206
  // get() returns a machine to use for matching |this|. It uses |this|'s
4952
5207
  // machine cache if possible, to avoid unnecessary allocation.
4953
5208
  get() {
@@ -5580,7 +5835,7 @@ class RE2JS {
5580
5835
  }
5581
5836
  let re2Flags = RE2Flags.PERL;
5582
5837
  if ((flags & RE2JS.DISABLE_UNICODE_GROUPS) !== 0) {
5583
- re2Flags &= ~RE2Flags.UNICODE_GROUPS;
5838
+ re2Flags &= -129;
5584
5839
  }
5585
5840
  const p = new RE2JS(regex, flags);
5586
5841
  // The compiled RE2 regexp.
@@ -5748,6 +6003,20 @@ class RE2JS {
5748
6003
  return this.patternInput;
5749
6004
  }
5750
6005
 
6006
+ /**
6007
+ * Returns the program size of this pattern.
6008
+ *
6009
+ * <p>
6010
+ * Similar to the C++ implementation, the program size is a very approximate measure of a regexp's
6011
+ * "cost". Larger numbers are more expensive than smaller numbers.
6012
+ * </p>
6013
+ *
6014
+ * @return the program size of this pattern
6015
+ */
6016
+ programSize() {
6017
+ return this.re2Input.numberOfInstructions();
6018
+ }
6019
+
5751
6020
  /**
5752
6021
  * Returns the number of capturing groups in this matcher's pattern. Group zero denotes the entire
5753
6022
  * pattern and is excluded from this count.