re2js 2.6.1 → 2.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,7 +2,7 @@
2
2
  * re2js
3
3
  * RE2JS is the JavaScript port of RE2, a regular expression engine that provides linear time matching
4
4
  *
5
- * @version v2.6.1
5
+ * @version v2.7.1
6
6
  * @author Oleksii Vasyliev
7
7
  * @homepage https://github.com/le0pard/re2js#readme
8
8
  * @repository github:le0pard/re2js
@@ -2159,9 +2159,9 @@
2159
2159
  case Inst.NOP:
2160
2160
  return `nop -> ${this.out}`;
2161
2161
  case Inst.LB_WRITE:
2162
- return `lbwrite ${this.lb} -> ${this.out}`;
2162
+ return `lbwrite ${this.arg} -> ${this.out}`;
2163
2163
  case Inst.LB_CHECK:
2164
- return `lbcheck ${this.lb} -> ${this.out}, ${this.arg}`;
2164
+ return `lbcheck ${this.arg} -> ${this.out}`;
2165
2165
  case Inst.RUNE:
2166
2166
  if (this.runes === null) {
2167
2167
  return 'rune <null>';
@@ -2341,30 +2341,35 @@
2341
2341
  }
2342
2342
  this.matched = false;
2343
2343
  this.matchcap.fill(-1);
2344
+
2345
+ // Lookbehinds must scan from the beginning of the string to build their state table,
2346
+ // even if the main pattern search is requested to start mid-string.
2347
+ let currentPos = this.prog.numLb > 0 ? 0 : pos;
2348
+ let matchStartPos = pos;
2344
2349
  let runq = this.q0;
2345
2350
  let nextq = this.q1;
2346
- let r = input.step(pos);
2351
+ let r = input.step(currentPos);
2347
2352
  let rune = r >> 3;
2348
2353
  let width = r & 7;
2349
2354
  let rune1 = -1;
2350
2355
  let width1 = 0;
2351
2356
  if (r !== MachineInputBase.EOF()) {
2352
- r = input.step(pos + width);
2357
+ r = input.step(currentPos + width);
2353
2358
  rune1 = r >> 3;
2354
2359
  width1 = r & 7;
2355
2360
  }
2356
2361
  let flag;
2357
- if (pos === 0) {
2362
+ if (currentPos === 0) {
2358
2363
  flag = Utils.emptyOpContext(-1, rune);
2359
2364
  } else {
2360
- flag = input.context(pos);
2365
+ flag = input.context(currentPos);
2361
2366
  }
2362
2367
  while (true) {
2363
2368
  if (runq.isEmpty()) {
2364
- if ((startCond & Utils.EMPTY_BEGIN_TEXT) !== 0 && pos !== 0) {
2369
+ if ((startCond & Utils.EMPTY_BEGIN_TEXT) !== 0 && currentPos !== 0) {
2365
2370
  break;
2366
2371
  }
2367
- if ((anchor === RE2Flags.ANCHOR_START || anchor === RE2Flags.ANCHOR_BOTH) && pos !== 0) {
2372
+ if ((anchor === RE2Flags.ANCHOR_START || anchor === RE2Flags.ANCHOR_BOTH) && currentPos !== 0) {
2368
2373
  break;
2369
2374
  }
2370
2375
  if (this.matched) {
@@ -2374,43 +2379,50 @@
2374
2379
  // Fast-forwarding the string pointer will skip over the positions where
2375
2380
  // the parallel lookbehind automata need to be spawned.
2376
2381
  if (this.prog.numLb === 0 && !(this.re2.prefix.length === 0) && rune1 !== this.re2.prefixRune && input.canCheckPrefix()) {
2377
- const advance = input.index(this.re2, pos);
2382
+ const advance = input.index(this.re2, currentPos);
2378
2383
  if (advance < 0) {
2379
2384
  break;
2380
2385
  }
2381
- pos += advance;
2382
- r = input.step(pos);
2386
+ currentPos += advance;
2387
+ r = input.step(currentPos);
2383
2388
  rune = r >> 3;
2384
2389
  width = r & 7;
2385
- r = input.step(pos + width);
2390
+ r = input.step(currentPos + width);
2386
2391
  rune1 = r >> 3;
2387
2392
  width1 = r & 7;
2388
2393
  }
2389
2394
  }
2390
- if (!this.matched && (pos === 0 || anchor === RE2Flags.UNANCHORED)) {
2391
- if (this.ncap > 0) {
2392
- this.matchcap[0] = pos;
2393
- }
2394
- // Spawn Lookbehind threads BEFORE the main pattern
2395
+
2396
+ // Optimize lookbehind spawning. Because lookbehinds are prefixed with `.*` by the compiler,
2397
+ // they only need to be spawned exactly once at the beginning of the string (currentPos === 0).
2398
+ if (currentPos === 0 && this.prog.numLb > 0) {
2395
2399
  for (let i = 0; i < this.prog.lbStarts.length; i++) {
2396
- this.add(runq, this.prog.lbStarts[i], pos, this.matchcap, flag, null);
2400
+ this.add(runq, this.prog.lbStarts[i], currentPos, this.matchcap, flag, null);
2401
+ }
2402
+ }
2403
+ if (!this.matched && (currentPos === 0 || anchor === RE2Flags.UNANCHORED)) {
2404
+ // ONLY spawn the main pattern if we have reached the requested search start boundary
2405
+ if (currentPos >= matchStartPos) {
2406
+ if (this.ncap > 0) {
2407
+ this.matchcap[0] = currentPos;
2408
+ }
2409
+ this.add(runq, this.prog.start, currentPos, this.matchcap, flag, null);
2397
2410
  }
2398
- this.add(runq, this.prog.start, pos, this.matchcap, flag, null);
2399
2411
  }
2400
- const nextPos = pos + width;
2412
+ const nextPos = currentPos + width;
2401
2413
  flag = input.context(nextPos);
2402
- this.step(runq, nextq, pos, nextPos, rune, flag, anchor, pos === input.endPos());
2414
+ this.step(runq, nextq, currentPos, nextPos, rune, flag, anchor, currentPos === input.endPos());
2403
2415
  if (width === 0) {
2404
2416
  break;
2405
2417
  }
2406
2418
  if (this.ncap === 0 && this.matched) {
2407
2419
  break;
2408
2420
  }
2409
- pos += width;
2421
+ currentPos += width;
2410
2422
  rune = rune1;
2411
2423
  width = width1;
2412
2424
  if (rune !== -1) {
2413
- r = input.step(pos + width);
2425
+ r = input.step(currentPos + width);
2414
2426
  rune1 = r >> 3;
2415
2427
  width1 = r & 7;
2416
2428
  }
@@ -2427,35 +2439,46 @@
2427
2439
  if ((anchor === RE2Flags.ANCHOR_START || anchor === RE2Flags.ANCHOR_BOTH) && pos !== 0) {
2428
2440
  return [];
2429
2441
  }
2442
+
2443
+ // Lookbehinds must scan from the beginning of the string to build their state table,
2444
+ // even if the main pattern search is requested to start mid-string.
2445
+ let currentPos = this.prog.numLb > 0 ? 0 : pos;
2446
+ let matchStartPos = pos;
2430
2447
  let runq = this.q0;
2431
2448
  let nextq = this.q1;
2432
- let r = input.step(pos);
2449
+ let r = input.step(currentPos);
2433
2450
  let rune = r >> 3;
2434
2451
  let width = r & 7;
2435
2452
  let rune1 = -1;
2436
2453
  let width1 = 0;
2437
2454
  if (r !== MachineInputBase.EOF()) {
2438
- r = input.step(pos + width);
2455
+ r = input.step(currentPos + width);
2439
2456
  rune1 = r >> 3;
2440
2457
  width1 = r & 7;
2441
2458
  }
2442
- let flag = pos === 0 ? Utils.emptyOpContext(-1, rune) : input.context(pos);
2459
+ let flag = currentPos === 0 ? Utils.emptyOpContext(-1, rune) : input.context(currentPos);
2443
2460
  const matches = new Set();
2444
2461
  while (true) {
2445
2462
  if (runq.isEmpty()) {
2446
- if ((startCond & Utils.EMPTY_BEGIN_TEXT) !== 0 && pos !== 0) break;
2447
- if ((anchor === RE2Flags.ANCHOR_START || anchor === RE2Flags.ANCHOR_BOTH) && pos !== 0) {
2463
+ if ((startCond & Utils.EMPTY_BEGIN_TEXT) !== 0 && currentPos !== 0) break;
2464
+ if ((anchor === RE2Flags.ANCHOR_START || anchor === RE2Flags.ANCHOR_BOTH) && currentPos !== 0) {
2448
2465
  break;
2449
2466
  }
2450
2467
  }
2451
- if (pos === 0 || anchor === RE2Flags.UNANCHORED) {
2452
- // Spawn Lookbehind threads BEFORE the main pattern
2468
+
2469
+ // Optimize lookbehind spawning to exactly once at BOF
2470
+ if (currentPos === 0 && this.prog.numLb > 0) {
2453
2471
  for (let i = 0; i < this.prog.lbStarts.length; i++) {
2454
- this.add(runq, this.prog.lbStarts[i], pos, this.matchcap, flag, null);
2472
+ this.add(runq, this.prog.lbStarts[i], currentPos, this.matchcap, flag, null);
2473
+ }
2474
+ }
2475
+ if (currentPos === 0 || anchor === RE2Flags.UNANCHORED) {
2476
+ // ONLY spawn the main pattern if we have reached the requested search start boundary
2477
+ if (currentPos >= matchStartPos) {
2478
+ this.add(runq, this.prog.start, currentPos, this.matchcap, flag, null);
2455
2479
  }
2456
- this.add(runq, this.prog.start, pos, this.matchcap, flag, null);
2457
2480
  }
2458
- const nextPos = pos + width;
2481
+ const nextPos = currentPos + width;
2459
2482
  flag = input.context(nextPos);
2460
2483
  for (let j = 0; j < runq.size; j++) {
2461
2484
  let t = runq.denseThreads[j];
@@ -2464,7 +2487,7 @@
2464
2487
  let add = false;
2465
2488
  switch (i.op) {
2466
2489
  case Inst.MATCH:
2467
- if (anchor === RE2Flags.ANCHOR_BOTH && pos !== input.endPos()) break;
2490
+ if (anchor === RE2Flags.ANCHOR_BOTH && currentPos !== input.endPos()) break;
2468
2491
  matches.add(i.arg); // Record the matched Set ID
2469
2492
  break;
2470
2493
  case Inst.RUNE:
@@ -2492,11 +2515,11 @@
2492
2515
  }
2493
2516
  runq.clear();
2494
2517
  if (width === 0) break;
2495
- pos += width;
2518
+ currentPos += width;
2496
2519
  rune = rune1;
2497
2520
  width = width1;
2498
2521
  if (rune !== -1) {
2499
- r = input.step(pos + width);
2522
+ r = input.step(currentPos + width);
2500
2523
  rune1 = r >> 3;
2501
2524
  width1 = r & 7;
2502
2525
  }
@@ -2600,17 +2623,17 @@
2600
2623
  continue;
2601
2624
  }
2602
2625
  case Inst.LB_WRITE:
2603
- this.lbTable[Math.abs(inst.lb)] = pos;
2626
+ this.lbTable[Math.abs(inst.arg)] = pos;
2604
2627
  pc = inst.out;
2605
2628
  continue;
2606
2629
  case Inst.LB_CHECK:
2607
- if (inst.lb > 0) {
2630
+ if (inst.arg > 0) {
2608
2631
  // Positive Lookbehind
2609
- if (this.lbTable[inst.lb] === pos) {
2632
+ if (this.lbTable[inst.arg] === pos) {
2610
2633
  pc = inst.out; // Flattened tail recursion
2611
2634
  continue;
2612
2635
  }
2613
- } else if (this.lbTable[-inst.lb] !== pos) {
2636
+ } else if (this.lbTable[-inst.arg] !== pos) {
2614
2637
  // Negative Lookbehind
2615
2638
  pc = inst.out; // Flattened tail recursion
2616
2639
  continue;
@@ -4690,7 +4713,7 @@
4690
4713
  }
4691
4714
  lookBehind(a, lb) {
4692
4715
  const id = this.newInst(Inst.LB_WRITE);
4693
- this.prog.getInst(id.i).lb = lb;
4716
+ this.prog.getInst(id.i).arg = lb;
4694
4717
 
4695
4718
  // Create the prefix wildcard `.*` for the lookbehind automaton
4696
4719
  const any = this.rune(Compiler.ANY_RUNE(), 0);
@@ -4698,7 +4721,7 @@
4698
4721
  const lbAutomaton = this.cat(dotStar, a);
4699
4722
  this.prog.patch(lbAutomaton.out, id.i);
4700
4723
  const checkId = this.newInst(Inst.LB_CHECK);
4701
- this.prog.getInst(checkId.i).lb = lb;
4724
+ this.prog.getInst(checkId.i).arg = lb;
4702
4725
 
4703
4726
  // Save the starting point of this lookbehind automaton
4704
4727
  this.prog.lbStarts.push(lbAutomaton.i);
@@ -5473,6 +5496,7 @@
5473
5496
  static ERR_UNEXPECTED_PAREN = 'unexpected )';
5474
5497
  static ERR_NESTING_DEPTH = 'expression nests too deeply';
5475
5498
  static ERR_LARGE = 'expression too large';
5499
+ static ERR_INVALID_CAPTURE_IN_LOOKBEHIND = 'invalid capture in lookbehind';
5476
5500
 
5477
5501
  // maxHeight is the maximum height of a regexp parse tree.
5478
5502
  // It is somewhat arbitrarily chosen, but the idea is to be large enough
@@ -5876,6 +5900,18 @@
5876
5900
  }
5877
5901
  return x;
5878
5902
  }
5903
+
5904
+ // recursively check for captures
5905
+ static hasCapture(re) {
5906
+ if (re === null) return false;
5907
+ if (re.op === Regexp.Op.CAPTURE) return true;
5908
+ if (re.subs) {
5909
+ for (let sub of re.subs) {
5910
+ if (Parser.hasCapture(sub)) return true;
5911
+ }
5912
+ }
5913
+ return false;
5914
+ }
5879
5915
  constructor(wholeRegexp, flags = 0) {
5880
5916
  this.wholeRegexp = wholeRegexp;
5881
5917
  // Flags control the behavior of the parser and record information about
@@ -6559,7 +6595,7 @@
6559
6595
  case 1:
6560
6596
  // Impossible but handle.
6561
6597
  re.op = Regexp.Op.EMPTY_MATCH;
6562
- re.subs = null;
6598
+ re.subs = Regexp.emptySubs();
6563
6599
  break;
6564
6600
  case 2:
6565
6601
  {
@@ -7010,6 +7046,9 @@
7010
7046
 
7011
7047
  // Handle lookbehinds
7012
7048
  if (re2.lb !== 0) {
7049
+ if (Parser.hasCapture(re1)) {
7050
+ throw new RE2JSSyntaxException(Parser.ERR_INVALID_CAPTURE_IN_LOOKBEHIND, this.wholeRegexp);
7051
+ }
7013
7052
  if (re2.lb > 0) {
7014
7053
  re2.op = Regexp.Op.PLB;
7015
7054
  } else {