re2js 2.7.0 → 2.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,7 +2,7 @@
2
2
  * re2js
3
3
  * RE2JS is the JavaScript port of RE2, a regular expression engine that provides linear time matching
4
4
  *
5
- * @version v2.7.0
5
+ * @version v2.8.0
6
6
  * @author Oleksii Vasyliev
7
7
  * @homepage https://github.com/le0pard/re2js#readme
8
8
  * @repository github:le0pard/re2js
@@ -1415,6 +1415,13 @@
1415
1415
  */
1416
1416
 
1417
1417
  class Matcher {
1418
+ /**
1419
+ * V8 and WebKit have historical hard limits on the number of arguments
1420
+ * that can be passed to a function. We cap replacer arguments to prevent
1421
+ * Call Stack Overflow (DoS) vulnerabilities on massive ASTs.
1422
+ */
1423
+ static MAX_REPLACER_ARGS = 65535;
1424
+
1418
1425
  /**
1419
1426
  * Quotes '\' and '$' in {@code s}, so that the returned string could be used in
1420
1427
  * {@link #appendReplacement} as a literal replacement of {@code s}.
@@ -1711,16 +1718,14 @@
1711
1718
  * @private
1712
1719
  */
1713
1720
  genMatch(startByte, anchor) {
1714
- const hasLookbehinds = this.patternInput.re2().prog.numLb > 0;
1715
- const ngroup = hasLookbehinds ? 1 + this.patternGroupCount : 1;
1716
- const res = this.patternInput.re2().matchMachineInput(this.matcherInput, startByte, this.matcherInputLength, anchor, ngroup);
1721
+ const res = this.patternInput.re2().matchMachineInput(this.matcherInput, startByte, this.matcherInputLength, anchor, 1);
1717
1722
  const ok = res[0];
1718
1723
  if (!ok) {
1719
1724
  return false;
1720
1725
  }
1721
1726
  this.groups = res[1];
1722
1727
  this.hasMatch = true;
1723
- this.hasGroups = hasLookbehinds || this.patternGroupCount === 0;
1728
+ this.hasGroups = this.patternGroupCount === 0;
1724
1729
  this.anchorFlag = anchor;
1725
1730
  return true;
1726
1731
  }
@@ -1973,7 +1978,7 @@
1973
1978
  * Returns the input with all matches replaced by {@code replacement}, interpreted as for
1974
1979
  * {@code appendReplacement}.
1975
1980
  *
1976
- * @param {string} replacement - the replacement string
1981
+ * @param {string|Function} replacement - the replacement string or a replacer function
1977
1982
  * @param {boolean} [javaMode=false] - activate java mode (different behaviour for capture groups and special characters)
1978
1983
  * @returns {string} the input string with the matches replaced
1979
1984
  * @throws IndexOutOfBoundsException if replacement refers to an invalid group and javaMode is true
@@ -1986,7 +1991,7 @@
1986
1991
  * Returns the input with the first match replaced by {@code replacement}, interpreted as for
1987
1992
  * {@code appendReplacement}.
1988
1993
  *
1989
- * @param {string} replacement - the replacement string
1994
+ * @param {string|Function} replacement - the replacement string or a replacer function
1990
1995
  * @param {boolean} [javaMode=false] - activate java mode (different behaviour for capture groups and special characters)
1991
1996
  * @returns {string} the input string with the first match replaced
1992
1997
  * @throws IndexOutOfBoundsException if replacement refers to an invalid group and javaMode is true
@@ -1997,7 +2002,7 @@
1997
2002
 
1998
2003
  /**
1999
2004
  * Helper: replaceAll/replaceFirst hybrid.
2000
- * @param {string} replacement - the replacement string
2005
+ * @param {string|Function} replacement - the replacement string or a replacer function
2001
2006
  * @param {boolean} [all=true] - replace all matches
2002
2007
  * @param {boolean} [javaMode=false] - activate java mode (different behaviour for capture groups and special characters)
2003
2008
  * @returns {string}
@@ -2006,8 +2011,21 @@
2006
2011
  replace(replacement, all = true, javaMode = false) {
2007
2012
  let res = '';
2008
2013
  this.reset();
2014
+ const isFunc = typeof replacement === 'function';
2015
+
2016
+ // Cache named groups check to avoid GC thrashing on every match
2017
+ const hasNamedGroups = Object.keys(this.namedGroups).length > 0;
2018
+ let originalInput = null;
2019
+ if (isFunc) {
2020
+ // Prevent V8 Call Stack Overflow (DoS vector) on massive capture group counts
2021
+ if (this.groupCount() >= Matcher.MAX_REPLACER_ARGS) {
2022
+ throw new RE2JSGroupException('Too many capture groups to safely invoke replacer function');
2023
+ }
2024
+ // Resolve the original input reference exactly once outside the hot loop
2025
+ originalInput = this.matcherInput.isUTF8Encoding() ? this.matcherInput.asBytes() : this.matcherInput.asCharSequence();
2026
+ }
2009
2027
  while (this.find()) {
2010
- res += this.appendReplacement(replacement, javaMode);
2028
+ res += isFunc ? this.appendReplacementFunc(replacement, hasNamedGroups, originalInput) : this.appendReplacement(replacement, javaMode);
2011
2029
  if (!all) {
2012
2030
  break;
2013
2031
  }
@@ -2015,6 +2033,66 @@
2015
2033
  res += this.appendTail();
2016
2034
  return res;
2017
2035
  }
2036
+
2037
+ /**
2038
+ * Evaluates a replacer function for the current match and appends the result,
2039
+ * along with any un-matched preceding text, advancing the append position.
2040
+ * @param {Function} replacer - the replacer function
2041
+ * @param {boolean} hasNamedGroups - cached flag if pattern has named groups
2042
+ * @param {string|Uint8Array|number[]} originalInput - the cached original input reference
2043
+ * @returns {string} the evaluated string to append
2044
+ * @private
2045
+ */
2046
+ appendReplacementFunc(replacer, hasNamedGroups, originalInput) {
2047
+ let res = '';
2048
+ const s = this.start();
2049
+ const e = this.end();
2050
+ if (this.appendPos < s) {
2051
+ res += this.substring(this.appendPos, s);
2052
+ }
2053
+ this.appendPos = e;
2054
+ const args = this.buildReplacerArgs(s, hasNamedGroups, originalInput);
2055
+ res += String(replacer(...args));
2056
+ return res;
2057
+ }
2058
+
2059
+ /**
2060
+ * Builds the argument array for the replacer function matching the standard
2061
+ * JS String.prototype.replace(regex, replacer) signature.
2062
+ * @param {number} matchStart - the start index of the match
2063
+ * @param {boolean} hasNamedGroups - cached flag if pattern has named groups
2064
+ * @param {string|Uint8Array|number[]} originalInput - the cached original input reference
2065
+ * @returns {Array} array of arguments
2066
+ * @private
2067
+ */
2068
+ buildReplacerArgs(matchStart, hasNamedGroups, originalInput) {
2069
+ const args = [this.group(0)]; // match
2070
+
2071
+ const numGroups = this.groupCount();
2072
+ // Fast-path capture group extraction
2073
+ for (let i = 1; i <= numGroups; i++) {
2074
+ const start = this.start(i);
2075
+ if (start < 0) {
2076
+ args.push(void 0);
2077
+ } else {
2078
+ args.push(this.substring(start, this.end(i)));
2079
+ }
2080
+ }
2081
+ args.push(matchStart); // offset
2082
+ args.push(originalInput); // original string (cached)
2083
+
2084
+ // Append named groups object if pattern contains them
2085
+ if (hasNamedGroups) {
2086
+ const parsedGroups = this.getNamedGroups();
2087
+ for (const key in parsedGroups) {
2088
+ if (parsedGroups[key] === null) {
2089
+ parsedGroups[key] = void 0;
2090
+ }
2091
+ }
2092
+ args.push(parsedGroups);
2093
+ }
2094
+ return args;
2095
+ }
2018
2096
  }
2019
2097
 
2020
2098
  /**
@@ -2341,30 +2419,35 @@
2341
2419
  }
2342
2420
  this.matched = false;
2343
2421
  this.matchcap.fill(-1);
2422
+
2423
+ // Lookbehinds must scan from the beginning of the string to build their state table,
2424
+ // even if the main pattern search is requested to start mid-string.
2425
+ let currentPos = this.prog.numLb > 0 ? 0 : pos;
2426
+ let matchStartPos = pos;
2344
2427
  let runq = this.q0;
2345
2428
  let nextq = this.q1;
2346
- let r = input.step(pos);
2429
+ let r = input.step(currentPos);
2347
2430
  let rune = r >> 3;
2348
2431
  let width = r & 7;
2349
2432
  let rune1 = -1;
2350
2433
  let width1 = 0;
2351
2434
  if (r !== MachineInputBase.EOF()) {
2352
- r = input.step(pos + width);
2435
+ r = input.step(currentPos + width);
2353
2436
  rune1 = r >> 3;
2354
2437
  width1 = r & 7;
2355
2438
  }
2356
2439
  let flag;
2357
- if (pos === 0) {
2440
+ if (currentPos === 0) {
2358
2441
  flag = Utils.emptyOpContext(-1, rune);
2359
2442
  } else {
2360
- flag = input.context(pos);
2443
+ flag = input.context(currentPos);
2361
2444
  }
2362
2445
  while (true) {
2363
2446
  if (runq.isEmpty()) {
2364
- if ((startCond & Utils.EMPTY_BEGIN_TEXT) !== 0 && pos !== 0) {
2447
+ if ((startCond & Utils.EMPTY_BEGIN_TEXT) !== 0 && currentPos !== 0) {
2365
2448
  break;
2366
2449
  }
2367
- if ((anchor === RE2Flags.ANCHOR_START || anchor === RE2Flags.ANCHOR_BOTH) && pos !== 0) {
2450
+ if ((anchor === RE2Flags.ANCHOR_START || anchor === RE2Flags.ANCHOR_BOTH) && currentPos !== 0) {
2368
2451
  break;
2369
2452
  }
2370
2453
  if (this.matched) {
@@ -2374,43 +2457,50 @@
2374
2457
  // Fast-forwarding the string pointer will skip over the positions where
2375
2458
  // the parallel lookbehind automata need to be spawned.
2376
2459
  if (this.prog.numLb === 0 && !(this.re2.prefix.length === 0) && rune1 !== this.re2.prefixRune && input.canCheckPrefix()) {
2377
- const advance = input.index(this.re2, pos);
2460
+ const advance = input.index(this.re2, currentPos);
2378
2461
  if (advance < 0) {
2379
2462
  break;
2380
2463
  }
2381
- pos += advance;
2382
- r = input.step(pos);
2464
+ currentPos += advance;
2465
+ r = input.step(currentPos);
2383
2466
  rune = r >> 3;
2384
2467
  width = r & 7;
2385
- r = input.step(pos + width);
2468
+ r = input.step(currentPos + width);
2386
2469
  rune1 = r >> 3;
2387
2470
  width1 = r & 7;
2388
2471
  }
2389
2472
  }
2390
- if (!this.matched && (pos === 0 || anchor === RE2Flags.UNANCHORED)) {
2391
- if (this.ncap > 0) {
2392
- this.matchcap[0] = pos;
2393
- }
2394
- // Spawn Lookbehind threads BEFORE the main pattern
2473
+
2474
+ // Optimize lookbehind spawning. Because lookbehinds are prefixed with `.*` by the compiler,
2475
+ // they only need to be spawned exactly once at the beginning of the string (currentPos === 0).
2476
+ if (currentPos === 0 && this.prog.numLb > 0) {
2395
2477
  for (let i = 0; i < this.prog.lbStarts.length; i++) {
2396
- this.add(runq, this.prog.lbStarts[i], pos, this.matchcap, flag, null);
2478
+ this.add(runq, this.prog.lbStarts[i], currentPos, this.matchcap, flag, null);
2479
+ }
2480
+ }
2481
+ if (!this.matched && (currentPos === 0 || anchor === RE2Flags.UNANCHORED)) {
2482
+ // ONLY spawn the main pattern if we have reached the requested search start boundary
2483
+ if (currentPos >= matchStartPos) {
2484
+ if (this.ncap > 0) {
2485
+ this.matchcap[0] = currentPos;
2486
+ }
2487
+ this.add(runq, this.prog.start, currentPos, this.matchcap, flag, null);
2397
2488
  }
2398
- this.add(runq, this.prog.start, pos, this.matchcap, flag, null);
2399
2489
  }
2400
- const nextPos = pos + width;
2490
+ const nextPos = currentPos + width;
2401
2491
  flag = input.context(nextPos);
2402
- this.step(runq, nextq, pos, nextPos, rune, flag, anchor, pos === input.endPos());
2492
+ this.step(runq, nextq, currentPos, nextPos, rune, flag, anchor, currentPos === input.endPos());
2403
2493
  if (width === 0) {
2404
2494
  break;
2405
2495
  }
2406
2496
  if (this.ncap === 0 && this.matched) {
2407
2497
  break;
2408
2498
  }
2409
- pos += width;
2499
+ currentPos += width;
2410
2500
  rune = rune1;
2411
2501
  width = width1;
2412
2502
  if (rune !== -1) {
2413
- r = input.step(pos + width);
2503
+ r = input.step(currentPos + width);
2414
2504
  rune1 = r >> 3;
2415
2505
  width1 = r & 7;
2416
2506
  }
@@ -2427,35 +2517,46 @@
2427
2517
  if ((anchor === RE2Flags.ANCHOR_START || anchor === RE2Flags.ANCHOR_BOTH) && pos !== 0) {
2428
2518
  return [];
2429
2519
  }
2520
+
2521
+ // Lookbehinds must scan from the beginning of the string to build their state table,
2522
+ // even if the main pattern search is requested to start mid-string.
2523
+ let currentPos = this.prog.numLb > 0 ? 0 : pos;
2524
+ let matchStartPos = pos;
2430
2525
  let runq = this.q0;
2431
2526
  let nextq = this.q1;
2432
- let r = input.step(pos);
2527
+ let r = input.step(currentPos);
2433
2528
  let rune = r >> 3;
2434
2529
  let width = r & 7;
2435
2530
  let rune1 = -1;
2436
2531
  let width1 = 0;
2437
2532
  if (r !== MachineInputBase.EOF()) {
2438
- r = input.step(pos + width);
2533
+ r = input.step(currentPos + width);
2439
2534
  rune1 = r >> 3;
2440
2535
  width1 = r & 7;
2441
2536
  }
2442
- let flag = pos === 0 ? Utils.emptyOpContext(-1, rune) : input.context(pos);
2537
+ let flag = currentPos === 0 ? Utils.emptyOpContext(-1, rune) : input.context(currentPos);
2443
2538
  const matches = new Set();
2444
2539
  while (true) {
2445
2540
  if (runq.isEmpty()) {
2446
- if ((startCond & Utils.EMPTY_BEGIN_TEXT) !== 0 && pos !== 0) break;
2447
- if ((anchor === RE2Flags.ANCHOR_START || anchor === RE2Flags.ANCHOR_BOTH) && pos !== 0) {
2541
+ if ((startCond & Utils.EMPTY_BEGIN_TEXT) !== 0 && currentPos !== 0) break;
2542
+ if ((anchor === RE2Flags.ANCHOR_START || anchor === RE2Flags.ANCHOR_BOTH) && currentPos !== 0) {
2448
2543
  break;
2449
2544
  }
2450
2545
  }
2451
- if (pos === 0 || anchor === RE2Flags.UNANCHORED) {
2452
- // Spawn Lookbehind threads BEFORE the main pattern
2546
+
2547
+ // Optimize lookbehind spawning to exactly once at BOF
2548
+ if (currentPos === 0 && this.prog.numLb > 0) {
2453
2549
  for (let i = 0; i < this.prog.lbStarts.length; i++) {
2454
- this.add(runq, this.prog.lbStarts[i], pos, this.matchcap, flag, null);
2550
+ this.add(runq, this.prog.lbStarts[i], currentPos, this.matchcap, flag, null);
2455
2551
  }
2456
- this.add(runq, this.prog.start, pos, this.matchcap, flag, null);
2457
2552
  }
2458
- const nextPos = pos + width;
2553
+ if (currentPos === 0 || anchor === RE2Flags.UNANCHORED) {
2554
+ // ONLY spawn the main pattern if we have reached the requested search start boundary
2555
+ if (currentPos >= matchStartPos) {
2556
+ this.add(runq, this.prog.start, currentPos, this.matchcap, flag, null);
2557
+ }
2558
+ }
2559
+ const nextPos = currentPos + width;
2459
2560
  flag = input.context(nextPos);
2460
2561
  for (let j = 0; j < runq.size; j++) {
2461
2562
  let t = runq.denseThreads[j];
@@ -2464,7 +2565,7 @@
2464
2565
  let add = false;
2465
2566
  switch (i.op) {
2466
2567
  case Inst.MATCH:
2467
- if (anchor === RE2Flags.ANCHOR_BOTH && pos !== input.endPos()) break;
2568
+ if (anchor === RE2Flags.ANCHOR_BOTH && currentPos !== input.endPos()) break;
2468
2569
  matches.add(i.arg); // Record the matched Set ID
2469
2570
  break;
2470
2571
  case Inst.RUNE:
@@ -2492,11 +2593,11 @@
2492
2593
  }
2493
2594
  runq.clear();
2494
2595
  if (width === 0) break;
2495
- pos += width;
2596
+ currentPos += width;
2496
2597
  rune = rune1;
2497
2598
  width = width1;
2498
2599
  if (rune !== -1) {
2499
- r = input.step(pos + width);
2600
+ r = input.step(currentPos + width);
2500
2601
  rune1 = r >> 3;
2501
2602
  width1 = r & 7;
2502
2603
  }