re2js 2.7.0 → 2.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -513,7 +513,7 @@ RE2JS.compile('(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)')
513
513
  Note that the replacement string can include references to capturing groups from the pattern
514
514
 
515
515
  Parameters:
516
- - `replacement (String)`: The string that replaces the substrings found. Capture groups and special characters in the replacement string have special behavior. For example:
516
+ - `replacement (String | Function)`: The string that replaces the substrings found, or a function invoked to create the new substring. When passing a string, capture groups and special characters have special behavior. For example:
517
517
  - `$&` refers to the entire matched substring
518
518
  - `$1, $2, ...` refer to the corresponding capture groups in the pattern
519
519
  - `$$` inserts a literal `$`
@@ -556,7 +556,42 @@ RE2JS.compile('(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)')
556
556
  .replaceFirst('$10$20') // 'jb0nopqrstuvwxyz123'
557
557
  ```
558
558
 
559
- Function support second argument `javaMode`, which work in the same way, as for `replaceAll` function
559
+ Function support second argument `javaMode`, which work in the same way, as for `replaceAll` function.
560
+
561
+ #### Using a Replacer Function
562
+
563
+ For a more modern JavaScript developer experience, RE2JS supports passing a **replacer function** to `replaceAll()` and `replaceFirst()`, perfectly mirroring native `String.prototype.replace(regex, replacer)` behavior while taking advantage of the high-speed linear-time engine.
564
+
565
+ The replacer function is invoked for each match, and its return value is used as the replacement string. The function receives the following arguments:
566
+
567
+ 1. `match`: The matched substring.
568
+ 2. `p1, p2, ...`: The string found by a capture group (if any). Unmatched optional groups evaluate to `undefined`.
569
+ 3. `offset`: The offset of the matched substring within the whole string.
570
+ 4. `string`: The original input string (or byte array).
571
+ 5. `groups`: A dictionary object of named capture groups (if any exist in the pattern).
572
+
573
+ ```js
574
+ import { RE2JS } from 're2js'
575
+
576
+ // Example 1: Dynamic replacements
577
+ const re1 = RE2JS.compile('\\d+');
578
+ const m1 = re1.matcher('Numbers: 1, 2, 3');
579
+
580
+ m1.replaceAll((match) => String(Number(match) * 10));
581
+ // 'Numbers: 10, 20, 30'
582
+
583
+
584
+ // Example 2: Using named capture groups and function signature
585
+ const re2 = RE2JS.compile('(?P<first>\\w+) (?:(?P<middle>\\w+) )?(?P<last>\\w+)');
586
+ const m2 = re2.matcher('Hello World');
587
+
588
+ m2.replaceFirst((match, p1, p2, p3, offset, string, groups) => {
589
+ // 'middle' didn't match, so p2 and groups.middle will be undefined
590
+ return `${groups.last}, ${groups.first}`;
591
+ });
592
+ // 'World, Hello'
593
+
594
+ ```
560
595
 
561
596
  ### Safe Replacements
562
597
 
@@ -2,7 +2,7 @@
2
2
  * re2js
3
3
  * RE2JS is the JavaScript port of RE2, a regular expression engine that provides linear time matching
4
4
  *
5
- * @version v2.7.0
5
+ * @version v2.8.0
6
6
  * @author Oleksii Vasyliev
7
7
  * @homepage https://github.com/le0pard/re2js#readme
8
8
  * @repository github:le0pard/re2js
@@ -1411,6 +1411,13 @@ class RE2JSInternalException extends RE2JSException {
1411
1411
  */
1412
1412
 
1413
1413
  class Matcher {
1414
+ /**
1415
+ * V8 and WebKit have historical hard limits on the number of arguments
1416
+ * that can be passed to a function. We cap replacer arguments to prevent
1417
+ * Call Stack Overflow (DoS) vulnerabilities on massive ASTs.
1418
+ */
1419
+ static MAX_REPLACER_ARGS = 65535;
1420
+
1414
1421
  /**
1415
1422
  * Quotes '\' and '$' in {@code s}, so that the returned string could be used in
1416
1423
  * {@link #appendReplacement} as a literal replacement of {@code s}.
@@ -1707,16 +1714,14 @@ class Matcher {
1707
1714
  * @private
1708
1715
  */
1709
1716
  genMatch(startByte, anchor) {
1710
- const hasLookbehinds = this.patternInput.re2().prog.numLb > 0;
1711
- const ngroup = hasLookbehinds ? 1 + this.patternGroupCount : 1;
1712
- const res = this.patternInput.re2().matchMachineInput(this.matcherInput, startByte, this.matcherInputLength, anchor, ngroup);
1717
+ const res = this.patternInput.re2().matchMachineInput(this.matcherInput, startByte, this.matcherInputLength, anchor, 1);
1713
1718
  const ok = res[0];
1714
1719
  if (!ok) {
1715
1720
  return false;
1716
1721
  }
1717
1722
  this.groups = res[1];
1718
1723
  this.hasMatch = true;
1719
- this.hasGroups = hasLookbehinds || this.patternGroupCount === 0;
1724
+ this.hasGroups = this.patternGroupCount === 0;
1720
1725
  this.anchorFlag = anchor;
1721
1726
  return true;
1722
1727
  }
@@ -1969,7 +1974,7 @@ class Matcher {
1969
1974
  * Returns the input with all matches replaced by {@code replacement}, interpreted as for
1970
1975
  * {@code appendReplacement}.
1971
1976
  *
1972
- * @param {string} replacement - the replacement string
1977
+ * @param {string|Function} replacement - the replacement string or a replacer function
1973
1978
  * @param {boolean} [javaMode=false] - activate java mode (different behaviour for capture groups and special characters)
1974
1979
  * @returns {string} the input string with the matches replaced
1975
1980
  * @throws IndexOutOfBoundsException if replacement refers to an invalid group and javaMode is true
@@ -1982,7 +1987,7 @@ class Matcher {
1982
1987
  * Returns the input with the first match replaced by {@code replacement}, interpreted as for
1983
1988
  * {@code appendReplacement}.
1984
1989
  *
1985
- * @param {string} replacement - the replacement string
1990
+ * @param {string|Function} replacement - the replacement string or a replacer function
1986
1991
  * @param {boolean} [javaMode=false] - activate java mode (different behaviour for capture groups and special characters)
1987
1992
  * @returns {string} the input string with the first match replaced
1988
1993
  * @throws IndexOutOfBoundsException if replacement refers to an invalid group and javaMode is true
@@ -1993,7 +1998,7 @@ class Matcher {
1993
1998
 
1994
1999
  /**
1995
2000
  * Helper: replaceAll/replaceFirst hybrid.
1996
- * @param {string} replacement - the replacement string
2001
+ * @param {string|Function} replacement - the replacement string or a replacer function
1997
2002
  * @param {boolean} [all=true] - replace all matches
1998
2003
  * @param {boolean} [javaMode=false] - activate java mode (different behaviour for capture groups and special characters)
1999
2004
  * @returns {string}
@@ -2002,8 +2007,21 @@ class Matcher {
2002
2007
  replace(replacement, all = true, javaMode = false) {
2003
2008
  let res = '';
2004
2009
  this.reset();
2010
+ const isFunc = typeof replacement === 'function';
2011
+
2012
+ // Cache named groups check to avoid GC thrashing on every match
2013
+ const hasNamedGroups = Object.keys(this.namedGroups).length > 0;
2014
+ let originalInput = null;
2015
+ if (isFunc) {
2016
+ // Prevent V8 Call Stack Overflow (DoS vector) on massive capture group counts
2017
+ if (this.groupCount() >= Matcher.MAX_REPLACER_ARGS) {
2018
+ throw new RE2JSGroupException('Too many capture groups to safely invoke replacer function');
2019
+ }
2020
+ // Resolve the original input reference exactly once outside the hot loop
2021
+ originalInput = this.matcherInput.isUTF8Encoding() ? this.matcherInput.asBytes() : this.matcherInput.asCharSequence();
2022
+ }
2005
2023
  while (this.find()) {
2006
- res += this.appendReplacement(replacement, javaMode);
2024
+ res += isFunc ? this.appendReplacementFunc(replacement, hasNamedGroups, originalInput) : this.appendReplacement(replacement, javaMode);
2007
2025
  if (!all) {
2008
2026
  break;
2009
2027
  }
@@ -2011,6 +2029,66 @@ class Matcher {
2011
2029
  res += this.appendTail();
2012
2030
  return res;
2013
2031
  }
2032
+
2033
+ /**
2034
+ * Evaluates a replacer function for the current match and appends the result,
2035
+ * along with any un-matched preceding text, advancing the append position.
2036
+ * @param {Function} replacer - the replacer function
2037
+ * @param {boolean} hasNamedGroups - cached flag if pattern has named groups
2038
+ * @param {string|Uint8Array|number[]} originalInput - the cached original input reference
2039
+ * @returns {string} the evaluated string to append
2040
+ * @private
2041
+ */
2042
+ appendReplacementFunc(replacer, hasNamedGroups, originalInput) {
2043
+ let res = '';
2044
+ const s = this.start();
2045
+ const e = this.end();
2046
+ if (this.appendPos < s) {
2047
+ res += this.substring(this.appendPos, s);
2048
+ }
2049
+ this.appendPos = e;
2050
+ const args = this.buildReplacerArgs(s, hasNamedGroups, originalInput);
2051
+ res += String(replacer(...args));
2052
+ return res;
2053
+ }
2054
+
2055
+ /**
2056
+ * Builds the argument array for the replacer function matching the standard
2057
+ * JS String.prototype.replace(regex, replacer) signature.
2058
+ * @param {number} matchStart - the start index of the match
2059
+ * @param {boolean} hasNamedGroups - cached flag if pattern has named groups
2060
+ * @param {string|Uint8Array|number[]} originalInput - the cached original input reference
2061
+ * @returns {Array} array of arguments
2062
+ * @private
2063
+ */
2064
+ buildReplacerArgs(matchStart, hasNamedGroups, originalInput) {
2065
+ const args = [this.group(0)]; // match
2066
+
2067
+ const numGroups = this.groupCount();
2068
+ // Fast-path capture group extraction
2069
+ for (let i = 1; i <= numGroups; i++) {
2070
+ const start = this.start(i);
2071
+ if (start < 0) {
2072
+ args.push(void 0);
2073
+ } else {
2074
+ args.push(this.substring(start, this.end(i)));
2075
+ }
2076
+ }
2077
+ args.push(matchStart); // offset
2078
+ args.push(originalInput); // original string (cached)
2079
+
2080
+ // Append named groups object if pattern contains them
2081
+ if (hasNamedGroups) {
2082
+ const parsedGroups = this.getNamedGroups();
2083
+ for (const key in parsedGroups) {
2084
+ if (parsedGroups[key] === null) {
2085
+ parsedGroups[key] = void 0;
2086
+ }
2087
+ }
2088
+ args.push(parsedGroups);
2089
+ }
2090
+ return args;
2091
+ }
2014
2092
  }
2015
2093
 
2016
2094
  /**
@@ -2337,30 +2415,35 @@ class Machine {
2337
2415
  }
2338
2416
  this.matched = false;
2339
2417
  this.matchcap.fill(-1);
2418
+
2419
+ // Lookbehinds must scan from the beginning of the string to build their state table,
2420
+ // even if the main pattern search is requested to start mid-string.
2421
+ let currentPos = this.prog.numLb > 0 ? 0 : pos;
2422
+ let matchStartPos = pos;
2340
2423
  let runq = this.q0;
2341
2424
  let nextq = this.q1;
2342
- let r = input.step(pos);
2425
+ let r = input.step(currentPos);
2343
2426
  let rune = r >> 3;
2344
2427
  let width = r & 7;
2345
2428
  let rune1 = -1;
2346
2429
  let width1 = 0;
2347
2430
  if (r !== MachineInputBase.EOF()) {
2348
- r = input.step(pos + width);
2431
+ r = input.step(currentPos + width);
2349
2432
  rune1 = r >> 3;
2350
2433
  width1 = r & 7;
2351
2434
  }
2352
2435
  let flag;
2353
- if (pos === 0) {
2436
+ if (currentPos === 0) {
2354
2437
  flag = Utils.emptyOpContext(-1, rune);
2355
2438
  } else {
2356
- flag = input.context(pos);
2439
+ flag = input.context(currentPos);
2357
2440
  }
2358
2441
  while (true) {
2359
2442
  if (runq.isEmpty()) {
2360
- if ((startCond & Utils.EMPTY_BEGIN_TEXT) !== 0 && pos !== 0) {
2443
+ if ((startCond & Utils.EMPTY_BEGIN_TEXT) !== 0 && currentPos !== 0) {
2361
2444
  break;
2362
2445
  }
2363
- if ((anchor === RE2Flags.ANCHOR_START || anchor === RE2Flags.ANCHOR_BOTH) && pos !== 0) {
2446
+ if ((anchor === RE2Flags.ANCHOR_START || anchor === RE2Flags.ANCHOR_BOTH) && currentPos !== 0) {
2364
2447
  break;
2365
2448
  }
2366
2449
  if (this.matched) {
@@ -2370,43 +2453,50 @@ class Machine {
2370
2453
  // Fast-forwarding the string pointer will skip over the positions where
2371
2454
  // the parallel lookbehind automata need to be spawned.
2372
2455
  if (this.prog.numLb === 0 && !(this.re2.prefix.length === 0) && rune1 !== this.re2.prefixRune && input.canCheckPrefix()) {
2373
- const advance = input.index(this.re2, pos);
2456
+ const advance = input.index(this.re2, currentPos);
2374
2457
  if (advance < 0) {
2375
2458
  break;
2376
2459
  }
2377
- pos += advance;
2378
- r = input.step(pos);
2460
+ currentPos += advance;
2461
+ r = input.step(currentPos);
2379
2462
  rune = r >> 3;
2380
2463
  width = r & 7;
2381
- r = input.step(pos + width);
2464
+ r = input.step(currentPos + width);
2382
2465
  rune1 = r >> 3;
2383
2466
  width1 = r & 7;
2384
2467
  }
2385
2468
  }
2386
- if (!this.matched && (pos === 0 || anchor === RE2Flags.UNANCHORED)) {
2387
- if (this.ncap > 0) {
2388
- this.matchcap[0] = pos;
2389
- }
2390
- // Spawn Lookbehind threads BEFORE the main pattern
2469
+
2470
+ // Optimize lookbehind spawning. Because lookbehinds are prefixed with `.*` by the compiler,
2471
+ // they only need to be spawned exactly once at the beginning of the string (currentPos === 0).
2472
+ if (currentPos === 0 && this.prog.numLb > 0) {
2391
2473
  for (let i = 0; i < this.prog.lbStarts.length; i++) {
2392
- this.add(runq, this.prog.lbStarts[i], pos, this.matchcap, flag, null);
2474
+ this.add(runq, this.prog.lbStarts[i], currentPos, this.matchcap, flag, null);
2475
+ }
2476
+ }
2477
+ if (!this.matched && (currentPos === 0 || anchor === RE2Flags.UNANCHORED)) {
2478
+ // ONLY spawn the main pattern if we have reached the requested search start boundary
2479
+ if (currentPos >= matchStartPos) {
2480
+ if (this.ncap > 0) {
2481
+ this.matchcap[0] = currentPos;
2482
+ }
2483
+ this.add(runq, this.prog.start, currentPos, this.matchcap, flag, null);
2393
2484
  }
2394
- this.add(runq, this.prog.start, pos, this.matchcap, flag, null);
2395
2485
  }
2396
- const nextPos = pos + width;
2486
+ const nextPos = currentPos + width;
2397
2487
  flag = input.context(nextPos);
2398
- this.step(runq, nextq, pos, nextPos, rune, flag, anchor, pos === input.endPos());
2488
+ this.step(runq, nextq, currentPos, nextPos, rune, flag, anchor, currentPos === input.endPos());
2399
2489
  if (width === 0) {
2400
2490
  break;
2401
2491
  }
2402
2492
  if (this.ncap === 0 && this.matched) {
2403
2493
  break;
2404
2494
  }
2405
- pos += width;
2495
+ currentPos += width;
2406
2496
  rune = rune1;
2407
2497
  width = width1;
2408
2498
  if (rune !== -1) {
2409
- r = input.step(pos + width);
2499
+ r = input.step(currentPos + width);
2410
2500
  rune1 = r >> 3;
2411
2501
  width1 = r & 7;
2412
2502
  }
@@ -2423,35 +2513,46 @@ class Machine {
2423
2513
  if ((anchor === RE2Flags.ANCHOR_START || anchor === RE2Flags.ANCHOR_BOTH) && pos !== 0) {
2424
2514
  return [];
2425
2515
  }
2516
+
2517
+ // Lookbehinds must scan from the beginning of the string to build their state table,
2518
+ // even if the main pattern search is requested to start mid-string.
2519
+ let currentPos = this.prog.numLb > 0 ? 0 : pos;
2520
+ let matchStartPos = pos;
2426
2521
  let runq = this.q0;
2427
2522
  let nextq = this.q1;
2428
- let r = input.step(pos);
2523
+ let r = input.step(currentPos);
2429
2524
  let rune = r >> 3;
2430
2525
  let width = r & 7;
2431
2526
  let rune1 = -1;
2432
2527
  let width1 = 0;
2433
2528
  if (r !== MachineInputBase.EOF()) {
2434
- r = input.step(pos + width);
2529
+ r = input.step(currentPos + width);
2435
2530
  rune1 = r >> 3;
2436
2531
  width1 = r & 7;
2437
2532
  }
2438
- let flag = pos === 0 ? Utils.emptyOpContext(-1, rune) : input.context(pos);
2533
+ let flag = currentPos === 0 ? Utils.emptyOpContext(-1, rune) : input.context(currentPos);
2439
2534
  const matches = new Set();
2440
2535
  while (true) {
2441
2536
  if (runq.isEmpty()) {
2442
- if ((startCond & Utils.EMPTY_BEGIN_TEXT) !== 0 && pos !== 0) break;
2443
- if ((anchor === RE2Flags.ANCHOR_START || anchor === RE2Flags.ANCHOR_BOTH) && pos !== 0) {
2537
+ if ((startCond & Utils.EMPTY_BEGIN_TEXT) !== 0 && currentPos !== 0) break;
2538
+ if ((anchor === RE2Flags.ANCHOR_START || anchor === RE2Flags.ANCHOR_BOTH) && currentPos !== 0) {
2444
2539
  break;
2445
2540
  }
2446
2541
  }
2447
- if (pos === 0 || anchor === RE2Flags.UNANCHORED) {
2448
- // Spawn Lookbehind threads BEFORE the main pattern
2542
+
2543
+ // Optimize lookbehind spawning to exactly once at BOF
2544
+ if (currentPos === 0 && this.prog.numLb > 0) {
2449
2545
  for (let i = 0; i < this.prog.lbStarts.length; i++) {
2450
- this.add(runq, this.prog.lbStarts[i], pos, this.matchcap, flag, null);
2546
+ this.add(runq, this.prog.lbStarts[i], currentPos, this.matchcap, flag, null);
2451
2547
  }
2452
- this.add(runq, this.prog.start, pos, this.matchcap, flag, null);
2453
2548
  }
2454
- const nextPos = pos + width;
2549
+ if (currentPos === 0 || anchor === RE2Flags.UNANCHORED) {
2550
+ // ONLY spawn the main pattern if we have reached the requested search start boundary
2551
+ if (currentPos >= matchStartPos) {
2552
+ this.add(runq, this.prog.start, currentPos, this.matchcap, flag, null);
2553
+ }
2554
+ }
2555
+ const nextPos = currentPos + width;
2455
2556
  flag = input.context(nextPos);
2456
2557
  for (let j = 0; j < runq.size; j++) {
2457
2558
  let t = runq.denseThreads[j];
@@ -2460,7 +2561,7 @@ class Machine {
2460
2561
  let add = false;
2461
2562
  switch (i.op) {
2462
2563
  case Inst.MATCH:
2463
- if (anchor === RE2Flags.ANCHOR_BOTH && pos !== input.endPos()) break;
2564
+ if (anchor === RE2Flags.ANCHOR_BOTH && currentPos !== input.endPos()) break;
2464
2565
  matches.add(i.arg); // Record the matched Set ID
2465
2566
  break;
2466
2567
  case Inst.RUNE:
@@ -2488,11 +2589,11 @@ class Machine {
2488
2589
  }
2489
2590
  runq.clear();
2490
2591
  if (width === 0) break;
2491
- pos += width;
2592
+ currentPos += width;
2492
2593
  rune = rune1;
2493
2594
  width = width1;
2494
2595
  if (rune !== -1) {
2495
- r = input.step(pos + width);
2596
+ r = input.step(currentPos + width);
2496
2597
  rune1 = r >> 3;
2497
2598
  width1 = r & 7;
2498
2599
  }