re2js 2.2.2 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -473,6 +473,8 @@ Parameters:
473
473
  - `$1, $2, ...` refer to the corresponding capture groups in the pattern
474
474
  - `$$` inserts a literal `$`
475
475
  - `$<name>` can be used to reference named capture groups
476
+ - `` $` `` inserts the portion of the string that precedes the matched substring
477
+ - `$'` inserts the portion of the string that follows the matched substring
476
478
  - on invalid group - ignore it
477
479
  - `javaMode (Boolean)`: If set to `true`, the replacement follows Java's rules for replacement. Defaults to `false`. If `javaMode = true`, changed rules for capture groups and special characters:
478
480
  - `$0` refers to the entire matched substring
@@ -575,6 +577,10 @@ const unicodeRegexp = RE2JS.translateRegExp('\\u{1F600}') // '\\x{1F600}'
575
577
 
576
578
  RE2JS.matches(unicodeRegexp, '😀') // true
577
579
  RE2JS.matches(unicodeRegexp, '😃') // false
580
+
581
+ // also support native Regex
582
+ RE2JS.translateRegExp(/foo/ims) // '(?ims)foo'
583
+ RE2JS.translateRegExp(/bar/giy) // '(?i)bar'
578
584
  ```
579
585
 
580
586
  ## Performance and Architecture
@@ -2,7 +2,7 @@
2
2
  * re2js
3
3
  * RE2JS is the JavaScript port of RE2, a regular expression engine that provides linear time matching
4
4
  *
5
- * @version v2.2.2
5
+ * @version v2.3.0
6
6
  * @author Oleksii Vasyliev
7
7
  * @homepage https://github.com/le0pard/re2js#readme
8
8
  * @repository github:le0pard/re2js
@@ -96,7 +96,7 @@ for (let i = 0; i < ASCII_SIZE; i++) {
96
96
  }
97
97
  class Codepoint {
98
98
  // codePointAt(0)
99
- static CODES = new Map([['\x07', 7], ['\b', 8], ['\t', 9], ['\n', 10], ['\v', 11], ['\f', 12], ['\r', 13], [' ', 32], ['"', 34], ['$', 36], ['&', 38], ['(', 40], [')', 41], ['*', 42], ['+', 43], ['-', 45], ['.', 46], ['0', 48], ['1', 49], ['2', 50], ['3', 51], ['4', 52], ['5', 53], ['6', 54], ['7', 55], ['8', 56], ['9', 57], [':', 58], ['<', 60], ['>', 62], ['?', 63], ['A', 65], ['B', 66], ['C', 67], ['F', 70], ['P', 80], ['Q', 81], ['U', 85], ['Z', 90], ['[', 91], ['\\', 92], [']', 93], ['^', 94], ['_', 95], ['a', 97], ['b', 98], ['f', 102], ['i', 105], ['m', 109], ['n', 110], ['r', 114], ['s', 115], ['t', 116], ['v', 118], ['x', 120], ['z', 122], ['{', 123], ['|', 124], ['}', 125]]);
99
+ static CODES = new Map([['\x07', 7], ['\b', 8], ['\t', 9], ['\n', 10], ['\v', 11], ['\f', 12], ['\r', 13], [' ', 32], ['"', 34], ['$', 36], ['&', 38], ["'", 39], ['(', 40], [')', 41], ['*', 42], ['+', 43], ['-', 45], ['.', 46], ['0', 48], ['1', 49], ['2', 50], ['3', 51], ['4', 52], ['5', 53], ['6', 54], ['7', 55], ['8', 56], ['9', 57], [':', 58], ['<', 60], ['>', 62], ['?', 63], ['A', 65], ['B', 66], ['C', 67], ['F', 70], ['P', 80], ['Q', 81], ['U', 85], ['Z', 90], ['[', 91], ['\\', 92], [']', 93], ['^', 94], ['_', 95], ['`', 96], ['a', 97], ['b', 98], ['f', 102], ['i', 105], ['m', 109], ['n', 110], ['r', 114], ['s', 115], ['t', 116], ['v', 118], ['x', 120], ['z', 122], ['{', 123], ['|', 124], ['}', 125]]);
100
100
 
101
101
  // convert unicode codepoint to upper case codepoint
102
102
  // return same codepoint, if cannot do it (or codepoint not have upper variation)
@@ -150,10 +150,6 @@ class UnicodeRangeTable {
150
150
  getStride(index) {
151
151
  return this.isStride1 ? 1 : this.data[index * this.SIZE + 2];
152
152
  }
153
- get(index) {
154
- const i = index * this.SIZE;
155
- return [this.data[i], this.data[i + 1], this.getStride(index)];
156
- }
157
153
  get length() {
158
154
  return this.data.length / this.SIZE;
159
155
  }
@@ -650,6 +646,9 @@ class Utils {
650
646
  static emptyInts() {
651
647
  return [];
652
648
  }
649
+ static isByteArray(input) {
650
+ return Array.isArray(input) || input instanceof Uint8Array;
651
+ }
653
652
 
654
653
  // Returns true iff |c| is an ASCII letter or decimal digit.
655
654
  static isalnum(c) {
@@ -951,7 +950,7 @@ class Utf16MatcherInput extends MatcherInputBase {
951
950
  * @returns {number[]}
952
951
  */
953
952
  asBytes() {
954
- return this.charSequence.toString().split('').map(s => s.codePointAt(0));
953
+ return Utils.stringToUtf8ByteArray(this.charSequence.toString());
955
954
  }
956
955
 
957
956
  /**
@@ -976,7 +975,7 @@ class MatcherInput {
976
975
  * @returns {Utf8MatcherInput}
977
976
  */
978
977
  static utf8(input) {
979
- if (Array.isArray(input)) {
978
+ if (Utils.isByteArray(input)) {
980
979
  return new Utf8MatcherInput(input);
981
980
  }
982
981
  return new Utf8MatcherInput(Utils.stringToUtf8ByteArray(input));
@@ -1108,10 +1107,10 @@ class MachineUTF8Input extends MachineInputBase {
1108
1107
  if (start < this.start) {
1109
1108
  start = this.start;
1110
1109
  }
1111
- r1 = this.step(start) >> 3;
1110
+ r1 = this.step(start - this.start) >> 3;
1112
1111
  }
1113
1112
  }
1114
- const r2 = pos < this.end ? this.step(pos) >> 3 : -1;
1113
+ const r2 = pos < this.end ? this.step(pos - this.start) >> 3 : -1;
1115
1114
  return Utils.emptyOpContext(r1, r2);
1116
1115
  }
1117
1116
 
@@ -1193,14 +1192,17 @@ class MachineUTF16Input extends MachineInputBase {
1193
1192
  index(re2, pos) {
1194
1193
  pos += this.start;
1195
1194
  const i = this.charSequence.indexOf(re2.prefix, pos);
1196
- return i < 0 ? i : i - pos;
1195
+ if (i < 0 || i > this.end - re2.prefix.length) {
1196
+ return -1;
1197
+ }
1198
+ return i - pos;
1197
1199
  }
1198
1200
 
1199
1201
  // Returns a bitmask of EMPTY_* flags.
1200
1202
  context(pos) {
1201
1203
  pos += this.start;
1202
- const r1 = pos > 0 && pos <= this.charSequence.length ? this.charSequence.codePointAt(pos - 1) : -1;
1203
- const r2 = pos < this.charSequence.length ? this.charSequence.codePointAt(pos) : -1;
1204
+ const r1 = pos > this.start && pos <= this.end ? this.charSequence.codePointAt(pos - 1) : -1;
1205
+ const r2 = pos < this.end ? this.charSequence.codePointAt(pos) : -1;
1204
1206
  return Utils.emptyOpContext(r1, r2);
1205
1207
  }
1206
1208
  prefixLength(re2) {
@@ -1329,6 +1331,11 @@ class RE2JSInternalException extends RE2JSException {
1329
1331
  *
1330
1332
  * @author rsc@google.com (Russ Cox)
1331
1333
  */
1334
+
1335
+ /**
1336
+ * @typedef {import('./index').RE2JS} RE2JS_Pattern
1337
+ */
1338
+
1332
1339
  class Matcher {
1333
1340
  /**
1334
1341
  * Quotes '\' and '$' in {@code s}, so that the returned string could be used in
@@ -1366,14 +1373,17 @@ class Matcher {
1366
1373
  }
1367
1374
  /**
1368
1375
  *
1369
- * @param {RE2JS} pattern
1370
- * @param {Utf8MatcherInput|Utf16MatcherInput|number[]|string} input
1376
+ * @param {RE2JS_Pattern} pattern
1377
+ * @param {Uint8Array|number[]|string} input
1371
1378
  */
1372
1379
  constructor(pattern, input) {
1373
1380
  if (pattern === null) {
1374
1381
  throw new Error('pattern is null');
1375
1382
  }
1376
- // The pattern being matched.
1383
+ /**
1384
+ * The pattern being matched.
1385
+ * @type {RE2JS_Pattern}
1386
+ */
1377
1387
  this.patternInput = pattern;
1378
1388
  const re2 = this.patternInput.re2();
1379
1389
  // The number of submatches (groups) in the pattern.
@@ -1388,7 +1398,7 @@ class Matcher {
1388
1398
  this.numberOfInstructions = re2.numberOfInstructions();
1389
1399
  if (input instanceof MatcherInputBase) {
1390
1400
  this.resetMatcherInput(input);
1391
- } else if (Array.isArray(input)) {
1401
+ } else if (Utils.isByteArray(input)) {
1392
1402
  this.resetMatcherInput(MatcherInput.utf8(input));
1393
1403
  } else {
1394
1404
  this.resetMatcherInput(MatcherInput.utf16(input));
@@ -1397,7 +1407,7 @@ class Matcher {
1397
1407
 
1398
1408
  /**
1399
1409
  * Returns the {@code RE2JS} associated with this {@code Matcher}.
1400
- * @returns {RE2JS}
1410
+ * @returns {RE2JS_Pattern}
1401
1411
  */
1402
1412
  pattern() {
1403
1413
  return this.patternInput;
@@ -1427,7 +1437,7 @@ class Matcher {
1427
1437
 
1428
1438
  /**
1429
1439
  * Resets the {@code Matcher} and changes the input.
1430
- * @param {Utf8MatcherInput|Utf16MatcherInput} input
1440
+ * @param {import('./MatcherInput').MatcherInputBase} input
1431
1441
  * @returns {Matcher} the {@code Matcher} itself, for chained method calls
1432
1442
  */
1433
1443
  resetMatcherInput(input) {
@@ -1492,7 +1502,7 @@ class Matcher {
1492
1502
  /**
1493
1503
  * Returns the named group of the most recent match, or {@code null} if the group was not matched.
1494
1504
  * @param {string|number} [group=0]
1495
- * @returns {?string}
1505
+ * @returns {string|null}
1496
1506
  */
1497
1507
  group(group = 0) {
1498
1508
  if (typeof group === 'string') {
@@ -1550,10 +1560,7 @@ class Matcher {
1550
1560
  if (group === 0 || this.hasGroups) {
1551
1561
  return;
1552
1562
  }
1553
- let end = this.groups[1] + 1;
1554
- if (end > this.matcherInputLength) {
1555
- end = this.matcherInputLength;
1556
- }
1563
+ const end = this.matcherInputLength;
1557
1564
  const res = this.patternInput.re2().matchMachineInput(this.matcherInput, this.groups[0], end, this.anchorFlag, 1 + this.patternGroupCount);
1558
1565
  const ok = res[0];
1559
1566
  if (!ok) {
@@ -1587,7 +1594,7 @@ class Matcher {
1587
1594
  * Matches the input against the pattern (unanchored), starting at a specified position. If there
1588
1595
  * is a match, {@code find} sets the match state to describe it.
1589
1596
  *
1590
- * @param {number} [start=null] the input position where the search begins
1597
+ * @param {number|null} [start=null] the input position where the search begins
1591
1598
  * @returns {boolean} if it finds a match
1592
1599
  * @throws IndexOutOfBoundsException if start is not a valid input position
1593
1600
  */
@@ -1749,7 +1756,10 @@ class Matcher {
1749
1756
  throw new RE2JSGroupException("named capture group is missing trailing '}'");
1750
1757
  }
1751
1758
  const groupName = replacement.substring(i + 1, j);
1752
- res += this.group(groupName);
1759
+ const groupVal = this.group(groupName);
1760
+ if (groupVal !== null) {
1761
+ res += groupVal;
1762
+ }
1753
1763
  last = j + 1;
1754
1764
  i = j;
1755
1765
  continue;
@@ -1795,6 +1805,22 @@ class Matcher {
1795
1805
  i++;
1796
1806
  last = i + 1;
1797
1807
  continue;
1808
+ } else if (Codepoint.CODES.get('`') === c) {
1809
+ if (last < i) {
1810
+ res += replacement.substring(last, i);
1811
+ }
1812
+ res += this.substring(0, this.start(0));
1813
+ i++;
1814
+ last = i + 1;
1815
+ continue;
1816
+ } else if (Codepoint.CODES.get("'") === c) {
1817
+ if (last < i) {
1818
+ res += replacement.substring(last, i);
1819
+ }
1820
+ res += this.substring(this.end(0), this.matcherInputLength);
1821
+ i++;
1822
+ last = i + 1;
1823
+ continue;
1798
1824
  } else if (Codepoint.CODES.get('1') <= c && c <= Codepoint.CODES.get('9')) {
1799
1825
  let n = c - Codepoint.CODES.get('0');
1800
1826
  if (last < i) {
@@ -1837,7 +1863,10 @@ class Matcher {
1837
1863
  }
1838
1864
  const groupName = replacement.substring(i + 1, j);
1839
1865
  if (Object.prototype.hasOwnProperty.call(this.namedGroups, groupName)) {
1840
- res += this.group(groupName);
1866
+ const groupVal = this.group(groupName);
1867
+ if (groupVal !== null) {
1868
+ res += groupVal;
1869
+ }
1841
1870
  } else {
1842
1871
  res += `$<${groupName}>`;
1843
1872
  }
@@ -4339,13 +4368,6 @@ class Prog {
4339
4368
  // start every program with a fail instruction, so we'll never want to point
4340
4369
  // at its output link.
4341
4370
 
4342
- next(l) {
4343
- const i = this.inst[l >> 1];
4344
- if ((l & 1) === 0) {
4345
- return i.out;
4346
- }
4347
- return i.arg;
4348
- }
4349
4371
  patch(l, val) {
4350
4372
  let head = l.head;
4351
4373
  while (head !== 0) {
@@ -5675,6 +5697,7 @@ class Parser {
5675
5697
  case Codepoint.CODES.get('6'):
5676
5698
  case Codepoint.CODES.get('7'):
5677
5699
  {
5700
+ // Single non-zero digit is a backreference; not supported
5678
5701
  if (!t.more() || t.peek() < Codepoint.CODES.get('0') || t.peek() > Codepoint.CODES.get('7')) {
5679
5702
  break;
5680
5703
  }
@@ -5682,6 +5705,7 @@ class Parser {
5682
5705
  // eslint-disable-next-line no-fallthrough
5683
5706
  case Codepoint.CODES.get('0'):
5684
5707
  {
5708
+ // Consume up to three octal digits; already have one.
5685
5709
  let r = c - Codepoint.CODES.get('0');
5686
5710
  for (let i = 1; i < 3; i++) {
5687
5711
  if (!t.more() || t.peek() < Codepoint.CODES.get('0') || t.peek() > Codepoint.CODES.get('7')) {
@@ -7462,7 +7486,11 @@ class RE2 {
7462
7486
  */
7463
7487
  matchWithGroup(input, start, end, anchor, ngroup) {
7464
7488
  if (!(input instanceof MatcherInputBase)) {
7465
- input = MatcherInput.utf16(input);
7489
+ if (Utils.isByteArray(input)) {
7490
+ input = MatcherInput.utf8(input);
7491
+ } else {
7492
+ input = MatcherInput.utf16(input);
7493
+ }
7466
7494
  }
7467
7495
  return this.matchMachineInput(input, start, end, anchor, ngroup);
7468
7496
  }
@@ -7917,9 +7945,18 @@ class RE2 {
7917
7945
  }
7918
7946
 
7919
7947
  class RE2Set {
7948
+ /** @type {number} */
7920
7949
  static UNANCHORED = RE2Flags.UNANCHORED;
7950
+ /** @type {number} */
7921
7951
  static ANCHOR_START = RE2Flags.ANCHOR_START;
7952
+ /** @type {number} */
7922
7953
  static ANCHOR_BOTH = RE2Flags.ANCHOR_BOTH;
7954
+
7955
+ /**
7956
+ * Constructs a new RE2Set with the specified anchor mode and flags.
7957
+ * @param {number} [anchor=RE2Set.UNANCHORED] - The anchoring mode (e.g., RE2Set.UNANCHORED).
7958
+ * @param {number} [flags=0] - The public flags to apply to all patterns in the set.
7959
+ */
7923
7960
  constructor(anchor = RE2Set.UNANCHORED, flags = 0) {
7924
7961
  this.anchor = anchor;
7925
7962
  this.jsFlags = flags;
@@ -7936,6 +7973,14 @@ class RE2Set {
7936
7973
  this.dfa = null;
7937
7974
  this.dummyRe2 = null;
7938
7975
  }
7976
+
7977
+ /**
7978
+ * Adds a new regular expression pattern to the set.
7979
+ * Patterns cannot be added after the set has been compiled.
7980
+ * @param {string} pattern - The regular expression pattern to add.
7981
+ * @returns {number} The integer index assigned to the added pattern.
7982
+ * @throws {RE2JSCompileException} If patterns are added after compilation.
7983
+ */
7939
7984
  add(pattern) {
7940
7985
  if (this.prog) {
7941
7986
  throw new RE2JSCompileException('Cannot add patterns after compile');
@@ -7954,6 +7999,12 @@ class RE2Set {
7954
7999
  this.regexps.push(Simplify.simplify(re));
7955
8000
  return this.regexps.length - 1;
7956
8001
  }
8002
+
8003
+ /**
8004
+ * Compiles the added patterns into a single state machine.
8005
+ * This is automatically called on the first match if not called explicitly.
8006
+ * @returns {void}
8007
+ */
7957
8008
  compile() {
7958
8009
  if (this.prog) return;
7959
8010
  this.prog = Compiler.compileSet(this.regexps);
@@ -7966,9 +8017,15 @@ class RE2Set {
7966
8017
  longest: false
7967
8018
  };
7968
8019
  }
8020
+
8021
+ /**
8022
+ * Matches the input against the compiled set of regular expressions.
8023
+ * @param {string|number[]|Uint8Array} input - The input string or UTF-8 byte array to match against.
8024
+ * @returns {number[]} An array of indices representing the patterns that successfully matched the input.
8025
+ */
7969
8026
  match(input) {
7970
8027
  if (!this.prog) this.compile();
7971
- const machineInput = Array.isArray(input) ? MachineInput.fromUTF8(input) : MachineInput.fromUTF16(input);
8028
+ const machineInput = Utils.isByteArray(input) ? MachineInput.fromUTF8(input) : MachineInput.fromUTF16(input);
7972
8029
  let internalAnchor = RE2Flags.UNANCHORED;
7973
8030
  if (this.anchor === RE2Set.ANCHOR_START) {
7974
8031
  internalAnchor = RE2Flags.ANCHOR_START;
@@ -7991,13 +8048,19 @@ class RE2Set {
7991
8048
  * Transform JS regex string to RE2 regex string
7992
8049
  */
7993
8050
  class TranslateRegExpString {
7994
- static isUpperCaseAlpha(ch) {
7995
- return 'A' <= ch && ch <= 'Z';
7996
- }
7997
8051
  static isHexadecimal(ch) {
7998
8052
  return '0' <= ch && ch <= '9' || 'A' <= ch && ch <= 'F' || 'a' <= ch && ch <= 'f';
7999
8053
  }
8000
8054
  static translate(data) {
8055
+ let prefixFlags = '';
8056
+ if (data instanceof RegExp) {
8057
+ if (data.ignoreCase) prefixFlags += 'i';
8058
+ if (data.multiline) prefixFlags += 'm';
8059
+ if (data.dotAll) prefixFlags += 's';
8060
+
8061
+ // execution flags ('g', 'y') are safely ignored here.
8062
+ data = data.source;
8063
+ }
8001
8064
  if (typeof data !== 'string') {
8002
8065
  return data;
8003
8066
  }
@@ -8008,6 +8071,7 @@ class TranslateRegExpString {
8008
8071
  result = '(?:)';
8009
8072
  changed = true;
8010
8073
  }
8074
+ let inCharClass = false;
8011
8075
  let i = 0;
8012
8076
  while (i < size) {
8013
8077
  let ch = data[i];
@@ -8025,54 +8089,160 @@ class TranslateRegExpString {
8025
8089
  {
8026
8090
  if (i + 2 < size) {
8027
8091
  let nextCh = data[i + 2];
8028
- if (TranslateRegExpString.isUpperCaseAlpha(nextCh)) {
8092
+ let code = nextCh.charCodeAt(0);
8093
+ if (code >= 65 && code <= 90 || code >= 97 && code <= 122) {
8094
+ let val = code % 32;
8029
8095
  result += '\\x';
8030
- result += (nextCh.charCodeAt(0) - 64 >> 4).toString(16).toUpperCase();
8031
- result += (nextCh.charCodeAt(0) - 64 & 15).toString(16).toUpperCase();
8096
+ result += (val >> 4).toString(16).toUpperCase();
8097
+ result += (val & 15).toString(16).toUpperCase();
8032
8098
  i += 3;
8033
8099
  changed = true;
8034
8100
  continue;
8035
8101
  }
8036
8102
  }
8037
- result += '\\c';
8103
+ result += 'c';
8038
8104
  i += 2;
8105
+ changed = true;
8039
8106
  continue;
8040
8107
  }
8041
8108
  case 'u':
8042
8109
  {
8043
8110
  if (i + 2 < size) {
8044
8111
  let nextCh = data[i + 2];
8045
- if (TranslateRegExpString.isHexadecimal(nextCh)) {
8046
- result += '\\x{' + nextCh;
8047
- i += 3;
8048
- for (let j = 0; j < 3 && i < size; ++i, ++j) {
8049
- nextCh = data[i];
8050
- if (!TranslateRegExpString.isHexadecimal(nextCh)) {
8112
+ if (nextCh === '{') {
8113
+ // Must have a closing brace and at least one valid hex digit inside
8114
+ let j = i + 3;
8115
+ let hasHex = false;
8116
+ let closed = false;
8117
+ while (j < size) {
8118
+ const hexChar = data[j];
8119
+ if (hexChar === '}') {
8120
+ closed = true;
8121
+ break;
8122
+ }
8123
+ if (!TranslateRegExpString.isHexadecimal(hexChar)) {
8051
8124
  break;
8052
8125
  }
8053
- result += nextCh;
8126
+ hasHex = true;
8127
+ j++;
8128
+ }
8129
+ if (closed && hasHex) {
8130
+ result += '\\x';
8131
+ i += 2;
8132
+ changed = true;
8133
+ continue;
8134
+ }
8135
+ } else if (i + 5 < size) {
8136
+ let isHex4 = true;
8137
+ for (let j = 0; j < 4; j++) {
8138
+ if (!TranslateRegExpString.isHexadecimal(data[i + 2 + j])) {
8139
+ isHex4 = false;
8140
+ break;
8141
+ }
8142
+ }
8143
+ if (isHex4) {
8144
+ result += '\\x{' + data.substring(i + 2, i + 6) + '}';
8145
+ i += 6;
8146
+ changed = true;
8147
+ continue;
8054
8148
  }
8055
- result += '}';
8056
- changed = true;
8057
- continue;
8058
- } else if (nextCh === '{') {
8059
- result += '\\x';
8060
- i += 2;
8061
- changed = true;
8062
- continue;
8063
8149
  }
8064
8150
  }
8065
- result += '\\u';
8151
+
8152
+ // Graceful degradation for invalid/unclosed \u sequences
8153
+ result += 'u';
8154
+ i += 2;
8155
+ changed = true;
8156
+ continue;
8157
+ }
8158
+ case 'x':
8159
+ {
8160
+ let isValidHex = false;
8161
+ if (i + 2 < size && data[i + 2] === '{') {
8162
+ // Must have a closing brace and at least one valid hex digit inside
8163
+ let j = i + 3;
8164
+ let hasHex = false;
8165
+ let closed = false;
8166
+ while (j < size) {
8167
+ const hexChar = data[j];
8168
+ if (hexChar === '}') {
8169
+ closed = true;
8170
+ break;
8171
+ }
8172
+ if (!TranslateRegExpString.isHexadecimal(hexChar)) {
8173
+ break;
8174
+ }
8175
+ hasHex = true;
8176
+ j++;
8177
+ }
8178
+ if (closed && hasHex) {
8179
+ isValidHex = true;
8180
+ }
8181
+ } else if (i + 3 < size && TranslateRegExpString.isHexadecimal(data[i + 2]) && TranslateRegExpString.isHexadecimal(data[i + 3])) {
8182
+ isValidHex = true;
8183
+ }
8184
+ if (isValidHex) {
8185
+ result += '\\x';
8186
+ i += 2;
8187
+ } else {
8188
+ result += 'x';
8189
+ i += 2;
8190
+ changed = true;
8191
+ }
8192
+ continue;
8193
+ }
8194
+ // Whitelist of valid RE2/JS alphanumeric escapes
8195
+ case 'n':
8196
+ case 'r':
8197
+ case 't':
8198
+ case 'a':
8199
+ case 'f':
8200
+ case 'v':
8201
+ case 'd':
8202
+ case 'D':
8203
+ case 's':
8204
+ case 'S':
8205
+ case 'w':
8206
+ case 'W':
8207
+ case 'b':
8208
+ case 'B':
8209
+ case 'p':
8210
+ case 'P':
8211
+ case 'A':
8212
+ case 'z':
8213
+ case 'Q':
8214
+ case 'E':
8215
+ case '0':
8216
+ case '1':
8217
+ case '2':
8218
+ case '3':
8219
+ case '4':
8220
+ case '5':
8221
+ case '6':
8222
+ case '7':
8223
+ {
8224
+ result += '\\' + ch;
8066
8225
  i += 2;
8067
8226
  continue;
8068
8227
  }
8069
8228
  default:
8070
8229
  {
8071
- result += '\\';
8072
8230
  let cp = data.codePointAt(i + 1);
8073
- let symSize = Utils.charCount(cp);
8074
- result += data.substring(i + 1, i + 1 + symSize);
8075
- i += symSize + 1;
8231
+ let isAlphaNum = cp >= 48 && cp <= 57 || cp >= 65 && cp <= 90 || cp >= 97 && cp <= 122;
8232
+ if (isAlphaNum) {
8233
+ // Invalid JS alphanumeric escape sequence (e.g. \8, \9, \e, \K)
8234
+ // Gracefully degrade to the literal character to prevent RE2 syntax crashes
8235
+ let symSize = Utils.charCount(cp);
8236
+ result += data.substring(i + 1, i + 1 + symSize);
8237
+ i += symSize + 1;
8238
+ changed = true;
8239
+ } else {
8240
+ // Escaped symbol (e.g. \., \*, \])
8241
+ result += '\\';
8242
+ let symSize = Utils.charCount(cp);
8243
+ result += data.substring(i + 1, i + 1 + symSize);
8244
+ i += symSize + 1;
8245
+ }
8076
8246
  continue;
8077
8247
  }
8078
8248
  }
@@ -8082,7 +8252,13 @@ class TranslateRegExpString {
8082
8252
  i += 1;
8083
8253
  changed = true;
8084
8254
  continue;
8085
- } else if (ch === '(' && i + 2 < size && data[i + 1] === '?' && data[i + 2] === '<') {
8255
+ } else if (ch === '[') {
8256
+ // Track entry into a character class (protects syntax inside)
8257
+ inCharClass = true;
8258
+ } else if (ch === ']') {
8259
+ // Track exit of a character class
8260
+ inCharClass = false;
8261
+ } else if (!inCharClass && ch === '(' && i + 2 < size && data[i + 1] === '?' && data[i + 2] === '<') {
8086
8262
  if (i + 3 < size && !'=!>)'.includes(data[i + 3])) {
8087
8263
  result += '(?P<';
8088
8264
  i += 3;
@@ -8095,7 +8271,13 @@ class TranslateRegExpString {
8095
8271
  result += data.substring(i, i + symSize);
8096
8272
  i += symSize;
8097
8273
  }
8098
- return changed ? result : data;
8274
+ const finalResult = changed ? result : data;
8275
+
8276
+ // Append any extracted inline flags
8277
+ if (prefixFlags.length > 0) {
8278
+ return `(?${prefixFlags})${finalResult}`;
8279
+ }
8280
+ return finalResult;
8099
8281
  }
8100
8282
  }
8101
8283
 
@@ -8173,7 +8355,7 @@ class RE2JS {
8173
8355
  * RE2JS-compatible syntax, and handling Unicode sequences properly. It ensures that the
8174
8356
  * resulting regex is safe and properly formatted before compilation.
8175
8357
  *
8176
- * @param {string} expr - The regular expression string to be translated.
8358
+ * @param {string|RegExp} expr - The regular expression string to be translated.
8177
8359
  * @returns {string} - The transformed regular expression string, ready for compilation.
8178
8360
  */
8179
8361
  static translateRegExp(expr) {
@@ -8217,7 +8399,7 @@ class RE2JS {
8217
8399
  * Matches a string against a regular expression.
8218
8400
  *
8219
8401
  * @param {string} regex the regular expression
8220
- * @param {string|number[]} input the input
8402
+ * @param {string|number[]|Uint8Array} input the input
8221
8403
  * @returns {boolean} true if the regular expression matches the entire input
8222
8404
  * @throws RE2JSSyntaxException if the regular expression is malformed
8223
8405
  */
@@ -8284,7 +8466,7 @@ class RE2JS {
8284
8466
  /**
8285
8467
  * Matches a string against a regular expression.
8286
8468
  *
8287
- * @param {string|number[]} input the input
8469
+ * @param {string|number[]|Uint8Array} input the input
8288
8470
  * @returns {boolean} true if the regular expression matches the entire input
8289
8471
  */
8290
8472
  matches(input) {
@@ -8294,11 +8476,11 @@ class RE2JS {
8294
8476
  /**
8295
8477
  * Creates a new {@code Matcher} matching the pattern against the input.
8296
8478
  *
8297
- * @param {string|number[]} input the input string
8479
+ * @param {string|number[]|Uint8Array} input the input string
8298
8480
  * @returns {Matcher}
8299
8481
  */
8300
8482
  matcher(input) {
8301
- if (Array.isArray(input)) {
8483
+ if (Utils.isByteArray(input)) {
8302
8484
  input = MatcherInput.utf8(input);
8303
8485
  }
8304
8486
  return new Matcher(this, input);
@@ -8310,11 +8492,11 @@ class RE2JS {
8310
8492
  * a boolean and does not extract capture groups, it bypasses the `Matcher` overhead
8311
8493
  * and guarantees execution on the high-speed DFA engine whenever possible.
8312
8494
  *
8313
- * @param {string|number[]} input - The input string or UTF-8 byte array to test against.
8495
+ * @param {string|number[]|Uint8Array} input - The input string or UTF-8 byte array to test against.
8314
8496
  * @returns {boolean} `true` if the pattern is found anywhere in the input, `false` otherwise.
8315
8497
  */
8316
8498
  test(input) {
8317
- if (Array.isArray(input)) {
8499
+ if (Utils.isByteArray(input)) {
8318
8500
  // Reuse the existing UTF-8 fast-path method
8319
8501
  return this.re2Input.matchUTF8(input);
8320
8502
  }
@@ -8329,11 +8511,11 @@ class RE2JS {
8329
8511
  * faster because it does not request capture group data. By requesting 0 capture groups,
8330
8512
  * it securely routes execution through the DFA fast-path.
8331
8513
  *
8332
- * @param {string|number[]} input - The input string or UTF-8 byte array to test against.
8514
+ * @param {string|number[]|Uint8Array} input - The input string or UTF-8 byte array to test against.
8333
8515
  * @returns {boolean} `true` if the exact input string fully matches the pattern, `false` otherwise.
8334
8516
  */
8335
8517
  testExact(input) {
8336
- const machineInput = Array.isArray(input) ? MachineInput.fromUTF8(input) : MachineInput.fromUTF16(input);
8518
+ const machineInput = Utils.isByteArray(input) ? MachineInput.fromUTF8(input) : MachineInput.fromUTF16(input);
8337
8519
  return this.re2Input.executeEngine(machineInput, 0, RE2Flags.ANCHOR_BOTH, 0) !== null;
8338
8520
  }
8339
8521