re2js 2.2.3 → 2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,7 +2,7 @@
2
2
  * re2js
3
3
  * RE2JS is the JavaScript port of RE2, a regular expression engine that provides linear time matching
4
4
  *
5
- * @version v2.2.3
5
+ * @version v2.3.1
6
6
  * @author Oleksii Vasyliev
7
7
  * @homepage https://github.com/le0pard/re2js#readme
8
8
  * @repository github:le0pard/re2js
@@ -1335,6 +1335,7 @@
1335
1335
  *
1336
1336
  * @author rsc@google.com (Russ Cox)
1337
1337
  */
1338
+
1338
1339
  class Matcher {
1339
1340
  /**
1340
1341
  * Quotes '\' and '$' in {@code s}, so that the returned string could be used in
@@ -1373,13 +1374,16 @@
1373
1374
  /**
1374
1375
  *
1375
1376
  * @param {RE2JS} pattern
1376
- * @param {Utf8MatcherInput|Utf16MatcherInput|number[]|string} input
1377
+ * @param {string|number[]|Uint8Array} input
1377
1378
  */
1378
1379
  constructor(pattern, input) {
1379
1380
  if (pattern === null) {
1380
1381
  throw new Error('pattern is null');
1381
1382
  }
1382
- // The pattern being matched.
1383
+ /**
1384
+ * The pattern being matched.
1385
+ * @type {RE2JS}
1386
+ */
1383
1387
  this.patternInput = pattern;
1384
1388
  const re2 = this.patternInput.re2();
1385
1389
  // The number of submatches (groups) in the pattern.
@@ -1433,7 +1437,7 @@
1433
1437
 
1434
1438
  /**
1435
1439
  * Resets the {@code Matcher} and changes the input.
1436
- * @param {Utf8MatcherInput|Utf16MatcherInput} input
1440
+ * @param {MatcherInputBase} input
1437
1441
  * @returns {Matcher} the {@code Matcher} itself, for chained method calls
1438
1442
  */
1439
1443
  resetMatcherInput(input) {
@@ -1498,7 +1502,7 @@
1498
1502
  /**
1499
1503
  * Returns the named group of the most recent match, or {@code null} if the group was not matched.
1500
1504
  * @param {string|number} [group=0]
1501
- * @returns {?string}
1505
+ * @returns {string|null}
1502
1506
  */
1503
1507
  group(group = 0) {
1504
1508
  if (typeof group === 'string') {
@@ -1590,7 +1594,7 @@
1590
1594
  * Matches the input against the pattern (unanchored), starting at a specified position. If there
1591
1595
  * is a match, {@code find} sets the match state to describe it.
1592
1596
  *
1593
- * @param {number} [start=null] the input position where the search begins
1597
+ * @param {number|null} [start=null] the input position where the search begins
1594
1598
  * @returns {boolean} if it finds a match
1595
1599
  * @throws IndexOutOfBoundsException if start is not a valid input position
1596
1600
  */
@@ -7941,9 +7945,18 @@
7941
7945
  }
7942
7946
 
7943
7947
  class RE2Set {
7948
+ /** @type {number} */
7944
7949
  static UNANCHORED = RE2Flags.UNANCHORED;
7950
+ /** @type {number} */
7945
7951
  static ANCHOR_START = RE2Flags.ANCHOR_START;
7952
+ /** @type {number} */
7946
7953
  static ANCHOR_BOTH = RE2Flags.ANCHOR_BOTH;
7954
+
7955
+ /**
7956
+ * Constructs a new RE2Set with the specified anchor mode and flags.
7957
+ * @param {number} [anchor=RE2Set.UNANCHORED] - The anchoring mode (e.g., RE2Set.UNANCHORED).
7958
+ * @param {number} [flags=0] - The public flags to apply to all patterns in the set.
7959
+ */
7947
7960
  constructor(anchor = RE2Set.UNANCHORED, flags = 0) {
7948
7961
  this.anchor = anchor;
7949
7962
  this.jsFlags = flags;
@@ -7960,6 +7973,14 @@
7960
7973
  this.dfa = null;
7961
7974
  this.dummyRe2 = null;
7962
7975
  }
7976
+
7977
+ /**
7978
+ * Adds a new regular expression pattern to the set.
7979
+ * Patterns cannot be added after the set has been compiled.
7980
+ * @param {string} pattern - The regular expression pattern to add.
7981
+ * @returns {number} The integer index assigned to the added pattern.
7982
+ * @throws {RE2JSCompileException} If patterns are added after compilation.
7983
+ */
7963
7984
  add(pattern) {
7964
7985
  if (this.prog) {
7965
7986
  throw new RE2JSCompileException('Cannot add patterns after compile');
@@ -7978,6 +7999,12 @@
7978
7999
  this.regexps.push(Simplify.simplify(re));
7979
8000
  return this.regexps.length - 1;
7980
8001
  }
8002
+
8003
+ /**
8004
+ * Compiles the added patterns into a single state machine.
8005
+ * This is automatically called on the first match if not called explicitly.
8006
+ * @returns {void}
8007
+ */
7981
8008
  compile() {
7982
8009
  if (this.prog) return;
7983
8010
  this.prog = Compiler.compileSet(this.regexps);
@@ -7990,6 +8017,12 @@
7990
8017
  longest: false
7991
8018
  };
7992
8019
  }
8020
+
8021
+ /**
8022
+ * Matches the input against the compiled set of regular expressions.
8023
+ * @param {string|number[]|Uint8Array} input - The input string or UTF-8 byte array to match against.
8024
+ * @returns {number[]} An array of indices representing the patterns that successfully matched the input.
8025
+ */
7993
8026
  match(input) {
7994
8027
  if (!this.prog) this.compile();
7995
8028
  const machineInput = Utils.isByteArray(input) ? MachineInput.fromUTF8(input) : MachineInput.fromUTF16(input);
@@ -8015,13 +8048,19 @@
8015
8048
  * Transform JS regex string to RE2 regex string
8016
8049
  */
8017
8050
  class TranslateRegExpString {
8018
- static isUpperCaseAlpha(ch) {
8019
- return 'A' <= ch && ch <= 'Z';
8020
- }
8021
8051
  static isHexadecimal(ch) {
8022
8052
  return '0' <= ch && ch <= '9' || 'A' <= ch && ch <= 'F' || 'a' <= ch && ch <= 'f';
8023
8053
  }
8024
8054
  static translate(data) {
8055
+ let prefixFlags = '';
8056
+ if (data instanceof RegExp) {
8057
+ if (data.ignoreCase) prefixFlags += 'i';
8058
+ if (data.multiline) prefixFlags += 'm';
8059
+ if (data.dotAll) prefixFlags += 's';
8060
+
8061
+ // execution flags ('g', 'y') are safely ignored here.
8062
+ data = data.source;
8063
+ }
8025
8064
  if (typeof data !== 'string') {
8026
8065
  return data;
8027
8066
  }
@@ -8032,6 +8071,7 @@
8032
8071
  result = '(?:)';
8033
8072
  changed = true;
8034
8073
  }
8074
+ let inCharClass = false;
8035
8075
  let i = 0;
8036
8076
  while (i < size) {
8037
8077
  let ch = data[i];
@@ -8070,10 +8110,28 @@
8070
8110
  if (i + 2 < size) {
8071
8111
  let nextCh = data[i + 2];
8072
8112
  if (nextCh === '{') {
8073
- result += '\\x';
8074
- i += 2;
8075
- changed = true;
8076
- continue;
8113
+ // Must have a closing brace and at least one valid hex digit inside
8114
+ let j = i + 3;
8115
+ let hasHex = false;
8116
+ let closed = false;
8117
+ while (j < size) {
8118
+ const hexChar = data[j];
8119
+ if (hexChar === '}') {
8120
+ closed = true;
8121
+ break;
8122
+ }
8123
+ if (!TranslateRegExpString.isHexadecimal(hexChar)) {
8124
+ break;
8125
+ }
8126
+ hasHex = true;
8127
+ j++;
8128
+ }
8129
+ if (closed && hasHex) {
8130
+ result += '\\x';
8131
+ i += 2;
8132
+ changed = true;
8133
+ continue;
8134
+ }
8077
8135
  } else if (i + 5 < size) {
8078
8136
  let isHex4 = true;
8079
8137
  for (let j = 0; j < 4; j++) {
@@ -8090,18 +8148,101 @@
8090
8148
  }
8091
8149
  }
8092
8150
  }
8151
+
8152
+ // Graceful degradation for invalid/unclosed \u sequences
8093
8153
  result += 'u';
8094
8154
  i += 2;
8095
8155
  changed = true;
8096
8156
  continue;
8097
8157
  }
8158
+ case 'x':
8159
+ {
8160
+ let isValidHex = false;
8161
+ if (i + 2 < size && data[i + 2] === '{') {
8162
+ // Must have a closing brace and at least one valid hex digit inside
8163
+ let j = i + 3;
8164
+ let hasHex = false;
8165
+ let closed = false;
8166
+ while (j < size) {
8167
+ const hexChar = data[j];
8168
+ if (hexChar === '}') {
8169
+ closed = true;
8170
+ break;
8171
+ }
8172
+ if (!TranslateRegExpString.isHexadecimal(hexChar)) {
8173
+ break;
8174
+ }
8175
+ hasHex = true;
8176
+ j++;
8177
+ }
8178
+ if (closed && hasHex) {
8179
+ isValidHex = true;
8180
+ }
8181
+ } else if (i + 3 < size && TranslateRegExpString.isHexadecimal(data[i + 2]) && TranslateRegExpString.isHexadecimal(data[i + 3])) {
8182
+ isValidHex = true;
8183
+ }
8184
+ if (isValidHex) {
8185
+ result += '\\x';
8186
+ i += 2;
8187
+ } else {
8188
+ result += 'x';
8189
+ i += 2;
8190
+ changed = true;
8191
+ }
8192
+ continue;
8193
+ }
8194
+ // Whitelist of valid RE2/JS alphanumeric escapes
8195
+ case 'n':
8196
+ case 'r':
8197
+ case 't':
8198
+ case 'a':
8199
+ case 'f':
8200
+ case 'v':
8201
+ case 'd':
8202
+ case 'D':
8203
+ case 's':
8204
+ case 'S':
8205
+ case 'w':
8206
+ case 'W':
8207
+ case 'b':
8208
+ case 'B':
8209
+ case 'p':
8210
+ case 'P':
8211
+ case 'A':
8212
+ case 'z':
8213
+ case 'Q':
8214
+ case 'E':
8215
+ case '0':
8216
+ case '1':
8217
+ case '2':
8218
+ case '3':
8219
+ case '4':
8220
+ case '5':
8221
+ case '6':
8222
+ case '7':
8223
+ {
8224
+ result += '\\' + ch;
8225
+ i += 2;
8226
+ continue;
8227
+ }
8098
8228
  default:
8099
8229
  {
8100
- result += '\\';
8101
8230
  let cp = data.codePointAt(i + 1);
8102
- let symSize = Utils.charCount(cp);
8103
- result += data.substring(i + 1, i + 1 + symSize);
8104
- i += symSize + 1;
8231
+ let isAlphaNum = cp >= 48 && cp <= 57 || cp >= 65 && cp <= 90 || cp >= 97 && cp <= 122;
8232
+ if (isAlphaNum) {
8233
+ // Invalid JS alphanumeric escape sequence (e.g. \8, \9, \e, \K)
8234
+ // Gracefully degrade to the literal character to prevent RE2 syntax crashes
8235
+ let symSize = Utils.charCount(cp);
8236
+ result += data.substring(i + 1, i + 1 + symSize);
8237
+ i += symSize + 1;
8238
+ changed = true;
8239
+ } else {
8240
+ // Escaped symbol (e.g. \., \*, \])
8241
+ result += '\\';
8242
+ let symSize = Utils.charCount(cp);
8243
+ result += data.substring(i + 1, i + 1 + symSize);
8244
+ i += symSize + 1;
8245
+ }
8105
8246
  continue;
8106
8247
  }
8107
8248
  }
@@ -8111,7 +8252,13 @@
8111
8252
  i += 1;
8112
8253
  changed = true;
8113
8254
  continue;
8114
- } else if (ch === '(' && i + 2 < size && data[i + 1] === '?' && data[i + 2] === '<') {
8255
+ } else if (ch === '[') {
8256
+ // Track entry into a character class (protects syntax inside)
8257
+ inCharClass = true;
8258
+ } else if (ch === ']') {
8259
+ // Track exit of a character class
8260
+ inCharClass = false;
8261
+ } else if (!inCharClass && ch === '(' && i + 2 < size && data[i + 1] === '?' && data[i + 2] === '<') {
8115
8262
  if (i + 3 < size && !'=!>)'.includes(data[i + 3])) {
8116
8263
  result += '(?P<';
8117
8264
  i += 3;
@@ -8124,7 +8271,13 @@
8124
8271
  result += data.substring(i, i + symSize);
8125
8272
  i += symSize;
8126
8273
  }
8127
- return changed ? result : data;
8274
+ const finalResult = changed ? result : data;
8275
+
8276
+ // Append any extracted inline flags
8277
+ if (prefixFlags.length > 0) {
8278
+ return `(?${prefixFlags})${finalResult}`;
8279
+ }
8280
+ return finalResult;
8128
8281
  }
8129
8282
  }
8130
8283
 
@@ -8202,7 +8355,7 @@
8202
8355
  * RE2JS-compatible syntax, and handling Unicode sequences properly. It ensures that the
8203
8356
  * resulting regex is safe and properly formatted before compilation.
8204
8357
  *
8205
- * @param {string} expr - The regular expression string to be translated.
8358
+ * @param {string|RegExp} expr - The regular expression string to be translated.
8206
8359
  * @returns {string} - The transformed regular expression string, ready for compilation.
8207
8360
  */
8208
8361
  static translateRegExp(expr) {
@@ -8246,7 +8399,7 @@
8246
8399
  * Matches a string against a regular expression.
8247
8400
  *
8248
8401
  * @param {string} regex the regular expression
8249
- * @param {string|number[]} input the input
8402
+ * @param {string|number[]|Uint8Array} input the input
8250
8403
  * @returns {boolean} true if the regular expression matches the entire input
8251
8404
  * @throws RE2JSSyntaxException if the regular expression is malformed
8252
8405
  */
@@ -8313,7 +8466,7 @@
8313
8466
  /**
8314
8467
  * Matches a string against a regular expression.
8315
8468
  *
8316
- * @param {string|number[]} input the input
8469
+ * @param {string|number[]|Uint8Array} input the input
8317
8470
  * @returns {boolean} true if the regular expression matches the entire input
8318
8471
  */
8319
8472
  matches(input) {
@@ -8323,7 +8476,7 @@
8323
8476
  /**
8324
8477
  * Creates a new {@code Matcher} matching the pattern against the input.
8325
8478
  *
8326
- * @param {string|number[]} input the input string
8479
+ * @param {string|number[]|Uint8Array} input the input string
8327
8480
  * @returns {Matcher}
8328
8481
  */
8329
8482
  matcher(input) {
@@ -8339,7 +8492,7 @@
8339
8492
  * a boolean and does not extract capture groups, it bypasses the `Matcher` overhead
8340
8493
  * and guarantees execution on the high-speed DFA engine whenever possible.
8341
8494
  *
8342
- * @param {string|number[]} input - The input string or UTF-8 byte array to test against.
8495
+ * @param {string|number[]|Uint8Array} input - The input string or UTF-8 byte array to test against.
8343
8496
  * @returns {boolean} `true` if the pattern is found anywhere in the input, `false` otherwise.
8344
8497
  */
8345
8498
  test(input) {
@@ -8358,7 +8511,7 @@
8358
8511
  * faster because it does not request capture group data. By requesting 0 capture groups,
8359
8512
  * it securely routes execution through the DFA fast-path.
8360
8513
  *
8361
- * @param {string|number[]} input - The input string or UTF-8 byte array to test against.
8514
+ * @param {string|number[]|Uint8Array} input - The input string or UTF-8 byte array to test against.
8362
8515
  * @returns {boolean} `true` if the exact input string fully matches the pattern, `false` otherwise.
8363
8516
  */
8364
8517
  testExact(input) {