re2js 2.2.3 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,7 +2,7 @@
2
2
  * re2js
3
3
  * RE2JS is the JavaScript port of RE2, a regular expression engine that provides linear time matching
4
4
  *
5
- * @version v2.2.3
5
+ * @version v2.3.0
6
6
  * @author Oleksii Vasyliev
7
7
  * @homepage https://github.com/le0pard/re2js#readme
8
8
  * @repository github:le0pard/re2js
@@ -1335,6 +1335,11 @@
1335
1335
  *
1336
1336
  * @author rsc@google.com (Russ Cox)
1337
1337
  */
1338
+
1339
+ /**
1340
+ * @typedef {import('./index').RE2JS} RE2JS_Pattern
1341
+ */
1342
+
1338
1343
  class Matcher {
1339
1344
  /**
1340
1345
  * Quotes '\' and '$' in {@code s}, so that the returned string could be used in
@@ -1372,14 +1377,17 @@
1372
1377
  }
1373
1378
  /**
1374
1379
  *
1375
- * @param {RE2JS} pattern
1376
- * @param {Utf8MatcherInput|Utf16MatcherInput|number[]|string} input
1380
+ * @param {RE2JS_Pattern} pattern
1381
+ * @param {Uint8Array|number[]|string} input
1377
1382
  */
1378
1383
  constructor(pattern, input) {
1379
1384
  if (pattern === null) {
1380
1385
  throw new Error('pattern is null');
1381
1386
  }
1382
- // The pattern being matched.
1387
+ /**
1388
+ * The pattern being matched.
1389
+ * @type {RE2JS_Pattern}
1390
+ */
1383
1391
  this.patternInput = pattern;
1384
1392
  const re2 = this.patternInput.re2();
1385
1393
  // The number of submatches (groups) in the pattern.
@@ -1403,7 +1411,7 @@
1403
1411
 
1404
1412
  /**
1405
1413
  * Returns the {@code RE2JS} associated with this {@code Matcher}.
1406
- * @returns {RE2JS}
1414
+ * @returns {RE2JS_Pattern}
1407
1415
  */
1408
1416
  pattern() {
1409
1417
  return this.patternInput;
@@ -1433,7 +1441,7 @@
1433
1441
 
1434
1442
  /**
1435
1443
  * Resets the {@code Matcher} and changes the input.
1436
- * @param {Utf8MatcherInput|Utf16MatcherInput} input
1444
+ * @param {import('./MatcherInput').MatcherInputBase} input
1437
1445
  * @returns {Matcher} the {@code Matcher} itself, for chained method calls
1438
1446
  */
1439
1447
  resetMatcherInput(input) {
@@ -1498,7 +1506,7 @@
1498
1506
  /**
1499
1507
  * Returns the named group of the most recent match, or {@code null} if the group was not matched.
1500
1508
  * @param {string|number} [group=0]
1501
- * @returns {?string}
1509
+ * @returns {string|null}
1502
1510
  */
1503
1511
  group(group = 0) {
1504
1512
  if (typeof group === 'string') {
@@ -1590,7 +1598,7 @@
1590
1598
  * Matches the input against the pattern (unanchored), starting at a specified position. If there
1591
1599
  * is a match, {@code find} sets the match state to describe it.
1592
1600
  *
1593
- * @param {number} [start=null] the input position where the search begins
1601
+ * @param {number|null} [start=null] the input position where the search begins
1594
1602
  * @returns {boolean} if it finds a match
1595
1603
  * @throws IndexOutOfBoundsException if start is not a valid input position
1596
1604
  */
@@ -7941,9 +7949,18 @@
7941
7949
  }
7942
7950
 
7943
7951
  class RE2Set {
7952
+ /** @type {number} */
7944
7953
  static UNANCHORED = RE2Flags.UNANCHORED;
7954
+ /** @type {number} */
7945
7955
  static ANCHOR_START = RE2Flags.ANCHOR_START;
7956
+ /** @type {number} */
7946
7957
  static ANCHOR_BOTH = RE2Flags.ANCHOR_BOTH;
7958
+
7959
+ /**
7960
+ * Constructs a new RE2Set with the specified anchor mode and flags.
7961
+ * @param {number} [anchor=RE2Set.UNANCHORED] - The anchoring mode (e.g., RE2Set.UNANCHORED).
7962
+ * @param {number} [flags=0] - The public flags to apply to all patterns in the set.
7963
+ */
7947
7964
  constructor(anchor = RE2Set.UNANCHORED, flags = 0) {
7948
7965
  this.anchor = anchor;
7949
7966
  this.jsFlags = flags;
@@ -7960,6 +7977,14 @@
7960
7977
  this.dfa = null;
7961
7978
  this.dummyRe2 = null;
7962
7979
  }
7980
+
7981
+ /**
7982
+ * Adds a new regular expression pattern to the set.
7983
+ * Patterns cannot be added after the set has been compiled.
7984
+ * @param {string} pattern - The regular expression pattern to add.
7985
+ * @returns {number} The integer index assigned to the added pattern.
7986
+ * @throws {RE2JSCompileException} If patterns are added after compilation.
7987
+ */
7963
7988
  add(pattern) {
7964
7989
  if (this.prog) {
7965
7990
  throw new RE2JSCompileException('Cannot add patterns after compile');
@@ -7978,6 +8003,12 @@
7978
8003
  this.regexps.push(Simplify.simplify(re));
7979
8004
  return this.regexps.length - 1;
7980
8005
  }
8006
+
8007
+ /**
8008
+ * Compiles the added patterns into a single state machine.
8009
+ * This is automatically called on the first match if not called explicitly.
8010
+ * @returns {void}
8011
+ */
7981
8012
  compile() {
7982
8013
  if (this.prog) return;
7983
8014
  this.prog = Compiler.compileSet(this.regexps);
@@ -7990,6 +8021,12 @@
7990
8021
  longest: false
7991
8022
  };
7992
8023
  }
8024
+
8025
+ /**
8026
+ * Matches the input against the compiled set of regular expressions.
8027
+ * @param {string|number[]|Uint8Array} input - The input string or UTF-8 byte array to match against.
8028
+ * @returns {number[]} An array of indices representing the patterns that successfully matched the input.
8029
+ */
7993
8030
  match(input) {
7994
8031
  if (!this.prog) this.compile();
7995
8032
  const machineInput = Utils.isByteArray(input) ? MachineInput.fromUTF8(input) : MachineInput.fromUTF16(input);
@@ -8015,13 +8052,19 @@
8015
8052
  * Transform JS regex string to RE2 regex string
8016
8053
  */
8017
8054
  class TranslateRegExpString {
8018
- static isUpperCaseAlpha(ch) {
8019
- return 'A' <= ch && ch <= 'Z';
8020
- }
8021
8055
  static isHexadecimal(ch) {
8022
8056
  return '0' <= ch && ch <= '9' || 'A' <= ch && ch <= 'F' || 'a' <= ch && ch <= 'f';
8023
8057
  }
8024
8058
  static translate(data) {
8059
+ let prefixFlags = '';
8060
+ if (data instanceof RegExp) {
8061
+ if (data.ignoreCase) prefixFlags += 'i';
8062
+ if (data.multiline) prefixFlags += 'm';
8063
+ if (data.dotAll) prefixFlags += 's';
8064
+
8065
+ // execution flags ('g', 'y') are safely ignored here.
8066
+ data = data.source;
8067
+ }
8025
8068
  if (typeof data !== 'string') {
8026
8069
  return data;
8027
8070
  }
@@ -8032,6 +8075,7 @@
8032
8075
  result = '(?:)';
8033
8076
  changed = true;
8034
8077
  }
8078
+ let inCharClass = false;
8035
8079
  let i = 0;
8036
8080
  while (i < size) {
8037
8081
  let ch = data[i];
@@ -8070,10 +8114,28 @@
8070
8114
  if (i + 2 < size) {
8071
8115
  let nextCh = data[i + 2];
8072
8116
  if (nextCh === '{') {
8073
- result += '\\x';
8074
- i += 2;
8075
- changed = true;
8076
- continue;
8117
+ // Must have a closing brace and at least one valid hex digit inside
8118
+ let j = i + 3;
8119
+ let hasHex = false;
8120
+ let closed = false;
8121
+ while (j < size) {
8122
+ const hexChar = data[j];
8123
+ if (hexChar === '}') {
8124
+ closed = true;
8125
+ break;
8126
+ }
8127
+ if (!TranslateRegExpString.isHexadecimal(hexChar)) {
8128
+ break;
8129
+ }
8130
+ hasHex = true;
8131
+ j++;
8132
+ }
8133
+ if (closed && hasHex) {
8134
+ result += '\\x';
8135
+ i += 2;
8136
+ changed = true;
8137
+ continue;
8138
+ }
8077
8139
  } else if (i + 5 < size) {
8078
8140
  let isHex4 = true;
8079
8141
  for (let j = 0; j < 4; j++) {
@@ -8090,18 +8152,101 @@
8090
8152
  }
8091
8153
  }
8092
8154
  }
8155
+
8156
+ // Graceful degradation for invalid/unclosed \u sequences
8093
8157
  result += 'u';
8094
8158
  i += 2;
8095
8159
  changed = true;
8096
8160
  continue;
8097
8161
  }
8162
+ case 'x':
8163
+ {
8164
+ let isValidHex = false;
8165
+ if (i + 2 < size && data[i + 2] === '{') {
8166
+ // Must have a closing brace and at least one valid hex digit inside
8167
+ let j = i + 3;
8168
+ let hasHex = false;
8169
+ let closed = false;
8170
+ while (j < size) {
8171
+ const hexChar = data[j];
8172
+ if (hexChar === '}') {
8173
+ closed = true;
8174
+ break;
8175
+ }
8176
+ if (!TranslateRegExpString.isHexadecimal(hexChar)) {
8177
+ break;
8178
+ }
8179
+ hasHex = true;
8180
+ j++;
8181
+ }
8182
+ if (closed && hasHex) {
8183
+ isValidHex = true;
8184
+ }
8185
+ } else if (i + 3 < size && TranslateRegExpString.isHexadecimal(data[i + 2]) && TranslateRegExpString.isHexadecimal(data[i + 3])) {
8186
+ isValidHex = true;
8187
+ }
8188
+ if (isValidHex) {
8189
+ result += '\\x';
8190
+ i += 2;
8191
+ } else {
8192
+ result += 'x';
8193
+ i += 2;
8194
+ changed = true;
8195
+ }
8196
+ continue;
8197
+ }
8198
+ // Whitelist of valid RE2/JS alphanumeric escapes
8199
+ case 'n':
8200
+ case 'r':
8201
+ case 't':
8202
+ case 'a':
8203
+ case 'f':
8204
+ case 'v':
8205
+ case 'd':
8206
+ case 'D':
8207
+ case 's':
8208
+ case 'S':
8209
+ case 'w':
8210
+ case 'W':
8211
+ case 'b':
8212
+ case 'B':
8213
+ case 'p':
8214
+ case 'P':
8215
+ case 'A':
8216
+ case 'z':
8217
+ case 'Q':
8218
+ case 'E':
8219
+ case '0':
8220
+ case '1':
8221
+ case '2':
8222
+ case '3':
8223
+ case '4':
8224
+ case '5':
8225
+ case '6':
8226
+ case '7':
8227
+ {
8228
+ result += '\\' + ch;
8229
+ i += 2;
8230
+ continue;
8231
+ }
8098
8232
  default:
8099
8233
  {
8100
- result += '\\';
8101
8234
  let cp = data.codePointAt(i + 1);
8102
- let symSize = Utils.charCount(cp);
8103
- result += data.substring(i + 1, i + 1 + symSize);
8104
- i += symSize + 1;
8235
+ let isAlphaNum = cp >= 48 && cp <= 57 || cp >= 65 && cp <= 90 || cp >= 97 && cp <= 122;
8236
+ if (isAlphaNum) {
8237
+ // Invalid JS alphanumeric escape sequence (e.g. \8, \9, \e, \K)
8238
+ // Gracefully degrade to the literal character to prevent RE2 syntax crashes
8239
+ let symSize = Utils.charCount(cp);
8240
+ result += data.substring(i + 1, i + 1 + symSize);
8241
+ i += symSize + 1;
8242
+ changed = true;
8243
+ } else {
8244
+ // Escaped symbol (e.g. \., \*, \])
8245
+ result += '\\';
8246
+ let symSize = Utils.charCount(cp);
8247
+ result += data.substring(i + 1, i + 1 + symSize);
8248
+ i += symSize + 1;
8249
+ }
8105
8250
  continue;
8106
8251
  }
8107
8252
  }
@@ -8111,7 +8256,13 @@
8111
8256
  i += 1;
8112
8257
  changed = true;
8113
8258
  continue;
8114
- } else if (ch === '(' && i + 2 < size && data[i + 1] === '?' && data[i + 2] === '<') {
8259
+ } else if (ch === '[') {
8260
+ // Track entry into a character class (protects syntax inside)
8261
+ inCharClass = true;
8262
+ } else if (ch === ']') {
8263
+ // Track exit of a character class
8264
+ inCharClass = false;
8265
+ } else if (!inCharClass && ch === '(' && i + 2 < size && data[i + 1] === '?' && data[i + 2] === '<') {
8115
8266
  if (i + 3 < size && !'=!>)'.includes(data[i + 3])) {
8116
8267
  result += '(?P<';
8117
8268
  i += 3;
@@ -8124,7 +8275,13 @@
8124
8275
  result += data.substring(i, i + symSize);
8125
8276
  i += symSize;
8126
8277
  }
8127
- return changed ? result : data;
8278
+ const finalResult = changed ? result : data;
8279
+
8280
+ // Append any extracted inline flags
8281
+ if (prefixFlags.length > 0) {
8282
+ return `(?${prefixFlags})${finalResult}`;
8283
+ }
8284
+ return finalResult;
8128
8285
  }
8129
8286
  }
8130
8287
 
@@ -8202,7 +8359,7 @@
8202
8359
  * RE2JS-compatible syntax, and handling Unicode sequences properly. It ensures that the
8203
8360
  * resulting regex is safe and properly formatted before compilation.
8204
8361
  *
8205
- * @param {string} expr - The regular expression string to be translated.
8362
+ * @param {string|RegExp} expr - The regular expression string to be translated.
8206
8363
  * @returns {string} - The transformed regular expression string, ready for compilation.
8207
8364
  */
8208
8365
  static translateRegExp(expr) {
@@ -8246,7 +8403,7 @@
8246
8403
  * Matches a string against a regular expression.
8247
8404
  *
8248
8405
  * @param {string} regex the regular expression
8249
- * @param {string|number[]} input the input
8406
+ * @param {string|number[]|Uint8Array} input the input
8250
8407
  * @returns {boolean} true if the regular expression matches the entire input
8251
8408
  * @throws RE2JSSyntaxException if the regular expression is malformed
8252
8409
  */
@@ -8313,7 +8470,7 @@
8313
8470
  /**
8314
8471
  * Matches a string against a regular expression.
8315
8472
  *
8316
- * @param {string|number[]} input the input
8473
+ * @param {string|number[]|Uint8Array} input the input
8317
8474
  * @returns {boolean} true if the regular expression matches the entire input
8318
8475
  */
8319
8476
  matches(input) {
@@ -8323,7 +8480,7 @@
8323
8480
  /**
8324
8481
  * Creates a new {@code Matcher} matching the pattern against the input.
8325
8482
  *
8326
- * @param {string|number[]} input the input string
8483
+ * @param {string|number[]|Uint8Array} input the input string
8327
8484
  * @returns {Matcher}
8328
8485
  */
8329
8486
  matcher(input) {
@@ -8339,7 +8496,7 @@
8339
8496
  * a boolean and does not extract capture groups, it bypasses the `Matcher` overhead
8340
8497
  * and guarantees execution on the high-speed DFA engine whenever possible.
8341
8498
  *
8342
- * @param {string|number[]} input - The input string or UTF-8 byte array to test against.
8499
+ * @param {string|number[]|Uint8Array} input - The input string or UTF-8 byte array to test against.
8343
8500
  * @returns {boolean} `true` if the pattern is found anywhere in the input, `false` otherwise.
8344
8501
  */
8345
8502
  test(input) {
@@ -8358,7 +8515,7 @@
8358
8515
  * faster because it does not request capture group data. By requesting 0 capture groups,
8359
8516
  * it securely routes execution through the DFA fast-path.
8360
8517
  *
8361
- * @param {string|number[]} input - The input string or UTF-8 byte array to test against.
8518
+ * @param {string|number[]|Uint8Array} input - The input string or UTF-8 byte array to test against.
8362
8519
  * @returns {boolean} `true` if the exact input string fully matches the pattern, `false` otherwise.
8363
8520
  */
8364
8521
  testExact(input) {