re2js 2.2.3 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -577,6 +577,10 @@ const unicodeRegexp = RE2JS.translateRegExp('\\u{1F600}') // '\\x{1F600}'
577
577
 
578
578
  RE2JS.matches(unicodeRegexp, '😀') // true
579
579
  RE2JS.matches(unicodeRegexp, '😃') // false
580
+
581
+ // also support native Regex
582
+ RE2JS.translateRegExp(/foo/ims) // '(?ims)foo'
583
+ RE2JS.translateRegExp(/bar/giy) // '(?i)bar'
580
584
  ```
581
585
 
582
586
  ## Performance and Architecture
@@ -2,7 +2,7 @@
2
2
  * re2js
3
3
  * RE2JS is the JavaScript port of RE2, a regular expression engine that provides linear time matching
4
4
  *
5
- * @version v2.2.3
5
+ * @version v2.3.0
6
6
  * @author Oleksii Vasyliev
7
7
  * @homepage https://github.com/le0pard/re2js#readme
8
8
  * @repository github:le0pard/re2js
@@ -1331,6 +1331,11 @@ class RE2JSInternalException extends RE2JSException {
1331
1331
  *
1332
1332
  * @author rsc@google.com (Russ Cox)
1333
1333
  */
1334
+
1335
+ /**
1336
+ * @typedef {import('./index').RE2JS} RE2JS_Pattern
1337
+ */
1338
+
1334
1339
  class Matcher {
1335
1340
  /**
1336
1341
  * Quotes '\' and '$' in {@code s}, so that the returned string could be used in
@@ -1368,14 +1373,17 @@ class Matcher {
1368
1373
  }
1369
1374
  /**
1370
1375
  *
1371
- * @param {RE2JS} pattern
1372
- * @param {Utf8MatcherInput|Utf16MatcherInput|number[]|string} input
1376
+ * @param {RE2JS_Pattern} pattern
1377
+ * @param {Uint8Array|number[]|string} input
1373
1378
  */
1374
1379
  constructor(pattern, input) {
1375
1380
  if (pattern === null) {
1376
1381
  throw new Error('pattern is null');
1377
1382
  }
1378
- // The pattern being matched.
1383
+ /**
1384
+ * The pattern being matched.
1385
+ * @type {RE2JS_Pattern}
1386
+ */
1379
1387
  this.patternInput = pattern;
1380
1388
  const re2 = this.patternInput.re2();
1381
1389
  // The number of submatches (groups) in the pattern.
@@ -1399,7 +1407,7 @@ class Matcher {
1399
1407
 
1400
1408
  /**
1401
1409
  * Returns the {@code RE2JS} associated with this {@code Matcher}.
1402
- * @returns {RE2JS}
1410
+ * @returns {RE2JS_Pattern}
1403
1411
  */
1404
1412
  pattern() {
1405
1413
  return this.patternInput;
@@ -1429,7 +1437,7 @@ class Matcher {
1429
1437
 
1430
1438
  /**
1431
1439
  * Resets the {@code Matcher} and changes the input.
1432
- * @param {Utf8MatcherInput|Utf16MatcherInput} input
1440
+ * @param {import('./MatcherInput').MatcherInputBase} input
1433
1441
  * @returns {Matcher} the {@code Matcher} itself, for chained method calls
1434
1442
  */
1435
1443
  resetMatcherInput(input) {
@@ -1494,7 +1502,7 @@ class Matcher {
1494
1502
  /**
1495
1503
  * Returns the named group of the most recent match, or {@code null} if the group was not matched.
1496
1504
  * @param {string|number} [group=0]
1497
- * @returns {?string}
1505
+ * @returns {string|null}
1498
1506
  */
1499
1507
  group(group = 0) {
1500
1508
  if (typeof group === 'string') {
@@ -1586,7 +1594,7 @@ class Matcher {
1586
1594
  * Matches the input against the pattern (unanchored), starting at a specified position. If there
1587
1595
  * is a match, {@code find} sets the match state to describe it.
1588
1596
  *
1589
- * @param {number} [start=null] the input position where the search begins
1597
+ * @param {number|null} [start=null] the input position where the search begins
1590
1598
  * @returns {boolean} if it finds a match
1591
1599
  * @throws IndexOutOfBoundsException if start is not a valid input position
1592
1600
  */
@@ -7937,9 +7945,18 @@ class RE2 {
7937
7945
  }
7938
7946
 
7939
7947
  class RE2Set {
7948
+ /** @type {number} */
7940
7949
  static UNANCHORED = RE2Flags.UNANCHORED;
7950
+ /** @type {number} */
7941
7951
  static ANCHOR_START = RE2Flags.ANCHOR_START;
7952
+ /** @type {number} */
7942
7953
  static ANCHOR_BOTH = RE2Flags.ANCHOR_BOTH;
7954
+
7955
+ /**
7956
+ * Constructs a new RE2Set with the specified anchor mode and flags.
7957
+ * @param {number} [anchor=RE2Set.UNANCHORED] - The anchoring mode (e.g., RE2Set.UNANCHORED).
7958
+ * @param {number} [flags=0] - The public flags to apply to all patterns in the set.
7959
+ */
7943
7960
  constructor(anchor = RE2Set.UNANCHORED, flags = 0) {
7944
7961
  this.anchor = anchor;
7945
7962
  this.jsFlags = flags;
@@ -7956,6 +7973,14 @@ class RE2Set {
7956
7973
  this.dfa = null;
7957
7974
  this.dummyRe2 = null;
7958
7975
  }
7976
+
7977
+ /**
7978
+ * Adds a new regular expression pattern to the set.
7979
+ * Patterns cannot be added after the set has been compiled.
7980
+ * @param {string} pattern - The regular expression pattern to add.
7981
+ * @returns {number} The integer index assigned to the added pattern.
7982
+ * @throws {RE2JSCompileException} If patterns are added after compilation.
7983
+ */
7959
7984
  add(pattern) {
7960
7985
  if (this.prog) {
7961
7986
  throw new RE2JSCompileException('Cannot add patterns after compile');
@@ -7974,6 +7999,12 @@ class RE2Set {
7974
7999
  this.regexps.push(Simplify.simplify(re));
7975
8000
  return this.regexps.length - 1;
7976
8001
  }
8002
+
8003
+ /**
8004
+ * Compiles the added patterns into a single state machine.
8005
+ * This is automatically called on the first match if not called explicitly.
8006
+ * @returns {void}
8007
+ */
7977
8008
  compile() {
7978
8009
  if (this.prog) return;
7979
8010
  this.prog = Compiler.compileSet(this.regexps);
@@ -7986,6 +8017,12 @@ class RE2Set {
7986
8017
  longest: false
7987
8018
  };
7988
8019
  }
8020
+
8021
+ /**
8022
+ * Matches the input against the compiled set of regular expressions.
8023
+ * @param {string|number[]|Uint8Array} input - The input string or UTF-8 byte array to match against.
8024
+ * @returns {number[]} An array of indices representing the patterns that successfully matched the input.
8025
+ */
7989
8026
  match(input) {
7990
8027
  if (!this.prog) this.compile();
7991
8028
  const machineInput = Utils.isByteArray(input) ? MachineInput.fromUTF8(input) : MachineInput.fromUTF16(input);
@@ -8011,13 +8048,19 @@ class RE2Set {
8011
8048
  * Transform JS regex string to RE2 regex string
8012
8049
  */
8013
8050
  class TranslateRegExpString {
8014
- static isUpperCaseAlpha(ch) {
8015
- return 'A' <= ch && ch <= 'Z';
8016
- }
8017
8051
  static isHexadecimal(ch) {
8018
8052
  return '0' <= ch && ch <= '9' || 'A' <= ch && ch <= 'F' || 'a' <= ch && ch <= 'f';
8019
8053
  }
8020
8054
  static translate(data) {
8055
+ let prefixFlags = '';
8056
+ if (data instanceof RegExp) {
8057
+ if (data.ignoreCase) prefixFlags += 'i';
8058
+ if (data.multiline) prefixFlags += 'm';
8059
+ if (data.dotAll) prefixFlags += 's';
8060
+
8061
+ // execution flags ('g', 'y') are safely ignored here.
8062
+ data = data.source;
8063
+ }
8021
8064
  if (typeof data !== 'string') {
8022
8065
  return data;
8023
8066
  }
@@ -8028,6 +8071,7 @@ class TranslateRegExpString {
8028
8071
  result = '(?:)';
8029
8072
  changed = true;
8030
8073
  }
8074
+ let inCharClass = false;
8031
8075
  let i = 0;
8032
8076
  while (i < size) {
8033
8077
  let ch = data[i];
@@ -8066,10 +8110,28 @@ class TranslateRegExpString {
8066
8110
  if (i + 2 < size) {
8067
8111
  let nextCh = data[i + 2];
8068
8112
  if (nextCh === '{') {
8069
- result += '\\x';
8070
- i += 2;
8071
- changed = true;
8072
- continue;
8113
+ // Must have a closing brace and at least one valid hex digit inside
8114
+ let j = i + 3;
8115
+ let hasHex = false;
8116
+ let closed = false;
8117
+ while (j < size) {
8118
+ const hexChar = data[j];
8119
+ if (hexChar === '}') {
8120
+ closed = true;
8121
+ break;
8122
+ }
8123
+ if (!TranslateRegExpString.isHexadecimal(hexChar)) {
8124
+ break;
8125
+ }
8126
+ hasHex = true;
8127
+ j++;
8128
+ }
8129
+ if (closed && hasHex) {
8130
+ result += '\\x';
8131
+ i += 2;
8132
+ changed = true;
8133
+ continue;
8134
+ }
8073
8135
  } else if (i + 5 < size) {
8074
8136
  let isHex4 = true;
8075
8137
  for (let j = 0; j < 4; j++) {
@@ -8086,18 +8148,101 @@ class TranslateRegExpString {
8086
8148
  }
8087
8149
  }
8088
8150
  }
8151
+
8152
+ // Graceful degradation for invalid/unclosed \u sequences
8089
8153
  result += 'u';
8090
8154
  i += 2;
8091
8155
  changed = true;
8092
8156
  continue;
8093
8157
  }
8158
+ case 'x':
8159
+ {
8160
+ let isValidHex = false;
8161
+ if (i + 2 < size && data[i + 2] === '{') {
8162
+ // Must have a closing brace and at least one valid hex digit inside
8163
+ let j = i + 3;
8164
+ let hasHex = false;
8165
+ let closed = false;
8166
+ while (j < size) {
8167
+ const hexChar = data[j];
8168
+ if (hexChar === '}') {
8169
+ closed = true;
8170
+ break;
8171
+ }
8172
+ if (!TranslateRegExpString.isHexadecimal(hexChar)) {
8173
+ break;
8174
+ }
8175
+ hasHex = true;
8176
+ j++;
8177
+ }
8178
+ if (closed && hasHex) {
8179
+ isValidHex = true;
8180
+ }
8181
+ } else if (i + 3 < size && TranslateRegExpString.isHexadecimal(data[i + 2]) && TranslateRegExpString.isHexadecimal(data[i + 3])) {
8182
+ isValidHex = true;
8183
+ }
8184
+ if (isValidHex) {
8185
+ result += '\\x';
8186
+ i += 2;
8187
+ } else {
8188
+ result += 'x';
8189
+ i += 2;
8190
+ changed = true;
8191
+ }
8192
+ continue;
8193
+ }
8194
+ // Whitelist of valid RE2/JS alphanumeric escapes
8195
+ case 'n':
8196
+ case 'r':
8197
+ case 't':
8198
+ case 'a':
8199
+ case 'f':
8200
+ case 'v':
8201
+ case 'd':
8202
+ case 'D':
8203
+ case 's':
8204
+ case 'S':
8205
+ case 'w':
8206
+ case 'W':
8207
+ case 'b':
8208
+ case 'B':
8209
+ case 'p':
8210
+ case 'P':
8211
+ case 'A':
8212
+ case 'z':
8213
+ case 'Q':
8214
+ case 'E':
8215
+ case '0':
8216
+ case '1':
8217
+ case '2':
8218
+ case '3':
8219
+ case '4':
8220
+ case '5':
8221
+ case '6':
8222
+ case '7':
8223
+ {
8224
+ result += '\\' + ch;
8225
+ i += 2;
8226
+ continue;
8227
+ }
8094
8228
  default:
8095
8229
  {
8096
- result += '\\';
8097
8230
  let cp = data.codePointAt(i + 1);
8098
- let symSize = Utils.charCount(cp);
8099
- result += data.substring(i + 1, i + 1 + symSize);
8100
- i += symSize + 1;
8231
+ let isAlphaNum = cp >= 48 && cp <= 57 || cp >= 65 && cp <= 90 || cp >= 97 && cp <= 122;
8232
+ if (isAlphaNum) {
8233
+ // Invalid JS alphanumeric escape sequence (e.g. \8, \9, \e, \K)
8234
+ // Gracefully degrade to the literal character to prevent RE2 syntax crashes
8235
+ let symSize = Utils.charCount(cp);
8236
+ result += data.substring(i + 1, i + 1 + symSize);
8237
+ i += symSize + 1;
8238
+ changed = true;
8239
+ } else {
8240
+ // Escaped symbol (e.g. \., \*, \])
8241
+ result += '\\';
8242
+ let symSize = Utils.charCount(cp);
8243
+ result += data.substring(i + 1, i + 1 + symSize);
8244
+ i += symSize + 1;
8245
+ }
8101
8246
  continue;
8102
8247
  }
8103
8248
  }
@@ -8107,7 +8252,13 @@ class TranslateRegExpString {
8107
8252
  i += 1;
8108
8253
  changed = true;
8109
8254
  continue;
8110
- } else if (ch === '(' && i + 2 < size && data[i + 1] === '?' && data[i + 2] === '<') {
8255
+ } else if (ch === '[') {
8256
+ // Track entry into a character class (protects syntax inside)
8257
+ inCharClass = true;
8258
+ } else if (ch === ']') {
8259
+ // Track exit of a character class
8260
+ inCharClass = false;
8261
+ } else if (!inCharClass && ch === '(' && i + 2 < size && data[i + 1] === '?' && data[i + 2] === '<') {
8111
8262
  if (i + 3 < size && !'=!>)'.includes(data[i + 3])) {
8112
8263
  result += '(?P<';
8113
8264
  i += 3;
@@ -8120,7 +8271,13 @@ class TranslateRegExpString {
8120
8271
  result += data.substring(i, i + symSize);
8121
8272
  i += symSize;
8122
8273
  }
8123
- return changed ? result : data;
8274
+ const finalResult = changed ? result : data;
8275
+
8276
+ // Append any extracted inline flags
8277
+ if (prefixFlags.length > 0) {
8278
+ return `(?${prefixFlags})${finalResult}`;
8279
+ }
8280
+ return finalResult;
8124
8281
  }
8125
8282
  }
8126
8283
 
@@ -8198,7 +8355,7 @@ class RE2JS {
8198
8355
  * RE2JS-compatible syntax, and handling Unicode sequences properly. It ensures that the
8199
8356
  * resulting regex is safe and properly formatted before compilation.
8200
8357
  *
8201
- * @param {string} expr - The regular expression string to be translated.
8358
+ * @param {string|RegExp} expr - The regular expression string to be translated.
8202
8359
  * @returns {string} - The transformed regular expression string, ready for compilation.
8203
8360
  */
8204
8361
  static translateRegExp(expr) {
@@ -8242,7 +8399,7 @@ class RE2JS {
8242
8399
  * Matches a string against a regular expression.
8243
8400
  *
8244
8401
  * @param {string} regex the regular expression
8245
- * @param {string|number[]} input the input
8402
+ * @param {string|number[]|Uint8Array} input the input
8246
8403
  * @returns {boolean} true if the regular expression matches the entire input
8247
8404
  * @throws RE2JSSyntaxException if the regular expression is malformed
8248
8405
  */
@@ -8309,7 +8466,7 @@ class RE2JS {
8309
8466
  /**
8310
8467
  * Matches a string against a regular expression.
8311
8468
  *
8312
- * @param {string|number[]} input the input
8469
+ * @param {string|number[]|Uint8Array} input the input
8313
8470
  * @returns {boolean} true if the regular expression matches the entire input
8314
8471
  */
8315
8472
  matches(input) {
@@ -8319,7 +8476,7 @@ class RE2JS {
8319
8476
  /**
8320
8477
  * Creates a new {@code Matcher} matching the pattern against the input.
8321
8478
  *
8322
- * @param {string|number[]} input the input string
8479
+ * @param {string|number[]|Uint8Array} input the input string
8323
8480
  * @returns {Matcher}
8324
8481
  */
8325
8482
  matcher(input) {
@@ -8335,7 +8492,7 @@ class RE2JS {
8335
8492
  * a boolean and does not extract capture groups, it bypasses the `Matcher` overhead
8336
8493
  * and guarantees execution on the high-speed DFA engine whenever possible.
8337
8494
  *
8338
- * @param {string|number[]} input - The input string or UTF-8 byte array to test against.
8495
+ * @param {string|number[]|Uint8Array} input - The input string or UTF-8 byte array to test against.
8339
8496
  * @returns {boolean} `true` if the pattern is found anywhere in the input, `false` otherwise.
8340
8497
  */
8341
8498
  test(input) {
@@ -8354,7 +8511,7 @@ class RE2JS {
8354
8511
  * faster because it does not request capture group data. By requesting 0 capture groups,
8355
8512
  * it securely routes execution through the DFA fast-path.
8356
8513
  *
8357
- * @param {string|number[]} input - The input string or UTF-8 byte array to test against.
8514
+ * @param {string|number[]|Uint8Array} input - The input string or UTF-8 byte array to test against.
8358
8515
  * @returns {boolean} `true` if the exact input string fully matches the pattern, `false` otherwise.
8359
8516
  */
8360
8517
  testExact(input) {