re2js 2.2.3 → 2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -562,8 +562,7 @@ console.log(RE2JS.compile('(a+b?)').programSize()); // Outputs: 8
562
562
 
563
563
  ### Translating Regular Expressions
564
564
 
565
- The `translateRegExp()` method preprocesses a given regular expression string to ensure compatibility with RE2JS.
566
- It applies necessary transformations, such as escaping special characters, adjusting Unicode sequences, and converting named capture groups
565
+ The `translateRegExp()` method preprocesses a given regular expression string or native RegExp object to ensure compatibility with RE2JS. It applies necessary transformations, such as escaping special characters, adjusting Unicode sequences, converting named capture groups, and mapping native execution flags
567
566
 
568
567
  ```js
569
568
  import { RE2JS } from 're2js'
@@ -577,6 +576,14 @@ const unicodeRegexp = RE2JS.translateRegExp('\\u{1F600}') // '\\x{1F600}'
577
576
 
578
577
  RE2JS.matches(unicodeRegexp, '😀') // true
579
578
  RE2JS.matches(unicodeRegexp, '😃') // false
579
+
580
+ // also support native Regex
581
+ const translatedNative = RE2JS.translateRegExp(/foo/ims) // '(?ims)foo'
582
+
583
+ const re = RE2JS.compile(translatedNative)
584
+ re.test('FOO') // true
585
+
586
+ RE2JS.translateRegExp(/bar/giy) // '(?i)bar'
580
587
  ```
581
588
 
582
589
  ## Performance and Architecture
@@ -2,7 +2,7 @@
2
2
  * re2js
3
3
  * RE2JS is the JavaScript port of RE2, a regular expression engine that provides linear time matching
4
4
  *
5
- * @version v2.2.3
5
+ * @version v2.3.1
6
6
  * @author Oleksii Vasyliev
7
7
  * @homepage https://github.com/le0pard/re2js#readme
8
8
  * @repository github:le0pard/re2js
@@ -1331,6 +1331,7 @@ class RE2JSInternalException extends RE2JSException {
1331
1331
  *
1332
1332
  * @author rsc@google.com (Russ Cox)
1333
1333
  */
1334
+
1334
1335
  class Matcher {
1335
1336
  /**
1336
1337
  * Quotes '\' and '$' in {@code s}, so that the returned string could be used in
@@ -1369,13 +1370,16 @@ class Matcher {
1369
1370
  /**
1370
1371
  *
1371
1372
  * @param {RE2JS} pattern
1372
- * @param {Utf8MatcherInput|Utf16MatcherInput|number[]|string} input
1373
+ * @param {string|number[]|Uint8Array} input
1373
1374
  */
1374
1375
  constructor(pattern, input) {
1375
1376
  if (pattern === null) {
1376
1377
  throw new Error('pattern is null');
1377
1378
  }
1378
- // The pattern being matched.
1379
+ /**
1380
+ * The pattern being matched.
1381
+ * @type {RE2JS}
1382
+ */
1379
1383
  this.patternInput = pattern;
1380
1384
  const re2 = this.patternInput.re2();
1381
1385
  // The number of submatches (groups) in the pattern.
@@ -1429,7 +1433,7 @@ class Matcher {
1429
1433
 
1430
1434
  /**
1431
1435
  * Resets the {@code Matcher} and changes the input.
1432
- * @param {Utf8MatcherInput|Utf16MatcherInput} input
1436
+ * @param {MatcherInputBase} input
1433
1437
  * @returns {Matcher} the {@code Matcher} itself, for chained method calls
1434
1438
  */
1435
1439
  resetMatcherInput(input) {
@@ -1494,7 +1498,7 @@ class Matcher {
1494
1498
  /**
1495
1499
  * Returns the named group of the most recent match, or {@code null} if the group was not matched.
1496
1500
  * @param {string|number} [group=0]
1497
- * @returns {?string}
1501
+ * @returns {string|null}
1498
1502
  */
1499
1503
  group(group = 0) {
1500
1504
  if (typeof group === 'string') {
@@ -1586,7 +1590,7 @@ class Matcher {
1586
1590
  * Matches the input against the pattern (unanchored), starting at a specified position. If there
1587
1591
  * is a match, {@code find} sets the match state to describe it.
1588
1592
  *
1589
- * @param {number} [start=null] the input position where the search begins
1593
+ * @param {number|null} [start=null] the input position where the search begins
1590
1594
  * @returns {boolean} if it finds a match
1591
1595
  * @throws IndexOutOfBoundsException if start is not a valid input position
1592
1596
  */
@@ -7937,9 +7941,18 @@ class RE2 {
7937
7941
  }
7938
7942
 
7939
7943
  class RE2Set {
7944
+ /** @type {number} */
7940
7945
  static UNANCHORED = RE2Flags.UNANCHORED;
7946
+ /** @type {number} */
7941
7947
  static ANCHOR_START = RE2Flags.ANCHOR_START;
7948
+ /** @type {number} */
7942
7949
  static ANCHOR_BOTH = RE2Flags.ANCHOR_BOTH;
7950
+
7951
+ /**
7952
+ * Constructs a new RE2Set with the specified anchor mode and flags.
7953
+ * @param {number} [anchor=RE2Set.UNANCHORED] - The anchoring mode (e.g., RE2Set.UNANCHORED).
7954
+ * @param {number} [flags=0] - The public flags to apply to all patterns in the set.
7955
+ */
7943
7956
  constructor(anchor = RE2Set.UNANCHORED, flags = 0) {
7944
7957
  this.anchor = anchor;
7945
7958
  this.jsFlags = flags;
@@ -7956,6 +7969,14 @@ class RE2Set {
7956
7969
  this.dfa = null;
7957
7970
  this.dummyRe2 = null;
7958
7971
  }
7972
+
7973
+ /**
7974
+ * Adds a new regular expression pattern to the set.
7975
+ * Patterns cannot be added after the set has been compiled.
7976
+ * @param {string} pattern - The regular expression pattern to add.
7977
+ * @returns {number} The integer index assigned to the added pattern.
7978
+ * @throws {RE2JSCompileException} If patterns are added after compilation.
7979
+ */
7959
7980
  add(pattern) {
7960
7981
  if (this.prog) {
7961
7982
  throw new RE2JSCompileException('Cannot add patterns after compile');
@@ -7974,6 +7995,12 @@ class RE2Set {
7974
7995
  this.regexps.push(Simplify.simplify(re));
7975
7996
  return this.regexps.length - 1;
7976
7997
  }
7998
+
7999
+ /**
8000
+ * Compiles the added patterns into a single state machine.
8001
+ * This is automatically called on the first match if not called explicitly.
8002
+ * @returns {void}
8003
+ */
7977
8004
  compile() {
7978
8005
  if (this.prog) return;
7979
8006
  this.prog = Compiler.compileSet(this.regexps);
@@ -7986,6 +8013,12 @@ class RE2Set {
7986
8013
  longest: false
7987
8014
  };
7988
8015
  }
8016
+
8017
+ /**
8018
+ * Matches the input against the compiled set of regular expressions.
8019
+ * @param {string|number[]|Uint8Array} input - The input string or UTF-8 byte array to match against.
8020
+ * @returns {number[]} An array of indices representing the patterns that successfully matched the input.
8021
+ */
7989
8022
  match(input) {
7990
8023
  if (!this.prog) this.compile();
7991
8024
  const machineInput = Utils.isByteArray(input) ? MachineInput.fromUTF8(input) : MachineInput.fromUTF16(input);
@@ -8011,13 +8044,19 @@ class RE2Set {
8011
8044
  * Transform JS regex string to RE2 regex string
8012
8045
  */
8013
8046
  class TranslateRegExpString {
8014
- static isUpperCaseAlpha(ch) {
8015
- return 'A' <= ch && ch <= 'Z';
8016
- }
8017
8047
  static isHexadecimal(ch) {
8018
8048
  return '0' <= ch && ch <= '9' || 'A' <= ch && ch <= 'F' || 'a' <= ch && ch <= 'f';
8019
8049
  }
8020
8050
  static translate(data) {
8051
+ let prefixFlags = '';
8052
+ if (data instanceof RegExp) {
8053
+ if (data.ignoreCase) prefixFlags += 'i';
8054
+ if (data.multiline) prefixFlags += 'm';
8055
+ if (data.dotAll) prefixFlags += 's';
8056
+
8057
+ // execution flags ('g', 'y') are safely ignored here.
8058
+ data = data.source;
8059
+ }
8021
8060
  if (typeof data !== 'string') {
8022
8061
  return data;
8023
8062
  }
@@ -8028,6 +8067,7 @@ class TranslateRegExpString {
8028
8067
  result = '(?:)';
8029
8068
  changed = true;
8030
8069
  }
8070
+ let inCharClass = false;
8031
8071
  let i = 0;
8032
8072
  while (i < size) {
8033
8073
  let ch = data[i];
@@ -8066,10 +8106,28 @@ class TranslateRegExpString {
8066
8106
  if (i + 2 < size) {
8067
8107
  let nextCh = data[i + 2];
8068
8108
  if (nextCh === '{') {
8069
- result += '\\x';
8070
- i += 2;
8071
- changed = true;
8072
- continue;
8109
+ // Must have a closing brace and at least one valid hex digit inside
8110
+ let j = i + 3;
8111
+ let hasHex = false;
8112
+ let closed = false;
8113
+ while (j < size) {
8114
+ const hexChar = data[j];
8115
+ if (hexChar === '}') {
8116
+ closed = true;
8117
+ break;
8118
+ }
8119
+ if (!TranslateRegExpString.isHexadecimal(hexChar)) {
8120
+ break;
8121
+ }
8122
+ hasHex = true;
8123
+ j++;
8124
+ }
8125
+ if (closed && hasHex) {
8126
+ result += '\\x';
8127
+ i += 2;
8128
+ changed = true;
8129
+ continue;
8130
+ }
8073
8131
  } else if (i + 5 < size) {
8074
8132
  let isHex4 = true;
8075
8133
  for (let j = 0; j < 4; j++) {
@@ -8086,18 +8144,101 @@ class TranslateRegExpString {
8086
8144
  }
8087
8145
  }
8088
8146
  }
8147
+
8148
+ // Graceful degradation for invalid/unclosed \u sequences
8089
8149
  result += 'u';
8090
8150
  i += 2;
8091
8151
  changed = true;
8092
8152
  continue;
8093
8153
  }
8154
+ case 'x':
8155
+ {
8156
+ let isValidHex = false;
8157
+ if (i + 2 < size && data[i + 2] === '{') {
8158
+ // Must have a closing brace and at least one valid hex digit inside
8159
+ let j = i + 3;
8160
+ let hasHex = false;
8161
+ let closed = false;
8162
+ while (j < size) {
8163
+ const hexChar = data[j];
8164
+ if (hexChar === '}') {
8165
+ closed = true;
8166
+ break;
8167
+ }
8168
+ if (!TranslateRegExpString.isHexadecimal(hexChar)) {
8169
+ break;
8170
+ }
8171
+ hasHex = true;
8172
+ j++;
8173
+ }
8174
+ if (closed && hasHex) {
8175
+ isValidHex = true;
8176
+ }
8177
+ } else if (i + 3 < size && TranslateRegExpString.isHexadecimal(data[i + 2]) && TranslateRegExpString.isHexadecimal(data[i + 3])) {
8178
+ isValidHex = true;
8179
+ }
8180
+ if (isValidHex) {
8181
+ result += '\\x';
8182
+ i += 2;
8183
+ } else {
8184
+ result += 'x';
8185
+ i += 2;
8186
+ changed = true;
8187
+ }
8188
+ continue;
8189
+ }
8190
+ // Whitelist of valid RE2/JS alphanumeric escapes
8191
+ case 'n':
8192
+ case 'r':
8193
+ case 't':
8194
+ case 'a':
8195
+ case 'f':
8196
+ case 'v':
8197
+ case 'd':
8198
+ case 'D':
8199
+ case 's':
8200
+ case 'S':
8201
+ case 'w':
8202
+ case 'W':
8203
+ case 'b':
8204
+ case 'B':
8205
+ case 'p':
8206
+ case 'P':
8207
+ case 'A':
8208
+ case 'z':
8209
+ case 'Q':
8210
+ case 'E':
8211
+ case '0':
8212
+ case '1':
8213
+ case '2':
8214
+ case '3':
8215
+ case '4':
8216
+ case '5':
8217
+ case '6':
8218
+ case '7':
8219
+ {
8220
+ result += '\\' + ch;
8221
+ i += 2;
8222
+ continue;
8223
+ }
8094
8224
  default:
8095
8225
  {
8096
- result += '\\';
8097
8226
  let cp = data.codePointAt(i + 1);
8098
- let symSize = Utils.charCount(cp);
8099
- result += data.substring(i + 1, i + 1 + symSize);
8100
- i += symSize + 1;
8227
+ let isAlphaNum = cp >= 48 && cp <= 57 || cp >= 65 && cp <= 90 || cp >= 97 && cp <= 122;
8228
+ if (isAlphaNum) {
8229
+ // Invalid JS alphanumeric escape sequence (e.g. \8, \9, \e, \K)
8230
+ // Gracefully degrade to the literal character to prevent RE2 syntax crashes
8231
+ let symSize = Utils.charCount(cp);
8232
+ result += data.substring(i + 1, i + 1 + symSize);
8233
+ i += symSize + 1;
8234
+ changed = true;
8235
+ } else {
8236
+ // Escaped symbol (e.g. \., \*, \])
8237
+ result += '\\';
8238
+ let symSize = Utils.charCount(cp);
8239
+ result += data.substring(i + 1, i + 1 + symSize);
8240
+ i += symSize + 1;
8241
+ }
8101
8242
  continue;
8102
8243
  }
8103
8244
  }
@@ -8107,7 +8248,13 @@ class TranslateRegExpString {
8107
8248
  i += 1;
8108
8249
  changed = true;
8109
8250
  continue;
8110
- } else if (ch === '(' && i + 2 < size && data[i + 1] === '?' && data[i + 2] === '<') {
8251
+ } else if (ch === '[') {
8252
+ // Track entry into a character class (protects syntax inside)
8253
+ inCharClass = true;
8254
+ } else if (ch === ']') {
8255
+ // Track exit of a character class
8256
+ inCharClass = false;
8257
+ } else if (!inCharClass && ch === '(' && i + 2 < size && data[i + 1] === '?' && data[i + 2] === '<') {
8111
8258
  if (i + 3 < size && !'=!>)'.includes(data[i + 3])) {
8112
8259
  result += '(?P<';
8113
8260
  i += 3;
@@ -8120,7 +8267,13 @@ class TranslateRegExpString {
8120
8267
  result += data.substring(i, i + symSize);
8121
8268
  i += symSize;
8122
8269
  }
8123
- return changed ? result : data;
8270
+ const finalResult = changed ? result : data;
8271
+
8272
+ // Append any extracted inline flags
8273
+ if (prefixFlags.length > 0) {
8274
+ return `(?${prefixFlags})${finalResult}`;
8275
+ }
8276
+ return finalResult;
8124
8277
  }
8125
8278
  }
8126
8279
 
@@ -8198,7 +8351,7 @@ class RE2JS {
8198
8351
  * RE2JS-compatible syntax, and handling Unicode sequences properly. It ensures that the
8199
8352
  * resulting regex is safe and properly formatted before compilation.
8200
8353
  *
8201
- * @param {string} expr - The regular expression string to be translated.
8354
+ * @param {string|RegExp} expr - The regular expression string to be translated.
8202
8355
  * @returns {string} - The transformed regular expression string, ready for compilation.
8203
8356
  */
8204
8357
  static translateRegExp(expr) {
@@ -8242,7 +8395,7 @@ class RE2JS {
8242
8395
  * Matches a string against a regular expression.
8243
8396
  *
8244
8397
  * @param {string} regex the regular expression
8245
- * @param {string|number[]} input the input
8398
+ * @param {string|number[]|Uint8Array} input the input
8246
8399
  * @returns {boolean} true if the regular expression matches the entire input
8247
8400
  * @throws RE2JSSyntaxException if the regular expression is malformed
8248
8401
  */
@@ -8309,7 +8462,7 @@ class RE2JS {
8309
8462
  /**
8310
8463
  * Matches a string against a regular expression.
8311
8464
  *
8312
- * @param {string|number[]} input the input
8465
+ * @param {string|number[]|Uint8Array} input the input
8313
8466
  * @returns {boolean} true if the regular expression matches the entire input
8314
8467
  */
8315
8468
  matches(input) {
@@ -8319,7 +8472,7 @@ class RE2JS {
8319
8472
  /**
8320
8473
  * Creates a new {@code Matcher} matching the pattern against the input.
8321
8474
  *
8322
- * @param {string|number[]} input the input string
8475
+ * @param {string|number[]|Uint8Array} input the input string
8323
8476
  * @returns {Matcher}
8324
8477
  */
8325
8478
  matcher(input) {
@@ -8335,7 +8488,7 @@ class RE2JS {
8335
8488
  * a boolean and does not extract capture groups, it bypasses the `Matcher` overhead
8336
8489
  * and guarantees execution on the high-speed DFA engine whenever possible.
8337
8490
  *
8338
- * @param {string|number[]} input - The input string or UTF-8 byte array to test against.
8491
+ * @param {string|number[]|Uint8Array} input - The input string or UTF-8 byte array to test against.
8339
8492
  * @returns {boolean} `true` if the pattern is found anywhere in the input, `false` otherwise.
8340
8493
  */
8341
8494
  test(input) {
@@ -8354,7 +8507,7 @@ class RE2JS {
8354
8507
  * faster because it does not request capture group data. By requesting 0 capture groups,
8355
8508
  * it securely routes execution through the DFA fast-path.
8356
8509
  *
8357
- * @param {string|number[]} input - The input string or UTF-8 byte array to test against.
8510
+ * @param {string|number[]|Uint8Array} input - The input string or UTF-8 byte array to test against.
8358
8511
  * @returns {boolean} `true` if the exact input string fully matches the pattern, `false` otherwise.
8359
8512
  */
8360
8513
  testExact(input) {