re2js 2.2.3 → 2.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -2
- package/build/index.cjs.cjs +178 -25
- package/build/index.cjs.cjs.map +1 -1
- package/build/index.esm.d.ts +56 -67
- package/build/index.esm.d.ts.map +1 -1
- package/build/index.esm.js +178 -25
- package/build/index.esm.js.map +1 -1
- package/build/index.umd.js +178 -25
- package/build/index.umd.js.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -562,8 +562,7 @@ console.log(RE2JS.compile('(a+b?)').programSize()); // Outputs: 8
|
|
|
562
562
|
|
|
563
563
|
### Translating Regular Expressions
|
|
564
564
|
|
|
565
|
-
The `translateRegExp()` method preprocesses a given regular expression string to ensure compatibility with RE2JS.
|
|
566
|
-
It applies necessary transformations, such as escaping special characters, adjusting Unicode sequences, and converting named capture groups
|
|
565
|
+
The `translateRegExp()` method preprocesses a given regular expression string or native RegExp object to ensure compatibility with RE2JS. It applies necessary transformations, such as escaping special characters, adjusting Unicode sequences, converting named capture groups, and mapping native execution flags
|
|
567
566
|
|
|
568
567
|
```js
|
|
569
568
|
import { RE2JS } from 're2js'
|
|
@@ -577,6 +576,14 @@ const unicodeRegexp = RE2JS.translateRegExp('\\u{1F600}') // '\\x{1F600}'
|
|
|
577
576
|
|
|
578
577
|
RE2JS.matches(unicodeRegexp, '😀') // true
|
|
579
578
|
RE2JS.matches(unicodeRegexp, '😃') // false
|
|
579
|
+
|
|
580
|
+
// also support native Regex
|
|
581
|
+
const translatedNative = RE2JS.translateRegExp(/foo/ims) // '(?ims)foo'
|
|
582
|
+
|
|
583
|
+
const re = RE2JS.compile(translatedNative)
|
|
584
|
+
re.test('FOO') // true
|
|
585
|
+
|
|
586
|
+
RE2JS.translateRegExp(/bar/giy) // '(?i)bar'
|
|
580
587
|
```
|
|
581
588
|
|
|
582
589
|
## Performance and Architecture
|
package/build/index.cjs.cjs
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
* re2js
|
|
3
3
|
* RE2JS is the JavaScript port of RE2, a regular expression engine that provides linear time matching
|
|
4
4
|
*
|
|
5
|
-
* @version v2.
|
|
5
|
+
* @version v2.3.1
|
|
6
6
|
* @author Oleksii Vasyliev
|
|
7
7
|
* @homepage https://github.com/le0pard/re2js#readme
|
|
8
8
|
* @repository github:le0pard/re2js
|
|
@@ -1331,6 +1331,7 @@ class RE2JSInternalException extends RE2JSException {
|
|
|
1331
1331
|
*
|
|
1332
1332
|
* @author rsc@google.com (Russ Cox)
|
|
1333
1333
|
*/
|
|
1334
|
+
|
|
1334
1335
|
class Matcher {
|
|
1335
1336
|
/**
|
|
1336
1337
|
* Quotes '\' and '$' in {@code s}, so that the returned string could be used in
|
|
@@ -1369,13 +1370,16 @@ class Matcher {
|
|
|
1369
1370
|
/**
|
|
1370
1371
|
*
|
|
1371
1372
|
* @param {RE2JS} pattern
|
|
1372
|
-
* @param {
|
|
1373
|
+
* @param {string|number[]|Uint8Array} input
|
|
1373
1374
|
*/
|
|
1374
1375
|
constructor(pattern, input) {
|
|
1375
1376
|
if (pattern === null) {
|
|
1376
1377
|
throw new Error('pattern is null');
|
|
1377
1378
|
}
|
|
1378
|
-
|
|
1379
|
+
/**
|
|
1380
|
+
* The pattern being matched.
|
|
1381
|
+
* @type {RE2JS}
|
|
1382
|
+
*/
|
|
1379
1383
|
this.patternInput = pattern;
|
|
1380
1384
|
const re2 = this.patternInput.re2();
|
|
1381
1385
|
// The number of submatches (groups) in the pattern.
|
|
@@ -1429,7 +1433,7 @@ class Matcher {
|
|
|
1429
1433
|
|
|
1430
1434
|
/**
|
|
1431
1435
|
* Resets the {@code Matcher} and changes the input.
|
|
1432
|
-
* @param {
|
|
1436
|
+
* @param {MatcherInputBase} input
|
|
1433
1437
|
* @returns {Matcher} the {@code Matcher} itself, for chained method calls
|
|
1434
1438
|
*/
|
|
1435
1439
|
resetMatcherInput(input) {
|
|
@@ -1494,7 +1498,7 @@ class Matcher {
|
|
|
1494
1498
|
/**
|
|
1495
1499
|
* Returns the named group of the most recent match, or {@code null} if the group was not matched.
|
|
1496
1500
|
* @param {string|number} [group=0]
|
|
1497
|
-
* @returns {
|
|
1501
|
+
* @returns {string|null}
|
|
1498
1502
|
*/
|
|
1499
1503
|
group(group = 0) {
|
|
1500
1504
|
if (typeof group === 'string') {
|
|
@@ -1586,7 +1590,7 @@ class Matcher {
|
|
|
1586
1590
|
* Matches the input against the pattern (unanchored), starting at a specified position. If there
|
|
1587
1591
|
* is a match, {@code find} sets the match state to describe it.
|
|
1588
1592
|
*
|
|
1589
|
-
* @param {number} [start=null] the input position where the search begins
|
|
1593
|
+
* @param {number|null} [start=null] the input position where the search begins
|
|
1590
1594
|
* @returns {boolean} if it finds a match
|
|
1591
1595
|
* @throws IndexOutOfBoundsException if start is not a valid input position
|
|
1592
1596
|
*/
|
|
@@ -7937,9 +7941,18 @@ class RE2 {
|
|
|
7937
7941
|
}
|
|
7938
7942
|
|
|
7939
7943
|
class RE2Set {
|
|
7944
|
+
/** @type {number} */
|
|
7940
7945
|
static UNANCHORED = RE2Flags.UNANCHORED;
|
|
7946
|
+
/** @type {number} */
|
|
7941
7947
|
static ANCHOR_START = RE2Flags.ANCHOR_START;
|
|
7948
|
+
/** @type {number} */
|
|
7942
7949
|
static ANCHOR_BOTH = RE2Flags.ANCHOR_BOTH;
|
|
7950
|
+
|
|
7951
|
+
/**
|
|
7952
|
+
* Constructs a new RE2Set with the specified anchor mode and flags.
|
|
7953
|
+
* @param {number} [anchor=RE2Set.UNANCHORED] - The anchoring mode (e.g., RE2Set.UNANCHORED).
|
|
7954
|
+
* @param {number} [flags=0] - The public flags to apply to all patterns in the set.
|
|
7955
|
+
*/
|
|
7943
7956
|
constructor(anchor = RE2Set.UNANCHORED, flags = 0) {
|
|
7944
7957
|
this.anchor = anchor;
|
|
7945
7958
|
this.jsFlags = flags;
|
|
@@ -7956,6 +7969,14 @@ class RE2Set {
|
|
|
7956
7969
|
this.dfa = null;
|
|
7957
7970
|
this.dummyRe2 = null;
|
|
7958
7971
|
}
|
|
7972
|
+
|
|
7973
|
+
/**
|
|
7974
|
+
* Adds a new regular expression pattern to the set.
|
|
7975
|
+
* Patterns cannot be added after the set has been compiled.
|
|
7976
|
+
* @param {string} pattern - The regular expression pattern to add.
|
|
7977
|
+
* @returns {number} The integer index assigned to the added pattern.
|
|
7978
|
+
* @throws {RE2JSCompileException} If patterns are added after compilation.
|
|
7979
|
+
*/
|
|
7959
7980
|
add(pattern) {
|
|
7960
7981
|
if (this.prog) {
|
|
7961
7982
|
throw new RE2JSCompileException('Cannot add patterns after compile');
|
|
@@ -7974,6 +7995,12 @@ class RE2Set {
|
|
|
7974
7995
|
this.regexps.push(Simplify.simplify(re));
|
|
7975
7996
|
return this.regexps.length - 1;
|
|
7976
7997
|
}
|
|
7998
|
+
|
|
7999
|
+
/**
|
|
8000
|
+
* Compiles the added patterns into a single state machine.
|
|
8001
|
+
* This is automatically called on the first match if not called explicitly.
|
|
8002
|
+
* @returns {void}
|
|
8003
|
+
*/
|
|
7977
8004
|
compile() {
|
|
7978
8005
|
if (this.prog) return;
|
|
7979
8006
|
this.prog = Compiler.compileSet(this.regexps);
|
|
@@ -7986,6 +8013,12 @@ class RE2Set {
|
|
|
7986
8013
|
longest: false
|
|
7987
8014
|
};
|
|
7988
8015
|
}
|
|
8016
|
+
|
|
8017
|
+
/**
|
|
8018
|
+
* Matches the input against the compiled set of regular expressions.
|
|
8019
|
+
* @param {string|number[]|Uint8Array} input - The input string or UTF-8 byte array to match against.
|
|
8020
|
+
* @returns {number[]} An array of indices representing the patterns that successfully matched the input.
|
|
8021
|
+
*/
|
|
7989
8022
|
match(input) {
|
|
7990
8023
|
if (!this.prog) this.compile();
|
|
7991
8024
|
const machineInput = Utils.isByteArray(input) ? MachineInput.fromUTF8(input) : MachineInput.fromUTF16(input);
|
|
@@ -8011,13 +8044,19 @@ class RE2Set {
|
|
|
8011
8044
|
* Transform JS regex string to RE2 regex string
|
|
8012
8045
|
*/
|
|
8013
8046
|
class TranslateRegExpString {
|
|
8014
|
-
static isUpperCaseAlpha(ch) {
|
|
8015
|
-
return 'A' <= ch && ch <= 'Z';
|
|
8016
|
-
}
|
|
8017
8047
|
static isHexadecimal(ch) {
|
|
8018
8048
|
return '0' <= ch && ch <= '9' || 'A' <= ch && ch <= 'F' || 'a' <= ch && ch <= 'f';
|
|
8019
8049
|
}
|
|
8020
8050
|
static translate(data) {
|
|
8051
|
+
let prefixFlags = '';
|
|
8052
|
+
if (data instanceof RegExp) {
|
|
8053
|
+
if (data.ignoreCase) prefixFlags += 'i';
|
|
8054
|
+
if (data.multiline) prefixFlags += 'm';
|
|
8055
|
+
if (data.dotAll) prefixFlags += 's';
|
|
8056
|
+
|
|
8057
|
+
// execution flags ('g', 'y') are safely ignored here.
|
|
8058
|
+
data = data.source;
|
|
8059
|
+
}
|
|
8021
8060
|
if (typeof data !== 'string') {
|
|
8022
8061
|
return data;
|
|
8023
8062
|
}
|
|
@@ -8028,6 +8067,7 @@ class TranslateRegExpString {
|
|
|
8028
8067
|
result = '(?:)';
|
|
8029
8068
|
changed = true;
|
|
8030
8069
|
}
|
|
8070
|
+
let inCharClass = false;
|
|
8031
8071
|
let i = 0;
|
|
8032
8072
|
while (i < size) {
|
|
8033
8073
|
let ch = data[i];
|
|
@@ -8066,10 +8106,28 @@ class TranslateRegExpString {
|
|
|
8066
8106
|
if (i + 2 < size) {
|
|
8067
8107
|
let nextCh = data[i + 2];
|
|
8068
8108
|
if (nextCh === '{') {
|
|
8069
|
-
|
|
8070
|
-
i
|
|
8071
|
-
|
|
8072
|
-
|
|
8109
|
+
// Must have a closing brace and at least one valid hex digit inside
|
|
8110
|
+
let j = i + 3;
|
|
8111
|
+
let hasHex = false;
|
|
8112
|
+
let closed = false;
|
|
8113
|
+
while (j < size) {
|
|
8114
|
+
const hexChar = data[j];
|
|
8115
|
+
if (hexChar === '}') {
|
|
8116
|
+
closed = true;
|
|
8117
|
+
break;
|
|
8118
|
+
}
|
|
8119
|
+
if (!TranslateRegExpString.isHexadecimal(hexChar)) {
|
|
8120
|
+
break;
|
|
8121
|
+
}
|
|
8122
|
+
hasHex = true;
|
|
8123
|
+
j++;
|
|
8124
|
+
}
|
|
8125
|
+
if (closed && hasHex) {
|
|
8126
|
+
result += '\\x';
|
|
8127
|
+
i += 2;
|
|
8128
|
+
changed = true;
|
|
8129
|
+
continue;
|
|
8130
|
+
}
|
|
8073
8131
|
} else if (i + 5 < size) {
|
|
8074
8132
|
let isHex4 = true;
|
|
8075
8133
|
for (let j = 0; j < 4; j++) {
|
|
@@ -8086,18 +8144,101 @@ class TranslateRegExpString {
|
|
|
8086
8144
|
}
|
|
8087
8145
|
}
|
|
8088
8146
|
}
|
|
8147
|
+
|
|
8148
|
+
// Graceful degradation for invalid/unclosed \u sequences
|
|
8089
8149
|
result += 'u';
|
|
8090
8150
|
i += 2;
|
|
8091
8151
|
changed = true;
|
|
8092
8152
|
continue;
|
|
8093
8153
|
}
|
|
8154
|
+
case 'x':
|
|
8155
|
+
{
|
|
8156
|
+
let isValidHex = false;
|
|
8157
|
+
if (i + 2 < size && data[i + 2] === '{') {
|
|
8158
|
+
// Must have a closing brace and at least one valid hex digit inside
|
|
8159
|
+
let j = i + 3;
|
|
8160
|
+
let hasHex = false;
|
|
8161
|
+
let closed = false;
|
|
8162
|
+
while (j < size) {
|
|
8163
|
+
const hexChar = data[j];
|
|
8164
|
+
if (hexChar === '}') {
|
|
8165
|
+
closed = true;
|
|
8166
|
+
break;
|
|
8167
|
+
}
|
|
8168
|
+
if (!TranslateRegExpString.isHexadecimal(hexChar)) {
|
|
8169
|
+
break;
|
|
8170
|
+
}
|
|
8171
|
+
hasHex = true;
|
|
8172
|
+
j++;
|
|
8173
|
+
}
|
|
8174
|
+
if (closed && hasHex) {
|
|
8175
|
+
isValidHex = true;
|
|
8176
|
+
}
|
|
8177
|
+
} else if (i + 3 < size && TranslateRegExpString.isHexadecimal(data[i + 2]) && TranslateRegExpString.isHexadecimal(data[i + 3])) {
|
|
8178
|
+
isValidHex = true;
|
|
8179
|
+
}
|
|
8180
|
+
if (isValidHex) {
|
|
8181
|
+
result += '\\x';
|
|
8182
|
+
i += 2;
|
|
8183
|
+
} else {
|
|
8184
|
+
result += 'x';
|
|
8185
|
+
i += 2;
|
|
8186
|
+
changed = true;
|
|
8187
|
+
}
|
|
8188
|
+
continue;
|
|
8189
|
+
}
|
|
8190
|
+
// Whitelist of valid RE2/JS alphanumeric escapes
|
|
8191
|
+
case 'n':
|
|
8192
|
+
case 'r':
|
|
8193
|
+
case 't':
|
|
8194
|
+
case 'a':
|
|
8195
|
+
case 'f':
|
|
8196
|
+
case 'v':
|
|
8197
|
+
case 'd':
|
|
8198
|
+
case 'D':
|
|
8199
|
+
case 's':
|
|
8200
|
+
case 'S':
|
|
8201
|
+
case 'w':
|
|
8202
|
+
case 'W':
|
|
8203
|
+
case 'b':
|
|
8204
|
+
case 'B':
|
|
8205
|
+
case 'p':
|
|
8206
|
+
case 'P':
|
|
8207
|
+
case 'A':
|
|
8208
|
+
case 'z':
|
|
8209
|
+
case 'Q':
|
|
8210
|
+
case 'E':
|
|
8211
|
+
case '0':
|
|
8212
|
+
case '1':
|
|
8213
|
+
case '2':
|
|
8214
|
+
case '3':
|
|
8215
|
+
case '4':
|
|
8216
|
+
case '5':
|
|
8217
|
+
case '6':
|
|
8218
|
+
case '7':
|
|
8219
|
+
{
|
|
8220
|
+
result += '\\' + ch;
|
|
8221
|
+
i += 2;
|
|
8222
|
+
continue;
|
|
8223
|
+
}
|
|
8094
8224
|
default:
|
|
8095
8225
|
{
|
|
8096
|
-
result += '\\';
|
|
8097
8226
|
let cp = data.codePointAt(i + 1);
|
|
8098
|
-
let
|
|
8099
|
-
|
|
8100
|
-
|
|
8227
|
+
let isAlphaNum = cp >= 48 && cp <= 57 || cp >= 65 && cp <= 90 || cp >= 97 && cp <= 122;
|
|
8228
|
+
if (isAlphaNum) {
|
|
8229
|
+
// Invalid JS alphanumeric escape sequence (e.g. \8, \9, \e, \K)
|
|
8230
|
+
// Gracefully degrade to the literal character to prevent RE2 syntax crashes
|
|
8231
|
+
let symSize = Utils.charCount(cp);
|
|
8232
|
+
result += data.substring(i + 1, i + 1 + symSize);
|
|
8233
|
+
i += symSize + 1;
|
|
8234
|
+
changed = true;
|
|
8235
|
+
} else {
|
|
8236
|
+
// Escaped symbol (e.g. \., \*, \])
|
|
8237
|
+
result += '\\';
|
|
8238
|
+
let symSize = Utils.charCount(cp);
|
|
8239
|
+
result += data.substring(i + 1, i + 1 + symSize);
|
|
8240
|
+
i += symSize + 1;
|
|
8241
|
+
}
|
|
8101
8242
|
continue;
|
|
8102
8243
|
}
|
|
8103
8244
|
}
|
|
@@ -8107,7 +8248,13 @@ class TranslateRegExpString {
|
|
|
8107
8248
|
i += 1;
|
|
8108
8249
|
changed = true;
|
|
8109
8250
|
continue;
|
|
8110
|
-
} else if (ch === '
|
|
8251
|
+
} else if (ch === '[') {
|
|
8252
|
+
// Track entry into a character class (protects syntax inside)
|
|
8253
|
+
inCharClass = true;
|
|
8254
|
+
} else if (ch === ']') {
|
|
8255
|
+
// Track exit of a character class
|
|
8256
|
+
inCharClass = false;
|
|
8257
|
+
} else if (!inCharClass && ch === '(' && i + 2 < size && data[i + 1] === '?' && data[i + 2] === '<') {
|
|
8111
8258
|
if (i + 3 < size && !'=!>)'.includes(data[i + 3])) {
|
|
8112
8259
|
result += '(?P<';
|
|
8113
8260
|
i += 3;
|
|
@@ -8120,7 +8267,13 @@ class TranslateRegExpString {
|
|
|
8120
8267
|
result += data.substring(i, i + symSize);
|
|
8121
8268
|
i += symSize;
|
|
8122
8269
|
}
|
|
8123
|
-
|
|
8270
|
+
const finalResult = changed ? result : data;
|
|
8271
|
+
|
|
8272
|
+
// Append any extracted inline flags
|
|
8273
|
+
if (prefixFlags.length > 0) {
|
|
8274
|
+
return `(?${prefixFlags})${finalResult}`;
|
|
8275
|
+
}
|
|
8276
|
+
return finalResult;
|
|
8124
8277
|
}
|
|
8125
8278
|
}
|
|
8126
8279
|
|
|
@@ -8198,7 +8351,7 @@ class RE2JS {
|
|
|
8198
8351
|
* RE2JS-compatible syntax, and handling Unicode sequences properly. It ensures that the
|
|
8199
8352
|
* resulting regex is safe and properly formatted before compilation.
|
|
8200
8353
|
*
|
|
8201
|
-
* @param {string} expr - The regular expression string to be translated.
|
|
8354
|
+
* @param {string|RegExp} expr - The regular expression string to be translated.
|
|
8202
8355
|
* @returns {string} - The transformed regular expression string, ready for compilation.
|
|
8203
8356
|
*/
|
|
8204
8357
|
static translateRegExp(expr) {
|
|
@@ -8242,7 +8395,7 @@ class RE2JS {
|
|
|
8242
8395
|
* Matches a string against a regular expression.
|
|
8243
8396
|
*
|
|
8244
8397
|
* @param {string} regex the regular expression
|
|
8245
|
-
* @param {string|number[]} input the input
|
|
8398
|
+
* @param {string|number[]|Uint8Array} input the input
|
|
8246
8399
|
* @returns {boolean} true if the regular expression matches the entire input
|
|
8247
8400
|
* @throws RE2JSSyntaxException if the regular expression is malformed
|
|
8248
8401
|
*/
|
|
@@ -8309,7 +8462,7 @@ class RE2JS {
|
|
|
8309
8462
|
/**
|
|
8310
8463
|
* Matches a string against a regular expression.
|
|
8311
8464
|
*
|
|
8312
|
-
* @param {string|number[]} input the input
|
|
8465
|
+
* @param {string|number[]|Uint8Array} input the input
|
|
8313
8466
|
* @returns {boolean} true if the regular expression matches the entire input
|
|
8314
8467
|
*/
|
|
8315
8468
|
matches(input) {
|
|
@@ -8319,7 +8472,7 @@ class RE2JS {
|
|
|
8319
8472
|
/**
|
|
8320
8473
|
* Creates a new {@code Matcher} matching the pattern against the input.
|
|
8321
8474
|
*
|
|
8322
|
-
* @param {string|number[]} input the input string
|
|
8475
|
+
* @param {string|number[]|Uint8Array} input the input string
|
|
8323
8476
|
* @returns {Matcher}
|
|
8324
8477
|
*/
|
|
8325
8478
|
matcher(input) {
|
|
@@ -8335,7 +8488,7 @@ class RE2JS {
|
|
|
8335
8488
|
* a boolean and does not extract capture groups, it bypasses the `Matcher` overhead
|
|
8336
8489
|
* and guarantees execution on the high-speed DFA engine whenever possible.
|
|
8337
8490
|
*
|
|
8338
|
-
* @param {string|number[]} input - The input string or UTF-8 byte array to test against.
|
|
8491
|
+
* @param {string|number[]|Uint8Array} input - The input string or UTF-8 byte array to test against.
|
|
8339
8492
|
* @returns {boolean} `true` if the pattern is found anywhere in the input, `false` otherwise.
|
|
8340
8493
|
*/
|
|
8341
8494
|
test(input) {
|
|
@@ -8354,7 +8507,7 @@ class RE2JS {
|
|
|
8354
8507
|
* faster because it does not request capture group data. By requesting 0 capture groups,
|
|
8355
8508
|
* it securely routes execution through the DFA fast-path.
|
|
8356
8509
|
*
|
|
8357
|
-
* @param {string|number[]} input - The input string or UTF-8 byte array to test against.
|
|
8510
|
+
* @param {string|number[]|Uint8Array} input - The input string or UTF-8 byte array to test against.
|
|
8358
8511
|
* @returns {boolean} `true` if the exact input string fully matches the pattern, `false` otherwise.
|
|
8359
8512
|
*/
|
|
8360
8513
|
testExact(input) {
|