re2js 2.6.1 → 2.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/build/index.cjs.cjs +83 -44
- package/build/index.cjs.cjs.map +1 -1
- package/build/index.esm.d.ts.map +1 -1
- package/build/index.esm.js +83 -44
- package/build/index.esm.js.map +1 -1
- package/build/index.umd.js +83 -44
- package/build/index.umd.js.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -733,7 +733,7 @@ negative.test('foobar'); // false
|
|
|
733
733
|
|
|
734
734
|
1. **Performance Overhead:** If a regex contains a lookbehind, the engine is forced to safely bypass the ultra-fast Lazy DFA and OnePass engines. It evaluates the lookbehinds using parallel automata running on the NFA (Pike VM). While execution remains mathematically safe and linear $O(n)$, the NFA engine is generally slower than the DFA fast-paths. Use lookbehinds only when necessary.
|
|
735
735
|
2. **Prefix Acceleration is Disabled:** To ensure the parallel tracking automata initialize correctly, high-speed string prefix skipping (e.g., using `indexOf` to jump to a starting literal) is disabled when lookbehinds are present.
|
|
736
|
-
3. **Captureless Guarantee:** To prevent state-explosion vulnerabilities, lookbehinds are strictly evaluated as *captureless*. If you include a capturing group inside a lookbehind (e.g., `(?<=(foo))bar`), the engine will
|
|
736
|
+
3. **Captureless Guarantee:** To prevent state-explosion vulnerabilities and maintain strict safety invariants, lookbehinds are strictly evaluated as *captureless*. If you attempt to include a capturing group inside a lookbehind (e.g., `(?<=(foo))bar`), the engine will proactively throw a `SyntaxError` at compile time. Use non-capturing groups `(?:...)` instead.
|
|
737
737
|
|
|
738
738
|
|
|
739
739
|
## Development
|
package/build/index.cjs.cjs
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
* re2js
|
|
3
3
|
* RE2JS is the JavaScript port of RE2, a regular expression engine that provides linear time matching
|
|
4
4
|
*
|
|
5
|
-
* @version v2.
|
|
5
|
+
* @version v2.7.1
|
|
6
6
|
* @author Oleksii Vasyliev
|
|
7
7
|
* @homepage https://github.com/le0pard/re2js#readme
|
|
8
8
|
* @repository github:le0pard/re2js
|
|
@@ -2155,9 +2155,9 @@ class Inst {
|
|
|
2155
2155
|
case Inst.NOP:
|
|
2156
2156
|
return `nop -> ${this.out}`;
|
|
2157
2157
|
case Inst.LB_WRITE:
|
|
2158
|
-
return `lbwrite ${this.
|
|
2158
|
+
return `lbwrite ${this.arg} -> ${this.out}`;
|
|
2159
2159
|
case Inst.LB_CHECK:
|
|
2160
|
-
return `lbcheck ${this.
|
|
2160
|
+
return `lbcheck ${this.arg} -> ${this.out}`;
|
|
2161
2161
|
case Inst.RUNE:
|
|
2162
2162
|
if (this.runes === null) {
|
|
2163
2163
|
return 'rune <null>';
|
|
@@ -2337,30 +2337,35 @@ class Machine {
|
|
|
2337
2337
|
}
|
|
2338
2338
|
this.matched = false;
|
|
2339
2339
|
this.matchcap.fill(-1);
|
|
2340
|
+
|
|
2341
|
+
// Lookbehinds must scan from the beginning of the string to build their state table,
|
|
2342
|
+
// even if the main pattern search is requested to start mid-string.
|
|
2343
|
+
let currentPos = this.prog.numLb > 0 ? 0 : pos;
|
|
2344
|
+
let matchStartPos = pos;
|
|
2340
2345
|
let runq = this.q0;
|
|
2341
2346
|
let nextq = this.q1;
|
|
2342
|
-
let r = input.step(
|
|
2347
|
+
let r = input.step(currentPos);
|
|
2343
2348
|
let rune = r >> 3;
|
|
2344
2349
|
let width = r & 7;
|
|
2345
2350
|
let rune1 = -1;
|
|
2346
2351
|
let width1 = 0;
|
|
2347
2352
|
if (r !== MachineInputBase.EOF()) {
|
|
2348
|
-
r = input.step(
|
|
2353
|
+
r = input.step(currentPos + width);
|
|
2349
2354
|
rune1 = r >> 3;
|
|
2350
2355
|
width1 = r & 7;
|
|
2351
2356
|
}
|
|
2352
2357
|
let flag;
|
|
2353
|
-
if (
|
|
2358
|
+
if (currentPos === 0) {
|
|
2354
2359
|
flag = Utils.emptyOpContext(-1, rune);
|
|
2355
2360
|
} else {
|
|
2356
|
-
flag = input.context(
|
|
2361
|
+
flag = input.context(currentPos);
|
|
2357
2362
|
}
|
|
2358
2363
|
while (true) {
|
|
2359
2364
|
if (runq.isEmpty()) {
|
|
2360
|
-
if ((startCond & Utils.EMPTY_BEGIN_TEXT) !== 0 &&
|
|
2365
|
+
if ((startCond & Utils.EMPTY_BEGIN_TEXT) !== 0 && currentPos !== 0) {
|
|
2361
2366
|
break;
|
|
2362
2367
|
}
|
|
2363
|
-
if ((anchor === RE2Flags.ANCHOR_START || anchor === RE2Flags.ANCHOR_BOTH) &&
|
|
2368
|
+
if ((anchor === RE2Flags.ANCHOR_START || anchor === RE2Flags.ANCHOR_BOTH) && currentPos !== 0) {
|
|
2364
2369
|
break;
|
|
2365
2370
|
}
|
|
2366
2371
|
if (this.matched) {
|
|
@@ -2370,43 +2375,50 @@ class Machine {
|
|
|
2370
2375
|
// Fast-forwarding the string pointer will skip over the positions where
|
|
2371
2376
|
// the parallel lookbehind automata need to be spawned.
|
|
2372
2377
|
if (this.prog.numLb === 0 && !(this.re2.prefix.length === 0) && rune1 !== this.re2.prefixRune && input.canCheckPrefix()) {
|
|
2373
|
-
const advance = input.index(this.re2,
|
|
2378
|
+
const advance = input.index(this.re2, currentPos);
|
|
2374
2379
|
if (advance < 0) {
|
|
2375
2380
|
break;
|
|
2376
2381
|
}
|
|
2377
|
-
|
|
2378
|
-
r = input.step(
|
|
2382
|
+
currentPos += advance;
|
|
2383
|
+
r = input.step(currentPos);
|
|
2379
2384
|
rune = r >> 3;
|
|
2380
2385
|
width = r & 7;
|
|
2381
|
-
r = input.step(
|
|
2386
|
+
r = input.step(currentPos + width);
|
|
2382
2387
|
rune1 = r >> 3;
|
|
2383
2388
|
width1 = r & 7;
|
|
2384
2389
|
}
|
|
2385
2390
|
}
|
|
2386
|
-
|
|
2387
|
-
|
|
2388
|
-
|
|
2389
|
-
|
|
2390
|
-
// Spawn Lookbehind threads BEFORE the main pattern
|
|
2391
|
+
|
|
2392
|
+
// Optimize lookbehind spawning. Because lookbehinds are prefixed with `.*` by the compiler,
|
|
2393
|
+
// they only need to be spawned exactly once at the beginning of the string (currentPos === 0).
|
|
2394
|
+
if (currentPos === 0 && this.prog.numLb > 0) {
|
|
2391
2395
|
for (let i = 0; i < this.prog.lbStarts.length; i++) {
|
|
2392
|
-
this.add(runq, this.prog.lbStarts[i],
|
|
2396
|
+
this.add(runq, this.prog.lbStarts[i], currentPos, this.matchcap, flag, null);
|
|
2397
|
+
}
|
|
2398
|
+
}
|
|
2399
|
+
if (!this.matched && (currentPos === 0 || anchor === RE2Flags.UNANCHORED)) {
|
|
2400
|
+
// ONLY spawn the main pattern if we have reached the requested search start boundary
|
|
2401
|
+
if (currentPos >= matchStartPos) {
|
|
2402
|
+
if (this.ncap > 0) {
|
|
2403
|
+
this.matchcap[0] = currentPos;
|
|
2404
|
+
}
|
|
2405
|
+
this.add(runq, this.prog.start, currentPos, this.matchcap, flag, null);
|
|
2393
2406
|
}
|
|
2394
|
-
this.add(runq, this.prog.start, pos, this.matchcap, flag, null);
|
|
2395
2407
|
}
|
|
2396
|
-
const nextPos =
|
|
2408
|
+
const nextPos = currentPos + width;
|
|
2397
2409
|
flag = input.context(nextPos);
|
|
2398
|
-
this.step(runq, nextq,
|
|
2410
|
+
this.step(runq, nextq, currentPos, nextPos, rune, flag, anchor, currentPos === input.endPos());
|
|
2399
2411
|
if (width === 0) {
|
|
2400
2412
|
break;
|
|
2401
2413
|
}
|
|
2402
2414
|
if (this.ncap === 0 && this.matched) {
|
|
2403
2415
|
break;
|
|
2404
2416
|
}
|
|
2405
|
-
|
|
2417
|
+
currentPos += width;
|
|
2406
2418
|
rune = rune1;
|
|
2407
2419
|
width = width1;
|
|
2408
2420
|
if (rune !== -1) {
|
|
2409
|
-
r = input.step(
|
|
2421
|
+
r = input.step(currentPos + width);
|
|
2410
2422
|
rune1 = r >> 3;
|
|
2411
2423
|
width1 = r & 7;
|
|
2412
2424
|
}
|
|
@@ -2423,35 +2435,46 @@ class Machine {
|
|
|
2423
2435
|
if ((anchor === RE2Flags.ANCHOR_START || anchor === RE2Flags.ANCHOR_BOTH) && pos !== 0) {
|
|
2424
2436
|
return [];
|
|
2425
2437
|
}
|
|
2438
|
+
|
|
2439
|
+
// Lookbehinds must scan from the beginning of the string to build their state table,
|
|
2440
|
+
// even if the main pattern search is requested to start mid-string.
|
|
2441
|
+
let currentPos = this.prog.numLb > 0 ? 0 : pos;
|
|
2442
|
+
let matchStartPos = pos;
|
|
2426
2443
|
let runq = this.q0;
|
|
2427
2444
|
let nextq = this.q1;
|
|
2428
|
-
let r = input.step(
|
|
2445
|
+
let r = input.step(currentPos);
|
|
2429
2446
|
let rune = r >> 3;
|
|
2430
2447
|
let width = r & 7;
|
|
2431
2448
|
let rune1 = -1;
|
|
2432
2449
|
let width1 = 0;
|
|
2433
2450
|
if (r !== MachineInputBase.EOF()) {
|
|
2434
|
-
r = input.step(
|
|
2451
|
+
r = input.step(currentPos + width);
|
|
2435
2452
|
rune1 = r >> 3;
|
|
2436
2453
|
width1 = r & 7;
|
|
2437
2454
|
}
|
|
2438
|
-
let flag =
|
|
2455
|
+
let flag = currentPos === 0 ? Utils.emptyOpContext(-1, rune) : input.context(currentPos);
|
|
2439
2456
|
const matches = new Set();
|
|
2440
2457
|
while (true) {
|
|
2441
2458
|
if (runq.isEmpty()) {
|
|
2442
|
-
if ((startCond & Utils.EMPTY_BEGIN_TEXT) !== 0 &&
|
|
2443
|
-
if ((anchor === RE2Flags.ANCHOR_START || anchor === RE2Flags.ANCHOR_BOTH) &&
|
|
2459
|
+
if ((startCond & Utils.EMPTY_BEGIN_TEXT) !== 0 && currentPos !== 0) break;
|
|
2460
|
+
if ((anchor === RE2Flags.ANCHOR_START || anchor === RE2Flags.ANCHOR_BOTH) && currentPos !== 0) {
|
|
2444
2461
|
break;
|
|
2445
2462
|
}
|
|
2446
2463
|
}
|
|
2447
|
-
|
|
2448
|
-
|
|
2464
|
+
|
|
2465
|
+
// Optimize lookbehind spawning to exactly once at BOF
|
|
2466
|
+
if (currentPos === 0 && this.prog.numLb > 0) {
|
|
2449
2467
|
for (let i = 0; i < this.prog.lbStarts.length; i++) {
|
|
2450
|
-
this.add(runq, this.prog.lbStarts[i],
|
|
2468
|
+
this.add(runq, this.prog.lbStarts[i], currentPos, this.matchcap, flag, null);
|
|
2469
|
+
}
|
|
2470
|
+
}
|
|
2471
|
+
if (currentPos === 0 || anchor === RE2Flags.UNANCHORED) {
|
|
2472
|
+
// ONLY spawn the main pattern if we have reached the requested search start boundary
|
|
2473
|
+
if (currentPos >= matchStartPos) {
|
|
2474
|
+
this.add(runq, this.prog.start, currentPos, this.matchcap, flag, null);
|
|
2451
2475
|
}
|
|
2452
|
-
this.add(runq, this.prog.start, pos, this.matchcap, flag, null);
|
|
2453
2476
|
}
|
|
2454
|
-
const nextPos =
|
|
2477
|
+
const nextPos = currentPos + width;
|
|
2455
2478
|
flag = input.context(nextPos);
|
|
2456
2479
|
for (let j = 0; j < runq.size; j++) {
|
|
2457
2480
|
let t = runq.denseThreads[j];
|
|
@@ -2460,7 +2483,7 @@ class Machine {
|
|
|
2460
2483
|
let add = false;
|
|
2461
2484
|
switch (i.op) {
|
|
2462
2485
|
case Inst.MATCH:
|
|
2463
|
-
if (anchor === RE2Flags.ANCHOR_BOTH &&
|
|
2486
|
+
if (anchor === RE2Flags.ANCHOR_BOTH && currentPos !== input.endPos()) break;
|
|
2464
2487
|
matches.add(i.arg); // Record the matched Set ID
|
|
2465
2488
|
break;
|
|
2466
2489
|
case Inst.RUNE:
|
|
@@ -2488,11 +2511,11 @@ class Machine {
|
|
|
2488
2511
|
}
|
|
2489
2512
|
runq.clear();
|
|
2490
2513
|
if (width === 0) break;
|
|
2491
|
-
|
|
2514
|
+
currentPos += width;
|
|
2492
2515
|
rune = rune1;
|
|
2493
2516
|
width = width1;
|
|
2494
2517
|
if (rune !== -1) {
|
|
2495
|
-
r = input.step(
|
|
2518
|
+
r = input.step(currentPos + width);
|
|
2496
2519
|
rune1 = r >> 3;
|
|
2497
2520
|
width1 = r & 7;
|
|
2498
2521
|
}
|
|
@@ -2596,17 +2619,17 @@ class Machine {
|
|
|
2596
2619
|
continue;
|
|
2597
2620
|
}
|
|
2598
2621
|
case Inst.LB_WRITE:
|
|
2599
|
-
this.lbTable[Math.abs(inst.
|
|
2622
|
+
this.lbTable[Math.abs(inst.arg)] = pos;
|
|
2600
2623
|
pc = inst.out;
|
|
2601
2624
|
continue;
|
|
2602
2625
|
case Inst.LB_CHECK:
|
|
2603
|
-
if (inst.
|
|
2626
|
+
if (inst.arg > 0) {
|
|
2604
2627
|
// Positive Lookbehind
|
|
2605
|
-
if (this.lbTable[inst.
|
|
2628
|
+
if (this.lbTable[inst.arg] === pos) {
|
|
2606
2629
|
pc = inst.out; // Flattened tail recursion
|
|
2607
2630
|
continue;
|
|
2608
2631
|
}
|
|
2609
|
-
} else if (this.lbTable[-inst.
|
|
2632
|
+
} else if (this.lbTable[-inst.arg] !== pos) {
|
|
2610
2633
|
// Negative Lookbehind
|
|
2611
2634
|
pc = inst.out; // Flattened tail recursion
|
|
2612
2635
|
continue;
|
|
@@ -4686,7 +4709,7 @@ class Compiler {
|
|
|
4686
4709
|
}
|
|
4687
4710
|
lookBehind(a, lb) {
|
|
4688
4711
|
const id = this.newInst(Inst.LB_WRITE);
|
|
4689
|
-
this.prog.getInst(id.i).
|
|
4712
|
+
this.prog.getInst(id.i).arg = lb;
|
|
4690
4713
|
|
|
4691
4714
|
// Create the prefix wildcard `.*` for the lookbehind automaton
|
|
4692
4715
|
const any = this.rune(Compiler.ANY_RUNE(), 0);
|
|
@@ -4694,7 +4717,7 @@ class Compiler {
|
|
|
4694
4717
|
const lbAutomaton = this.cat(dotStar, a);
|
|
4695
4718
|
this.prog.patch(lbAutomaton.out, id.i);
|
|
4696
4719
|
const checkId = this.newInst(Inst.LB_CHECK);
|
|
4697
|
-
this.prog.getInst(checkId.i).
|
|
4720
|
+
this.prog.getInst(checkId.i).arg = lb;
|
|
4698
4721
|
|
|
4699
4722
|
// Save the starting point of this lookbehind automaton
|
|
4700
4723
|
this.prog.lbStarts.push(lbAutomaton.i);
|
|
@@ -5469,6 +5492,7 @@ class Parser {
|
|
|
5469
5492
|
static ERR_UNEXPECTED_PAREN = 'unexpected )';
|
|
5470
5493
|
static ERR_NESTING_DEPTH = 'expression nests too deeply';
|
|
5471
5494
|
static ERR_LARGE = 'expression too large';
|
|
5495
|
+
static ERR_INVALID_CAPTURE_IN_LOOKBEHIND = 'invalid capture in lookbehind';
|
|
5472
5496
|
|
|
5473
5497
|
// maxHeight is the maximum height of a regexp parse tree.
|
|
5474
5498
|
// It is somewhat arbitrarily chosen, but the idea is to be large enough
|
|
@@ -5872,6 +5896,18 @@ class Parser {
|
|
|
5872
5896
|
}
|
|
5873
5897
|
return x;
|
|
5874
5898
|
}
|
|
5899
|
+
|
|
5900
|
+
// recursively check for captures
|
|
5901
|
+
static hasCapture(re) {
|
|
5902
|
+
if (re === null) return false;
|
|
5903
|
+
if (re.op === Regexp.Op.CAPTURE) return true;
|
|
5904
|
+
if (re.subs) {
|
|
5905
|
+
for (let sub of re.subs) {
|
|
5906
|
+
if (Parser.hasCapture(sub)) return true;
|
|
5907
|
+
}
|
|
5908
|
+
}
|
|
5909
|
+
return false;
|
|
5910
|
+
}
|
|
5875
5911
|
constructor(wholeRegexp, flags = 0) {
|
|
5876
5912
|
this.wholeRegexp = wholeRegexp;
|
|
5877
5913
|
// Flags control the behavior of the parser and record information about
|
|
@@ -6555,7 +6591,7 @@ class Parser {
|
|
|
6555
6591
|
case 1:
|
|
6556
6592
|
// Impossible but handle.
|
|
6557
6593
|
re.op = Regexp.Op.EMPTY_MATCH;
|
|
6558
|
-
re.subs =
|
|
6594
|
+
re.subs = Regexp.emptySubs();
|
|
6559
6595
|
break;
|
|
6560
6596
|
case 2:
|
|
6561
6597
|
{
|
|
@@ -7006,6 +7042,9 @@ class Parser {
|
|
|
7006
7042
|
|
|
7007
7043
|
// Handle lookbehinds
|
|
7008
7044
|
if (re2.lb !== 0) {
|
|
7045
|
+
if (Parser.hasCapture(re1)) {
|
|
7046
|
+
throw new RE2JSSyntaxException(Parser.ERR_INVALID_CAPTURE_IN_LOOKBEHIND, this.wholeRegexp);
|
|
7047
|
+
}
|
|
7009
7048
|
if (re2.lb > 0) {
|
|
7010
7049
|
re2.op = Regexp.Op.PLB;
|
|
7011
7050
|
} else {
|