re2js 1.2.3 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,7 +2,7 @@
2
2
  * re2js
3
3
  * RE2JS is the JavaScript port of RE2, a regular expression engine that provides linear time matching
4
4
  *
5
- * @version v1.2.3
5
+ * @version v1.3.0
6
6
  * @author Alexey Vasiliev
7
7
  * @homepage https://github.com/le0pard/re2js#readme
8
8
  * @repository github:le0pard/re2js
@@ -923,6 +923,17 @@ class RE2JSFlagsException extends RE2JSException {
923
923
  }
924
924
  }
925
925
 
926
+ /**
927
+ * An exception thrown by DFA
928
+ */
929
+ class RE2JSDfaMemoryException extends RE2JSException {
930
+ /** @param {string} message */
931
+ constructor(message) {
932
+ super(message);
933
+ this.name = 'RE2JSDfaMemoryException';
934
+ }
935
+ }
936
+
926
937
  /**
927
938
  * A stateful iterator that interprets a regex {@code RE2JS} on a specific input.
928
939
  *
@@ -2112,6 +2123,160 @@ class Machine {
2112
2123
  }
2113
2124
  }
2114
2125
 
2126
+ class DFAState {
2127
+ constructor(id, nfaStates, isMatch) {
2128
+ this.id = id; // Stringified NFA state list (e.g., "1,4,7")
2129
+ this.nfaStates = nfaStates; // Array of Instruction PCs
2130
+ this.isMatch = isMatch; // Boolean
2131
+ this.nextAscii = new Array(Unicode.MAX_ASCII + 1).fill(null); // Flat array for blisteringly fast ASCII lookups (unanchored)
2132
+ this.nextMap = new Map(); // Cache of Char -> DFAState
2133
+ }
2134
+ }
2135
+ class DFA {
2136
+ constructor(prog) {
2137
+ this.prog = prog;
2138
+ this.stateCache = new Map(); // id -> DFAState
2139
+ this.startState = null;
2140
+ this.stateLimit = 10000; // Prevent memory explosion (ReDoS protection), like RE2 max_mem
2141
+ }
2142
+
2143
+ // Follows epsilon (empty) transitions to find all reachable states without consuming a char
2144
+ computeClosure(pcs) {
2145
+ const closure = new Set();
2146
+ const stack = [...pcs];
2147
+ let isMatch = false;
2148
+ while (stack.length > 0) {
2149
+ const pc = stack.pop();
2150
+ if (closure.has(pc)) continue;
2151
+ closure.add(pc);
2152
+ const inst = this.prog.getInst(pc);
2153
+ switch (inst.op) {
2154
+ case Inst.MATCH:
2155
+ isMatch = true;
2156
+ break;
2157
+ case Inst.ALT:
2158
+ case Inst.ALT_MATCH:
2159
+ stack.push(inst.out);
2160
+ stack.push(inst.arg);
2161
+ break;
2162
+ case Inst.NOP:
2163
+ case Inst.CAPTURE:
2164
+ stack.push(inst.out);
2165
+ break;
2166
+ // Bailing out on complex empty-width assertions to keep DFA fast.
2167
+ // Engine will seamlessly fall back to the NFA.
2168
+ case Inst.EMPTY_WIDTH:
2169
+ return null;
2170
+ }
2171
+ }
2172
+ const sortedPCs = Int32Array.from(closure).sort();
2173
+ return {
2174
+ pcs: sortedPCs,
2175
+ isMatch
2176
+ };
2177
+ }
2178
+
2179
+ // Get or create a DFA state from a list of NFA PCs
2180
+ getState(pcs) {
2181
+ const closureResult = this.computeClosure(pcs);
2182
+ if (!closureResult) return null; // Bailout to NFA required
2183
+
2184
+ const id = closureResult.pcs.join(',');
2185
+ if (this.stateCache.has(id)) {
2186
+ return this.stateCache.get(id);
2187
+ }
2188
+
2189
+ // Safety: prevent memory exhaustion from state explosion
2190
+ if (this.stateCache.size > this.stateLimit) {
2191
+ throw new RE2JSDfaMemoryException('dfa error: Out of memory exception');
2192
+ }
2193
+ const state = new DFAState(id, closureResult.pcs, closureResult.isMatch);
2194
+ this.stateCache.set(id, state);
2195
+ return state;
2196
+ }
2197
+
2198
+ // Compute the next DFA state given a current state and a character
2199
+ step(state, charCode, anchor) {
2200
+ // OPTIMIZATION: ASCII Fast-Path
2201
+ if (anchor === RE2Flags.UNANCHORED && charCode <= Unicode.MAX_ASCII) {
2202
+ const next = state.nextAscii[charCode];
2203
+ if (next !== null) {
2204
+ return next;
2205
+ }
2206
+ } else {
2207
+ const key = charCode + (anchor === RE2Flags.UNANCHORED ? 0 : Unicode.MAX_RUNE + 1);
2208
+ if (state.nextMap.has(key)) {
2209
+ return state.nextMap.get(key);
2210
+ }
2211
+ }
2212
+ const nextPCs = [];
2213
+ for (let i = 0; i < state.nfaStates.length; i++) {
2214
+ const pc = state.nfaStates[i];
2215
+ const inst = this.prog.getInst(pc);
2216
+ if (Inst.isRuneOp(inst.op) && inst.matchRune(charCode)) {
2217
+ nextPCs.push(inst.out);
2218
+ }
2219
+ }
2220
+ if (anchor === RE2Flags.UNANCHORED) {
2221
+ nextPCs.push(this.prog.start);
2222
+ }
2223
+ const nextState = this.getState(nextPCs);
2224
+
2225
+ // Cache the result
2226
+ if (anchor === RE2Flags.UNANCHORED && charCode <= Unicode.MAX_ASCII) {
2227
+ state.nextAscii[charCode] = nextState;
2228
+ } else {
2229
+ const key = charCode + (anchor === RE2Flags.UNANCHORED ? 0 : Unicode.MAX_RUNE + 1);
2230
+ state.nextMap.set(key, nextState);
2231
+ }
2232
+ return nextState;
2233
+ }
2234
+
2235
+ // The hot loop: Execute the Lazy DFA
2236
+ match(input, pos, anchor) {
2237
+ if ((anchor === RE2Flags.ANCHOR_START || anchor === RE2Flags.ANCHOR_BOTH) && pos !== 0) {
2238
+ return false;
2239
+ }
2240
+ if (!this.startState) {
2241
+ this.startState = this.getState([this.prog.start]);
2242
+ if (!this.startState) return null; // Fallback to NFA
2243
+ }
2244
+ let endPos = input.endPos();
2245
+ let currentState = this.startState;
2246
+ if (currentState.isMatch) {
2247
+ if (anchor === RE2Flags.ANCHOR_BOTH) {
2248
+ if (pos === endPos) return true;
2249
+ } else {
2250
+ return true;
2251
+ }
2252
+ }
2253
+ let i = pos;
2254
+ while (i < endPos) {
2255
+ const r = input.step(i);
2256
+ const rune = r >> 3;
2257
+ const width = r & 7;
2258
+ currentState = this.step(currentState, rune, anchor);
2259
+
2260
+ // If we hit an unrecoverable DFA error or bailout, signal fallback
2261
+ if (currentState === null) return null;
2262
+ if (currentState.isMatch) {
2263
+ if (anchor === RE2Flags.ANCHOR_BOTH) {
2264
+ if (i + width === endPos) return true;
2265
+ } else {
2266
+ return true;
2267
+ }
2268
+ }
2269
+
2270
+ // If we hit a dead end, and anchored, fail early
2271
+ if (currentState.nfaStates.length === 0) {
2272
+ if (anchor !== RE2Flags.UNANCHORED) return false;
2273
+ }
2274
+ i += width;
2275
+ }
2276
+ return false;
2277
+ }
2278
+ }
2279
+
2115
2280
  /**
2116
2281
  * Regular expression abstract syntax tree. Produced by parser, used by compiler. NB, this
2117
2282
  * corresponds to {@code syntax.regexp} in the Go implementation; Go's {@code regexp} is called
@@ -5244,6 +5409,30 @@ class RE2 {
5244
5409
  this.prefixComplete = false; // true if prefix is the entire regexp
5245
5410
  this.prefixRune = 0; // first rune in prefix
5246
5411
  this.pooled = new AtomicReference(); // Cache of machines for running regexp. Forms a Treiber stack.
5412
+ this.dfa = new DFA(prog); // Initialize the Lazy DFA
5413
+ }
5414
+ executeEngine(input, pos, anchor, ncap) {
5415
+ // If the user wants capturing groups (ncap > 0), the DFA mathematically CANNOT do it.
5416
+ // We must use the NFA.
5417
+ if (ncap > 0) {
5418
+ return this.doExecuteNFA(input, pos, anchor, ncap);
5419
+ }
5420
+ try {
5421
+ const dfaResult = this.dfa.match(input, pos, anchor);
5422
+ if (dfaResult !== null) {
5423
+ // DFA succeeded (returned true or false)
5424
+ return dfaResult ? [] : null; // Return empty array to signify "matched but no captures"
5425
+ }
5426
+ } catch (e) {
5427
+ if (e instanceof RE2JSDfaMemoryException) {
5428
+ this.dfa = new DFA(this.prog); // flush cache
5429
+ } else {
5430
+ throw e;
5431
+ }
5432
+ }
5433
+
5434
+ // Fallback to NFA
5435
+ return this.doExecuteNFA(input, pos, anchor, ncap);
5247
5436
  }
5248
5437
 
5249
5438
  /**
@@ -5328,10 +5517,10 @@ class RE2 {
5328
5517
  return this.expr;
5329
5518
  }
5330
5519
 
5331
- // doExecute() finds the leftmost match in the input and returns
5520
+ // doExecuteNFA() finds the leftmost match in the input and returns
5332
5521
  // the position of its subexpressions.
5333
5522
  // Derived from exec.go.
5334
- doExecute(input, pos, anchor, ncap) {
5523
+ doExecuteNFA(input, pos, anchor, ncap) {
5335
5524
  let m = this.get();
5336
5525
  // The Treiber stack cannot reuse nodes, unless the node to be reused has only ever been at
5337
5526
  // the bottom of the stack (i.e., next == null).
@@ -5349,7 +5538,7 @@ class RE2 {
5349
5538
  return cap;
5350
5539
  }
5351
5540
  match(s) {
5352
- return this.doExecute(MachineInput.fromUTF16(s), 0, RE2Flags.UNANCHORED, 0) !== null;
5541
+ return this.executeEngine(MachineInput.fromUTF16(s), 0, RE2Flags.UNANCHORED, 0) !== null;
5353
5542
  }
5354
5543
 
5355
5544
  /**
@@ -5377,7 +5566,7 @@ class RE2 {
5377
5566
  return [false, null];
5378
5567
  }
5379
5568
  const machineInput = input.isUTF16Encoding() ? MachineInput.fromUTF16(input.asCharSequence(), 0, end) : MachineInput.fromUTF8(input.asBytes(), 0, end);
5380
- const groupMatch = this.doExecute(machineInput, start, anchor, 2 * ngroup);
5569
+ const groupMatch = this.executeEngine(machineInput, start, anchor, 2 * ngroup);
5381
5570
  if (groupMatch === null) {
5382
5571
  return [false, null];
5383
5572
  }
@@ -5389,7 +5578,7 @@ class RE2 {
5389
5578
  */
5390
5579
  // This is visible for testing.
5391
5580
  matchUTF8(b) {
5392
- return this.doExecute(MachineInput.fromUTF8(b), 0, RE2Flags.UNANCHORED, 0) !== null;
5581
+ return this.executeEngine(MachineInput.fromUTF8(b), 0, RE2Flags.UNANCHORED, 0) !== null;
5393
5582
  }
5394
5583
 
5395
5584
  /**
@@ -5426,7 +5615,7 @@ class RE2 {
5426
5615
  const input = MachineInput.fromUTF16(src);
5427
5616
  let numReplaces = 0;
5428
5617
  while (searchPos <= src.length) {
5429
- const a = this.doExecute(input, searchPos, RE2Flags.UNANCHORED, 2);
5618
+ const a = this.executeEngine(input, searchPos, RE2Flags.UNANCHORED, 2);
5430
5619
  if (a === null || a.length === 0) {
5431
5620
  break;
5432
5621
  }
@@ -5484,7 +5673,7 @@ class RE2 {
5484
5673
  let i = 0;
5485
5674
  let prevMatchEnd = -1;
5486
5675
  while (i < n && pos <= end) {
5487
- const matches = this.doExecute(input, pos, RE2Flags.UNANCHORED, this.prog.numCap);
5676
+ const matches = this.executeEngine(input, pos, RE2Flags.UNANCHORED, this.prog.numCap);
5488
5677
  if (matches === null || matches.length === 0) {
5489
5678
  break;
5490
5679
  }
@@ -5555,7 +5744,7 @@ class RE2 {
5555
5744
  */
5556
5745
  // This is visible for testing.
5557
5746
  findUTF8(b) {
5558
- const a = this.doExecute(MachineInput.fromUTF8(b), 0, RE2Flags.UNANCHORED, 2);
5747
+ const a = this.executeEngine(MachineInput.fromUTF8(b), 0, RE2Flags.UNANCHORED, 2);
5559
5748
  if (a === null) {
5560
5749
  return null;
5561
5750
  }
@@ -5570,7 +5759,7 @@ class RE2 {
5570
5759
  */
5571
5760
  // This is visible for testing.
5572
5761
  findUTF8Index(b) {
5573
- const a = this.doExecute(MachineInput.fromUTF8(b), 0, RE2Flags.UNANCHORED, 2);
5762
+ const a = this.executeEngine(MachineInput.fromUTF8(b), 0, RE2Flags.UNANCHORED, 2);
5574
5763
  if (a === null) {
5575
5764
  return null;
5576
5765
  }
@@ -5587,7 +5776,7 @@ class RE2 {
5587
5776
  */
5588
5777
  // This is visible for testing.
5589
5778
  find(s) {
5590
- const a = this.doExecute(MachineInput.fromUTF16(s), 0, RE2Flags.UNANCHORED, 2);
5779
+ const a = this.executeEngine(MachineInput.fromUTF16(s), 0, RE2Flags.UNANCHORED, 2);
5591
5780
  if (a === null) {
5592
5781
  return '';
5593
5782
  }
@@ -5603,7 +5792,7 @@ class RE2 {
5603
5792
  */
5604
5793
  // This is visible for testing.
5605
5794
  findIndex(s) {
5606
- return this.doExecute(MachineInput.fromUTF16(s), 0, RE2Flags.UNANCHORED, 2);
5795
+ return this.executeEngine(MachineInput.fromUTF16(s), 0, RE2Flags.UNANCHORED, 2);
5607
5796
  }
5608
5797
 
5609
5798
  /**
@@ -5615,7 +5804,7 @@ class RE2 {
5615
5804
  */
5616
5805
  // This is visible for testing.
5617
5806
  findUTF8Submatch(b) {
5618
- const a = this.doExecute(MachineInput.fromUTF8(b), 0, RE2Flags.UNANCHORED, this.prog.numCap);
5807
+ const a = this.executeEngine(MachineInput.fromUTF8(b), 0, RE2Flags.UNANCHORED, this.prog.numCap);
5619
5808
  if (a === null) {
5620
5809
  return null;
5621
5810
  }
@@ -5637,7 +5826,7 @@ class RE2 {
5637
5826
  */
5638
5827
  // This is visible for testing.
5639
5828
  findUTF8SubmatchIndex(b) {
5640
- return this.pad(this.doExecute(MachineInput.fromUTF8(b), 0, RE2Flags.UNANCHORED, this.prog.numCap));
5829
+ return this.pad(this.executeEngine(MachineInput.fromUTF8(b), 0, RE2Flags.UNANCHORED, this.prog.numCap));
5641
5830
  }
5642
5831
 
5643
5832
  /**
@@ -5649,7 +5838,7 @@ class RE2 {
5649
5838
  */
5650
5839
  // This is visible for testing.
5651
5840
  findSubmatch(s) {
5652
- const a = this.doExecute(MachineInput.fromUTF16(s), 0, RE2Flags.UNANCHORED, this.prog.numCap);
5841
+ const a = this.executeEngine(MachineInput.fromUTF16(s), 0, RE2Flags.UNANCHORED, this.prog.numCap);
5653
5842
  if (a === null) {
5654
5843
  return null;
5655
5844
  }
@@ -5671,7 +5860,7 @@ class RE2 {
5671
5860
  */
5672
5861
  // This is visible for testing.
5673
5862
  findSubmatchIndex(s) {
5674
- return this.pad(this.doExecute(MachineInput.fromUTF16(s), 0, RE2Flags.UNANCHORED, this.prog.numCap));
5863
+ return this.pad(this.executeEngine(MachineInput.fromUTF16(s), 0, RE2Flags.UNANCHORED, this.prog.numCap));
5675
5864
  }
5676
5865
 
5677
5866
  /**
@@ -6244,6 +6433,7 @@ class RE2JS {
6244
6433
  exports.Matcher = Matcher;
6245
6434
  exports.RE2JS = RE2JS;
6246
6435
  exports.RE2JSCompileException = RE2JSCompileException;
6436
+ exports.RE2JSDfaMemoryException = RE2JSDfaMemoryException;
6247
6437
  exports.RE2JSException = RE2JSException;
6248
6438
  exports.RE2JSFlagsException = RE2JSFlagsException;
6249
6439
  exports.RE2JSGroupException = RE2JSGroupException;