re2js 1.2.3 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,7 +2,7 @@
2
2
  * re2js
3
3
  * RE2JS is the JavaScript port of RE2, a regular expression engine that provides linear time matching
4
4
  *
5
- * @version v1.2.3
5
+ * @version v1.3.0
6
6
  * @author Alexey Vasiliev
7
7
  * @homepage https://github.com/le0pard/re2js#readme
8
8
  * @repository github:le0pard/re2js
@@ -927,6 +927,17 @@
927
927
  }
928
928
  }
929
929
 
930
+ /**
931
+ * An exception thrown by DFA
932
+ */
933
+ class RE2JSDfaMemoryException extends RE2JSException {
934
+ /** @param {string} message */
935
+ constructor(message) {
936
+ super(message);
937
+ this.name = 'RE2JSDfaMemoryException';
938
+ }
939
+ }
940
+
930
941
  /**
931
942
  * A stateful iterator that interprets a regex {@code RE2JS} on a specific input.
932
943
  *
@@ -2116,6 +2127,160 @@
2116
2127
  }
2117
2128
  }
2118
2129
 
2130
+ class DFAState {
2131
+ constructor(id, nfaStates, isMatch) {
2132
+ this.id = id; // Stringified NFA state list (e.g., "1,4,7")
2133
+ this.nfaStates = nfaStates; // Array of Instruction PCs
2134
+ this.isMatch = isMatch; // Boolean
2135
+ this.nextAscii = new Array(Unicode.MAX_ASCII + 1).fill(null); // Flat array for blisteringly fast ASCII lookups (unanchored)
2136
+ this.nextMap = new Map(); // Cache of Char -> DFAState
2137
+ }
2138
+ }
2139
+ class DFA {
2140
+ constructor(prog) {
2141
+ this.prog = prog;
2142
+ this.stateCache = new Map(); // id -> DFAState
2143
+ this.startState = null;
2144
+ this.stateLimit = 10000; // Prevent memory explosion (ReDoS protection), like RE2 max_mem
2145
+ }
2146
+
2147
+ // Follows epsilon (empty) transitions to find all reachable states without consuming a char
2148
+ computeClosure(pcs) {
2149
+ const closure = new Set();
2150
+ const stack = [...pcs];
2151
+ let isMatch = false;
2152
+ while (stack.length > 0) {
2153
+ const pc = stack.pop();
2154
+ if (closure.has(pc)) continue;
2155
+ closure.add(pc);
2156
+ const inst = this.prog.getInst(pc);
2157
+ switch (inst.op) {
2158
+ case Inst.MATCH:
2159
+ isMatch = true;
2160
+ break;
2161
+ case Inst.ALT:
2162
+ case Inst.ALT_MATCH:
2163
+ stack.push(inst.out);
2164
+ stack.push(inst.arg);
2165
+ break;
2166
+ case Inst.NOP:
2167
+ case Inst.CAPTURE:
2168
+ stack.push(inst.out);
2169
+ break;
2170
+ // Bailing out on complex empty-width assertions to keep DFA fast.
2171
+ // Engine will seamlessly fall back to the NFA.
2172
+ case Inst.EMPTY_WIDTH:
2173
+ return null;
2174
+ }
2175
+ }
2176
+ const sortedPCs = Int32Array.from(closure).sort();
2177
+ return {
2178
+ pcs: sortedPCs,
2179
+ isMatch
2180
+ };
2181
+ }
2182
+
2183
+ // Get or create a DFA state from a list of NFA PCs
2184
+ getState(pcs) {
2185
+ const closureResult = this.computeClosure(pcs);
2186
+ if (!closureResult) return null; // Bailout to NFA required
2187
+
2188
+ const id = closureResult.pcs.join(',');
2189
+ if (this.stateCache.has(id)) {
2190
+ return this.stateCache.get(id);
2191
+ }
2192
+
2193
+ // Safety: prevent memory exhaustion from state explosion
2194
+ if (this.stateCache.size > this.stateLimit) {
2195
+ throw new RE2JSDfaMemoryException('dfa error: Out of memory exception');
2196
+ }
2197
+ const state = new DFAState(id, closureResult.pcs, closureResult.isMatch);
2198
+ this.stateCache.set(id, state);
2199
+ return state;
2200
+ }
2201
+
2202
+ // Compute the next DFA state given a current state and a character
2203
+ step(state, charCode, anchor) {
2204
+ // OPTIMIZATION: ASCII Fast-Path
2205
+ if (anchor === RE2Flags.UNANCHORED && charCode <= Unicode.MAX_ASCII) {
2206
+ const next = state.nextAscii[charCode];
2207
+ if (next !== null) {
2208
+ return next;
2209
+ }
2210
+ } else {
2211
+ const key = charCode + (anchor === RE2Flags.UNANCHORED ? 0 : Unicode.MAX_RUNE + 1);
2212
+ if (state.nextMap.has(key)) {
2213
+ return state.nextMap.get(key);
2214
+ }
2215
+ }
2216
+ const nextPCs = [];
2217
+ for (let i = 0; i < state.nfaStates.length; i++) {
2218
+ const pc = state.nfaStates[i];
2219
+ const inst = this.prog.getInst(pc);
2220
+ if (Inst.isRuneOp(inst.op) && inst.matchRune(charCode)) {
2221
+ nextPCs.push(inst.out);
2222
+ }
2223
+ }
2224
+ if (anchor === RE2Flags.UNANCHORED) {
2225
+ nextPCs.push(this.prog.start);
2226
+ }
2227
+ const nextState = this.getState(nextPCs);
2228
+
2229
+ // Cache the result
2230
+ if (anchor === RE2Flags.UNANCHORED && charCode <= Unicode.MAX_ASCII) {
2231
+ state.nextAscii[charCode] = nextState;
2232
+ } else {
2233
+ const key = charCode + (anchor === RE2Flags.UNANCHORED ? 0 : Unicode.MAX_RUNE + 1);
2234
+ state.nextMap.set(key, nextState);
2235
+ }
2236
+ return nextState;
2237
+ }
2238
+
2239
+ // The hot loop: Execute the Lazy DFA
2240
+ match(input, pos, anchor) {
2241
+ if ((anchor === RE2Flags.ANCHOR_START || anchor === RE2Flags.ANCHOR_BOTH) && pos !== 0) {
2242
+ return false;
2243
+ }
2244
+ if (!this.startState) {
2245
+ this.startState = this.getState([this.prog.start]);
2246
+ if (!this.startState) return null; // Fallback to NFA
2247
+ }
2248
+ let endPos = input.endPos();
2249
+ let currentState = this.startState;
2250
+ if (currentState.isMatch) {
2251
+ if (anchor === RE2Flags.ANCHOR_BOTH) {
2252
+ if (pos === endPos) return true;
2253
+ } else {
2254
+ return true;
2255
+ }
2256
+ }
2257
+ let i = pos;
2258
+ while (i < endPos) {
2259
+ const r = input.step(i);
2260
+ const rune = r >> 3;
2261
+ const width = r & 7;
2262
+ currentState = this.step(currentState, rune, anchor);
2263
+
2264
+ // If we hit an unrecoverable DFA error or bailout, signal fallback
2265
+ if (currentState === null) return null;
2266
+ if (currentState.isMatch) {
2267
+ if (anchor === RE2Flags.ANCHOR_BOTH) {
2268
+ if (i + width === endPos) return true;
2269
+ } else {
2270
+ return true;
2271
+ }
2272
+ }
2273
+
2274
+ // If we hit a dead end, and anchored, fail early
2275
+ if (currentState.nfaStates.length === 0) {
2276
+ if (anchor !== RE2Flags.UNANCHORED) return false;
2277
+ }
2278
+ i += width;
2279
+ }
2280
+ return false;
2281
+ }
2282
+ }
2283
+
2119
2284
  /**
2120
2285
  * Regular expression abstract syntax tree. Produced by parser, used by compiler. NB, this
2121
2286
  * corresponds to {@code syntax.regexp} in the Go implementation; Go's {@code regexp} is called
@@ -5248,6 +5413,30 @@
5248
5413
  this.prefixComplete = false; // true if prefix is the entire regexp
5249
5414
  this.prefixRune = 0; // first rune in prefix
5250
5415
  this.pooled = new AtomicReference(); // Cache of machines for running regexp. Forms a Treiber stack.
5416
+ this.dfa = new DFA(prog); // Initialize the Lazy DFA
5417
+ }
5418
+ executeEngine(input, pos, anchor, ncap) {
5419
+ // If the user wants capturing groups (ncap > 0), the DFA mathematically CANNOT do it.
5420
+ // We must use the NFA.
5421
+ if (ncap > 0) {
5422
+ return this.doExecuteNFA(input, pos, anchor, ncap);
5423
+ }
5424
+ try {
5425
+ const dfaResult = this.dfa.match(input, pos, anchor);
5426
+ if (dfaResult !== null) {
5427
+ // DFA succeeded (returned true or false)
5428
+ return dfaResult ? [] : null; // Return empty array to signify "matched but no captures"
5429
+ }
5430
+ } catch (e) {
5431
+ if (e instanceof RE2JSDfaMemoryException) {
5432
+ this.dfa = new DFA(this.prog); // flush cache
5433
+ } else {
5434
+ throw e;
5435
+ }
5436
+ }
5437
+
5438
+ // Fallback to NFA
5439
+ return this.doExecuteNFA(input, pos, anchor, ncap);
5251
5440
  }
5252
5441
 
5253
5442
  /**
@@ -5332,10 +5521,10 @@
5332
5521
  return this.expr;
5333
5522
  }
5334
5523
 
5335
- // doExecute() finds the leftmost match in the input and returns
5524
+ // doExecuteNFA() finds the leftmost match in the input and returns
5336
5525
  // the position of its subexpressions.
5337
5526
  // Derived from exec.go.
5338
- doExecute(input, pos, anchor, ncap) {
5527
+ doExecuteNFA(input, pos, anchor, ncap) {
5339
5528
  let m = this.get();
5340
5529
  // The Treiber stack cannot reuse nodes, unless the node to be reused has only ever been at
5341
5530
  // the bottom of the stack (i.e., next == null).
@@ -5353,7 +5542,7 @@
5353
5542
  return cap;
5354
5543
  }
5355
5544
  match(s) {
5356
- return this.doExecute(MachineInput.fromUTF16(s), 0, RE2Flags.UNANCHORED, 0) !== null;
5545
+ return this.executeEngine(MachineInput.fromUTF16(s), 0, RE2Flags.UNANCHORED, 0) !== null;
5357
5546
  }
5358
5547
 
5359
5548
  /**
@@ -5381,7 +5570,7 @@
5381
5570
  return [false, null];
5382
5571
  }
5383
5572
  const machineInput = input.isUTF16Encoding() ? MachineInput.fromUTF16(input.asCharSequence(), 0, end) : MachineInput.fromUTF8(input.asBytes(), 0, end);
5384
- const groupMatch = this.doExecute(machineInput, start, anchor, 2 * ngroup);
5573
+ const groupMatch = this.executeEngine(machineInput, start, anchor, 2 * ngroup);
5385
5574
  if (groupMatch === null) {
5386
5575
  return [false, null];
5387
5576
  }
@@ -5393,7 +5582,7 @@
5393
5582
  */
5394
5583
  // This is visible for testing.
5395
5584
  matchUTF8(b) {
5396
- return this.doExecute(MachineInput.fromUTF8(b), 0, RE2Flags.UNANCHORED, 0) !== null;
5585
+ return this.executeEngine(MachineInput.fromUTF8(b), 0, RE2Flags.UNANCHORED, 0) !== null;
5397
5586
  }
5398
5587
 
5399
5588
  /**
@@ -5430,7 +5619,7 @@
5430
5619
  const input = MachineInput.fromUTF16(src);
5431
5620
  let numReplaces = 0;
5432
5621
  while (searchPos <= src.length) {
5433
- const a = this.doExecute(input, searchPos, RE2Flags.UNANCHORED, 2);
5622
+ const a = this.executeEngine(input, searchPos, RE2Flags.UNANCHORED, 2);
5434
5623
  if (a === null || a.length === 0) {
5435
5624
  break;
5436
5625
  }
@@ -5488,7 +5677,7 @@
5488
5677
  let i = 0;
5489
5678
  let prevMatchEnd = -1;
5490
5679
  while (i < n && pos <= end) {
5491
- const matches = this.doExecute(input, pos, RE2Flags.UNANCHORED, this.prog.numCap);
5680
+ const matches = this.executeEngine(input, pos, RE2Flags.UNANCHORED, this.prog.numCap);
5492
5681
  if (matches === null || matches.length === 0) {
5493
5682
  break;
5494
5683
  }
@@ -5559,7 +5748,7 @@
5559
5748
  */
5560
5749
  // This is visible for testing.
5561
5750
  findUTF8(b) {
5562
- const a = this.doExecute(MachineInput.fromUTF8(b), 0, RE2Flags.UNANCHORED, 2);
5751
+ const a = this.executeEngine(MachineInput.fromUTF8(b), 0, RE2Flags.UNANCHORED, 2);
5563
5752
  if (a === null) {
5564
5753
  return null;
5565
5754
  }
@@ -5574,7 +5763,7 @@
5574
5763
  */
5575
5764
  // This is visible for testing.
5576
5765
  findUTF8Index(b) {
5577
- const a = this.doExecute(MachineInput.fromUTF8(b), 0, RE2Flags.UNANCHORED, 2);
5766
+ const a = this.executeEngine(MachineInput.fromUTF8(b), 0, RE2Flags.UNANCHORED, 2);
5578
5767
  if (a === null) {
5579
5768
  return null;
5580
5769
  }
@@ -5591,7 +5780,7 @@
5591
5780
  */
5592
5781
  // This is visible for testing.
5593
5782
  find(s) {
5594
- const a = this.doExecute(MachineInput.fromUTF16(s), 0, RE2Flags.UNANCHORED, 2);
5783
+ const a = this.executeEngine(MachineInput.fromUTF16(s), 0, RE2Flags.UNANCHORED, 2);
5595
5784
  if (a === null) {
5596
5785
  return '';
5597
5786
  }
@@ -5607,7 +5796,7 @@
5607
5796
  */
5608
5797
  // This is visible for testing.
5609
5798
  findIndex(s) {
5610
- return this.doExecute(MachineInput.fromUTF16(s), 0, RE2Flags.UNANCHORED, 2);
5799
+ return this.executeEngine(MachineInput.fromUTF16(s), 0, RE2Flags.UNANCHORED, 2);
5611
5800
  }
5612
5801
 
5613
5802
  /**
@@ -5619,7 +5808,7 @@
5619
5808
  */
5620
5809
  // This is visible for testing.
5621
5810
  findUTF8Submatch(b) {
5622
- const a = this.doExecute(MachineInput.fromUTF8(b), 0, RE2Flags.UNANCHORED, this.prog.numCap);
5811
+ const a = this.executeEngine(MachineInput.fromUTF8(b), 0, RE2Flags.UNANCHORED, this.prog.numCap);
5623
5812
  if (a === null) {
5624
5813
  return null;
5625
5814
  }
@@ -5641,7 +5830,7 @@
5641
5830
  */
5642
5831
  // This is visible for testing.
5643
5832
  findUTF8SubmatchIndex(b) {
5644
- return this.pad(this.doExecute(MachineInput.fromUTF8(b), 0, RE2Flags.UNANCHORED, this.prog.numCap));
5833
+ return this.pad(this.executeEngine(MachineInput.fromUTF8(b), 0, RE2Flags.UNANCHORED, this.prog.numCap));
5645
5834
  }
5646
5835
 
5647
5836
  /**
@@ -5653,7 +5842,7 @@
5653
5842
  */
5654
5843
  // This is visible for testing.
5655
5844
  findSubmatch(s) {
5656
- const a = this.doExecute(MachineInput.fromUTF16(s), 0, RE2Flags.UNANCHORED, this.prog.numCap);
5845
+ const a = this.executeEngine(MachineInput.fromUTF16(s), 0, RE2Flags.UNANCHORED, this.prog.numCap);
5657
5846
  if (a === null) {
5658
5847
  return null;
5659
5848
  }
@@ -5675,7 +5864,7 @@
5675
5864
  */
5676
5865
  // This is visible for testing.
5677
5866
  findSubmatchIndex(s) {
5678
- return this.pad(this.doExecute(MachineInput.fromUTF16(s), 0, RE2Flags.UNANCHORED, this.prog.numCap));
5867
+ return this.pad(this.executeEngine(MachineInput.fromUTF16(s), 0, RE2Flags.UNANCHORED, this.prog.numCap));
5679
5868
  }
5680
5869
 
5681
5870
  /**
@@ -6248,6 +6437,7 @@
6248
6437
  exports.Matcher = Matcher;
6249
6438
  exports.RE2JS = RE2JS;
6250
6439
  exports.RE2JSCompileException = RE2JSCompileException;
6440
+ exports.RE2JSDfaMemoryException = RE2JSDfaMemoryException;
6251
6441
  exports.RE2JSException = RE2JSException;
6252
6442
  exports.RE2JSFlagsException = RE2JSFlagsException;
6253
6443
  exports.RE2JSGroupException = RE2JSGroupException;