re2js 1.2.2 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,7 +2,7 @@
2
2
  * re2js
3
3
  * RE2JS is the JavaScript port of RE2, a regular expression engine that provides linear time matching
4
4
  *
5
- * @version v1.2.2
5
+ * @version v1.3.0
6
6
  * @author Alexey Vasiliev
7
7
  * @homepage https://github.com/le0pard/re2js#readme
8
8
  * @repository github:le0pard/re2js
@@ -842,6 +842,7 @@ class MatcherInput {
842
842
  }
843
843
 
844
844
  class RE2JSException extends Error {
845
+ /** @param {string} message */
845
846
  constructor(message) {
846
847
  super(message);
847
848
  this.name = 'RE2JSException';
@@ -852,6 +853,10 @@ class RE2JSException extends Error {
852
853
  * An exception thrown by the parser if the pattern was invalid.
853
854
  */
854
855
  class RE2JSSyntaxException extends RE2JSException {
856
+ /**
857
+ * @param {string} error
858
+ * @param {string|null} [input=null]
859
+ */
855
860
  constructor(error, input = null) {
856
861
  let message = `error parsing regexp: ${error}`;
857
862
  if (input) {
@@ -860,12 +865,15 @@ class RE2JSSyntaxException extends RE2JSException {
860
865
  super(message);
861
866
  this.name = 'RE2JSSyntaxException';
862
867
  this.message = message;
868
+ /** @type {string} */
863
869
  this.error = error;
870
+ /** @type {string|null} */
864
871
  this.input = input;
865
872
  }
866
873
 
867
874
  /**
868
875
  * Retrieves the description of the error.
876
+ * @returns {string}
869
877
  */
870
878
  getDescription() {
871
879
  return this.error;
@@ -873,6 +881,7 @@ class RE2JSSyntaxException extends RE2JSException {
873
881
 
874
882
  /**
875
883
  * Retrieves the erroneous regular-expression pattern.
884
+ * @returns {string|null}
876
885
  */
877
886
  getPattern() {
878
887
  return this.input;
@@ -883,6 +892,7 @@ class RE2JSSyntaxException extends RE2JSException {
883
892
  * An exception thrown by the compiler
884
893
  */
885
894
  class RE2JSCompileException extends RE2JSException {
895
+ /** @param {string} message */
886
896
  constructor(message) {
887
897
  super(message);
888
898
  this.name = 'RE2JSCompileException';
@@ -893,6 +903,7 @@ class RE2JSCompileException extends RE2JSException {
893
903
  * An exception thrown by using groups
894
904
  */
895
905
  class RE2JSGroupException extends RE2JSException {
906
+ /** @param {string} message */
896
907
  constructor(message) {
897
908
  super(message);
898
909
  this.name = 'RE2JSGroupException';
@@ -903,12 +914,24 @@ class RE2JSGroupException extends RE2JSException {
903
914
  * An exception thrown by flags
904
915
  */
905
916
  class RE2JSFlagsException extends RE2JSException {
917
+ /** @param {string} message */
906
918
  constructor(message) {
907
919
  super(message);
908
920
  this.name = 'RE2JSFlagsException';
909
921
  }
910
922
  }
911
923
 
924
+ /**
925
+ * An exception thrown by DFA
926
+ */
927
+ class RE2JSDfaMemoryException extends RE2JSException {
928
+ /** @param {string} message */
929
+ constructor(message) {
930
+ super(message);
931
+ this.name = 'RE2JSDfaMemoryException';
932
+ }
933
+ }
934
+
912
935
  /**
913
936
  * A stateful iterator that interprets a regex {@code RE2JS} on a specific input.
914
937
  *
@@ -964,10 +987,14 @@ class Matcher {
964
987
  this.patternInput = pattern;
965
988
  const re2 = this.patternInput.re2();
966
989
  // The number of submatches (groups) in the pattern.
990
+ /** @type {number} */
967
991
  this.patternGroupCount = re2.numberOfCapturingGroups();
968
992
  // The group indexes, in [start, end) pairs. Zeroth pair is overall match.
993
+ /** @type {number[]} */
969
994
  this.groups = [];
995
+ /** @type {Record<string, number>} */
970
996
  this.namedGroups = re2.namedGroups;
997
+ /** @type {number} */
971
998
  this.numberOfInstructions = re2.numberOfInstructions();
972
999
  if (input instanceof MatcherInputBase) {
973
1000
  this.resetMatcherInput(input);
@@ -993,8 +1020,10 @@ class Matcher {
993
1020
  */
994
1021
  reset() {
995
1022
  // The input length in UTF16 codes.
1023
+ /** @type {number} */
996
1024
  this.matcherInputLength = this.matcherInput.length();
997
1025
  // The append position: where the next append should start.
1026
+ /** @type {number} */
998
1027
  this.appendPos = 0;
999
1028
  // Is there a current match?
1000
1029
  this.hasMatch = false;
@@ -1008,6 +1037,7 @@ class Matcher {
1008
1037
 
1009
1038
  /**
1010
1039
  * Resets the {@code Matcher} and changes the input.
1040
+ * @param {Utf8MatcherInput|Utf16MatcherInput} input
1011
1041
  * @returns {Matcher} the {@code Matcher} itself, for chained method calls
1012
1042
  */
1013
1043
  resetMatcherInput(input) {
@@ -1150,7 +1180,7 @@ class Matcher {
1150
1180
  * Matches the input against the pattern (unanchored), starting at a specified position. If there
1151
1181
  * is a match, {@code find} sets the match state to describe it.
1152
1182
  *
1153
- * @param {string|number} [start=null] the input position where the search begins
1183
+ * @param {number} [start=null] the input position where the search begins
1154
1184
  * @returns {boolean} if it finds a match
1155
1185
  * @throws IndexOutOfBoundsException if start is not a valid input position
1156
1186
  */
@@ -2091,6 +2121,160 @@ class Machine {
2091
2121
  }
2092
2122
  }
2093
2123
 
2124
+ class DFAState {
2125
+ constructor(id, nfaStates, isMatch) {
2126
+ this.id = id; // Stringified NFA state list (e.g., "1,4,7")
2127
+ this.nfaStates = nfaStates; // Array of Instruction PCs
2128
+ this.isMatch = isMatch; // Boolean
2129
+ this.nextAscii = new Array(Unicode.MAX_ASCII + 1).fill(null); // Flat array for blisteringly fast ASCII lookups (unanchored)
2130
+ this.nextMap = new Map(); // Cache of Char -> DFAState
2131
+ }
2132
+ }
2133
+ class DFA {
2134
+ constructor(prog) {
2135
+ this.prog = prog;
2136
+ this.stateCache = new Map(); // id -> DFAState
2137
+ this.startState = null;
2138
+ this.stateLimit = 10000; // Prevent memory explosion (ReDoS protection), like RE2 max_mem
2139
+ }
2140
+
2141
+ // Follows epsilon (empty) transitions to find all reachable states without consuming a char
2142
+ computeClosure(pcs) {
2143
+ const closure = new Set();
2144
+ const stack = [...pcs];
2145
+ let isMatch = false;
2146
+ while (stack.length > 0) {
2147
+ const pc = stack.pop();
2148
+ if (closure.has(pc)) continue;
2149
+ closure.add(pc);
2150
+ const inst = this.prog.getInst(pc);
2151
+ switch (inst.op) {
2152
+ case Inst.MATCH:
2153
+ isMatch = true;
2154
+ break;
2155
+ case Inst.ALT:
2156
+ case Inst.ALT_MATCH:
2157
+ stack.push(inst.out);
2158
+ stack.push(inst.arg);
2159
+ break;
2160
+ case Inst.NOP:
2161
+ case Inst.CAPTURE:
2162
+ stack.push(inst.out);
2163
+ break;
2164
+ // Bailing out on complex empty-width assertions to keep DFA fast.
2165
+ // Engine will seamlessly fall back to the NFA.
2166
+ case Inst.EMPTY_WIDTH:
2167
+ return null;
2168
+ }
2169
+ }
2170
+ const sortedPCs = Int32Array.from(closure).sort();
2171
+ return {
2172
+ pcs: sortedPCs,
2173
+ isMatch
2174
+ };
2175
+ }
2176
+
2177
+ // Get or create a DFA state from a list of NFA PCs
2178
+ getState(pcs) {
2179
+ const closureResult = this.computeClosure(pcs);
2180
+ if (!closureResult) return null; // Bailout to NFA required
2181
+
2182
+ const id = closureResult.pcs.join(',');
2183
+ if (this.stateCache.has(id)) {
2184
+ return this.stateCache.get(id);
2185
+ }
2186
+
2187
+ // Safety: prevent memory exhaustion from state explosion
2188
+ if (this.stateCache.size > this.stateLimit) {
2189
+ throw new RE2JSDfaMemoryException('dfa error: Out of memory exception');
2190
+ }
2191
+ const state = new DFAState(id, closureResult.pcs, closureResult.isMatch);
2192
+ this.stateCache.set(id, state);
2193
+ return state;
2194
+ }
2195
+
2196
+ // Compute the next DFA state given a current state and a character
2197
+ step(state, charCode, anchor) {
2198
+ // OPTIMIZATION: ASCII Fast-Path
2199
+ if (anchor === RE2Flags.UNANCHORED && charCode <= Unicode.MAX_ASCII) {
2200
+ const next = state.nextAscii[charCode];
2201
+ if (next !== null) {
2202
+ return next;
2203
+ }
2204
+ } else {
2205
+ const key = charCode + (anchor === RE2Flags.UNANCHORED ? 0 : Unicode.MAX_RUNE + 1);
2206
+ if (state.nextMap.has(key)) {
2207
+ return state.nextMap.get(key);
2208
+ }
2209
+ }
2210
+ const nextPCs = [];
2211
+ for (let i = 0; i < state.nfaStates.length; i++) {
2212
+ const pc = state.nfaStates[i];
2213
+ const inst = this.prog.getInst(pc);
2214
+ if (Inst.isRuneOp(inst.op) && inst.matchRune(charCode)) {
2215
+ nextPCs.push(inst.out);
2216
+ }
2217
+ }
2218
+ if (anchor === RE2Flags.UNANCHORED) {
2219
+ nextPCs.push(this.prog.start);
2220
+ }
2221
+ const nextState = this.getState(nextPCs);
2222
+
2223
+ // Cache the result
2224
+ if (anchor === RE2Flags.UNANCHORED && charCode <= Unicode.MAX_ASCII) {
2225
+ state.nextAscii[charCode] = nextState;
2226
+ } else {
2227
+ const key = charCode + (anchor === RE2Flags.UNANCHORED ? 0 : Unicode.MAX_RUNE + 1);
2228
+ state.nextMap.set(key, nextState);
2229
+ }
2230
+ return nextState;
2231
+ }
2232
+
2233
+ // The hot loop: Execute the Lazy DFA
2234
+ match(input, pos, anchor) {
2235
+ if ((anchor === RE2Flags.ANCHOR_START || anchor === RE2Flags.ANCHOR_BOTH) && pos !== 0) {
2236
+ return false;
2237
+ }
2238
+ if (!this.startState) {
2239
+ this.startState = this.getState([this.prog.start]);
2240
+ if (!this.startState) return null; // Fallback to NFA
2241
+ }
2242
+ let endPos = input.endPos();
2243
+ let currentState = this.startState;
2244
+ if (currentState.isMatch) {
2245
+ if (anchor === RE2Flags.ANCHOR_BOTH) {
2246
+ if (pos === endPos) return true;
2247
+ } else {
2248
+ return true;
2249
+ }
2250
+ }
2251
+ let i = pos;
2252
+ while (i < endPos) {
2253
+ const r = input.step(i);
2254
+ const rune = r >> 3;
2255
+ const width = r & 7;
2256
+ currentState = this.step(currentState, rune, anchor);
2257
+
2258
+ // If we hit an unrecoverable DFA error or bailout, signal fallback
2259
+ if (currentState === null) return null;
2260
+ if (currentState.isMatch) {
2261
+ if (anchor === RE2Flags.ANCHOR_BOTH) {
2262
+ if (i + width === endPos) return true;
2263
+ } else {
2264
+ return true;
2265
+ }
2266
+ }
2267
+
2268
+ // If we hit a dead end, and anchored, fail early
2269
+ if (currentState.nfaStates.length === 0) {
2270
+ if (anchor !== RE2Flags.UNANCHORED) return false;
2271
+ }
2272
+ i += width;
2273
+ }
2274
+ return false;
2275
+ }
2276
+ }
2277
+
2094
2278
  /**
2095
2279
  * Regular expression abstract syntax tree. Produced by parser, used by compiler. NB, this
2096
2280
  * corresponds to {@code syntax.regexp} in the Go implementation; Go's {@code regexp} is called
@@ -5223,6 +5407,30 @@ class RE2 {
5223
5407
  this.prefixComplete = false; // true if prefix is the entire regexp
5224
5408
  this.prefixRune = 0; // first rune in prefix
5225
5409
  this.pooled = new AtomicReference(); // Cache of machines for running regexp. Forms a Treiber stack.
5410
+ this.dfa = new DFA(prog); // Initialize the Lazy DFA
5411
+ }
5412
+ executeEngine(input, pos, anchor, ncap) {
5413
+ // If the user wants capturing groups (ncap > 0), the DFA mathematically CANNOT do it.
5414
+ // We must use the NFA.
5415
+ if (ncap > 0) {
5416
+ return this.doExecuteNFA(input, pos, anchor, ncap);
5417
+ }
5418
+ try {
5419
+ const dfaResult = this.dfa.match(input, pos, anchor);
5420
+ if (dfaResult !== null) {
5421
+ // DFA succeeded (returned true or false)
5422
+ return dfaResult ? [] : null; // Return empty array to signify "matched but no captures"
5423
+ }
5424
+ } catch (e) {
5425
+ if (e instanceof RE2JSDfaMemoryException) {
5426
+ this.dfa = new DFA(this.prog); // flush cache
5427
+ } else {
5428
+ throw e;
5429
+ }
5430
+ }
5431
+
5432
+ // Fallback to NFA
5433
+ return this.doExecuteNFA(input, pos, anchor, ncap);
5226
5434
  }
5227
5435
 
5228
5436
  /**
@@ -5307,10 +5515,10 @@ class RE2 {
5307
5515
  return this.expr;
5308
5516
  }
5309
5517
 
5310
- // doExecute() finds the leftmost match in the input and returns
5518
+ // doExecuteNFA() finds the leftmost match in the input and returns
5311
5519
  // the position of its subexpressions.
5312
5520
  // Derived from exec.go.
5313
- doExecute(input, pos, anchor, ncap) {
5521
+ doExecuteNFA(input, pos, anchor, ncap) {
5314
5522
  let m = this.get();
5315
5523
  // The Treiber stack cannot reuse nodes, unless the node to be reused has only ever been at
5316
5524
  // the bottom of the stack (i.e., next == null).
@@ -5328,7 +5536,7 @@ class RE2 {
5328
5536
  return cap;
5329
5537
  }
5330
5538
  match(s) {
5331
- return this.doExecute(MachineInput.fromUTF16(s), 0, RE2Flags.UNANCHORED, 0) !== null;
5539
+ return this.executeEngine(MachineInput.fromUTF16(s), 0, RE2Flags.UNANCHORED, 0) !== null;
5332
5540
  }
5333
5541
 
5334
5542
  /**
@@ -5356,7 +5564,7 @@ class RE2 {
5356
5564
  return [false, null];
5357
5565
  }
5358
5566
  const machineInput = input.isUTF16Encoding() ? MachineInput.fromUTF16(input.asCharSequence(), 0, end) : MachineInput.fromUTF8(input.asBytes(), 0, end);
5359
- const groupMatch = this.doExecute(machineInput, start, anchor, 2 * ngroup);
5567
+ const groupMatch = this.executeEngine(machineInput, start, anchor, 2 * ngroup);
5360
5568
  if (groupMatch === null) {
5361
5569
  return [false, null];
5362
5570
  }
@@ -5368,7 +5576,7 @@ class RE2 {
5368
5576
  */
5369
5577
  // This is visible for testing.
5370
5578
  matchUTF8(b) {
5371
- return this.doExecute(MachineInput.fromUTF8(b), 0, RE2Flags.UNANCHORED, 0) !== null;
5579
+ return this.executeEngine(MachineInput.fromUTF8(b), 0, RE2Flags.UNANCHORED, 0) !== null;
5372
5580
  }
5373
5581
 
5374
5582
  /**
@@ -5405,7 +5613,7 @@ class RE2 {
5405
5613
  const input = MachineInput.fromUTF16(src);
5406
5614
  let numReplaces = 0;
5407
5615
  while (searchPos <= src.length) {
5408
- const a = this.doExecute(input, searchPos, RE2Flags.UNANCHORED, 2);
5616
+ const a = this.executeEngine(input, searchPos, RE2Flags.UNANCHORED, 2);
5409
5617
  if (a === null || a.length === 0) {
5410
5618
  break;
5411
5619
  }
@@ -5463,7 +5671,7 @@ class RE2 {
5463
5671
  let i = 0;
5464
5672
  let prevMatchEnd = -1;
5465
5673
  while (i < n && pos <= end) {
5466
- const matches = this.doExecute(input, pos, RE2Flags.UNANCHORED, this.prog.numCap);
5674
+ const matches = this.executeEngine(input, pos, RE2Flags.UNANCHORED, this.prog.numCap);
5467
5675
  if (matches === null || matches.length === 0) {
5468
5676
  break;
5469
5677
  }
@@ -5534,7 +5742,7 @@ class RE2 {
5534
5742
  */
5535
5743
  // This is visible for testing.
5536
5744
  findUTF8(b) {
5537
- const a = this.doExecute(MachineInput.fromUTF8(b), 0, RE2Flags.UNANCHORED, 2);
5745
+ const a = this.executeEngine(MachineInput.fromUTF8(b), 0, RE2Flags.UNANCHORED, 2);
5538
5746
  if (a === null) {
5539
5747
  return null;
5540
5748
  }
@@ -5549,7 +5757,7 @@ class RE2 {
5549
5757
  */
5550
5758
  // This is visible for testing.
5551
5759
  findUTF8Index(b) {
5552
- const a = this.doExecute(MachineInput.fromUTF8(b), 0, RE2Flags.UNANCHORED, 2);
5760
+ const a = this.executeEngine(MachineInput.fromUTF8(b), 0, RE2Flags.UNANCHORED, 2);
5553
5761
  if (a === null) {
5554
5762
  return null;
5555
5763
  }
@@ -5566,7 +5774,7 @@ class RE2 {
5566
5774
  */
5567
5775
  // This is visible for testing.
5568
5776
  find(s) {
5569
- const a = this.doExecute(MachineInput.fromUTF16(s), 0, RE2Flags.UNANCHORED, 2);
5777
+ const a = this.executeEngine(MachineInput.fromUTF16(s), 0, RE2Flags.UNANCHORED, 2);
5570
5778
  if (a === null) {
5571
5779
  return '';
5572
5780
  }
@@ -5582,7 +5790,7 @@ class RE2 {
5582
5790
  */
5583
5791
  // This is visible for testing.
5584
5792
  findIndex(s) {
5585
- return this.doExecute(MachineInput.fromUTF16(s), 0, RE2Flags.UNANCHORED, 2);
5793
+ return this.executeEngine(MachineInput.fromUTF16(s), 0, RE2Flags.UNANCHORED, 2);
5586
5794
  }
5587
5795
 
5588
5796
  /**
@@ -5594,7 +5802,7 @@ class RE2 {
5594
5802
  */
5595
5803
  // This is visible for testing.
5596
5804
  findUTF8Submatch(b) {
5597
- const a = this.doExecute(MachineInput.fromUTF8(b), 0, RE2Flags.UNANCHORED, this.prog.numCap);
5805
+ const a = this.executeEngine(MachineInput.fromUTF8(b), 0, RE2Flags.UNANCHORED, this.prog.numCap);
5598
5806
  if (a === null) {
5599
5807
  return null;
5600
5808
  }
@@ -5616,7 +5824,7 @@ class RE2 {
5616
5824
  */
5617
5825
  // This is visible for testing.
5618
5826
  findUTF8SubmatchIndex(b) {
5619
- return this.pad(this.doExecute(MachineInput.fromUTF8(b), 0, RE2Flags.UNANCHORED, this.prog.numCap));
5827
+ return this.pad(this.executeEngine(MachineInput.fromUTF8(b), 0, RE2Flags.UNANCHORED, this.prog.numCap));
5620
5828
  }
5621
5829
 
5622
5830
  /**
@@ -5628,7 +5836,7 @@ class RE2 {
5628
5836
  */
5629
5837
  // This is visible for testing.
5630
5838
  findSubmatch(s) {
5631
- const a = this.doExecute(MachineInput.fromUTF16(s), 0, RE2Flags.UNANCHORED, this.prog.numCap);
5839
+ const a = this.executeEngine(MachineInput.fromUTF16(s), 0, RE2Flags.UNANCHORED, this.prog.numCap);
5632
5840
  if (a === null) {
5633
5841
  return null;
5634
5842
  }
@@ -5650,7 +5858,7 @@ class RE2 {
5650
5858
  */
5651
5859
  // This is visible for testing.
5652
5860
  findSubmatchIndex(s) {
5653
- return this.pad(this.doExecute(MachineInput.fromUTF16(s), 0, RE2Flags.UNANCHORED, this.prog.numCap));
5861
+ return this.pad(this.executeEngine(MachineInput.fromUTF16(s), 0, RE2Flags.UNANCHORED, this.prog.numCap));
5654
5862
  }
5655
5863
 
5656
5864
  /**
@@ -6198,7 +6406,7 @@ class RE2JS {
6198
6406
  /**
6199
6407
  * Return a map of the capturing groups in this matcher's pattern, where key is the name and value
6200
6408
  * is the index of the group in the pattern.
6201
- * @returns {*}
6409
+ * @returns {Record<string, number>}
6202
6410
  */
6203
6411
  namedGroups() {
6204
6412
  return this.re2Input.namedGroups;
@@ -6220,5 +6428,5 @@ class RE2JS {
6220
6428
  }
6221
6429
  }
6222
6430
 
6223
- export { RE2JS, RE2JSCompileException, RE2JSException, RE2JSFlagsException, RE2JSGroupException, RE2JSSyntaxException };
6431
+ export { Matcher, RE2JS, RE2JSCompileException, RE2JSDfaMemoryException, RE2JSException, RE2JSFlagsException, RE2JSGroupException, RE2JSSyntaxException };
6224
6432
  //# sourceMappingURL=index.esm.js.map