re2js 1.2.2 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,7 +2,7 @@
2
2
  * re2js
3
3
  * RE2JS is the JavaScript port of RE2, a regular expression engine that provides linear time matching
4
4
  *
5
- * @version v1.2.2
5
+ * @version v1.3.0
6
6
  * @author Alexey Vasiliev
7
7
  * @homepage https://github.com/le0pard/re2js#readme
8
8
  * @repository github:le0pard/re2js
@@ -844,6 +844,7 @@ class MatcherInput {
844
844
  }
845
845
 
846
846
  class RE2JSException extends Error {
847
+ /** @param {string} message */
847
848
  constructor(message) {
848
849
  super(message);
849
850
  this.name = 'RE2JSException';
@@ -854,6 +855,10 @@ class RE2JSException extends Error {
854
855
  * An exception thrown by the parser if the pattern was invalid.
855
856
  */
856
857
  class RE2JSSyntaxException extends RE2JSException {
858
+ /**
859
+ * @param {string} error
860
+ * @param {string|null} [input=null]
861
+ */
857
862
  constructor(error, input = null) {
858
863
  let message = `error parsing regexp: ${error}`;
859
864
  if (input) {
@@ -862,12 +867,15 @@ class RE2JSSyntaxException extends RE2JSException {
862
867
  super(message);
863
868
  this.name = 'RE2JSSyntaxException';
864
869
  this.message = message;
870
+ /** @type {string} */
865
871
  this.error = error;
872
+ /** @type {string|null} */
866
873
  this.input = input;
867
874
  }
868
875
 
869
876
  /**
870
877
  * Retrieves the description of the error.
878
+ * @returns {string}
871
879
  */
872
880
  getDescription() {
873
881
  return this.error;
@@ -875,6 +883,7 @@ class RE2JSSyntaxException extends RE2JSException {
875
883
 
876
884
  /**
877
885
  * Retrieves the erroneous regular-expression pattern.
886
+ * @returns {string|null}
878
887
  */
879
888
  getPattern() {
880
889
  return this.input;
@@ -885,6 +894,7 @@ class RE2JSSyntaxException extends RE2JSException {
885
894
  * An exception thrown by the compiler
886
895
  */
887
896
  class RE2JSCompileException extends RE2JSException {
897
+ /** @param {string} message */
888
898
  constructor(message) {
889
899
  super(message);
890
900
  this.name = 'RE2JSCompileException';
@@ -895,6 +905,7 @@ class RE2JSCompileException extends RE2JSException {
895
905
  * An exception thrown by using groups
896
906
  */
897
907
  class RE2JSGroupException extends RE2JSException {
908
+ /** @param {string} message */
898
909
  constructor(message) {
899
910
  super(message);
900
911
  this.name = 'RE2JSGroupException';
@@ -905,12 +916,24 @@ class RE2JSGroupException extends RE2JSException {
905
916
  * An exception thrown by flags
906
917
  */
907
918
  class RE2JSFlagsException extends RE2JSException {
919
+ /** @param {string} message */
908
920
  constructor(message) {
909
921
  super(message);
910
922
  this.name = 'RE2JSFlagsException';
911
923
  }
912
924
  }
913
925
 
926
+ /**
927
+ * An exception thrown by DFA
928
+ */
929
+ class RE2JSDfaMemoryException extends RE2JSException {
930
+ /** @param {string} message */
931
+ constructor(message) {
932
+ super(message);
933
+ this.name = 'RE2JSDfaMemoryException';
934
+ }
935
+ }
936
+
914
937
  /**
915
938
  * A stateful iterator that interprets a regex {@code RE2JS} on a specific input.
916
939
  *
@@ -966,10 +989,14 @@ class Matcher {
966
989
  this.patternInput = pattern;
967
990
  const re2 = this.patternInput.re2();
968
991
  // The number of submatches (groups) in the pattern.
992
+ /** @type {number} */
969
993
  this.patternGroupCount = re2.numberOfCapturingGroups();
970
994
  // The group indexes, in [start, end) pairs. Zeroth pair is overall match.
995
+ /** @type {number[]} */
971
996
  this.groups = [];
997
+ /** @type {Record<string, number>} */
972
998
  this.namedGroups = re2.namedGroups;
999
+ /** @type {number} */
973
1000
  this.numberOfInstructions = re2.numberOfInstructions();
974
1001
  if (input instanceof MatcherInputBase) {
975
1002
  this.resetMatcherInput(input);
@@ -995,8 +1022,10 @@ class Matcher {
995
1022
  */
996
1023
  reset() {
997
1024
  // The input length in UTF16 codes.
1025
+ /** @type {number} */
998
1026
  this.matcherInputLength = this.matcherInput.length();
999
1027
  // The append position: where the next append should start.
1028
+ /** @type {number} */
1000
1029
  this.appendPos = 0;
1001
1030
  // Is there a current match?
1002
1031
  this.hasMatch = false;
@@ -1010,6 +1039,7 @@ class Matcher {
1010
1039
 
1011
1040
  /**
1012
1041
  * Resets the {@code Matcher} and changes the input.
1042
+ * @param {Utf8MatcherInput|Utf16MatcherInput} input
1013
1043
  * @returns {Matcher} the {@code Matcher} itself, for chained method calls
1014
1044
  */
1015
1045
  resetMatcherInput(input) {
@@ -1152,7 +1182,7 @@ class Matcher {
1152
1182
  * Matches the input against the pattern (unanchored), starting at a specified position. If there
1153
1183
  * is a match, {@code find} sets the match state to describe it.
1154
1184
  *
1155
- * @param {string|number} [start=null] the input position where the search begins
1185
+ * @param {number} [start=null] the input position where the search begins
1156
1186
  * @returns {boolean} if it finds a match
1157
1187
  * @throws IndexOutOfBoundsException if start is not a valid input position
1158
1188
  */
@@ -2093,6 +2123,160 @@ class Machine {
2093
2123
  }
2094
2124
  }
2095
2125
 
2126
+ class DFAState {
2127
+ constructor(id, nfaStates, isMatch) {
2128
+ this.id = id; // Stringified NFA state list (e.g., "1,4,7")
2129
+ this.nfaStates = nfaStates; // Array of Instruction PCs
2130
+ this.isMatch = isMatch; // Boolean
2131
+ this.nextAscii = new Array(Unicode.MAX_ASCII + 1).fill(null); // Flat array for blisteringly fast ASCII lookups (unanchored)
2132
+ this.nextMap = new Map(); // Cache of Char -> DFAState
2133
+ }
2134
+ }
2135
+ class DFA {
2136
+ constructor(prog) {
2137
+ this.prog = prog;
2138
+ this.stateCache = new Map(); // id -> DFAState
2139
+ this.startState = null;
2140
+ this.stateLimit = 10000; // Prevent memory explosion (ReDoS protection), like RE2 max_mem
2141
+ }
2142
+
2143
+ // Follows epsilon (empty) transitions to find all reachable states without consuming a char
2144
+ computeClosure(pcs) {
2145
+ const closure = new Set();
2146
+ const stack = [...pcs];
2147
+ let isMatch = false;
2148
+ while (stack.length > 0) {
2149
+ const pc = stack.pop();
2150
+ if (closure.has(pc)) continue;
2151
+ closure.add(pc);
2152
+ const inst = this.prog.getInst(pc);
2153
+ switch (inst.op) {
2154
+ case Inst.MATCH:
2155
+ isMatch = true;
2156
+ break;
2157
+ case Inst.ALT:
2158
+ case Inst.ALT_MATCH:
2159
+ stack.push(inst.out);
2160
+ stack.push(inst.arg);
2161
+ break;
2162
+ case Inst.NOP:
2163
+ case Inst.CAPTURE:
2164
+ stack.push(inst.out);
2165
+ break;
2166
+ // Bailing out on complex empty-width assertions to keep DFA fast.
2167
+ // Engine will seamlessly fall back to the NFA.
2168
+ case Inst.EMPTY_WIDTH:
2169
+ return null;
2170
+ }
2171
+ }
2172
+ const sortedPCs = Int32Array.from(closure).sort();
2173
+ return {
2174
+ pcs: sortedPCs,
2175
+ isMatch
2176
+ };
2177
+ }
2178
+
2179
+ // Get or create a DFA state from a list of NFA PCs
2180
+ getState(pcs) {
2181
+ const closureResult = this.computeClosure(pcs);
2182
+ if (!closureResult) return null; // Bailout to NFA required
2183
+
2184
+ const id = closureResult.pcs.join(',');
2185
+ if (this.stateCache.has(id)) {
2186
+ return this.stateCache.get(id);
2187
+ }
2188
+
2189
+ // Safety: prevent memory exhaustion from state explosion
2190
+ if (this.stateCache.size > this.stateLimit) {
2191
+ throw new RE2JSDfaMemoryException('dfa error: Out of memory exception');
2192
+ }
2193
+ const state = new DFAState(id, closureResult.pcs, closureResult.isMatch);
2194
+ this.stateCache.set(id, state);
2195
+ return state;
2196
+ }
2197
+
2198
+ // Compute the next DFA state given a current state and a character
2199
+ step(state, charCode, anchor) {
2200
+ // OPTIMIZATION: ASCII Fast-Path
2201
+ if (anchor === RE2Flags.UNANCHORED && charCode <= Unicode.MAX_ASCII) {
2202
+ const next = state.nextAscii[charCode];
2203
+ if (next !== null) {
2204
+ return next;
2205
+ }
2206
+ } else {
2207
+ const key = charCode + (anchor === RE2Flags.UNANCHORED ? 0 : Unicode.MAX_RUNE + 1);
2208
+ if (state.nextMap.has(key)) {
2209
+ return state.nextMap.get(key);
2210
+ }
2211
+ }
2212
+ const nextPCs = [];
2213
+ for (let i = 0; i < state.nfaStates.length; i++) {
2214
+ const pc = state.nfaStates[i];
2215
+ const inst = this.prog.getInst(pc);
2216
+ if (Inst.isRuneOp(inst.op) && inst.matchRune(charCode)) {
2217
+ nextPCs.push(inst.out);
2218
+ }
2219
+ }
2220
+ if (anchor === RE2Flags.UNANCHORED) {
2221
+ nextPCs.push(this.prog.start);
2222
+ }
2223
+ const nextState = this.getState(nextPCs);
2224
+
2225
+ // Cache the result
2226
+ if (anchor === RE2Flags.UNANCHORED && charCode <= Unicode.MAX_ASCII) {
2227
+ state.nextAscii[charCode] = nextState;
2228
+ } else {
2229
+ const key = charCode + (anchor === RE2Flags.UNANCHORED ? 0 : Unicode.MAX_RUNE + 1);
2230
+ state.nextMap.set(key, nextState);
2231
+ }
2232
+ return nextState;
2233
+ }
2234
+
2235
+ // The hot loop: Execute the Lazy DFA
2236
+ match(input, pos, anchor) {
2237
+ if ((anchor === RE2Flags.ANCHOR_START || anchor === RE2Flags.ANCHOR_BOTH) && pos !== 0) {
2238
+ return false;
2239
+ }
2240
+ if (!this.startState) {
2241
+ this.startState = this.getState([this.prog.start]);
2242
+ if (!this.startState) return null; // Fallback to NFA
2243
+ }
2244
+ let endPos = input.endPos();
2245
+ let currentState = this.startState;
2246
+ if (currentState.isMatch) {
2247
+ if (anchor === RE2Flags.ANCHOR_BOTH) {
2248
+ if (pos === endPos) return true;
2249
+ } else {
2250
+ return true;
2251
+ }
2252
+ }
2253
+ let i = pos;
2254
+ while (i < endPos) {
2255
+ const r = input.step(i);
2256
+ const rune = r >> 3;
2257
+ const width = r & 7;
2258
+ currentState = this.step(currentState, rune, anchor);
2259
+
2260
+ // If we hit an unrecoverable DFA error or bailout, signal fallback
2261
+ if (currentState === null) return null;
2262
+ if (currentState.isMatch) {
2263
+ if (anchor === RE2Flags.ANCHOR_BOTH) {
2264
+ if (i + width === endPos) return true;
2265
+ } else {
2266
+ return true;
2267
+ }
2268
+ }
2269
+
2270
+ // If we hit a dead end, and anchored, fail early
2271
+ if (currentState.nfaStates.length === 0) {
2272
+ if (anchor !== RE2Flags.UNANCHORED) return false;
2273
+ }
2274
+ i += width;
2275
+ }
2276
+ return false;
2277
+ }
2278
+ }
2279
+
2096
2280
  /**
2097
2281
  * Regular expression abstract syntax tree. Produced by parser, used by compiler. NB, this
2098
2282
  * corresponds to {@code syntax.regexp} in the Go implementation; Go's {@code regexp} is called
@@ -5225,6 +5409,30 @@ class RE2 {
5225
5409
  this.prefixComplete = false; // true if prefix is the entire regexp
5226
5410
  this.prefixRune = 0; // first rune in prefix
5227
5411
  this.pooled = new AtomicReference(); // Cache of machines for running regexp. Forms a Treiber stack.
5412
+ this.dfa = new DFA(prog); // Initialize the Lazy DFA
5413
+ }
5414
+ executeEngine(input, pos, anchor, ncap) {
5415
+ // If the user wants capturing groups (ncap > 0), the DFA mathematically CANNOT do it.
5416
+ // We must use the NFA.
5417
+ if (ncap > 0) {
5418
+ return this.doExecuteNFA(input, pos, anchor, ncap);
5419
+ }
5420
+ try {
5421
+ const dfaResult = this.dfa.match(input, pos, anchor);
5422
+ if (dfaResult !== null) {
5423
+ // DFA succeeded (returned true or false)
5424
+ return dfaResult ? [] : null; // Return empty array to signify "matched but no captures"
5425
+ }
5426
+ } catch (e) {
5427
+ if (e instanceof RE2JSDfaMemoryException) {
5428
+ this.dfa = new DFA(this.prog); // flush cache
5429
+ } else {
5430
+ throw e;
5431
+ }
5432
+ }
5433
+
5434
+ // Fallback to NFA
5435
+ return this.doExecuteNFA(input, pos, anchor, ncap);
5228
5436
  }
5229
5437
 
5230
5438
  /**
@@ -5309,10 +5517,10 @@ class RE2 {
5309
5517
  return this.expr;
5310
5518
  }
5311
5519
 
5312
- // doExecute() finds the leftmost match in the input and returns
5520
+ // doExecuteNFA() finds the leftmost match in the input and returns
5313
5521
  // the position of its subexpressions.
5314
5522
  // Derived from exec.go.
5315
- doExecute(input, pos, anchor, ncap) {
5523
+ doExecuteNFA(input, pos, anchor, ncap) {
5316
5524
  let m = this.get();
5317
5525
  // The Treiber stack cannot reuse nodes, unless the node to be reused has only ever been at
5318
5526
  // the bottom of the stack (i.e., next == null).
@@ -5330,7 +5538,7 @@ class RE2 {
5330
5538
  return cap;
5331
5539
  }
5332
5540
  match(s) {
5333
- return this.doExecute(MachineInput.fromUTF16(s), 0, RE2Flags.UNANCHORED, 0) !== null;
5541
+ return this.executeEngine(MachineInput.fromUTF16(s), 0, RE2Flags.UNANCHORED, 0) !== null;
5334
5542
  }
5335
5543
 
5336
5544
  /**
@@ -5358,7 +5566,7 @@ class RE2 {
5358
5566
  return [false, null];
5359
5567
  }
5360
5568
  const machineInput = input.isUTF16Encoding() ? MachineInput.fromUTF16(input.asCharSequence(), 0, end) : MachineInput.fromUTF8(input.asBytes(), 0, end);
5361
- const groupMatch = this.doExecute(machineInput, start, anchor, 2 * ngroup);
5569
+ const groupMatch = this.executeEngine(machineInput, start, anchor, 2 * ngroup);
5362
5570
  if (groupMatch === null) {
5363
5571
  return [false, null];
5364
5572
  }
@@ -5370,7 +5578,7 @@ class RE2 {
5370
5578
  */
5371
5579
  // This is visible for testing.
5372
5580
  matchUTF8(b) {
5373
- return this.doExecute(MachineInput.fromUTF8(b), 0, RE2Flags.UNANCHORED, 0) !== null;
5581
+ return this.executeEngine(MachineInput.fromUTF8(b), 0, RE2Flags.UNANCHORED, 0) !== null;
5374
5582
  }
5375
5583
 
5376
5584
  /**
@@ -5407,7 +5615,7 @@ class RE2 {
5407
5615
  const input = MachineInput.fromUTF16(src);
5408
5616
  let numReplaces = 0;
5409
5617
  while (searchPos <= src.length) {
5410
- const a = this.doExecute(input, searchPos, RE2Flags.UNANCHORED, 2);
5618
+ const a = this.executeEngine(input, searchPos, RE2Flags.UNANCHORED, 2);
5411
5619
  if (a === null || a.length === 0) {
5412
5620
  break;
5413
5621
  }
@@ -5465,7 +5673,7 @@ class RE2 {
5465
5673
  let i = 0;
5466
5674
  let prevMatchEnd = -1;
5467
5675
  while (i < n && pos <= end) {
5468
- const matches = this.doExecute(input, pos, RE2Flags.UNANCHORED, this.prog.numCap);
5676
+ const matches = this.executeEngine(input, pos, RE2Flags.UNANCHORED, this.prog.numCap);
5469
5677
  if (matches === null || matches.length === 0) {
5470
5678
  break;
5471
5679
  }
@@ -5536,7 +5744,7 @@ class RE2 {
5536
5744
  */
5537
5745
  // This is visible for testing.
5538
5746
  findUTF8(b) {
5539
- const a = this.doExecute(MachineInput.fromUTF8(b), 0, RE2Flags.UNANCHORED, 2);
5747
+ const a = this.executeEngine(MachineInput.fromUTF8(b), 0, RE2Flags.UNANCHORED, 2);
5540
5748
  if (a === null) {
5541
5749
  return null;
5542
5750
  }
@@ -5551,7 +5759,7 @@ class RE2 {
5551
5759
  */
5552
5760
  // This is visible for testing.
5553
5761
  findUTF8Index(b) {
5554
- const a = this.doExecute(MachineInput.fromUTF8(b), 0, RE2Flags.UNANCHORED, 2);
5762
+ const a = this.executeEngine(MachineInput.fromUTF8(b), 0, RE2Flags.UNANCHORED, 2);
5555
5763
  if (a === null) {
5556
5764
  return null;
5557
5765
  }
@@ -5568,7 +5776,7 @@ class RE2 {
5568
5776
  */
5569
5777
  // This is visible for testing.
5570
5778
  find(s) {
5571
- const a = this.doExecute(MachineInput.fromUTF16(s), 0, RE2Flags.UNANCHORED, 2);
5779
+ const a = this.executeEngine(MachineInput.fromUTF16(s), 0, RE2Flags.UNANCHORED, 2);
5572
5780
  if (a === null) {
5573
5781
  return '';
5574
5782
  }
@@ -5584,7 +5792,7 @@ class RE2 {
5584
5792
  */
5585
5793
  // This is visible for testing.
5586
5794
  findIndex(s) {
5587
- return this.doExecute(MachineInput.fromUTF16(s), 0, RE2Flags.UNANCHORED, 2);
5795
+ return this.executeEngine(MachineInput.fromUTF16(s), 0, RE2Flags.UNANCHORED, 2);
5588
5796
  }
5589
5797
 
5590
5798
  /**
@@ -5596,7 +5804,7 @@ class RE2 {
5596
5804
  */
5597
5805
  // This is visible for testing.
5598
5806
  findUTF8Submatch(b) {
5599
- const a = this.doExecute(MachineInput.fromUTF8(b), 0, RE2Flags.UNANCHORED, this.prog.numCap);
5807
+ const a = this.executeEngine(MachineInput.fromUTF8(b), 0, RE2Flags.UNANCHORED, this.prog.numCap);
5600
5808
  if (a === null) {
5601
5809
  return null;
5602
5810
  }
@@ -5618,7 +5826,7 @@ class RE2 {
5618
5826
  */
5619
5827
  // This is visible for testing.
5620
5828
  findUTF8SubmatchIndex(b) {
5621
- return this.pad(this.doExecute(MachineInput.fromUTF8(b), 0, RE2Flags.UNANCHORED, this.prog.numCap));
5829
+ return this.pad(this.executeEngine(MachineInput.fromUTF8(b), 0, RE2Flags.UNANCHORED, this.prog.numCap));
5622
5830
  }
5623
5831
 
5624
5832
  /**
@@ -5630,7 +5838,7 @@ class RE2 {
5630
5838
  */
5631
5839
  // This is visible for testing.
5632
5840
  findSubmatch(s) {
5633
- const a = this.doExecute(MachineInput.fromUTF16(s), 0, RE2Flags.UNANCHORED, this.prog.numCap);
5841
+ const a = this.executeEngine(MachineInput.fromUTF16(s), 0, RE2Flags.UNANCHORED, this.prog.numCap);
5634
5842
  if (a === null) {
5635
5843
  return null;
5636
5844
  }
@@ -5652,7 +5860,7 @@ class RE2 {
5652
5860
  */
5653
5861
  // This is visible for testing.
5654
5862
  findSubmatchIndex(s) {
5655
- return this.pad(this.doExecute(MachineInput.fromUTF16(s), 0, RE2Flags.UNANCHORED, this.prog.numCap));
5863
+ return this.pad(this.executeEngine(MachineInput.fromUTF16(s), 0, RE2Flags.UNANCHORED, this.prog.numCap));
5656
5864
  }
5657
5865
 
5658
5866
  /**
@@ -6200,7 +6408,7 @@ class RE2JS {
6200
6408
  /**
6201
6409
  * Return a map of the capturing groups in this matcher's pattern, where key is the name and value
6202
6410
  * is the index of the group in the pattern.
6203
- * @returns {*}
6411
+ * @returns {Record<string, number>}
6204
6412
  */
6205
6413
  namedGroups() {
6206
6414
  return this.re2Input.namedGroups;
@@ -6222,8 +6430,10 @@ class RE2JS {
6222
6430
  }
6223
6431
  }
6224
6432
 
6433
+ exports.Matcher = Matcher;
6225
6434
  exports.RE2JS = RE2JS;
6226
6435
  exports.RE2JSCompileException = RE2JSCompileException;
6436
+ exports.RE2JSDfaMemoryException = RE2JSDfaMemoryException;
6227
6437
  exports.RE2JSException = RE2JSException;
6228
6438
  exports.RE2JSFlagsException = RE2JSFlagsException;
6229
6439
  exports.RE2JSGroupException = RE2JSGroupException;