re2js 1.2.2 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,7 +2,7 @@
2
2
  * re2js
3
3
  * RE2JS is the JavaScript port of RE2, a regular expression engine that provides linear time matching
4
4
  *
5
- * @version v1.2.2
5
+ * @version v1.3.0
6
6
  * @author Alexey Vasiliev
7
7
  * @homepage https://github.com/le0pard/re2js#readme
8
8
  * @repository github:le0pard/re2js
@@ -848,6 +848,7 @@
848
848
  }
849
849
 
850
850
  class RE2JSException extends Error {
851
+ /** @param {string} message */
851
852
  constructor(message) {
852
853
  super(message);
853
854
  this.name = 'RE2JSException';
@@ -858,6 +859,10 @@
858
859
  * An exception thrown by the parser if the pattern was invalid.
859
860
  */
860
861
  class RE2JSSyntaxException extends RE2JSException {
862
+ /**
863
+ * @param {string} error
864
+ * @param {string|null} [input=null]
865
+ */
861
866
  constructor(error, input = null) {
862
867
  let message = `error parsing regexp: ${error}`;
863
868
  if (input) {
@@ -866,12 +871,15 @@
866
871
  super(message);
867
872
  this.name = 'RE2JSSyntaxException';
868
873
  this.message = message;
874
+ /** @type {string} */
869
875
  this.error = error;
876
+ /** @type {string|null} */
870
877
  this.input = input;
871
878
  }
872
879
 
873
880
  /**
874
881
  * Retrieves the description of the error.
882
+ * @returns {string}
875
883
  */
876
884
  getDescription() {
877
885
  return this.error;
@@ -879,6 +887,7 @@
879
887
 
880
888
  /**
881
889
  * Retrieves the erroneous regular-expression pattern.
890
+ * @returns {string|null}
882
891
  */
883
892
  getPattern() {
884
893
  return this.input;
@@ -889,6 +898,7 @@
889
898
  * An exception thrown by the compiler
890
899
  */
891
900
  class RE2JSCompileException extends RE2JSException {
901
+ /** @param {string} message */
892
902
  constructor(message) {
893
903
  super(message);
894
904
  this.name = 'RE2JSCompileException';
@@ -899,6 +909,7 @@
899
909
  * An exception thrown by using groups
900
910
  */
901
911
  class RE2JSGroupException extends RE2JSException {
912
+ /** @param {string} message */
902
913
  constructor(message) {
903
914
  super(message);
904
915
  this.name = 'RE2JSGroupException';
@@ -909,12 +920,24 @@
909
920
  * An exception thrown by flags
910
921
  */
911
922
  class RE2JSFlagsException extends RE2JSException {
923
+ /** @param {string} message */
912
924
  constructor(message) {
913
925
  super(message);
914
926
  this.name = 'RE2JSFlagsException';
915
927
  }
916
928
  }
917
929
 
930
+ /**
931
+ * An exception thrown by DFA
932
+ */
933
+ class RE2JSDfaMemoryException extends RE2JSException {
934
+ /** @param {string} message */
935
+ constructor(message) {
936
+ super(message);
937
+ this.name = 'RE2JSDfaMemoryException';
938
+ }
939
+ }
940
+
918
941
  /**
919
942
  * A stateful iterator that interprets a regex {@code RE2JS} on a specific input.
920
943
  *
@@ -970,10 +993,14 @@
970
993
  this.patternInput = pattern;
971
994
  const re2 = this.patternInput.re2();
972
995
  // The number of submatches (groups) in the pattern.
996
+ /** @type {number} */
973
997
  this.patternGroupCount = re2.numberOfCapturingGroups();
974
998
  // The group indexes, in [start, end) pairs. Zeroth pair is overall match.
999
+ /** @type {number[]} */
975
1000
  this.groups = [];
1001
+ /** @type {Record<string, number>} */
976
1002
  this.namedGroups = re2.namedGroups;
1003
+ /** @type {number} */
977
1004
  this.numberOfInstructions = re2.numberOfInstructions();
978
1005
  if (input instanceof MatcherInputBase) {
979
1006
  this.resetMatcherInput(input);
@@ -999,8 +1026,10 @@
999
1026
  */
1000
1027
  reset() {
1001
1028
  // The input length in UTF16 codes.
1029
+ /** @type {number} */
1002
1030
  this.matcherInputLength = this.matcherInput.length();
1003
1031
  // The append position: where the next append should start.
1032
+ /** @type {number} */
1004
1033
  this.appendPos = 0;
1005
1034
  // Is there a current match?
1006
1035
  this.hasMatch = false;
@@ -1014,6 +1043,7 @@
1014
1043
 
1015
1044
  /**
1016
1045
  * Resets the {@code Matcher} and changes the input.
1046
+ * @param {Utf8MatcherInput|Utf16MatcherInput} input
1017
1047
  * @returns {Matcher} the {@code Matcher} itself, for chained method calls
1018
1048
  */
1019
1049
  resetMatcherInput(input) {
@@ -1156,7 +1186,7 @@
1156
1186
  * Matches the input against the pattern (unanchored), starting at a specified position. If there
1157
1187
  * is a match, {@code find} sets the match state to describe it.
1158
1188
  *
1159
- * @param {string|number} [start=null] the input position where the search begins
1189
+ * @param {number} [start=null] the input position where the search begins
1160
1190
  * @returns {boolean} if it finds a match
1161
1191
  * @throws IndexOutOfBoundsException if start is not a valid input position
1162
1192
  */
@@ -2097,6 +2127,160 @@
2097
2127
  }
2098
2128
  }
2099
2129
 
2130
+ class DFAState {
2131
+ constructor(id, nfaStates, isMatch) {
2132
+ this.id = id; // Stringified NFA state list (e.g., "1,4,7")
2133
+ this.nfaStates = nfaStates; // Array of Instruction PCs
2134
+ this.isMatch = isMatch; // Boolean
2135
+ this.nextAscii = new Array(Unicode.MAX_ASCII + 1).fill(null); // Flat array for blisteringly fast ASCII lookups (unanchored)
2136
+ this.nextMap = new Map(); // Cache of Char -> DFAState
2137
+ }
2138
+ }
2139
+ class DFA {
2140
+ constructor(prog) {
2141
+ this.prog = prog;
2142
+ this.stateCache = new Map(); // id -> DFAState
2143
+ this.startState = null;
2144
+ this.stateLimit = 10000; // Prevent memory explosion (ReDoS protection), like RE2 max_mem
2145
+ }
2146
+
2147
+ // Follows epsilon (empty) transitions to find all reachable states without consuming a char
2148
+ computeClosure(pcs) {
2149
+ const closure = new Set();
2150
+ const stack = [...pcs];
2151
+ let isMatch = false;
2152
+ while (stack.length > 0) {
2153
+ const pc = stack.pop();
2154
+ if (closure.has(pc)) continue;
2155
+ closure.add(pc);
2156
+ const inst = this.prog.getInst(pc);
2157
+ switch (inst.op) {
2158
+ case Inst.MATCH:
2159
+ isMatch = true;
2160
+ break;
2161
+ case Inst.ALT:
2162
+ case Inst.ALT_MATCH:
2163
+ stack.push(inst.out);
2164
+ stack.push(inst.arg);
2165
+ break;
2166
+ case Inst.NOP:
2167
+ case Inst.CAPTURE:
2168
+ stack.push(inst.out);
2169
+ break;
2170
+ // Bailing out on complex empty-width assertions to keep DFA fast.
2171
+ // Engine will seamlessly fall back to the NFA.
2172
+ case Inst.EMPTY_WIDTH:
2173
+ return null;
2174
+ }
2175
+ }
2176
+ const sortedPCs = Int32Array.from(closure).sort();
2177
+ return {
2178
+ pcs: sortedPCs,
2179
+ isMatch
2180
+ };
2181
+ }
2182
+
2183
+ // Get or create a DFA state from a list of NFA PCs
2184
+ getState(pcs) {
2185
+ const closureResult = this.computeClosure(pcs);
2186
+ if (!closureResult) return null; // Bailout to NFA required
2187
+
2188
+ const id = closureResult.pcs.join(',');
2189
+ if (this.stateCache.has(id)) {
2190
+ return this.stateCache.get(id);
2191
+ }
2192
+
2193
+ // Safety: prevent memory exhaustion from state explosion
2194
+ if (this.stateCache.size > this.stateLimit) {
2195
+ throw new RE2JSDfaMemoryException('dfa error: Out of memory exception');
2196
+ }
2197
+ const state = new DFAState(id, closureResult.pcs, closureResult.isMatch);
2198
+ this.stateCache.set(id, state);
2199
+ return state;
2200
+ }
2201
+
2202
+ // Compute the next DFA state given a current state and a character
2203
+ step(state, charCode, anchor) {
2204
+ // OPTIMIZATION: ASCII Fast-Path
2205
+ if (anchor === RE2Flags.UNANCHORED && charCode <= Unicode.MAX_ASCII) {
2206
+ const next = state.nextAscii[charCode];
2207
+ if (next !== null) {
2208
+ return next;
2209
+ }
2210
+ } else {
2211
+ const key = charCode + (anchor === RE2Flags.UNANCHORED ? 0 : Unicode.MAX_RUNE + 1);
2212
+ if (state.nextMap.has(key)) {
2213
+ return state.nextMap.get(key);
2214
+ }
2215
+ }
2216
+ const nextPCs = [];
2217
+ for (let i = 0; i < state.nfaStates.length; i++) {
2218
+ const pc = state.nfaStates[i];
2219
+ const inst = this.prog.getInst(pc);
2220
+ if (Inst.isRuneOp(inst.op) && inst.matchRune(charCode)) {
2221
+ nextPCs.push(inst.out);
2222
+ }
2223
+ }
2224
+ if (anchor === RE2Flags.UNANCHORED) {
2225
+ nextPCs.push(this.prog.start);
2226
+ }
2227
+ const nextState = this.getState(nextPCs);
2228
+
2229
+ // Cache the result
2230
+ if (anchor === RE2Flags.UNANCHORED && charCode <= Unicode.MAX_ASCII) {
2231
+ state.nextAscii[charCode] = nextState;
2232
+ } else {
2233
+ const key = charCode + (anchor === RE2Flags.UNANCHORED ? 0 : Unicode.MAX_RUNE + 1);
2234
+ state.nextMap.set(key, nextState);
2235
+ }
2236
+ return nextState;
2237
+ }
2238
+
2239
+ // The hot loop: Execute the Lazy DFA
2240
+ match(input, pos, anchor) {
2241
+ if ((anchor === RE2Flags.ANCHOR_START || anchor === RE2Flags.ANCHOR_BOTH) && pos !== 0) {
2242
+ return false;
2243
+ }
2244
+ if (!this.startState) {
2245
+ this.startState = this.getState([this.prog.start]);
2246
+ if (!this.startState) return null; // Fallback to NFA
2247
+ }
2248
+ let endPos = input.endPos();
2249
+ let currentState = this.startState;
2250
+ if (currentState.isMatch) {
2251
+ if (anchor === RE2Flags.ANCHOR_BOTH) {
2252
+ if (pos === endPos) return true;
2253
+ } else {
2254
+ return true;
2255
+ }
2256
+ }
2257
+ let i = pos;
2258
+ while (i < endPos) {
2259
+ const r = input.step(i);
2260
+ const rune = r >> 3;
2261
+ const width = r & 7;
2262
+ currentState = this.step(currentState, rune, anchor);
2263
+
2264
+ // If we hit an unrecoverable DFA error or bailout, signal fallback
2265
+ if (currentState === null) return null;
2266
+ if (currentState.isMatch) {
2267
+ if (anchor === RE2Flags.ANCHOR_BOTH) {
2268
+ if (i + width === endPos) return true;
2269
+ } else {
2270
+ return true;
2271
+ }
2272
+ }
2273
+
2274
+ // If we hit a dead end, and anchored, fail early
2275
+ if (currentState.nfaStates.length === 0) {
2276
+ if (anchor !== RE2Flags.UNANCHORED) return false;
2277
+ }
2278
+ i += width;
2279
+ }
2280
+ return false;
2281
+ }
2282
+ }
2283
+
2100
2284
  /**
2101
2285
  * Regular expression abstract syntax tree. Produced by parser, used by compiler. NB, this
2102
2286
  * corresponds to {@code syntax.regexp} in the Go implementation; Go's {@code regexp} is called
@@ -5229,6 +5413,30 @@
5229
5413
  this.prefixComplete = false; // true if prefix is the entire regexp
5230
5414
  this.prefixRune = 0; // first rune in prefix
5231
5415
  this.pooled = new AtomicReference(); // Cache of machines for running regexp. Forms a Treiber stack.
5416
+ this.dfa = new DFA(prog); // Initialize the Lazy DFA
5417
+ }
5418
+ executeEngine(input, pos, anchor, ncap) {
5419
+ // If the user wants capturing groups (ncap > 0), the DFA mathematically CANNOT do it.
5420
+ // We must use the NFA.
5421
+ if (ncap > 0) {
5422
+ return this.doExecuteNFA(input, pos, anchor, ncap);
5423
+ }
5424
+ try {
5425
+ const dfaResult = this.dfa.match(input, pos, anchor);
5426
+ if (dfaResult !== null) {
5427
+ // DFA succeeded (returned true or false)
5428
+ return dfaResult ? [] : null; // Return empty array to signify "matched but no captures"
5429
+ }
5430
+ } catch (e) {
5431
+ if (e instanceof RE2JSDfaMemoryException) {
5432
+ this.dfa = new DFA(this.prog); // flush cache
5433
+ } else {
5434
+ throw e;
5435
+ }
5436
+ }
5437
+
5438
+ // Fallback to NFA
5439
+ return this.doExecuteNFA(input, pos, anchor, ncap);
5232
5440
  }
5233
5441
 
5234
5442
  /**
@@ -5313,10 +5521,10 @@
5313
5521
  return this.expr;
5314
5522
  }
5315
5523
 
5316
- // doExecute() finds the leftmost match in the input and returns
5524
+ // doExecuteNFA() finds the leftmost match in the input and returns
5317
5525
  // the position of its subexpressions.
5318
5526
  // Derived from exec.go.
5319
- doExecute(input, pos, anchor, ncap) {
5527
+ doExecuteNFA(input, pos, anchor, ncap) {
5320
5528
  let m = this.get();
5321
5529
  // The Treiber stack cannot reuse nodes, unless the node to be reused has only ever been at
5322
5530
  // the bottom of the stack (i.e., next == null).
@@ -5334,7 +5542,7 @@
5334
5542
  return cap;
5335
5543
  }
5336
5544
  match(s) {
5337
- return this.doExecute(MachineInput.fromUTF16(s), 0, RE2Flags.UNANCHORED, 0) !== null;
5545
+ return this.executeEngine(MachineInput.fromUTF16(s), 0, RE2Flags.UNANCHORED, 0) !== null;
5338
5546
  }
5339
5547
 
5340
5548
  /**
@@ -5362,7 +5570,7 @@
5362
5570
  return [false, null];
5363
5571
  }
5364
5572
  const machineInput = input.isUTF16Encoding() ? MachineInput.fromUTF16(input.asCharSequence(), 0, end) : MachineInput.fromUTF8(input.asBytes(), 0, end);
5365
- const groupMatch = this.doExecute(machineInput, start, anchor, 2 * ngroup);
5573
+ const groupMatch = this.executeEngine(machineInput, start, anchor, 2 * ngroup);
5366
5574
  if (groupMatch === null) {
5367
5575
  return [false, null];
5368
5576
  }
@@ -5374,7 +5582,7 @@
5374
5582
  */
5375
5583
  // This is visible for testing.
5376
5584
  matchUTF8(b) {
5377
- return this.doExecute(MachineInput.fromUTF8(b), 0, RE2Flags.UNANCHORED, 0) !== null;
5585
+ return this.executeEngine(MachineInput.fromUTF8(b), 0, RE2Flags.UNANCHORED, 0) !== null;
5378
5586
  }
5379
5587
 
5380
5588
  /**
@@ -5411,7 +5619,7 @@
5411
5619
  const input = MachineInput.fromUTF16(src);
5412
5620
  let numReplaces = 0;
5413
5621
  while (searchPos <= src.length) {
5414
- const a = this.doExecute(input, searchPos, RE2Flags.UNANCHORED, 2);
5622
+ const a = this.executeEngine(input, searchPos, RE2Flags.UNANCHORED, 2);
5415
5623
  if (a === null || a.length === 0) {
5416
5624
  break;
5417
5625
  }
@@ -5469,7 +5677,7 @@
5469
5677
  let i = 0;
5470
5678
  let prevMatchEnd = -1;
5471
5679
  while (i < n && pos <= end) {
5472
- const matches = this.doExecute(input, pos, RE2Flags.UNANCHORED, this.prog.numCap);
5680
+ const matches = this.executeEngine(input, pos, RE2Flags.UNANCHORED, this.prog.numCap);
5473
5681
  if (matches === null || matches.length === 0) {
5474
5682
  break;
5475
5683
  }
@@ -5540,7 +5748,7 @@
5540
5748
  */
5541
5749
  // This is visible for testing.
5542
5750
  findUTF8(b) {
5543
- const a = this.doExecute(MachineInput.fromUTF8(b), 0, RE2Flags.UNANCHORED, 2);
5751
+ const a = this.executeEngine(MachineInput.fromUTF8(b), 0, RE2Flags.UNANCHORED, 2);
5544
5752
  if (a === null) {
5545
5753
  return null;
5546
5754
  }
@@ -5555,7 +5763,7 @@
5555
5763
  */
5556
5764
  // This is visible for testing.
5557
5765
  findUTF8Index(b) {
5558
- const a = this.doExecute(MachineInput.fromUTF8(b), 0, RE2Flags.UNANCHORED, 2);
5766
+ const a = this.executeEngine(MachineInput.fromUTF8(b), 0, RE2Flags.UNANCHORED, 2);
5559
5767
  if (a === null) {
5560
5768
  return null;
5561
5769
  }
@@ -5572,7 +5780,7 @@
5572
5780
  */
5573
5781
  // This is visible for testing.
5574
5782
  find(s) {
5575
- const a = this.doExecute(MachineInput.fromUTF16(s), 0, RE2Flags.UNANCHORED, 2);
5783
+ const a = this.executeEngine(MachineInput.fromUTF16(s), 0, RE2Flags.UNANCHORED, 2);
5576
5784
  if (a === null) {
5577
5785
  return '';
5578
5786
  }
@@ -5588,7 +5796,7 @@
5588
5796
  */
5589
5797
  // This is visible for testing.
5590
5798
  findIndex(s) {
5591
- return this.doExecute(MachineInput.fromUTF16(s), 0, RE2Flags.UNANCHORED, 2);
5799
+ return this.executeEngine(MachineInput.fromUTF16(s), 0, RE2Flags.UNANCHORED, 2);
5592
5800
  }
5593
5801
 
5594
5802
  /**
@@ -5600,7 +5808,7 @@
5600
5808
  */
5601
5809
  // This is visible for testing.
5602
5810
  findUTF8Submatch(b) {
5603
- const a = this.doExecute(MachineInput.fromUTF8(b), 0, RE2Flags.UNANCHORED, this.prog.numCap);
5811
+ const a = this.executeEngine(MachineInput.fromUTF8(b), 0, RE2Flags.UNANCHORED, this.prog.numCap);
5604
5812
  if (a === null) {
5605
5813
  return null;
5606
5814
  }
@@ -5622,7 +5830,7 @@
5622
5830
  */
5623
5831
  // This is visible for testing.
5624
5832
  findUTF8SubmatchIndex(b) {
5625
- return this.pad(this.doExecute(MachineInput.fromUTF8(b), 0, RE2Flags.UNANCHORED, this.prog.numCap));
5833
+ return this.pad(this.executeEngine(MachineInput.fromUTF8(b), 0, RE2Flags.UNANCHORED, this.prog.numCap));
5626
5834
  }
5627
5835
 
5628
5836
  /**
@@ -5634,7 +5842,7 @@
5634
5842
  */
5635
5843
  // This is visible for testing.
5636
5844
  findSubmatch(s) {
5637
- const a = this.doExecute(MachineInput.fromUTF16(s), 0, RE2Flags.UNANCHORED, this.prog.numCap);
5845
+ const a = this.executeEngine(MachineInput.fromUTF16(s), 0, RE2Flags.UNANCHORED, this.prog.numCap);
5638
5846
  if (a === null) {
5639
5847
  return null;
5640
5848
  }
@@ -5656,7 +5864,7 @@
5656
5864
  */
5657
5865
  // This is visible for testing.
5658
5866
  findSubmatchIndex(s) {
5659
- return this.pad(this.doExecute(MachineInput.fromUTF16(s), 0, RE2Flags.UNANCHORED, this.prog.numCap));
5867
+ return this.pad(this.executeEngine(MachineInput.fromUTF16(s), 0, RE2Flags.UNANCHORED, this.prog.numCap));
5660
5868
  }
5661
5869
 
5662
5870
  /**
@@ -6204,7 +6412,7 @@
6204
6412
  /**
6205
6413
  * Return a map of the capturing groups in this matcher's pattern, where key is the name and value
6206
6414
  * is the index of the group in the pattern.
6207
- * @returns {*}
6415
+ * @returns {Record<string, number>}
6208
6416
  */
6209
6417
  namedGroups() {
6210
6418
  return this.re2Input.namedGroups;
@@ -6226,8 +6434,10 @@
6226
6434
  }
6227
6435
  }
6228
6436
 
6437
+ exports.Matcher = Matcher;
6229
6438
  exports.RE2JS = RE2JS;
6230
6439
  exports.RE2JSCompileException = RE2JSCompileException;
6440
+ exports.RE2JSDfaMemoryException = RE2JSDfaMemoryException;
6231
6441
  exports.RE2JSException = RE2JSException;
6232
6442
  exports.RE2JSFlagsException = RE2JSFlagsException;
6233
6443
  exports.RE2JSGroupException = RE2JSGroupException;