re2js 1.3.0 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,7 +2,7 @@
2
2
  * re2js
3
3
  * RE2JS is the JavaScript port of RE2, a regular expression engine that provides linear time matching
4
4
  *
5
- * @version v1.3.0
5
+ * @version v1.3.1
6
6
  * @author Alexey Vasiliev
7
7
  * @homepage https://github.com/le0pard/re2js#readme
8
8
  * @repository github:le0pard/re2js
@@ -847,6 +847,181 @@
847
847
  }
848
848
  }
849
849
 
850
+ /**
851
+ * MachineInput abstracts different representations of the input text supplied to the Machine. It
852
+ * provides one-character lookahead.
853
+ */
854
+ class MachineInputBase {
855
+ static EOF() {
856
+ return -1 << 3;
857
+ }
858
+
859
+ // can we look ahead without losing info?
860
+ canCheckPrefix() {
861
+ return true;
862
+ }
863
+
864
+ // Returns the end position in the same units as step().
865
+ endPos() {
866
+ return this.end;
867
+ }
868
+ }
869
+
870
+ // An implementation of MachineInput for UTF-8 byte arrays.
871
+ // |pos| and |width| are byte indices.
872
+ class MachineUTF8Input extends MachineInputBase {
873
+ constructor(bytes, start = 0, end = bytes.length) {
874
+ super();
875
+ this.bytes = bytes;
876
+ this.start = start;
877
+ this.end = end;
878
+ }
879
+
880
+ // Returns the rune at the specified index; the units are
881
+ // unspecified, but could be UTF-8 byte, UTF-16 char, or rune
882
+ // indices. Returns the width (in the same units) of the rune in
883
+ // the lower 3 bits, and the rune (Unicode code point) in the high
884
+ // bits. Never negative, except for EOF which is represented as -1
885
+ // << 3 | 0.
886
+ step(i) {
887
+ i += this.start;
888
+ if (i >= this.end) {
889
+ return MachineInputBase.EOF();
890
+ }
891
+ let x = this.bytes[i++] & 255;
892
+ if ((x & 128) === 0) {
893
+ return x << 3 | 1;
894
+ } else if ((x & 224) === 192) {
895
+ x = x & 31;
896
+ if (i >= this.end) {
897
+ return MachineInputBase.EOF();
898
+ }
899
+ x = x << 6 | this.bytes[i++] & 63;
900
+ return x << 3 | 2;
901
+ } else if ((x & 240) === 224) {
902
+ x = x & 15;
903
+ if (i + 1 >= this.end) {
904
+ return MachineInputBase.EOF();
905
+ }
906
+ x = x << 6 | this.bytes[i++] & 63;
907
+ x = x << 6 | this.bytes[i++] & 63;
908
+ return x << 3 | 3;
909
+ } else {
910
+ x = x & 7;
911
+ if (i + 2 >= this.end) {
912
+ return MachineInputBase.EOF();
913
+ }
914
+ x = x << 6 | this.bytes[i++] & 63;
915
+ x = x << 6 | this.bytes[i++] & 63;
916
+ x = x << 6 | this.bytes[i++] & 63;
917
+ return x << 3 | 4;
918
+ }
919
+ }
920
+
921
+ // Returns the index relative to |pos| at which |re2.prefix| is found
922
+ // in this input stream, or a negative value if not found.
923
+ index(re2, pos) {
924
+ pos += this.start;
925
+ const i = this.indexOf(this.bytes, re2.prefixUTF8, pos);
926
+ return i < 0 ? i : i - pos;
927
+ }
928
+
929
+ // Returns a bitmask of EMPTY_* flags.
930
+ context(pos) {
931
+ pos += this.start;
932
+ let r1 = -1;
933
+ if (pos > this.start && pos <= this.end) {
934
+ let start = pos - 1;
935
+ r1 = this.bytes[start--];
936
+ if (r1 >= 128) {
937
+ let lim = pos - 4;
938
+ if (lim < this.start) {
939
+ lim = this.start;
940
+ }
941
+ while (start >= lim && (this.bytes[start] & 192) === 128) {
942
+ start--;
943
+ }
944
+ if (start < this.start) {
945
+ start = this.start;
946
+ }
947
+ r1 = this.step(start) >> 3;
948
+ }
949
+ }
950
+ const r2 = pos < this.end ? this.step(pos) >> 3 : -1;
951
+ return Utils.emptyOpContext(r1, r2);
952
+ }
953
+
954
+ // Returns the index of the first occurrence of array |target| within
955
+ // array |source| after |fromIndex|, or -1 if not found.
956
+ indexOf(source, target, fromIndex = 0) {
957
+ let targetLength = target.length;
958
+ if (targetLength === 0) {
959
+ return -1;
960
+ }
961
+ let sourceLength = source.length;
962
+ for (let i = fromIndex; i <= sourceLength - targetLength; i++) {
963
+ for (let j = 0; j < targetLength; j++) {
964
+ if (source[i + j] !== target[j]) {
965
+ break;
966
+ } else if (j === targetLength - 1) {
967
+ return i;
968
+ }
969
+ }
970
+ }
971
+ return -1;
972
+ }
973
+ }
974
+
975
+ // |pos| and |width| are in JS "char" units.
976
+ class MachineUTF16Input extends MachineInputBase {
977
+ constructor(charSequence, start = 0, end = charSequence.length) {
978
+ super();
979
+ this.charSequence = charSequence;
980
+ this.start = start;
981
+ this.end = end;
982
+ }
983
+
984
+ // Returns the rune at the specified index; the units are
985
+ // unspecified, but could be UTF-8 byte, UTF-16 char, or rune
986
+ // indices. Returns the width (in the same units) of the rune in
987
+ // the lower 3 bits, and the rune (Unicode code point) in the high
988
+ // bits. Never negative, except for EOF which is represented as -1
989
+ // << 3 | 0.
990
+ step(pos) {
991
+ pos += this.start;
992
+ if (pos < this.end) {
993
+ const rune = this.charSequence.codePointAt(pos);
994
+ return rune << 3 | Utils.charCount(rune);
995
+ } else {
996
+ return MachineInputBase.EOF();
997
+ }
998
+ }
999
+
1000
+ // Returns the index relative to |pos| at which |re2.prefix| is found
1001
+ // in this input stream, or a negative value if not found.
1002
+ index(re2, pos) {
1003
+ pos += this.start;
1004
+ const i = this.charSequence.indexOf(re2.prefix, pos);
1005
+ return i < 0 ? i : i - pos;
1006
+ }
1007
+
1008
+ // Returns a bitmask of EMPTY_* flags.
1009
+ context(pos) {
1010
+ pos += this.start;
1011
+ const r1 = pos > 0 && pos <= this.charSequence.length ? this.charSequence.codePointAt(pos - 1) : -1;
1012
+ const r2 = pos < this.charSequence.length ? this.charSequence.codePointAt(pos) : -1;
1013
+ return Utils.emptyOpContext(r1, r2);
1014
+ }
1015
+ }
1016
+ class MachineInput {
1017
+ static fromUTF8(bytes, start = 0, end = bytes.length) {
1018
+ return new MachineUTF8Input(bytes, start, end);
1019
+ }
1020
+ static fromUTF16(charSequence, start = 0, end = charSequence.length) {
1021
+ return new MachineUTF16Input(charSequence, start, end);
1022
+ }
1023
+ }
1024
+
850
1025
  class RE2JSException extends Error {
851
1026
  /** @param {string} message */
852
1027
  constructor(message) {
@@ -927,17 +1102,6 @@
927
1102
  }
928
1103
  }
929
1104
 
930
- /**
931
- * An exception thrown by DFA
932
- */
933
- class RE2JSDfaMemoryException extends RE2JSException {
934
- /** @param {string} message */
935
- constructor(message) {
936
- super(message);
937
- this.name = 'RE2JSDfaMemoryException';
938
- }
939
- }
940
-
941
1105
  /**
942
1106
  * A stateful iterator that interprets a regex {@code RE2JS} on a specific input.
943
1107
  *
@@ -1494,181 +1658,6 @@
1494
1658
  }
1495
1659
  }
1496
1660
 
1497
- /**
1498
- * MachineInput abstracts different representations of the input text supplied to the Machine. It
1499
- * provides one-character lookahead.
1500
- */
1501
- class MachineInputBase {
1502
- static EOF() {
1503
- return -1 << 3;
1504
- }
1505
-
1506
- // can we look ahead without losing info?
1507
- canCheckPrefix() {
1508
- return true;
1509
- }
1510
-
1511
- // Returns the end position in the same units as step().
1512
- endPos() {
1513
- return this.end;
1514
- }
1515
- }
1516
-
1517
- // An implementation of MachineInput for UTF-8 byte arrays.
1518
- // |pos| and |width| are byte indices.
1519
- class MachineUTF8Input extends MachineInputBase {
1520
- constructor(bytes, start = 0, end = bytes.length) {
1521
- super();
1522
- this.bytes = bytes;
1523
- this.start = start;
1524
- this.end = end;
1525
- }
1526
-
1527
- // Returns the rune at the specified index; the units are
1528
- // unspecified, but could be UTF-8 byte, UTF-16 char, or rune
1529
- // indices. Returns the width (in the same units) of the rune in
1530
- // the lower 3 bits, and the rune (Unicode code point) in the high
1531
- // bits. Never negative, except for EOF which is represented as -1
1532
- // << 3 | 0.
1533
- step(i) {
1534
- i += this.start;
1535
- if (i >= this.end) {
1536
- return MachineInputBase.EOF();
1537
- }
1538
- let x = this.bytes[i++] & 255;
1539
- if ((x & 128) === 0) {
1540
- return x << 3 | 1;
1541
- } else if ((x & 224) === 192) {
1542
- x = x & 31;
1543
- if (i >= this.end) {
1544
- return MachineInputBase.EOF();
1545
- }
1546
- x = x << 6 | this.bytes[i++] & 63;
1547
- return x << 3 | 2;
1548
- } else if ((x & 240) === 224) {
1549
- x = x & 15;
1550
- if (i + 1 >= this.end) {
1551
- return MachineInputBase.EOF();
1552
- }
1553
- x = x << 6 | this.bytes[i++] & 63;
1554
- x = x << 6 | this.bytes[i++] & 63;
1555
- return x << 3 | 3;
1556
- } else {
1557
- x = x & 7;
1558
- if (i + 2 >= this.end) {
1559
- return MachineInputBase.EOF();
1560
- }
1561
- x = x << 6 | this.bytes[i++] & 63;
1562
- x = x << 6 | this.bytes[i++] & 63;
1563
- x = x << 6 | this.bytes[i++] & 63;
1564
- return x << 3 | 4;
1565
- }
1566
- }
1567
-
1568
- // Returns the index relative to |pos| at which |re2.prefix| is found
1569
- // in this input stream, or a negative value if not found.
1570
- index(re2, pos) {
1571
- pos += this.start;
1572
- const i = this.indexOf(this.bytes, re2.prefixUTF8, pos);
1573
- return i < 0 ? i : i - pos;
1574
- }
1575
-
1576
- // Returns a bitmask of EMPTY_* flags.
1577
- context(pos) {
1578
- pos += this.start;
1579
- let r1 = -1;
1580
- if (pos > this.start && pos <= this.end) {
1581
- let start = pos - 1;
1582
- r1 = this.bytes[start--];
1583
- if (r1 >= 128) {
1584
- let lim = pos - 4;
1585
- if (lim < this.start) {
1586
- lim = this.start;
1587
- }
1588
- while (start >= lim && (this.bytes[start] & 192) === 128) {
1589
- start--;
1590
- }
1591
- if (start < this.start) {
1592
- start = this.start;
1593
- }
1594
- r1 = this.step(start) >> 3;
1595
- }
1596
- }
1597
- const r2 = pos < this.end ? this.step(pos) >> 3 : -1;
1598
- return Utils.emptyOpContext(r1, r2);
1599
- }
1600
-
1601
- // Returns the index of the first occurrence of array |target| within
1602
- // array |source| after |fromIndex|, or -1 if not found.
1603
- indexOf(source, target, fromIndex = 0) {
1604
- let targetLength = target.length;
1605
- if (targetLength === 0) {
1606
- return -1;
1607
- }
1608
- let sourceLength = source.length;
1609
- for (let i = fromIndex; i <= sourceLength - targetLength; i++) {
1610
- for (let j = 0; j < targetLength; j++) {
1611
- if (source[i + j] !== target[j]) {
1612
- break;
1613
- } else if (j === targetLength - 1) {
1614
- return i;
1615
- }
1616
- }
1617
- }
1618
- return -1;
1619
- }
1620
- }
1621
-
1622
- // |pos| and |width| are in JS "char" units.
1623
- class MachineUTF16Input extends MachineInputBase {
1624
- constructor(charSequence, start = 0, end = charSequence.length) {
1625
- super();
1626
- this.charSequence = charSequence;
1627
- this.start = start;
1628
- this.end = end;
1629
- }
1630
-
1631
- // Returns the rune at the specified index; the units are
1632
- // unspecified, but could be UTF-8 byte, UTF-16 char, or rune
1633
- // indices. Returns the width (in the same units) of the rune in
1634
- // the lower 3 bits, and the rune (Unicode code point) in the high
1635
- // bits. Never negative, except for EOF which is represented as -1
1636
- // << 3 | 0.
1637
- step(pos) {
1638
- pos += this.start;
1639
- if (pos < this.end) {
1640
- const rune = this.charSequence.codePointAt(pos);
1641
- return rune << 3 | Utils.charCount(rune);
1642
- } else {
1643
- return MachineInputBase.EOF();
1644
- }
1645
- }
1646
-
1647
- // Returns the index relative to |pos| at which |re2.prefix| is found
1648
- // in this input stream, or a negative value if not found.
1649
- index(re2, pos) {
1650
- pos += this.start;
1651
- const i = this.charSequence.indexOf(re2.prefix, pos);
1652
- return i < 0 ? i : i - pos;
1653
- }
1654
-
1655
- // Returns a bitmask of EMPTY_* flags.
1656
- context(pos) {
1657
- pos += this.start;
1658
- const r1 = pos > 0 && pos <= this.charSequence.length ? this.charSequence.codePointAt(pos - 1) : -1;
1659
- const r2 = pos < this.charSequence.length ? this.charSequence.codePointAt(pos) : -1;
1660
- return Utils.emptyOpContext(r1, r2);
1661
- }
1662
- }
1663
- class MachineInput {
1664
- static fromUTF8(bytes, start = 0, end = bytes.length) {
1665
- return new MachineUTF8Input(bytes, start, end);
1666
- }
1667
- static fromUTF16(charSequence, start = 0, end = charSequence.length) {
1668
- return new MachineUTF16Input(charSequence, start, end);
1669
- }
1670
- }
1671
-
1672
1661
  /**
1673
1662
  * A single instruction in the regular expression virtual machine.
1674
1663
  *
@@ -2127,21 +2116,40 @@
2127
2116
  }
2128
2117
  }
2129
2118
 
2119
+ // FNV-1a 32-bit hash for an array of integers.
2120
+ // Extremely fast, allocates no memory, and produces good distribution.
2121
+ const hashPCs = pcs => {
2122
+ let h = -2128831035; // 0x811c9dc5 (32-bit signed offset basis)
2123
+ for (let i = 0; i < pcs.length; i++) {
2124
+ h ^= pcs[i];
2125
+ h = Math.imul(h, 16777619); // 0x01000193 (FNV prime)
2126
+ }
2127
+ return h;
2128
+ };
2129
+
2130
+ // Zero-allocation array comparison for hash collision resolution
2131
+ const arraysEqual = (a, b) => {
2132
+ if (a.length !== b.length) return false;
2133
+ for (let i = 0; i < a.length; i++) {
2134
+ if (a[i] !== b[i]) return false;
2135
+ }
2136
+ return true;
2137
+ };
2130
2138
  class DFAState {
2131
- constructor(id, nfaStates, isMatch) {
2132
- this.id = id; // Stringified NFA state list (e.g., "1,4,7")
2133
- this.nfaStates = nfaStates; // Array of Instruction PCs
2139
+ constructor(nfaStates, isMatch) {
2140
+ this.nfaStates = nfaStates; // Int32Array of Instruction PCs
2134
2141
  this.isMatch = isMatch; // Boolean
2135
- this.nextAscii = new Array(Unicode.MAX_ASCII + 1).fill(null); // Flat array for blisteringly fast ASCII lookups (unanchored)
2142
+ this.nextAscii = new Array(Unicode.MAX_ASCII + 1).fill(null); // Flat array for blisteringly fast ASCII lookups
2136
2143
  this.nextMap = new Map(); // Cache of Char -> DFAState
2137
2144
  }
2138
2145
  }
2139
2146
  class DFA {
2140
2147
  constructor(prog) {
2141
2148
  this.prog = prog;
2142
- this.stateCache = new Map(); // id -> DFAState
2149
+ this.stateCache = new Map(); // hash(number) -> DFAState[]
2150
+ this.stateCount = 0; // Tracks total states for memory limits
2143
2151
  this.startState = null;
2144
- this.stateLimit = 10000; // Prevent memory explosion (ReDoS protection), like RE2 max_mem
2152
+ this.stateLimit = 10000; // Prevent memory explosion (ReDoS protection)
2145
2153
  }
2146
2154
 
2147
2155
  // Follows epsilon (empty) transitions to find all reachable states without consuming a char
@@ -2185,17 +2193,37 @@
2185
2193
  const closureResult = this.computeClosure(pcs);
2186
2194
  if (!closureResult) return null; // Bailout to NFA required
2187
2195
 
2188
- const id = closureResult.pcs.join(',');
2189
- if (this.stateCache.has(id)) {
2190
- return this.stateCache.get(id);
2196
+ const sortedPCs = closureResult.pcs;
2197
+ const hash = hashPCs(sortedPCs);
2198
+
2199
+ // Lookup hash bucket
2200
+ let bucket = this.stateCache.get(hash);
2201
+ if (bucket) {
2202
+ // Resolve potential hash collisions
2203
+ for (let i = 0; i < bucket.length; i++) {
2204
+ const state = bucket[i];
2205
+ if (arraysEqual(state.nfaStates, sortedPCs)) {
2206
+ return state;
2207
+ }
2208
+ }
2209
+ } else {
2210
+ bucket = [];
2211
+ this.stateCache.set(hash, bucket);
2191
2212
  }
2192
2213
 
2193
2214
  // Safety: prevent memory exhaustion from state explosion
2194
- if (this.stateCache.size > this.stateLimit) {
2195
- throw new RE2JSDfaMemoryException('dfa error: Out of memory exception');
2215
+ // We flush the cache and return null, which seamlessly routes execution to the NFA
2216
+ if (this.stateCount >= this.stateLimit) {
2217
+ this.stateCache.clear();
2218
+ this.stateCount = 0;
2219
+ this.startState = null;
2220
+ return null;
2196
2221
  }
2197
- const state = new DFAState(id, closureResult.pcs, closureResult.isMatch);
2198
- this.stateCache.set(id, state);
2222
+
2223
+ // State not found, create it and add to bucket
2224
+ const state = new DFAState(sortedPCs, closureResult.isMatch);
2225
+ bucket.push(state);
2226
+ this.stateCount++;
2199
2227
  return state;
2200
2228
  }
2201
2229
 
@@ -2259,6 +2287,11 @@
2259
2287
  const r = input.step(i);
2260
2288
  const rune = r >> 3;
2261
2289
  const width = r & 7;
2290
+
2291
+ // prevent infinite loop on EOF
2292
+ if (width === 0) {
2293
+ break;
2294
+ }
2262
2295
  currentState = this.step(currentState, rune, anchor);
2263
2296
 
2264
2297
  // If we hit an unrecoverable DFA error or bailout, signal fallback
@@ -5421,18 +5454,10 @@
5421
5454
  if (ncap > 0) {
5422
5455
  return this.doExecuteNFA(input, pos, anchor, ncap);
5423
5456
  }
5424
- try {
5425
- const dfaResult = this.dfa.match(input, pos, anchor);
5426
- if (dfaResult !== null) {
5427
- // DFA succeeded (returned true or false)
5428
- return dfaResult ? [] : null; // Return empty array to signify "matched but no captures"
5429
- }
5430
- } catch (e) {
5431
- if (e instanceof RE2JSDfaMemoryException) {
5432
- this.dfa = new DFA(this.prog); // flush cache
5433
- } else {
5434
- throw e;
5435
- }
5457
+ const dfaResult = this.dfa.match(input, pos, anchor);
5458
+ if (dfaResult !== null) {
5459
+ // DFA succeeded (returned true or false)
5460
+ return dfaResult ? [] : null; // Return empty array to signify "matched but no captures"
5436
5461
  }
5437
5462
 
5438
5463
  // Fallback to NFA
@@ -6316,6 +6341,39 @@
6316
6341
  return new Matcher(this, input);
6317
6342
  }
6318
6343
 
6344
+ /**
6345
+ * Tests whether the regular expression matches any part of the input string.
6346
+ * Performance Note: This method is highly optimized. Because it only returns
6347
+ * a boolean and does not extract capture groups, it bypasses the `Matcher` overhead
6348
+ * and guarantees execution on the high-speed DFA engine whenever possible.
6349
+ *
6350
+ * @param {string|number[]} input - The input string or UTF-8 byte array to test against.
6351
+ * @returns {boolean} `true` if the pattern is found anywhere in the input, `false` otherwise.
6352
+ */
6353
+ test(input) {
6354
+ if (Array.isArray(input)) {
6355
+ // Reuse the existing UTF-8 fast-path method
6356
+ return this.re2Input.matchUTF8(input);
6357
+ }
6358
+
6359
+ // Reuse the existing UTF-16 fast-path method
6360
+ return this.re2Input.match(input);
6361
+ }
6362
+
6363
+ /**
6364
+ * Tests whether the regular expression matches the ENTIRE input string.
6365
+ * * **Performance Note:** This operates identically to `.matches()`, but is significantly
6366
+ * faster because it does not request capture group data. By requesting 0 capture groups,
6367
+ * it securely routes execution through the DFA fast-path.
6368
+ *
6369
+ * @param {string|number[]} input - The input string or UTF-8 byte array to test against.
6370
+ * @returns {boolean} `true` if the exact input string fully matches the pattern, `false` otherwise.
6371
+ */
6372
+ testExact(input) {
6373
+ const machineInput = Array.isArray(input) ? MachineInput.fromUTF8(input) : MachineInput.fromUTF16(input);
6374
+ return this.re2Input.executeEngine(machineInput, 0, RE2Flags.ANCHOR_BOTH, 0) !== null;
6375
+ }
6376
+
6319
6377
  /**
6320
6378
  * Splits input around instances of the regular expression. It returns an array giving the strings
6321
6379
  * that occur before, between, and after instances of the regular expression.
@@ -6437,7 +6495,6 @@
6437
6495
  exports.Matcher = Matcher;
6438
6496
  exports.RE2JS = RE2JS;
6439
6497
  exports.RE2JSCompileException = RE2JSCompileException;
6440
- exports.RE2JSDfaMemoryException = RE2JSDfaMemoryException;
6441
6498
  exports.RE2JSException = RE2JSException;
6442
6499
  exports.RE2JSFlagsException = RE2JSFlagsException;
6443
6500
  exports.RE2JSGroupException = RE2JSGroupException;