re2js 2.2.2 → 2.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -473,6 +473,8 @@ Parameters:
473
473
  - `$1, $2, ...` refer to the corresponding capture groups in the pattern
474
474
  - `$$` inserts a literal `$`
475
475
  - `$<name>` can be used to reference named capture groups
476
+ - `` $` `` inserts the portion of the string that precedes the matched substring
477
+ - `$'` inserts the portion of the string that follows the matched substring
476
478
  - on invalid group - ignore it
477
479
  - `javaMode (Boolean)`: If set to `true`, the replacement follows Java's rules for replacement. Defaults to `false`. If `javaMode = true`, changed rules for capture groups and special characters:
478
480
  - `$0` refers to the entire matched substring
@@ -2,7 +2,7 @@
2
2
  * re2js
3
3
  * RE2JS is the JavaScript port of RE2, a regular expression engine that provides linear time matching
4
4
  *
5
- * @version v2.2.2
5
+ * @version v2.2.3
6
6
  * @author Oleksii Vasyliev
7
7
  * @homepage https://github.com/le0pard/re2js#readme
8
8
  * @repository github:le0pard/re2js
@@ -96,7 +96,7 @@ for (let i = 0; i < ASCII_SIZE; i++) {
96
96
  }
97
97
  class Codepoint {
98
98
  // codePointAt(0)
99
- static CODES = new Map([['\x07', 7], ['\b', 8], ['\t', 9], ['\n', 10], ['\v', 11], ['\f', 12], ['\r', 13], [' ', 32], ['"', 34], ['$', 36], ['&', 38], ['(', 40], [')', 41], ['*', 42], ['+', 43], ['-', 45], ['.', 46], ['0', 48], ['1', 49], ['2', 50], ['3', 51], ['4', 52], ['5', 53], ['6', 54], ['7', 55], ['8', 56], ['9', 57], [':', 58], ['<', 60], ['>', 62], ['?', 63], ['A', 65], ['B', 66], ['C', 67], ['F', 70], ['P', 80], ['Q', 81], ['U', 85], ['Z', 90], ['[', 91], ['\\', 92], [']', 93], ['^', 94], ['_', 95], ['a', 97], ['b', 98], ['f', 102], ['i', 105], ['m', 109], ['n', 110], ['r', 114], ['s', 115], ['t', 116], ['v', 118], ['x', 120], ['z', 122], ['{', 123], ['|', 124], ['}', 125]]);
99
+ static CODES = new Map([['\x07', 7], ['\b', 8], ['\t', 9], ['\n', 10], ['\v', 11], ['\f', 12], ['\r', 13], [' ', 32], ['"', 34], ['$', 36], ['&', 38], ["'", 39], ['(', 40], [')', 41], ['*', 42], ['+', 43], ['-', 45], ['.', 46], ['0', 48], ['1', 49], ['2', 50], ['3', 51], ['4', 52], ['5', 53], ['6', 54], ['7', 55], ['8', 56], ['9', 57], [':', 58], ['<', 60], ['>', 62], ['?', 63], ['A', 65], ['B', 66], ['C', 67], ['F', 70], ['P', 80], ['Q', 81], ['U', 85], ['Z', 90], ['[', 91], ['\\', 92], [']', 93], ['^', 94], ['_', 95], ['`', 96], ['a', 97], ['b', 98], ['f', 102], ['i', 105], ['m', 109], ['n', 110], ['r', 114], ['s', 115], ['t', 116], ['v', 118], ['x', 120], ['z', 122], ['{', 123], ['|', 124], ['}', 125]]);
100
100
 
101
101
  // convert unicode codepoint to upper case codepoint
102
102
  // return same codepoint, if cannot do it (or codepoint not have upper variation)
@@ -150,10 +150,6 @@ class UnicodeRangeTable {
150
150
  getStride(index) {
151
151
  return this.isStride1 ? 1 : this.data[index * this.SIZE + 2];
152
152
  }
153
- get(index) {
154
- const i = index * this.SIZE;
155
- return [this.data[i], this.data[i + 1], this.getStride(index)];
156
- }
157
153
  get length() {
158
154
  return this.data.length / this.SIZE;
159
155
  }
@@ -650,6 +646,9 @@ class Utils {
650
646
  static emptyInts() {
651
647
  return [];
652
648
  }
649
+ static isByteArray(input) {
650
+ return Array.isArray(input) || input instanceof Uint8Array;
651
+ }
653
652
 
654
653
  // Returns true iff |c| is an ASCII letter or decimal digit.
655
654
  static isalnum(c) {
@@ -951,7 +950,7 @@ class Utf16MatcherInput extends MatcherInputBase {
951
950
  * @returns {number[]}
952
951
  */
953
952
  asBytes() {
954
- return this.charSequence.toString().split('').map(s => s.codePointAt(0));
953
+ return Utils.stringToUtf8ByteArray(this.charSequence.toString());
955
954
  }
956
955
 
957
956
  /**
@@ -976,7 +975,7 @@ class MatcherInput {
976
975
  * @returns {Utf8MatcherInput}
977
976
  */
978
977
  static utf8(input) {
979
- if (Array.isArray(input)) {
978
+ if (Utils.isByteArray(input)) {
980
979
  return new Utf8MatcherInput(input);
981
980
  }
982
981
  return new Utf8MatcherInput(Utils.stringToUtf8ByteArray(input));
@@ -1108,10 +1107,10 @@ class MachineUTF8Input extends MachineInputBase {
1108
1107
  if (start < this.start) {
1109
1108
  start = this.start;
1110
1109
  }
1111
- r1 = this.step(start) >> 3;
1110
+ r1 = this.step(start - this.start) >> 3;
1112
1111
  }
1113
1112
  }
1114
- const r2 = pos < this.end ? this.step(pos) >> 3 : -1;
1113
+ const r2 = pos < this.end ? this.step(pos - this.start) >> 3 : -1;
1115
1114
  return Utils.emptyOpContext(r1, r2);
1116
1115
  }
1117
1116
 
@@ -1193,14 +1192,17 @@ class MachineUTF16Input extends MachineInputBase {
1193
1192
  index(re2, pos) {
1194
1193
  pos += this.start;
1195
1194
  const i = this.charSequence.indexOf(re2.prefix, pos);
1196
- return i < 0 ? i : i - pos;
1195
+ if (i < 0 || i > this.end - re2.prefix.length) {
1196
+ return -1;
1197
+ }
1198
+ return i - pos;
1197
1199
  }
1198
1200
 
1199
1201
  // Returns a bitmask of EMPTY_* flags.
1200
1202
  context(pos) {
1201
1203
  pos += this.start;
1202
- const r1 = pos > 0 && pos <= this.charSequence.length ? this.charSequence.codePointAt(pos - 1) : -1;
1203
- const r2 = pos < this.charSequence.length ? this.charSequence.codePointAt(pos) : -1;
1204
+ const r1 = pos > this.start && pos <= this.end ? this.charSequence.codePointAt(pos - 1) : -1;
1205
+ const r2 = pos < this.end ? this.charSequence.codePointAt(pos) : -1;
1204
1206
  return Utils.emptyOpContext(r1, r2);
1205
1207
  }
1206
1208
  prefixLength(re2) {
@@ -1388,7 +1390,7 @@ class Matcher {
1388
1390
  this.numberOfInstructions = re2.numberOfInstructions();
1389
1391
  if (input instanceof MatcherInputBase) {
1390
1392
  this.resetMatcherInput(input);
1391
- } else if (Array.isArray(input)) {
1393
+ } else if (Utils.isByteArray(input)) {
1392
1394
  this.resetMatcherInput(MatcherInput.utf8(input));
1393
1395
  } else {
1394
1396
  this.resetMatcherInput(MatcherInput.utf16(input));
@@ -1550,10 +1552,7 @@ class Matcher {
1550
1552
  if (group === 0 || this.hasGroups) {
1551
1553
  return;
1552
1554
  }
1553
- let end = this.groups[1] + 1;
1554
- if (end > this.matcherInputLength) {
1555
- end = this.matcherInputLength;
1556
- }
1555
+ const end = this.matcherInputLength;
1557
1556
  const res = this.patternInput.re2().matchMachineInput(this.matcherInput, this.groups[0], end, this.anchorFlag, 1 + this.patternGroupCount);
1558
1557
  const ok = res[0];
1559
1558
  if (!ok) {
@@ -1749,7 +1748,10 @@ class Matcher {
1749
1748
  throw new RE2JSGroupException("named capture group is missing trailing '}'");
1750
1749
  }
1751
1750
  const groupName = replacement.substring(i + 1, j);
1752
- res += this.group(groupName);
1751
+ const groupVal = this.group(groupName);
1752
+ if (groupVal !== null) {
1753
+ res += groupVal;
1754
+ }
1753
1755
  last = j + 1;
1754
1756
  i = j;
1755
1757
  continue;
@@ -1795,6 +1797,22 @@ class Matcher {
1795
1797
  i++;
1796
1798
  last = i + 1;
1797
1799
  continue;
1800
+ } else if (Codepoint.CODES.get('`') === c) {
1801
+ if (last < i) {
1802
+ res += replacement.substring(last, i);
1803
+ }
1804
+ res += this.substring(0, this.start(0));
1805
+ i++;
1806
+ last = i + 1;
1807
+ continue;
1808
+ } else if (Codepoint.CODES.get("'") === c) {
1809
+ if (last < i) {
1810
+ res += replacement.substring(last, i);
1811
+ }
1812
+ res += this.substring(this.end(0), this.matcherInputLength);
1813
+ i++;
1814
+ last = i + 1;
1815
+ continue;
1798
1816
  } else if (Codepoint.CODES.get('1') <= c && c <= Codepoint.CODES.get('9')) {
1799
1817
  let n = c - Codepoint.CODES.get('0');
1800
1818
  if (last < i) {
@@ -1837,7 +1855,10 @@ class Matcher {
1837
1855
  }
1838
1856
  const groupName = replacement.substring(i + 1, j);
1839
1857
  if (Object.prototype.hasOwnProperty.call(this.namedGroups, groupName)) {
1840
- res += this.group(groupName);
1858
+ const groupVal = this.group(groupName);
1859
+ if (groupVal !== null) {
1860
+ res += groupVal;
1861
+ }
1841
1862
  } else {
1842
1863
  res += `$<${groupName}>`;
1843
1864
  }
@@ -4339,13 +4360,6 @@ class Prog {
4339
4360
  // start every program with a fail instruction, so we'll never want to point
4340
4361
  // at its output link.
4341
4362
 
4342
- next(l) {
4343
- const i = this.inst[l >> 1];
4344
- if ((l & 1) === 0) {
4345
- return i.out;
4346
- }
4347
- return i.arg;
4348
- }
4349
4363
  patch(l, val) {
4350
4364
  let head = l.head;
4351
4365
  while (head !== 0) {
@@ -5675,6 +5689,7 @@ class Parser {
5675
5689
  case Codepoint.CODES.get('6'):
5676
5690
  case Codepoint.CODES.get('7'):
5677
5691
  {
5692
+ // Single non-zero digit is a backreference; not supported
5678
5693
  if (!t.more() || t.peek() < Codepoint.CODES.get('0') || t.peek() > Codepoint.CODES.get('7')) {
5679
5694
  break;
5680
5695
  }
@@ -5682,6 +5697,7 @@ class Parser {
5682
5697
  // eslint-disable-next-line no-fallthrough
5683
5698
  case Codepoint.CODES.get('0'):
5684
5699
  {
5700
+ // Consume up to three octal digits; already have one.
5685
5701
  let r = c - Codepoint.CODES.get('0');
5686
5702
  for (let i = 1; i < 3; i++) {
5687
5703
  if (!t.more() || t.peek() < Codepoint.CODES.get('0') || t.peek() > Codepoint.CODES.get('7')) {
@@ -7462,7 +7478,11 @@ class RE2 {
7462
7478
  */
7463
7479
  matchWithGroup(input, start, end, anchor, ngroup) {
7464
7480
  if (!(input instanceof MatcherInputBase)) {
7465
- input = MatcherInput.utf16(input);
7481
+ if (Utils.isByteArray(input)) {
7482
+ input = MatcherInput.utf8(input);
7483
+ } else {
7484
+ input = MatcherInput.utf16(input);
7485
+ }
7466
7486
  }
7467
7487
  return this.matchMachineInput(input, start, end, anchor, ngroup);
7468
7488
  }
@@ -7968,7 +7988,7 @@ class RE2Set {
7968
7988
  }
7969
7989
  match(input) {
7970
7990
  if (!this.prog) this.compile();
7971
- const machineInput = Array.isArray(input) ? MachineInput.fromUTF8(input) : MachineInput.fromUTF16(input);
7991
+ const machineInput = Utils.isByteArray(input) ? MachineInput.fromUTF8(input) : MachineInput.fromUTF16(input);
7972
7992
  let internalAnchor = RE2Flags.UNANCHORED;
7973
7993
  if (this.anchor === RE2Set.ANCHOR_START) {
7974
7994
  internalAnchor = RE2Flags.ANCHOR_START;
@@ -8025,45 +8045,50 @@ class TranslateRegExpString {
8025
8045
  {
8026
8046
  if (i + 2 < size) {
8027
8047
  let nextCh = data[i + 2];
8028
- if (TranslateRegExpString.isUpperCaseAlpha(nextCh)) {
8048
+ let code = nextCh.charCodeAt(0);
8049
+ if (code >= 65 && code <= 90 || code >= 97 && code <= 122) {
8050
+ let val = code % 32;
8029
8051
  result += '\\x';
8030
- result += (nextCh.charCodeAt(0) - 64 >> 4).toString(16).toUpperCase();
8031
- result += (nextCh.charCodeAt(0) - 64 & 15).toString(16).toUpperCase();
8052
+ result += (val >> 4).toString(16).toUpperCase();
8053
+ result += (val & 15).toString(16).toUpperCase();
8032
8054
  i += 3;
8033
8055
  changed = true;
8034
8056
  continue;
8035
8057
  }
8036
8058
  }
8037
- result += '\\c';
8059
+ result += 'c';
8038
8060
  i += 2;
8061
+ changed = true;
8039
8062
  continue;
8040
8063
  }
8041
8064
  case 'u':
8042
8065
  {
8043
8066
  if (i + 2 < size) {
8044
8067
  let nextCh = data[i + 2];
8045
- if (TranslateRegExpString.isHexadecimal(nextCh)) {
8046
- result += '\\x{' + nextCh;
8047
- i += 3;
8048
- for (let j = 0; j < 3 && i < size; ++i, ++j) {
8049
- nextCh = data[i];
8050
- if (!TranslateRegExpString.isHexadecimal(nextCh)) {
8051
- break;
8052
- }
8053
- result += nextCh;
8054
- }
8055
- result += '}';
8056
- changed = true;
8057
- continue;
8058
- } else if (nextCh === '{') {
8068
+ if (nextCh === '{') {
8059
8069
  result += '\\x';
8060
8070
  i += 2;
8061
8071
  changed = true;
8062
8072
  continue;
8073
+ } else if (i + 5 < size) {
8074
+ let isHex4 = true;
8075
+ for (let j = 0; j < 4; j++) {
8076
+ if (!TranslateRegExpString.isHexadecimal(data[i + 2 + j])) {
8077
+ isHex4 = false;
8078
+ break;
8079
+ }
8080
+ }
8081
+ if (isHex4) {
8082
+ result += '\\x{' + data.substring(i + 2, i + 6) + '}';
8083
+ i += 6;
8084
+ changed = true;
8085
+ continue;
8086
+ }
8063
8087
  }
8064
8088
  }
8065
- result += '\\u';
8089
+ result += 'u';
8066
8090
  i += 2;
8091
+ changed = true;
8067
8092
  continue;
8068
8093
  }
8069
8094
  default:
@@ -8298,7 +8323,7 @@ class RE2JS {
8298
8323
  * @returns {Matcher}
8299
8324
  */
8300
8325
  matcher(input) {
8301
- if (Array.isArray(input)) {
8326
+ if (Utils.isByteArray(input)) {
8302
8327
  input = MatcherInput.utf8(input);
8303
8328
  }
8304
8329
  return new Matcher(this, input);
@@ -8314,7 +8339,7 @@ class RE2JS {
8314
8339
  * @returns {boolean} `true` if the pattern is found anywhere in the input, `false` otherwise.
8315
8340
  */
8316
8341
  test(input) {
8317
- if (Array.isArray(input)) {
8342
+ if (Utils.isByteArray(input)) {
8318
8343
  // Reuse the existing UTF-8 fast-path method
8319
8344
  return this.re2Input.matchUTF8(input);
8320
8345
  }
@@ -8333,7 +8358,7 @@ class RE2JS {
8333
8358
  * @returns {boolean} `true` if the exact input string fully matches the pattern, `false` otherwise.
8334
8359
  */
8335
8360
  testExact(input) {
8336
- const machineInput = Array.isArray(input) ? MachineInput.fromUTF8(input) : MachineInput.fromUTF16(input);
8361
+ const machineInput = Utils.isByteArray(input) ? MachineInput.fromUTF8(input) : MachineInput.fromUTF16(input);
8337
8362
  return this.re2Input.executeEngine(machineInput, 0, RE2Flags.ANCHOR_BOTH, 0) !== null;
8338
8363
  }
8339
8364