re2js 2.2.1 → 2.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE CHANGED
@@ -1,6 +1,6 @@
1
1
  MIT License
2
2
 
3
- Copyright (c) 2023 Alexey Vasiliev
3
+ Copyright (c) 2023 Oleksii Vasyliev
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
6
  of this software and associated documentation files (the "Software"), to deal
package/README.md CHANGED
@@ -473,6 +473,8 @@ Parameters:
473
473
  - `$1, $2, ...` refer to the corresponding capture groups in the pattern
474
474
  - `$$` inserts a literal `$`
475
475
  - `$<name>` can be used to reference named capture groups
476
+ - `` $` `` inserts the portion of the string that precedes the matched substring
477
+ - `$'` inserts the portion of the string that follows the matched substring
476
478
  - on invalid group - ignore it
477
479
  - `javaMode (Boolean)`: If set to `true`, the replacement follows Java's rules for replacement. Defaults to `false`. If `javaMode = true`, changed rules for capture groups and special characters:
478
480
  - `$0` refers to the entire matched substring
@@ -2,8 +2,8 @@
2
2
  * re2js
3
3
  * RE2JS is the JavaScript port of RE2, a regular expression engine that provides linear time matching
4
4
  *
5
- * @version v2.2.1
6
- * @author Alexey Vasiliev
5
+ * @version v2.2.3
6
+ * @author Oleksii Vasyliev
7
7
  * @homepage https://github.com/le0pard/re2js#readme
8
8
  * @repository github:le0pard/re2js
9
9
  * @license MIT
@@ -96,7 +96,7 @@ for (let i = 0; i < ASCII_SIZE; i++) {
96
96
  }
97
97
  class Codepoint {
98
98
  // codePointAt(0)
99
- static CODES = new Map([['\x07', 7], ['\b', 8], ['\t', 9], ['\n', 10], ['\v', 11], ['\f', 12], ['\r', 13], [' ', 32], ['"', 34], ['$', 36], ['&', 38], ['(', 40], [')', 41], ['*', 42], ['+', 43], ['-', 45], ['.', 46], ['0', 48], ['1', 49], ['2', 50], ['3', 51], ['4', 52], ['5', 53], ['6', 54], ['7', 55], ['8', 56], ['9', 57], [':', 58], ['<', 60], ['>', 62], ['?', 63], ['A', 65], ['B', 66], ['C', 67], ['F', 70], ['P', 80], ['Q', 81], ['U', 85], ['Z', 90], ['[', 91], ['\\', 92], [']', 93], ['^', 94], ['_', 95], ['a', 97], ['b', 98], ['f', 102], ['i', 105], ['m', 109], ['n', 110], ['r', 114], ['s', 115], ['t', 116], ['v', 118], ['x', 120], ['z', 122], ['{', 123], ['|', 124], ['}', 125]]);
99
+ static CODES = new Map([['\x07', 7], ['\b', 8], ['\t', 9], ['\n', 10], ['\v', 11], ['\f', 12], ['\r', 13], [' ', 32], ['"', 34], ['$', 36], ['&', 38], ["'", 39], ['(', 40], [')', 41], ['*', 42], ['+', 43], ['-', 45], ['.', 46], ['0', 48], ['1', 49], ['2', 50], ['3', 51], ['4', 52], ['5', 53], ['6', 54], ['7', 55], ['8', 56], ['9', 57], [':', 58], ['<', 60], ['>', 62], ['?', 63], ['A', 65], ['B', 66], ['C', 67], ['F', 70], ['P', 80], ['Q', 81], ['U', 85], ['Z', 90], ['[', 91], ['\\', 92], [']', 93], ['^', 94], ['_', 95], ['`', 96], ['a', 97], ['b', 98], ['f', 102], ['i', 105], ['m', 109], ['n', 110], ['r', 114], ['s', 115], ['t', 116], ['v', 118], ['x', 120], ['z', 122], ['{', 123], ['|', 124], ['}', 125]]);
100
100
 
101
101
  // convert unicode codepoint to upper case codepoint
102
102
  // return same codepoint, if cannot do it (or codepoint not have upper variation)
@@ -150,10 +150,6 @@ class UnicodeRangeTable {
150
150
  getStride(index) {
151
151
  return this.isStride1 ? 1 : this.data[index * this.SIZE + 2];
152
152
  }
153
- get(index) {
154
- const i = index * this.SIZE;
155
- return [this.data[i], this.data[i + 1], this.getStride(index)];
156
- }
157
153
  get length() {
158
154
  return this.data.length / this.SIZE;
159
155
  }
@@ -650,6 +646,9 @@ class Utils {
650
646
  static emptyInts() {
651
647
  return [];
652
648
  }
649
+ static isByteArray(input) {
650
+ return Array.isArray(input) || input instanceof Uint8Array;
651
+ }
653
652
 
654
653
  // Returns true iff |c| is an ASCII letter or decimal digit.
655
654
  static isalnum(c) {
@@ -951,7 +950,7 @@ class Utf16MatcherInput extends MatcherInputBase {
951
950
  * @returns {number[]}
952
951
  */
953
952
  asBytes() {
954
- return this.charSequence.toString().split('').map(s => s.codePointAt(0));
953
+ return Utils.stringToUtf8ByteArray(this.charSequence.toString());
955
954
  }
956
955
 
957
956
  /**
@@ -976,7 +975,7 @@ class MatcherInput {
976
975
  * @returns {Utf8MatcherInput}
977
976
  */
978
977
  static utf8(input) {
979
- if (Array.isArray(input)) {
978
+ if (Utils.isByteArray(input)) {
980
979
  return new Utf8MatcherInput(input);
981
980
  }
982
981
  return new Utf8MatcherInput(Utils.stringToUtf8ByteArray(input));
@@ -1108,10 +1107,10 @@ class MachineUTF8Input extends MachineInputBase {
1108
1107
  if (start < this.start) {
1109
1108
  start = this.start;
1110
1109
  }
1111
- r1 = this.step(start) >> 3;
1110
+ r1 = this.step(start - this.start) >> 3;
1112
1111
  }
1113
1112
  }
1114
- const r2 = pos < this.end ? this.step(pos) >> 3 : -1;
1113
+ const r2 = pos < this.end ? this.step(pos - this.start) >> 3 : -1;
1115
1114
  return Utils.emptyOpContext(r1, r2);
1116
1115
  }
1117
1116
 
@@ -1193,14 +1192,17 @@ class MachineUTF16Input extends MachineInputBase {
1193
1192
  index(re2, pos) {
1194
1193
  pos += this.start;
1195
1194
  const i = this.charSequence.indexOf(re2.prefix, pos);
1196
- return i < 0 ? i : i - pos;
1195
+ if (i < 0 || i > this.end - re2.prefix.length) {
1196
+ return -1;
1197
+ }
1198
+ return i - pos;
1197
1199
  }
1198
1200
 
1199
1201
  // Returns a bitmask of EMPTY_* flags.
1200
1202
  context(pos) {
1201
1203
  pos += this.start;
1202
- const r1 = pos > 0 && pos <= this.charSequence.length ? this.charSequence.codePointAt(pos - 1) : -1;
1203
- const r2 = pos < this.charSequence.length ? this.charSequence.codePointAt(pos) : -1;
1204
+ const r1 = pos > this.start && pos <= this.end ? this.charSequence.codePointAt(pos - 1) : -1;
1205
+ const r2 = pos < this.end ? this.charSequence.codePointAt(pos) : -1;
1204
1206
  return Utils.emptyOpContext(r1, r2);
1205
1207
  }
1206
1208
  prefixLength(re2) {
@@ -1388,7 +1390,7 @@ class Matcher {
1388
1390
  this.numberOfInstructions = re2.numberOfInstructions();
1389
1391
  if (input instanceof MatcherInputBase) {
1390
1392
  this.resetMatcherInput(input);
1391
- } else if (Array.isArray(input)) {
1393
+ } else if (Utils.isByteArray(input)) {
1392
1394
  this.resetMatcherInput(MatcherInput.utf8(input));
1393
1395
  } else {
1394
1396
  this.resetMatcherInput(MatcherInput.utf16(input));
@@ -1550,10 +1552,7 @@ class Matcher {
1550
1552
  if (group === 0 || this.hasGroups) {
1551
1553
  return;
1552
1554
  }
1553
- let end = this.groups[1] + 1;
1554
- if (end > this.matcherInputLength) {
1555
- end = this.matcherInputLength;
1556
- }
1555
+ const end = this.matcherInputLength;
1557
1556
  const res = this.patternInput.re2().matchMachineInput(this.matcherInput, this.groups[0], end, this.anchorFlag, 1 + this.patternGroupCount);
1558
1557
  const ok = res[0];
1559
1558
  if (!ok) {
@@ -1749,7 +1748,10 @@ class Matcher {
1749
1748
  throw new RE2JSGroupException("named capture group is missing trailing '}'");
1750
1749
  }
1751
1750
  const groupName = replacement.substring(i + 1, j);
1752
- res += this.group(groupName);
1751
+ const groupVal = this.group(groupName);
1752
+ if (groupVal !== null) {
1753
+ res += groupVal;
1754
+ }
1753
1755
  last = j + 1;
1754
1756
  i = j;
1755
1757
  continue;
@@ -1795,6 +1797,22 @@ class Matcher {
1795
1797
  i++;
1796
1798
  last = i + 1;
1797
1799
  continue;
1800
+ } else if (Codepoint.CODES.get('`') === c) {
1801
+ if (last < i) {
1802
+ res += replacement.substring(last, i);
1803
+ }
1804
+ res += this.substring(0, this.start(0));
1805
+ i++;
1806
+ last = i + 1;
1807
+ continue;
1808
+ } else if (Codepoint.CODES.get("'") === c) {
1809
+ if (last < i) {
1810
+ res += replacement.substring(last, i);
1811
+ }
1812
+ res += this.substring(this.end(0), this.matcherInputLength);
1813
+ i++;
1814
+ last = i + 1;
1815
+ continue;
1798
1816
  } else if (Codepoint.CODES.get('1') <= c && c <= Codepoint.CODES.get('9')) {
1799
1817
  let n = c - Codepoint.CODES.get('0');
1800
1818
  if (last < i) {
@@ -1837,7 +1855,10 @@ class Matcher {
1837
1855
  }
1838
1856
  const groupName = replacement.substring(i + 1, j);
1839
1857
  if (Object.prototype.hasOwnProperty.call(this.namedGroups, groupName)) {
1840
- res += this.group(groupName);
1858
+ const groupVal = this.group(groupName);
1859
+ if (groupVal !== null) {
1860
+ res += groupVal;
1861
+ }
1841
1862
  } else {
1842
1863
  res += `$<${groupName}>`;
1843
1864
  }
@@ -3343,7 +3364,9 @@ const makeOnePass = p => {
3343
3364
  }
3344
3365
  runes.sort((a, b) => a - b);
3345
3366
  } else {
3346
- runes.push(...inst.runes);
3367
+ for (let j = 0; j < inst.runes.length; j++) {
3368
+ runes.push(inst.runes[j]);
3369
+ }
3347
3370
  }
3348
3371
  onePassRunes[pc] = runes;
3349
3372
  inst.next = new Uint32Array(Math.floor(runes.length / 2) + 1).fill(inst.out);
@@ -4170,7 +4193,9 @@ class PrefilterTree {
4170
4193
  return new Prefilter(Prefilter.Type.NONE);
4171
4194
  }
4172
4195
  if (s.type === Prefilter.Type.OR) {
4173
- newSubs.push(...s.subs);
4196
+ for (let j = 0; j < s.subs.length; j++) {
4197
+ newSubs.push(s.subs[j]);
4198
+ }
4174
4199
  } else {
4175
4200
  newSubs.push(s);
4176
4201
  }
@@ -4335,13 +4360,6 @@ class Prog {
4335
4360
  // start every program with a fail instruction, so we'll never want to point
4336
4361
  // at its output link.
4337
4362
 
4338
- next(l) {
4339
- const i = this.inst[l >> 1];
4340
- if ((l & 1) === 0) {
4341
- return i.out;
4342
- }
4343
- return i.arg;
4344
- }
4345
4363
  patch(l, val) {
4346
4364
  let head = l.head;
4347
4365
  while (head !== 0) {
@@ -5671,6 +5689,7 @@ class Parser {
5671
5689
  case Codepoint.CODES.get('6'):
5672
5690
  case Codepoint.CODES.get('7'):
5673
5691
  {
5692
+ // Single non-zero digit is a backreference; not supported
5674
5693
  if (!t.more() || t.peek() < Codepoint.CODES.get('0') || t.peek() > Codepoint.CODES.get('7')) {
5675
5694
  break;
5676
5695
  }
@@ -5678,6 +5697,7 @@ class Parser {
5678
5697
  // eslint-disable-next-line no-fallthrough
5679
5698
  case Codepoint.CODES.get('0'):
5680
5699
  {
5700
+ // Consume up to three octal digits; already have one.
5681
5701
  let r = c - Codepoint.CODES.get('0');
5682
5702
  for (let i = 1; i < 3; i++) {
5683
5703
  if (!t.more() || t.peek() < Codepoint.CODES.get('0') || t.peek() > Codepoint.CODES.get('7')) {
@@ -7458,7 +7478,11 @@ class RE2 {
7458
7478
  */
7459
7479
  matchWithGroup(input, start, end, anchor, ngroup) {
7460
7480
  if (!(input instanceof MatcherInputBase)) {
7461
- input = MatcherInput.utf16(input);
7481
+ if (Utils.isByteArray(input)) {
7482
+ input = MatcherInput.utf8(input);
7483
+ } else {
7484
+ input = MatcherInput.utf16(input);
7485
+ }
7462
7486
  }
7463
7487
  return this.matchMachineInput(input, start, end, anchor, ngroup);
7464
7488
  }
@@ -7964,7 +7988,7 @@ class RE2Set {
7964
7988
  }
7965
7989
  match(input) {
7966
7990
  if (!this.prog) this.compile();
7967
- const machineInput = Array.isArray(input) ? MachineInput.fromUTF8(input) : MachineInput.fromUTF16(input);
7991
+ const machineInput = Utils.isByteArray(input) ? MachineInput.fromUTF8(input) : MachineInput.fromUTF16(input);
7968
7992
  let internalAnchor = RE2Flags.UNANCHORED;
7969
7993
  if (this.anchor === RE2Set.ANCHOR_START) {
7970
7994
  internalAnchor = RE2Flags.ANCHOR_START;
@@ -8021,45 +8045,50 @@ class TranslateRegExpString {
8021
8045
  {
8022
8046
  if (i + 2 < size) {
8023
8047
  let nextCh = data[i + 2];
8024
- if (TranslateRegExpString.isUpperCaseAlpha(nextCh)) {
8048
+ let code = nextCh.charCodeAt(0);
8049
+ if (code >= 65 && code <= 90 || code >= 97 && code <= 122) {
8050
+ let val = code % 32;
8025
8051
  result += '\\x';
8026
- result += (nextCh.charCodeAt(0) - 64 >> 4).toString(16).toUpperCase();
8027
- result += (nextCh.charCodeAt(0) - 64 & 15).toString(16).toUpperCase();
8052
+ result += (val >> 4).toString(16).toUpperCase();
8053
+ result += (val & 15).toString(16).toUpperCase();
8028
8054
  i += 3;
8029
8055
  changed = true;
8030
8056
  continue;
8031
8057
  }
8032
8058
  }
8033
- result += '\\c';
8059
+ result += 'c';
8034
8060
  i += 2;
8061
+ changed = true;
8035
8062
  continue;
8036
8063
  }
8037
8064
  case 'u':
8038
8065
  {
8039
8066
  if (i + 2 < size) {
8040
8067
  let nextCh = data[i + 2];
8041
- if (TranslateRegExpString.isHexadecimal(nextCh)) {
8042
- result += '\\x{' + nextCh;
8043
- i += 3;
8044
- for (let j = 0; j < 3 && i < size; ++i, ++j) {
8045
- nextCh = data[i];
8046
- if (!TranslateRegExpString.isHexadecimal(nextCh)) {
8047
- break;
8048
- }
8049
- result += nextCh;
8050
- }
8051
- result += '}';
8052
- changed = true;
8053
- continue;
8054
- } else if (nextCh === '{') {
8068
+ if (nextCh === '{') {
8055
8069
  result += '\\x';
8056
8070
  i += 2;
8057
8071
  changed = true;
8058
8072
  continue;
8073
+ } else if (i + 5 < size) {
8074
+ let isHex4 = true;
8075
+ for (let j = 0; j < 4; j++) {
8076
+ if (!TranslateRegExpString.isHexadecimal(data[i + 2 + j])) {
8077
+ isHex4 = false;
8078
+ break;
8079
+ }
8080
+ }
8081
+ if (isHex4) {
8082
+ result += '\\x{' + data.substring(i + 2, i + 6) + '}';
8083
+ i += 6;
8084
+ changed = true;
8085
+ continue;
8086
+ }
8059
8087
  }
8060
8088
  }
8061
- result += '\\u';
8089
+ result += 'u';
8062
8090
  i += 2;
8091
+ changed = true;
8063
8092
  continue;
8064
8093
  }
8065
8094
  default:
@@ -8294,7 +8323,7 @@ class RE2JS {
8294
8323
  * @returns {Matcher}
8295
8324
  */
8296
8325
  matcher(input) {
8297
- if (Array.isArray(input)) {
8326
+ if (Utils.isByteArray(input)) {
8298
8327
  input = MatcherInput.utf8(input);
8299
8328
  }
8300
8329
  return new Matcher(this, input);
@@ -8310,7 +8339,7 @@ class RE2JS {
8310
8339
  * @returns {boolean} `true` if the pattern is found anywhere in the input, `false` otherwise.
8311
8340
  */
8312
8341
  test(input) {
8313
- if (Array.isArray(input)) {
8342
+ if (Utils.isByteArray(input)) {
8314
8343
  // Reuse the existing UTF-8 fast-path method
8315
8344
  return this.re2Input.matchUTF8(input);
8316
8345
  }
@@ -8329,7 +8358,7 @@ class RE2JS {
8329
8358
  * @returns {boolean} `true` if the exact input string fully matches the pattern, `false` otherwise.
8330
8359
  */
8331
8360
  testExact(input) {
8332
- const machineInput = Array.isArray(input) ? MachineInput.fromUTF8(input) : MachineInput.fromUTF16(input);
8361
+ const machineInput = Utils.isByteArray(input) ? MachineInput.fromUTF8(input) : MachineInput.fromUTF16(input);
8333
8362
  return this.re2Input.executeEngine(machineInput, 0, RE2Flags.ANCHOR_BOTH, 0) !== null;
8334
8363
  }
8335
8364