re2js 2.3.0 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,7 +2,7 @@
2
2
  * re2js
3
3
  * RE2JS is the JavaScript port of RE2, a regular expression engine that provides linear time matching
4
4
  *
5
- * @version v2.3.0
5
+ * @version v2.3.2
6
6
  * @author Oleksii Vasyliev
7
7
  * @homepage https://github.com/le0pard/re2js#readme
8
8
  * @repository github:le0pard/re2js
@@ -633,6 +633,31 @@
633
633
  }
634
634
  }
635
635
 
636
+ /**
637
+ * Size of the precomputed single-byte lookup table.
638
+ * Covers standard ASCII and Latin-1 characters for fast-path execution.
639
+ */
640
+ const FAST_PATH_TABLE_SIZE = 256;
641
+ /**
642
+ * Precomputed lookup table for Word Boundary (\b, \B) assertions.
643
+ * * By precomputing the boolean results for standard ASCII word ranges
644
+ * ('a'-'z', 'A'-'Z', '0'-'9', '_'), we completely eliminate 4 logical
645
+ * branches from the NFA's hot execution loop. This prevents costly
646
+ * CPU branch mispredictions when scanning large strings.
647
+ */
648
+ const WORD_RUNE_TABLE = new Uint8Array(FAST_PATH_TABLE_SIZE);
649
+ for (let i = 0; i < FAST_PATH_TABLE_SIZE; i++) {
650
+ WORD_RUNE_TABLE[i] = 97 <= i && i <= 122 ||
651
+ // 'a' - 'z'
652
+ 65 <= i && i <= 90 ||
653
+ // 'A' - 'Z'
654
+ 48 <= i && i <= 57 ||
655
+ // '0' - '9'
656
+ i === 95 // '_'
657
+ ? 1 : 0;
658
+ }
659
+ let cachedNativeEncoder = null;
660
+ let cachedNativeDecoder = null;
636
661
  /**
637
662
  * Various constants and helper utilities.
638
663
  */
@@ -731,12 +756,21 @@
731
756
  return out;
732
757
  }
733
758
 
734
- // Returns the array of runes in the specified Java UTF-16 string.
759
+ // Returns the array of runes in the specified JS UTF-16 string.
735
760
  static stringToRunes(str) {
736
- return Array.from(String(str)).map(s => s.codePointAt(0));
761
+ const string = String(str);
762
+ const runes = [];
763
+ let i = 0;
764
+ while (i < string.length) {
765
+ const cp = string.codePointAt(i);
766
+ runes.push(cp);
767
+ // Surrogate pairs (Emojis, etc.) are > 0xFFFF
768
+ i += cp > Unicode.MAX_BMP ? 2 : 1;
769
+ }
770
+ return runes;
737
771
  }
738
772
 
739
- // Returns the Java UTF-16 string containing the single rune |r|.
773
+ // Returns the JS UTF-16 string containing the single rune |r|.
740
774
  static runeToString(r) {
741
775
  return String.fromCodePoint(r);
742
776
  }
@@ -745,7 +779,7 @@
745
779
  // during the evaluation of the \b and \B zero-width assertions.
746
780
  // These assertions are ASCII-only: the word characters are [A-Za-z0-9_].
747
781
  static isWordRune(r) {
748
- return Codepoint.CODES.get('a') <= r && r <= Codepoint.CODES.get('z') || Codepoint.CODES.get('A') <= r && r <= Codepoint.CODES.get('Z') || Codepoint.CODES.get('0') <= r && r <= Codepoint.CODES.get('9') || r === Codepoint.CODES.get('_');
782
+ return r < FAST_PATH_TABLE_SIZE ? WORD_RUNE_TABLE[r] === 1 : false;
749
783
  }
750
784
 
751
785
  // emptyOpContext returns the zero-width assertions satisfied at the position
@@ -758,21 +792,24 @@
758
792
  static emptyOpContext(r1, r2) {
759
793
  let op = 0;
760
794
  if (r1 < 0) {
761
- op |= this.EMPTY_BEGIN_TEXT | this.EMPTY_BEGIN_LINE;
795
+ op |= Utils.EMPTY_BEGIN_TEXT | Utils.EMPTY_BEGIN_LINE;
762
796
  }
763
- if (r1 === Codepoint.CODES.get('\n')) {
764
- op |= this.EMPTY_BEGIN_LINE;
797
+ // Hardcode 10 for '\n'
798
+ if (r1 === 10) {
799
+ op |= Utils.EMPTY_BEGIN_LINE;
765
800
  }
766
801
  if (r2 < 0) {
767
- op |= this.EMPTY_END_TEXT | this.EMPTY_END_LINE;
802
+ op |= Utils.EMPTY_END_TEXT | Utils.EMPTY_END_LINE;
768
803
  }
769
- if (r2 === Codepoint.CODES.get('\n')) {
770
- op |= this.EMPTY_END_LINE;
804
+
805
+ // Hardcode 10 for '\n'
806
+ if (r2 === 10) {
807
+ op |= Utils.EMPTY_END_LINE;
771
808
  }
772
- if (this.isWordRune(r1) !== this.isWordRune(r2)) {
773
- op |= this.EMPTY_WORD_BOUNDARY;
809
+ if (Utils.isWordRune(r1) !== Utils.isWordRune(r2)) {
810
+ op |= Utils.EMPTY_WORD_BOUNDARY;
774
811
  } else {
775
- op |= this.EMPTY_NO_WORD_BOUNDARY;
812
+ op |= Utils.EMPTY_NO_WORD_BOUNDARY;
776
813
  }
777
814
  return op;
778
815
  }
@@ -796,9 +833,23 @@
796
833
  static charCount(codePoint) {
797
834
  return codePoint > Unicode.MAX_BMP ? 2 : 1;
798
835
  }
836
+
837
+ /**
838
+ * High-speed conversion from TypedArrays to standard JS Arrays.
839
+ * Bypasses the expensive Symbol.iterator overhead of Array.from()
840
+ */
841
+ static toArray(typedArray) {
842
+ const len = typedArray.length;
843
+ const res = new Array(len);
844
+ for (let i = 0; i < len; i++) {
845
+ res[i] = typedArray[i];
846
+ }
847
+ return res;
848
+ }
799
849
  static stringToUtf8ByteArray(str) {
800
850
  if (globalThis.TextEncoder) {
801
- return Array.from(new TextEncoder().encode(str));
851
+ if (!cachedNativeEncoder) cachedNativeEncoder = new TextEncoder();
852
+ return Utils.toArray(cachedNativeEncoder.encode(str));
802
853
  } else {
803
854
  // fallback, if no TextEncoder
804
855
  let out = [],
@@ -828,7 +879,9 @@
828
879
  }
829
880
  static utf8ByteArrayToString(bytes) {
830
881
  if (globalThis.TextDecoder) {
831
- return new TextDecoder('utf-8').decode(new Uint8Array(bytes));
882
+ if (!cachedNativeDecoder) cachedNativeDecoder = new TextDecoder('utf-8');
883
+ const view = bytes instanceof Uint8Array ? bytes : new Uint8Array(bytes);
884
+ return cachedNativeDecoder.decode(view);
832
885
  } else {
833
886
  // fallback, if no TextDecoder
834
887
  let out = [],
@@ -1125,15 +1178,34 @@
1125
1178
  if (targetLength === 0) {
1126
1179
  return fromIndex <= this.end ? fromIndex : -1;
1127
1180
  }
1181
+ const firstByte = target[0];
1128
1182
  let limit = this.end - targetLength;
1129
- for (let i = fromIndex; i <= limit; i++) {
1130
- for (let j = 0; j < targetLength; j++) {
1183
+ // Feature detection: Native TypedArray indexOf (ES2015)
1184
+ const hasNativeIndexOf = typeof source.indexOf === 'function';
1185
+ let i = fromIndex;
1186
+ while (i <= limit) {
1187
+ // Fast-forward to the first matching byte using C++ bindings if available
1188
+ if (hasNativeIndexOf) {
1189
+ i = source.indexOf(firstByte, i);
1190
+ if (i === -1 || i > limit) return -1;
1191
+ } else {
1192
+ // Fallback: Manual loop
1193
+ while (i <= limit && source[i] !== firstByte) i++;
1194
+ if (i > limit) return -1;
1195
+ }
1196
+
1197
+ // First byte matches, verify the rest of the target sequence
1198
+ let match = true;
1199
+ for (let j = 1; j < targetLength; j++) {
1131
1200
  if (source[i + j] !== target[j]) {
1201
+ match = false;
1132
1202
  break;
1133
- } else if (j === targetLength - 1) {
1134
- return i;
1135
1203
  }
1136
1204
  }
1205
+ if (match) {
1206
+ return i;
1207
+ }
1208
+ i++;
1137
1209
  }
1138
1210
  return -1;
1139
1211
  }
@@ -1205,8 +1277,10 @@
1205
1277
  // Returns a bitmask of EMPTY_* flags.
1206
1278
  context(pos) {
1207
1279
  pos += this.start;
1208
- const r1 = pos > this.start && pos <= this.end ? this.charSequence.codePointAt(pos - 1) : -1;
1209
- const r2 = pos < this.end ? this.charSequence.codePointAt(pos) : -1;
1280
+
1281
+ // We only care about ASCII word runes and newlines for context boundaries
1282
+ const r1 = pos > this.start && pos <= this.end ? this.charSequence.charCodeAt(pos - 1) : -1;
1283
+ const r2 = pos < this.end ? this.charSequence.charCodeAt(pos) : -1;
1210
1284
  return Utils.emptyOpContext(r1, r2);
1211
1285
  }
1212
1286
  prefixLength(re2) {
@@ -1336,10 +1410,6 @@
1336
1410
  * @author rsc@google.com (Russ Cox)
1337
1411
  */
1338
1412
 
1339
- /**
1340
- * @typedef {import('./index').RE2JS} RE2JS_Pattern
1341
- */
1342
-
1343
1413
  class Matcher {
1344
1414
  /**
1345
1415
  * Quotes '\' and '$' in {@code s}, so that the returned string could be used in
@@ -1377,8 +1447,8 @@
1377
1447
  }
1378
1448
  /**
1379
1449
  *
1380
- * @param {RE2JS_Pattern} pattern
1381
- * @param {Uint8Array|number[]|string} input
1450
+ * @param {RE2JS} pattern
1451
+ * @param {string|number[]|Uint8Array} input
1382
1452
  */
1383
1453
  constructor(pattern, input) {
1384
1454
  if (pattern === null) {
@@ -1386,7 +1456,7 @@
1386
1456
  }
1387
1457
  /**
1388
1458
  * The pattern being matched.
1389
- * @type {RE2JS_Pattern}
1459
+ * @type {RE2JS}
1390
1460
  */
1391
1461
  this.patternInput = pattern;
1392
1462
  const re2 = this.patternInput.re2();
@@ -1411,7 +1481,7 @@
1411
1481
 
1412
1482
  /**
1413
1483
  * Returns the {@code RE2JS} associated with this {@code Matcher}.
1414
- * @returns {RE2JS_Pattern}
1484
+ * @returns {RE2JS}
1415
1485
  */
1416
1486
  pattern() {
1417
1487
  return this.patternInput;
@@ -1441,7 +1511,7 @@
1441
1511
 
1442
1512
  /**
1443
1513
  * Resets the {@code Matcher} and changes the input.
1444
- * @param {import('./MatcherInput').MatcherInputBase} input
1514
+ * @param {MatcherInputBase} input
1445
1515
  * @returns {Matcher} the {@code Matcher} itself, for chained method calls
1446
1516
  */
1447
1517
  resetMatcherInput(input) {
@@ -2217,7 +2287,7 @@
2217
2287
  return Utils.emptyInts();
2218
2288
  }
2219
2289
  // Use subarray() to create a zero-allocation view before converting
2220
- return Array.from(this.matchcap.subarray(0, this.ncap));
2290
+ return Utils.toArray(this.matchcap.subarray(0, this.ncap));
2221
2291
  }
2222
2292
 
2223
2293
  // alloc() allocates a new thread with the given instruction.
@@ -3179,7 +3249,7 @@
3179
3249
  }
3180
3250
 
3181
3251
  // Must slice so we don't accidentally leak trailing arrays from previously recycled typed arrays
3182
- const result = ncap === 0 ? [] : Array.from(b.matchcap.subarray(0, ncap));
3252
+ const result = ncap === 0 ? [] : Utils.toArray(b.matchcap.subarray(0, ncap));
3183
3253
  bitStatePool.push(b);
3184
3254
  return result;
3185
3255
  }
@@ -3555,7 +3625,7 @@
3555
3625
  matchcap[0] = 0;
3556
3626
  matchcap[1] = pos;
3557
3627
  }
3558
- return ncap === 0 ? [] : Array.from(matchcap);
3628
+ return ncap === 0 ? [] : Utils.toArray(matchcap);
3559
3629
  }
3560
3630
  case Inst.RUNE:
3561
3631
  if (!inst.matchRune(rune)) return null;
@@ -3604,7 +3674,7 @@
3604
3674
  }
3605
3675
  }
3606
3676
  if (!matched) return null;
3607
- return ncap === 0 ? [] : Array.from(matchcap);
3677
+ return ncap === 0 ? [] : Utils.toArray(matchcap);
3608
3678
  }
3609
3679
  }
3610
3680
 
@@ -5358,7 +5428,7 @@
5358
5428
  return r;
5359
5429
  }
5360
5430
  lookingAt(s) {
5361
- return this.rest().startsWith(s);
5431
+ return this.str.startsWith(s, this.position);
5362
5432
  }
5363
5433
 
5364
5434
  // Returns the rest of the pattern as a Java UTF-16 string.