@bufbuild/re2 0.0.1-alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +30 -0
  3. package/dist/cjs/CharClass.d.ts +30 -0
  4. package/dist/cjs/CharClass.js +284 -0
  5. package/dist/cjs/CharGroup.d.ts +8 -0
  6. package/dist/cjs/CharGroup.js +83 -0
  7. package/dist/cjs/Codepoint.d.ts +3 -0
  8. package/dist/cjs/Codepoint.js +62 -0
  9. package/dist/cjs/Compiler.d.ts +40 -0
  10. package/dist/cjs/Compiler.js +262 -0
  11. package/dist/cjs/DFA.d.ts +36 -0
  12. package/dist/cjs/DFA.js +350 -0
  13. package/dist/cjs/Inst.d.ts +26 -0
  14. package/dist/cjs/Inst.js +86 -0
  15. package/dist/cjs/MachineInput.d.ts +17 -0
  16. package/dist/cjs/MachineInput.js +72 -0
  17. package/dist/cjs/Parser.d.ts +111 -0
  18. package/dist/cjs/Parser.js +1538 -0
  19. package/dist/cjs/Prefilter.d.ts +19 -0
  20. package/dist/cjs/Prefilter.js +163 -0
  21. package/dist/cjs/Prog.d.ts +39 -0
  22. package/dist/cjs/Prog.js +154 -0
  23. package/dist/cjs/RE2.d.ts +27 -0
  24. package/dist/cjs/RE2.js +221 -0
  25. package/dist/cjs/RE2Flags.d.ts +16 -0
  26. package/dist/cjs/RE2Flags.js +58 -0
  27. package/dist/cjs/Regexp.d.ts +43 -0
  28. package/dist/cjs/Regexp.js +98 -0
  29. package/dist/cjs/Simplify.d.ts +3 -0
  30. package/dist/cjs/Simplify.js +230 -0
  31. package/dist/cjs/Unicode.d.ts +17 -0
  32. package/dist/cjs/Unicode.js +165 -0
  33. package/dist/cjs/UnicodeRangeTable.d.ts +12 -0
  34. package/dist/cjs/UnicodeRangeTable.js +31 -0
  35. package/dist/cjs/UnicodeTables.d.ts +29 -0
  36. package/dist/cjs/UnicodeTables.js +571 -0
  37. package/dist/cjs/Utils.d.ts +22 -0
  38. package/dist/cjs/Utils.js +119 -0
  39. package/dist/cjs/__fixtures__/find.d.ts +9 -0
  40. package/dist/cjs/__fixtures__/find.js +115 -0
  41. package/dist/cjs/chars.d.ts +2 -0
  42. package/dist/cjs/chars.js +19 -0
  43. package/dist/cjs/exceptions.d.ts +55 -0
  44. package/dist/cjs/exceptions.js +94 -0
  45. package/dist/cjs/index.d.ts +102 -0
  46. package/dist/cjs/index.js +173 -0
  47. package/dist/cjs/package.json +1 -0
  48. package/dist/cjs/testParser.d.ts +3 -0
  49. package/dist/cjs/testParser.js +143 -0
  50. package/dist/esm/CharClass.d.ts +30 -0
  51. package/dist/esm/CharClass.js +281 -0
  52. package/dist/esm/CharGroup.d.ts +8 -0
  53. package/dist/esm/CharGroup.js +78 -0
  54. package/dist/esm/Codepoint.d.ts +3 -0
  55. package/dist/esm/Codepoint.js +59 -0
  56. package/dist/esm/Compiler.d.ts +40 -0
  57. package/dist/esm/Compiler.js +259 -0
  58. package/dist/esm/DFA.d.ts +36 -0
  59. package/dist/esm/DFA.js +347 -0
  60. package/dist/esm/Inst.d.ts +26 -0
  61. package/dist/esm/Inst.js +83 -0
  62. package/dist/esm/MachineInput.d.ts +17 -0
  63. package/dist/esm/MachineInput.js +68 -0
  64. package/dist/esm/Parser.d.ts +111 -0
  65. package/dist/esm/Parser.js +1535 -0
  66. package/dist/esm/Prefilter.d.ts +19 -0
  67. package/dist/esm/Prefilter.js +159 -0
  68. package/dist/esm/Prog.d.ts +39 -0
  69. package/dist/esm/Prog.js +150 -0
  70. package/dist/esm/RE2.d.ts +27 -0
  71. package/dist/esm/RE2.js +218 -0
  72. package/dist/esm/RE2Flags.d.ts +16 -0
  73. package/dist/esm/RE2Flags.js +41 -0
  74. package/dist/esm/Regexp.d.ts +43 -0
  75. package/dist/esm/Regexp.js +94 -0
  76. package/dist/esm/Simplify.d.ts +3 -0
  77. package/dist/esm/Simplify.js +228 -0
  78. package/dist/esm/Unicode.d.ts +17 -0
  79. package/dist/esm/Unicode.js +150 -0
  80. package/dist/esm/UnicodeRangeTable.d.ts +12 -0
  81. package/dist/esm/UnicodeRangeTable.js +28 -0
  82. package/dist/esm/UnicodeTables.d.ts +29 -0
  83. package/dist/esm/UnicodeTables.js +568 -0
  84. package/dist/esm/Utils.d.ts +22 -0
  85. package/dist/esm/Utils.js +103 -0
  86. package/dist/esm/__fixtures__/find.d.ts +9 -0
  87. package/dist/esm/__fixtures__/find.js +112 -0
  88. package/dist/esm/chars.d.ts +2 -0
  89. package/dist/esm/chars.js +14 -0
  90. package/dist/esm/exceptions.d.ts +55 -0
  91. package/dist/esm/exceptions.js +86 -0
  92. package/dist/esm/index.d.ts +102 -0
  93. package/dist/esm/index.js +163 -0
  94. package/dist/esm/testParser.d.ts +3 -0
  95. package/dist/esm/testParser.js +138 -0
  96. package/package.json +49 -0
@@ -0,0 +1,1535 @@
1
+ import { CLASS_NL, DOT_NL, FOLD_CASE, LITERAL, NON_GREEDY, ONE_LINE, UNICODE_GROUPS, WAS_DOLLAR, } from "./RE2Flags.js";
2
+ import { MAX_ASCII, MAX_BMP, MAX_FOLD, MAX_RUNE, MIN_FOLD, simpleFold, } from "./Unicode.js";
3
+ import { UnicodeTables } from "./UnicodeTables.js";
4
+ import { UnicodeRangeTable } from "./UnicodeRangeTable.js";
5
+ import { getPerlGroups, getPosixGroups } from "./CharGroup.js";
6
+ import { unhex, isalnum, charCount, stringToRunes, runeToString, } from "./Utils.js";
7
+ import { codePointAtOrThrow } from "./chars.js";
8
+ import { CharClass } from "./CharClass.js";
9
+ import { RE2JSSyntaxException } from "./exceptions.js";
10
+ import { Regexp } from "./Regexp.js";
11
+ // StringIterator: a stream of runes with an opaque cursor, permitting
12
+ // rewinding. The units of the cursor are not specified beyond the
13
+ // fact that ASCII characters are single width. (Cursor positions
14
+ // could be UTF-8 byte indices, UTF-16 code indices or rune indices.)
15
+ //
16
+ // In particular, be careful with:
17
+ // - skip: only use this to advance over ASCII characters
18
+ // since these always have a width of 1.
19
+ // - skipString: only use this to advance over strings which are
20
+ // known to be at the current position, e.g. due to prior call to
21
+ // lookingAt().
22
+ // Only use pop() to advance over possibly non-ASCII runes.
23
+ class StringIterator {
24
+ str;
25
+ position;
26
+ constructor(str) {
27
+ this.str = str;
28
+ this.position = 0;
29
+ }
30
+ // Returns the cursor position. Do not interpret the result!
31
+ pos() {
32
+ return this.position;
33
+ }
34
+ // Resets the cursor position to a previous value returned by pos().
35
+ rewindTo(pos) {
36
+ this.position = pos;
37
+ }
38
+ // Returns true unless the stream is exhausted.
39
+ more() {
40
+ return this.position < this.str.length;
41
+ }
42
+ // Returns the rune at the cursor position.
43
+ // Precondition: |more()|.
44
+ peek() {
45
+ return codePointAtOrThrow(this.str, this.position);
46
+ }
47
+ // Advances the cursor by |n| positions, which must be ASCII runes.
48
+ //
49
+ // (In practise, this is only ever used to skip over regexp
50
+ // metacharacters that are ASCII, so there is no numeric difference
51
+ // between indices into UTF-8 bytes, UTF-16 codes and runes.)
52
+ skip(n) {
53
+ this.position += n;
54
+ }
55
+ // Advances the cursor by the number of cursor positions in |s|.
56
+ skipString(s) {
57
+ this.position += s.length;
58
+ }
59
+ // Returns the rune at the cursor position, and advances the cursor
60
+ // past it. Precondition: |more()|.
61
+ pop() {
62
+ const r = codePointAtOrThrow(this.str, this.position);
63
+ this.position += charCount(r);
64
+ return r;
65
+ }
66
+ lookingAt(s) {
67
+ return this.str.startsWith(s, this.position);
68
+ }
69
+ // Returns the rest of the pattern from the current position.
70
+ rest() {
71
+ return this.str.substring(this.position);
72
+ }
73
+ // Returns the substring from |beforePos| to the current position.
74
+ // |beforePos| must have been previously returned by |pos()|.
75
+ from(beforePos) {
76
+ return this.str.substring(beforePos, this.position);
77
+ }
78
+ toString() {
79
+ return this.rest();
80
+ }
81
+ }
82
+ /**
83
+ * A parser of regular expression patterns.
84
+ *
85
+ * The only public entry point is {@link #parse(String pattern, int flags)}.
86
+ */
87
+ class Parser {
88
+ // Parse errors
89
+ static ERR_INVALID_CHAR_RANGE = "invalid character class range";
90
+ static ERR_INVALID_ESCAPE = "invalid escape sequence";
91
+ static ERR_INVALID_NAMED_CAPTURE = "invalid named capture";
92
+ static ERR_INVALID_PERL_OP = "invalid or unsupported Perl syntax";
93
+ static ERR_INVALID_REPEAT_OP = "invalid nested repetition operator";
94
+ static ERR_INVALID_REPEAT_SIZE = "invalid repeat count";
95
+ static ERR_MISSING_BRACKET = "missing closing ]";
96
+ static ERR_MISSING_PAREN = "missing closing )";
97
+ static ERR_MISSING_REPEAT_ARGUMENT = "missing argument to repetition operator";
98
+ static ERR_TRAILING_BACKSLASH = "trailing backslash at end of expression";
99
+ static ERR_DUPLICATE_NAMED_CAPTURE = "duplicate capture group name";
100
+ static ERR_UNEXPECTED_PAREN = "unexpected )";
101
+ static ERR_NESTING_DEPTH = "expression nests too deeply";
102
+ static ERR_LARGE = "expression too large";
103
+ static ERR_BAD_EXPRESSION = "expression not valid";
104
+ // maxHeight is the maximum height of a regexp parse tree.
105
+ // It is somewhat arbitrarily chosen, but the idea is to be large enough
106
+ // that no one will actually hit in real use but at the same time small enough
107
+ // that recursion on the Regexp tree will not hit the 1GB Go stack limit.
108
+ // The maximum amount of stack for a single recursive frame is probably
109
+ // closer to 1kB, so this could potentially be raised, but it seems unlikely
110
+ // that people have regexps nested even this deeply.
111
+ // We ran a test on Google's C++ code base and turned up only
112
+ // a single use case with depth > 100; it had depth 128.
113
+ // Using depth 1000 should be plenty of margin.
114
+ // As an optimization, we don't even bother calculating heights
115
+ // until we've allocated at least maxHeight Regexp structures.
116
+ static MAX_HEIGHT = 1000;
117
+ // maxSize is the maximum size of a compiled regexp in Insts.
118
+ // It too is somewhat arbitrarily chosen, but the idea is to be large enough
119
+ // to allow significant regexps while at the same time small enough that
120
+ // the compiled form will not take up too much memory.
121
+ // 128 MB is enough for a 3.3 million Inst structures, which roughly
122
+ // corresponds to a 3.3 MB regexp.
123
+ static MAX_SIZE = 3355443; // 128 << 20 / (5 * 8) (instSize = byte, 2 uint32, slice is 5 64-bit words)
124
+ // maxRunes is the maximum number of runes allowed in a regexp tree
125
+ // counting the runes in all the nodes.
126
+ // Ignoring character classes p.numRunes is always less than the length of the regexp.
127
+ // Character classes can make it much larger: each \pL adds 1292 runes.
128
+ // 128 MB is enough for 32M runes, which is over 26k \pL instances.
129
+ // Note that repetitions do not make copies of the rune slices,
130
+ // so \pL{1000} is only one rune slice, not 1000.
131
+ // We could keep a cache of character classes we've seen,
132
+ // so that all the \pL we see use the same rune list,
133
+ // but that doesn't remove the problem entirely:
134
+ // consider something like [\pL01234][\pL01235][\pL01236]...[\pL^&*()].
135
+ // And because the Rune slice is exposed directly in the Regexp,
136
+ // there is not an opportunity to change the representation to allow
137
+ // partial sharing between different character classes.
138
+ // So the limit is the best we can do.
139
+ static MAX_RUNES = 33554432; // 128 << 20 / 4 (runeSize, int32 is 4 bytes)
140
+ // RangeTables are represented as int[][], a list of triples (start, end,
141
+ // stride).
142
+ static ANY_TABLE = new UnicodeRangeTable(new Uint32Array([0, MAX_RUNE, 1]));
143
+ // Ascii tables
144
+ static ASCII_TABLE = new UnicodeRangeTable(new Uint32Array([0, 0x7f, 1]));
145
+ static ASCII_FOLD_TABLE = new UnicodeRangeTable(new Uint32Array([
146
+ 0,
147
+ 0x7f,
148
+ 1,
149
+ 0x017f,
150
+ 0x017f,
151
+ 1, // Old English long s (ſ), folds to S/s.
152
+ 0x212a,
153
+ 0x212a,
154
+ 1, // Kelvin K, folds to K/k.
155
+ ]));
156
+ // unicodeTable() returns the Unicode RangeTable identified by name
157
+ // and the table of additional fold-equivalent code points.
158
+ // Returns null if |name| does not identify a Unicode character range.
159
+ static unicodeTable(name) {
160
+ if (name === "Any") {
161
+ return { tab: Parser.ANY_TABLE, fold: Parser.ANY_TABLE, sign: +1 };
162
+ }
163
+ if (name === "Ascii") {
164
+ return {
165
+ tab: Parser.ASCII_TABLE,
166
+ fold: Parser.ASCII_FOLD_TABLE,
167
+ sign: +1,
168
+ };
169
+ }
170
+ if (name === "Assigned") {
171
+ // Assigned is the mathematical inversion of Cn (Unassigned)
172
+ return {
173
+ tab: UnicodeTables.CATEGORIES.get("Cn"),
174
+ fold: UnicodeTables.CATEGORIES.get("Cn"),
175
+ sign: -1,
176
+ };
177
+ }
178
+ if (name === "Lc") {
179
+ return {
180
+ tab: UnicodeTables.CATEGORIES.get("LC"),
181
+ fold: UnicodeTables.FOLD_CATEGORIES.get("LC"),
182
+ sign: +1,
183
+ };
184
+ }
185
+ if (UnicodeTables.CATEGORIES.has(name)) {
186
+ return {
187
+ tab: UnicodeTables.CATEGORIES.get(name),
188
+ fold: UnicodeTables.FOLD_CATEGORIES.get(name),
189
+ sign: +1,
190
+ };
191
+ }
192
+ if (UnicodeTables.SCRIPTS.has(name)) {
193
+ return {
194
+ tab: UnicodeTables.SCRIPTS.get(name),
195
+ fold: UnicodeTables.FOLD_SCRIPT.get(name),
196
+ sign: +1,
197
+ };
198
+ }
199
+ return null;
200
+ }
201
+ // minFoldRune returns the minimum rune fold-equivalent to r.
202
+ static minFoldRune(r) {
203
+ if (r < MIN_FOLD || r > MAX_FOLD) {
204
+ return r;
205
+ }
206
+ let min = r;
207
+ const r0 = r;
208
+ for (r = simpleFold(r); r !== r0; r = simpleFold(r)) {
209
+ if (min > r) {
210
+ min = r;
211
+ }
212
+ }
213
+ return min;
214
+ }
215
+ static literalRegexp(s, flags) {
216
+ const re = new Regexp(Regexp.Op.LITERAL);
217
+ re.flags = flags;
218
+ re.runes = stringToRunes(s);
219
+ return re;
220
+ }
221
+ /**
222
+ * Parse regular expression pattern {@code pattern} with mode flags {@code flags}.
223
+ * @param {string} pattern
224
+ * @param {number} flags
225
+ */
226
+ static parse(pattern, flags) {
227
+ return new Parser(pattern, flags).parseInternal();
228
+ }
229
+ // parseRepeat parses {min} (max=min) or {min,} (max=-1) or {min,max}.
230
+ // If |t| is not of that form, it returns -1.
231
+ // If |t| has the right form but the values are negative or too big,
232
+ // it returns -2.
233
+ // On success, returns a nonnegative number encoding min/max in the
234
+ // high/low signed halfwords of the result. (Note: min >= 0; max may
235
+ // be -1.)
236
+ //
237
+ // On success, advances |t| beyond the repeat; otherwise |t.pos()| is
238
+ // undefined.
239
+ static parseRepeat(t) {
240
+ const start = t.pos();
241
+ if (!t.more() || !t.lookingAt("{")) {
242
+ return -1;
243
+ }
244
+ t.skip(1);
245
+ const min = Parser.parseInt(t);
246
+ if (min === -1) {
247
+ return -1;
248
+ }
249
+ if (!t.more()) {
250
+ return -1;
251
+ }
252
+ let max;
253
+ if (!t.lookingAt(",")) {
254
+ max = min;
255
+ }
256
+ else {
257
+ t.skip(1);
258
+ if (!t.more()) {
259
+ return -1;
260
+ }
261
+ if (t.lookingAt("}")) {
262
+ max = -1;
263
+ }
264
+ else {
265
+ max = Parser.parseInt(t);
266
+ if (max === -1) {
267
+ return -1;
268
+ }
269
+ }
270
+ }
271
+ if (!t.more() || !t.lookingAt("}")) {
272
+ return -1;
273
+ }
274
+ t.skip(1);
275
+ if (min < 0 ||
276
+ min > 1000 ||
277
+ max === -2 ||
278
+ max > 1000 ||
279
+ (max >= 0 && min > max)) {
280
+ throw new RE2JSSyntaxException(Parser.ERR_INVALID_REPEAT_SIZE, t.from(start));
281
+ }
282
+ return (min << 16) | (max & MAX_BMP);
283
+ }
284
+ // isValidCaptureName reports whether name
285
+ // is a valid capture name: [A-Za-z0-9_]+.
286
+ // PCRE limits names to 32 bytes.
287
+ // Python rejects names starting with digits.
288
+ // We don't enforce either of those.
289
+ static isValidCaptureName(name) {
290
+ if (name.length === 0) {
291
+ return false;
292
+ }
293
+ for (let i = 0; i < name.length; i++) {
294
+ const c = codePointAtOrThrow(name, i);
295
+ if (c !== 0x5f && !isalnum(c)) {
296
+ return false;
297
+ }
298
+ }
299
+ return true;
300
+ }
301
+ // parseInt parses a nonnegative decimal integer.
302
+ // -1 => bad format. -2 => format ok, but integer overflow.
303
+ static parseInt(t) {
304
+ const start = t.pos();
305
+ while (t.more() && t.peek() >= 0x30 && t.peek() <= 0x39) {
306
+ t.skip(1);
307
+ }
308
+ const n = t.from(start);
309
+ if (n.length === 0 || (n.length > 1 && n.codePointAt(0) === 0x30)) {
310
+ return -1;
311
+ }
312
+ if (n.length > 8) {
313
+ return -2;
314
+ }
315
+ return parseInt(n, 10);
316
+ }
317
+ // can this be represented as a character class?
318
+ // single-rune literal string, char class, ., and .|\n.
319
+ static isCharClass(re) {
320
+ return ((re.op === Regexp.Op.LITERAL && re.runes.length === 1) ||
321
+ re.op === Regexp.Op.CHAR_CLASS ||
322
+ re.op === Regexp.Op.ANY_CHAR_NOT_NL ||
323
+ re.op === Regexp.Op.ANY_CHAR);
324
+ }
325
+ // does re match r?
326
+ static matchRune(re, r) {
327
+ switch (re.op) {
328
+ case Regexp.Op.LITERAL:
329
+ return re.runes.length === 1 && re.runes[0] === r;
330
+ case Regexp.Op.CHAR_CLASS:
331
+ for (let i = 0; i < re.runes.length; i += 2) {
332
+ if (re.runes[i] <= r && r <= re.runes[i + 1]) {
333
+ return true;
334
+ }
335
+ }
336
+ return false;
337
+ case Regexp.Op.ANY_CHAR_NOT_NL:
338
+ return r !== 0x0a;
339
+ case Regexp.Op.ANY_CHAR:
340
+ return true;
341
+ }
342
+ return false;
343
+ }
344
+ // mergeCharClass makes dst = dst|src.
345
+ // The caller must ensure that dst.Op >= src.Op,
346
+ // to reduce the amount of copying.
347
+ static mergeCharClass(dst, src) {
348
+ switch (dst.op) {
349
+ case Regexp.Op.ANY_CHAR:
350
+ break;
351
+ case Regexp.Op.ANY_CHAR_NOT_NL:
352
+ if (Parser.matchRune(src, 0x0a)) {
353
+ dst.op = Regexp.Op.ANY_CHAR;
354
+ }
355
+ break;
356
+ case Regexp.Op.CHAR_CLASS:
357
+ if (src.op === Regexp.Op.LITERAL) {
358
+ dst.runes = new CharClass(dst.runes)
359
+ .appendLiteral(src.runes[0], src.flags)
360
+ .toArray();
361
+ }
362
+ else {
363
+ dst.runes = new CharClass(dst.runes).appendClass(src.runes).toArray();
364
+ }
365
+ break;
366
+ case Regexp.Op.LITERAL:
367
+ if (src.runes[0] === dst.runes[0] && src.flags === dst.flags) {
368
+ break;
369
+ }
370
+ dst.op = Regexp.Op.CHAR_CLASS;
371
+ dst.runes = new CharClass()
372
+ .appendLiteral(dst.runes[0], dst.flags)
373
+ .appendLiteral(src.runes[0], src.flags)
374
+ .toArray();
375
+ break;
376
+ }
377
+ }
378
+ // parseEscape parses an escape sequence at the beginning of s
379
+ // and returns the rune.
380
+ // Pre: t at '\\'. Post: after escape.
381
+ static parseEscape(t) {
382
+ const startPos = t.pos();
383
+ t.skip(1); // '\\'
384
+ if (!t.more()) {
385
+ throw new RE2JSSyntaxException(Parser.ERR_TRAILING_BACKSLASH);
386
+ }
387
+ let c = t.pop();
388
+ switch (c) {
389
+ case 0x31:
390
+ case 0x32:
391
+ case 0x33:
392
+ case 0x34:
393
+ case 0x35:
394
+ case 0x36:
395
+ case 0x30:
396
+ case 0x37: {
397
+ if (c !== 0x30 && (!t.more() || t.peek() < 0x30 || t.peek() > 0x37)) {
398
+ break;
399
+ }
400
+ let r = c - 0x30;
401
+ for (let i = 1; i < 3; i++) {
402
+ if (!t.more() || t.peek() < 0x30 || t.peek() > 0x37) {
403
+ break;
404
+ }
405
+ r = r * 8 + t.peek() - 0x30;
406
+ t.skip(1);
407
+ }
408
+ return r;
409
+ }
410
+ case 0x78: {
411
+ if (!t.more()) {
412
+ break;
413
+ }
414
+ c = t.pop();
415
+ if (c === 0x7b) {
416
+ let nhex = 0;
417
+ let r = 0;
418
+ while (true) {
419
+ if (!t.more()) {
420
+ throw new RE2JSSyntaxException(Parser.ERR_INVALID_ESCAPE, t.from(startPos));
421
+ }
422
+ c = t.pop();
423
+ if (c === 0x7d) {
424
+ break;
425
+ }
426
+ const v = unhex(c);
427
+ if (v < 0) {
428
+ throw new RE2JSSyntaxException(Parser.ERR_INVALID_ESCAPE, t.from(startPos));
429
+ }
430
+ r = r * 16 + v;
431
+ if (r > MAX_RUNE) {
432
+ throw new RE2JSSyntaxException(Parser.ERR_INVALID_ESCAPE, t.from(startPos));
433
+ }
434
+ nhex++;
435
+ }
436
+ if (nhex === 0) {
437
+ break;
438
+ }
439
+ return r;
440
+ }
441
+ const x = unhex(c);
442
+ if (!t.more()) {
443
+ break;
444
+ }
445
+ c = t.pop();
446
+ const y = unhex(c);
447
+ if (x < 0 || y < 0) {
448
+ break;
449
+ }
450
+ return x * 16 + y;
451
+ }
452
+ case 0x61:
453
+ return 0x07;
454
+ case 0x66:
455
+ return 0x0c;
456
+ case 0x6e:
457
+ return 0x0a;
458
+ case 0x72:
459
+ return 0x0d;
460
+ case 0x74:
461
+ return 0x09;
462
+ case 0x76:
463
+ return 0x0b;
464
+ default:
465
+ if (c <= MAX_ASCII && !isalnum(c)) {
466
+ return c;
467
+ }
468
+ break;
469
+ }
470
+ throw new RE2JSSyntaxException(Parser.ERR_INVALID_ESCAPE, t.from(startPos));
471
+ }
472
+ // parseClassChar parses a character class character and returns it.
473
+ // wholeClassPos is the position of the start of the entire class "[...".
474
+ // Pre: t at class char; Post: t after it.
475
+ static parseClassChar(t, wholeClassPos) {
476
+ if (!t.more()) {
477
+ throw new RE2JSSyntaxException(Parser.ERR_MISSING_BRACKET, t.from(wholeClassPos));
478
+ }
479
+ if (t.lookingAt("\\")) {
480
+ return Parser.parseEscape(t);
481
+ }
482
+ return t.pop();
483
+ }
484
+ static concatRunes(x, y) {
485
+ const r = new Array(x.length + y.length);
486
+ for (let i = 0; i < x.length; i++)
487
+ r[i] = x[i];
488
+ for (let i = 0; i < y.length; i++)
489
+ r[x.length + i] = y[i];
490
+ return r;
491
+ }
492
+ wholeRegexp;
493
+ flags;
494
+ numCap;
495
+ namedGroups;
496
+ stack;
497
+ free;
498
+ numRegexp;
499
+ numRunes;
500
+ repeats;
501
+ height;
502
+ size;
503
+ constructor(wholeRegexp, flags = 0) {
504
+ this.wholeRegexp = wholeRegexp;
505
+ // Flags control the behavior of the parser and record information about
506
+ // regexp context.
507
+ this.flags = flags;
508
+ // number of capturing groups seen
509
+ this.numCap = 0;
510
+ this.namedGroups = new Map();
511
+ // Stack of parsed expressions.
512
+ this.stack = [];
513
+ this.free = null;
514
+ // checks
515
+ this.numRegexp = 0; // number of regexps allocated
516
+ this.numRunes = 0; // number of runes in char classes
517
+ this.repeats = 0; // product of all repetitions seen
518
+ this.height = null; // regexp height, for height limit check
519
+ this.size = null; // regexp compiled size, for size limit check
520
+ }
521
+ // Allocate a Regexp, from the free list if possible.
522
+ newRegexp(op) {
523
+ let re = this.free;
524
+ if (re !== null && re.subs !== null && re.subs.length > 0) {
525
+ this.free = re.subs[0];
526
+ re.reinit();
527
+ re.op = op;
528
+ }
529
+ else {
530
+ re = new Regexp(op);
531
+ this.numRegexp += 1;
532
+ }
533
+ return re;
534
+ }
535
+ reuse(re) {
536
+ if (this.height !== null) {
537
+ this.height.delete(re);
538
+ }
539
+ if (re.subs !== null && re.subs.length > 0) {
540
+ // subs[0] doubles as the free-list next pointer while re is on the list.
541
+ re.subs[0] = this.free;
542
+ }
543
+ this.free = re;
544
+ }
545
+ checkLimits(re) {
546
+ if (this.numRunes > Parser.MAX_RUNES) {
547
+ throw new RE2JSSyntaxException(Parser.ERR_LARGE);
548
+ }
549
+ this.checkSize(re);
550
+ this.checkHeight(re);
551
+ }
552
+ checkSize(re) {
553
+ if (this.size === null) {
554
+ // We haven't started tracking size yet.
555
+ // Do a relatively cheap check to see if we need to start.
556
+ // Maintain the product of all the repeats we've seen
557
+ // and don't track if the total number of regexp nodes
558
+ // we've seen times the repeat product is in budget.
559
+ if (this.repeats === 0) {
560
+ this.repeats = 1;
561
+ }
562
+ if (re.op === Regexp.Op.REPEAT) {
563
+ let n = re.max;
564
+ if (n === -1) {
565
+ n = re.min;
566
+ }
567
+ if (n <= 0) {
568
+ n = 1;
569
+ }
570
+ if (n > Parser.MAX_SIZE / this.repeats) {
571
+ this.repeats = Parser.MAX_SIZE;
572
+ }
573
+ else {
574
+ this.repeats *= n;
575
+ }
576
+ }
577
+ if (this.numRegexp < Parser.MAX_SIZE / this.repeats) {
578
+ return;
579
+ }
580
+ // We need to start tracking size.
581
+ // Make the map and belatedly populate it
582
+ // with info about everything we've constructed so far.
583
+ this.size = new Map();
584
+ for (let reEx of this.stack) {
585
+ this.checkSize(reEx);
586
+ }
587
+ }
588
+ if (this.calcSize(re, true) > Parser.MAX_SIZE) {
589
+ throw new RE2JSSyntaxException(Parser.ERR_LARGE);
590
+ }
591
+ }
592
+ calcSize(re, force = false) {
593
+ if (!force && this.size !== null) {
594
+ const cached = this.size.get(re);
595
+ if (cached !== undefined) {
596
+ return cached;
597
+ }
598
+ }
599
+ let size = 0;
600
+ switch (re.op) {
601
+ case Regexp.Op.LITERAL: {
602
+ size = re.runes.length;
603
+ break;
604
+ }
605
+ case Regexp.Op.CAPTURE:
606
+ case Regexp.Op.STAR: {
607
+ // star can be 1+ or 2+; assume 2 pessimistically
608
+ size = 2 + this.calcSize(re.subs[0]);
609
+ break;
610
+ }
611
+ case Regexp.Op.PLUS:
612
+ case Regexp.Op.QUEST: {
613
+ size = 1 + this.calcSize(re.subs[0]);
614
+ break;
615
+ }
616
+ case Regexp.Op.CONCAT: {
617
+ for (let sub of re.subs) {
618
+ size = size + this.calcSize(sub);
619
+ }
620
+ break;
621
+ }
622
+ case Regexp.Op.ALTERNATE: {
623
+ for (let sub of re.subs) {
624
+ size = size + this.calcSize(sub);
625
+ }
626
+ if (re.subs.length > 1) {
627
+ size = size + re.subs.length - 1;
628
+ }
629
+ break;
630
+ }
631
+ case Regexp.Op.REPEAT: {
632
+ let sub = this.calcSize(re.subs[0]);
633
+ if (re.max === -1) {
634
+ if (re.min === 0) {
635
+ size = 2 + sub; // x*
636
+ }
637
+ else {
638
+ size = 1 + re.min * sub; // xxx+
639
+ }
640
+ break;
641
+ }
642
+ // x{2,5} = xx(x(x(x)?)?)?
643
+ size = re.max * sub + (re.max - re.min);
644
+ break;
645
+ }
646
+ }
647
+ size = Math.max(1, size);
648
+ this.size?.set(re, size);
649
+ return size;
650
+ }
651
+ checkHeight(re) {
652
+ if (this.numRegexp < Parser.MAX_HEIGHT) {
653
+ return;
654
+ }
655
+ if (this.height === null) {
656
+ this.height = new Map();
657
+ for (let reEx of this.stack) {
658
+ this.checkHeight(reEx);
659
+ }
660
+ }
661
+ if (this.calcHeight(re, true) > Parser.MAX_HEIGHT) {
662
+ throw new RE2JSSyntaxException(Parser.ERR_NESTING_DEPTH);
663
+ }
664
+ }
665
+ calcHeight(re, force = false) {
666
+ if (!force && this.height !== null) {
667
+ const cached = this.height.get(re);
668
+ if (cached !== undefined) {
669
+ return cached;
670
+ }
671
+ }
672
+ let h = 1;
673
+ for (let sub of re.subs) {
674
+ const hsub = this.calcHeight(sub);
675
+ if (h < 1 + hsub) {
676
+ h = 1 + hsub;
677
+ }
678
+ }
679
+ this.height?.set(re, h);
680
+ return h;
681
+ }
682
+ // Parse stack manipulation.
683
+ pop() {
684
+ return this.stack.pop();
685
+ }
686
+ popToPseudo() {
687
+ const n = this.stack.length;
688
+ let i = n;
689
+ while (i > 0 && !Regexp.isPseudoOp(this.stack[i - 1].op)) {
690
+ i--;
691
+ }
692
+ const r = this.stack.slice(i, n);
693
+ this.stack = this.stack.slice(0, i);
694
+ return r;
695
+ }
696
+ // push pushes the regexp re onto the parse stack and returns the regexp.
697
+ // Returns null for a CHAR_CLASS that can be merged with the top-of-stack.
698
+ push(re) {
699
+ this.numRunes += re.runes.length;
700
+ if (re.op === Regexp.Op.CHAR_CLASS &&
701
+ re.runes.length === 2 &&
702
+ re.runes[0] === re.runes[1]) {
703
+ if (this.maybeConcat(re.runes[0], this.flags & ~FOLD_CASE)) {
704
+ return null;
705
+ }
706
+ re.op = Regexp.Op.LITERAL;
707
+ re.runes = [re.runes[0]];
708
+ re.flags = this.flags & ~FOLD_CASE;
709
+ }
710
+ else if ((re.op === Regexp.Op.CHAR_CLASS &&
711
+ re.runes.length === 4 &&
712
+ re.runes[0] === re.runes[1] &&
713
+ re.runes[2] === re.runes[3] &&
714
+ simpleFold(re.runes[0]) === re.runes[2] &&
715
+ simpleFold(re.runes[2]) === re.runes[0]) ||
716
+ (re.op === Regexp.Op.CHAR_CLASS &&
717
+ re.runes.length === 2 &&
718
+ re.runes[0] + 1 === re.runes[1] &&
719
+ simpleFold(re.runes[0]) === re.runes[1] &&
720
+ simpleFold(re.runes[1]) === re.runes[0])) {
721
+ // Case-insensitive rune like [Aa] or [Δδ].
722
+ if (this.maybeConcat(re.runes[0], this.flags | FOLD_CASE)) {
723
+ return null;
724
+ }
725
+ // Rewrite as (case-insensitive) literal.
726
+ re.op = Regexp.Op.LITERAL;
727
+ re.runes = [re.runes[0]];
728
+ re.flags = this.flags | FOLD_CASE;
729
+ }
730
+ else {
731
+ // Incremental concatenation.
732
+ this.maybeConcat(-1, 0);
733
+ }
734
+ this.stack.push(re);
735
+ this.checkLimits(re);
736
+ return re;
737
+ }
738
+ // maybeConcat implements incremental concatenation
739
+ // of literal runes into string nodes. The parser calls this
740
+ // before each push, so only the top fragment of the stack
741
+ // might need processing. Since this is called before a push,
742
+ // the topmost literal is no longer subject to operators like *
743
+ // (Otherwise ab* would turn into (ab)*.)
744
+ // If (r >= 0 and there's a node left over, maybeConcat uses it
745
+ // to push r with the given flags.
746
+ // maybeConcat reports whether r was pushed.
747
+ maybeConcat(r, flags) {
748
+ const n = this.stack.length;
749
+ if (n < 2) {
750
+ return false;
751
+ }
752
+ const re1 = this.stack[n - 1];
753
+ const re2 = this.stack[n - 2];
754
+ if (re1.op !== Regexp.Op.LITERAL ||
755
+ re2.op !== Regexp.Op.LITERAL ||
756
+ (re1.flags & FOLD_CASE) !== (re2.flags & FOLD_CASE)) {
757
+ return false;
758
+ }
759
+ // Push re1 into re2.
760
+ re2.runes = Parser.concatRunes(re2.runes, re1.runes);
761
+ // Reuse re1 if possible.
762
+ if (r >= 0) {
763
+ re1.runes = [r];
764
+ re1.flags = flags;
765
+ return true;
766
+ }
767
+ this.pop();
768
+ this.reuse(re1);
769
+ return false; // did not push r
770
+ }
771
+ // newLiteral returns a new LITERAL Regexp with the given flags
772
+ newLiteral(r, flags) {
773
+ const re = this.newRegexp(Regexp.Op.LITERAL);
774
+ re.flags = flags;
775
+ if ((flags & FOLD_CASE) !== 0) {
776
+ r = Parser.minFoldRune(r);
777
+ }
778
+ re.runes = [r];
779
+ return re;
780
+ }
781
+ // literal pushes a literal regexp for the rune r on the stack
782
+ // and returns that regexp.
783
+ literal(r) {
784
+ this.push(this.newLiteral(r, this.flags));
785
+ }
786
+ // op pushes a regexp with the given op onto the stack
787
+ // and returns that regexp.
788
+ op(op) {
789
+ const re = this.newRegexp(op);
790
+ re.flags = this.flags;
791
+ return this.push(re);
792
+ }
793
+ // repeat replaces the top stack element with itself repeated according to
794
+ // op, min, max. beforePos is the start position of the repetition operator.
795
+ // Pre: t is positioned after the initial repetition operator.
796
+ // Post: t advances past an optional perl-mode '?', or stays put.
797
+ // Or, it fails with RE2JSSyntaxException.
798
+ repeat(op, min, max, beforePos, t, lastRepeatPos) {
799
+ let flags = this.flags;
800
+ if (t.more() && t.lookingAt("?")) {
801
+ t.skip(1);
802
+ flags ^= NON_GREEDY;
803
+ }
804
+ if (lastRepeatPos !== -1) {
805
+ throw new RE2JSSyntaxException(Parser.ERR_INVALID_REPEAT_OP, t.from(lastRepeatPos));
806
+ }
807
+ const n = this.stack.length;
808
+ if (n === 0) {
809
+ throw new RE2JSSyntaxException(Parser.ERR_MISSING_REPEAT_ARGUMENT, t.from(beforePos));
810
+ }
811
+ const sub = this.stack[n - 1];
812
+ if (Regexp.isPseudoOp(sub.op)) {
813
+ throw new RE2JSSyntaxException(Parser.ERR_MISSING_REPEAT_ARGUMENT, t.from(beforePos));
814
+ }
815
+ const re = this.newRegexp(op);
816
+ re.min = min;
817
+ re.max = max;
818
+ re.flags = flags;
819
+ re.subs = [sub];
820
+ this.stack[n - 1] = re;
821
+ this.checkLimits(re);
822
+ if (op === Regexp.Op.REPEAT &&
823
+ (min >= 2 || max >= 2) &&
824
+ !this.repeatIsValid(re, 1000)) {
825
+ throw new RE2JSSyntaxException(Parser.ERR_INVALID_REPEAT_SIZE, t.from(beforePos));
826
+ }
827
+ }
828
+ // repeatIsValid reports whether the repetition re is valid.
829
+ // Valid means that the combination of the top-level repetition
830
+ // and any inner repetitions does not exceed n copies of the
831
+ // innermost thing.
832
+ // This function rewalks the regexp tree and is called for every repetition,
833
+ // so we have to worry about inducing quadratic behavior in the parser.
834
+ // We avoid this by only calling repeatIsValid when min or max >= 2.
835
+ // In that case the depth of any >= 2 nesting can only get to 9 without
836
+ // triggering a parse error, so each subtree can only be rewalked 9 times.
837
+ repeatIsValid(re, n) {
838
+ if (re.op === Regexp.Op.REPEAT) {
839
+ let m = re.max;
840
+ if (m === 0) {
841
+ return true;
842
+ }
843
+ if (m < 0) {
844
+ m = re.min;
845
+ }
846
+ if (m > n) {
847
+ return false;
848
+ }
849
+ if (m > 0) {
850
+ n = Math.trunc(n / m);
851
+ }
852
+ }
853
+ for (let sub of re.subs) {
854
+ if (!this.repeatIsValid(sub, n)) {
855
+ return false;
856
+ }
857
+ }
858
+ return true;
859
+ }
860
+ // concat replaces the top of the stack (above the topmost '|' or '(') with
861
+ // its concatenation.
862
+ concat() {
863
+ this.maybeConcat(-1, 0);
864
+ const subs = this.popToPseudo();
865
+ if (subs.length === 0) {
866
+ return this.push(this.newRegexp(Regexp.Op.EMPTY_MATCH));
867
+ }
868
+ return this.push(this.collapse(subs, Regexp.Op.CONCAT));
869
+ }
870
+ // alternate replaces the top of the stack (above the topmost '(') with its
871
+ // alternation.
872
+ alternate() {
873
+ // Scan down to find pseudo-operator (.
874
+ // There are no | above (.
875
+ const subs = this.popToPseudo();
876
+ // Make sure top class is clean.
877
+ // All the others already are (see swapVerticalBar).
878
+ if (subs.length > 0) {
879
+ this.cleanAlt(subs[subs.length - 1]);
880
+ }
881
+ // Empty alternate is special case
882
+ // (shouldn't happen but easy to handle).
883
+ if (subs.length === 0) {
884
+ return this.push(this.newRegexp(Regexp.Op.NO_MATCH));
885
+ }
886
+ return this.push(this.collapse(subs, Regexp.Op.ALTERNATE));
887
+ }
888
+ // cleanAlt cleans re for eventual inclusion in an alternation.
889
+ cleanAlt(re) {
890
+ if (re.op === Regexp.Op.CHAR_CLASS) {
891
+ re.runes = new CharClass(re.runes).cleanClass().toArray();
892
+ if (re.runes.length === 2 &&
893
+ re.runes[0] === 0 &&
894
+ re.runes[1] === MAX_RUNE) {
895
+ re.runes = [];
896
+ re.op = Regexp.Op.ANY_CHAR;
897
+ }
898
+ else if (re.runes.length === 4 &&
899
+ re.runes[0] === 0 &&
900
+ re.runes[1] === 0x0a - 1 &&
901
+ re.runes[2] === 0x0a + 1 &&
902
+ re.runes[3] === MAX_RUNE) {
903
+ re.runes = [];
904
+ re.op = Regexp.Op.ANY_CHAR_NOT_NL;
905
+ }
906
+ }
907
+ }
908
+ // collapse returns the result of applying op to subs[start:end].
909
+ // If (sub contains op nodes, they all get hoisted up
910
+ // so that there is never a concat of a concat or an
911
+ // alternate of an alternate.
912
+ collapse(subs, op) {
913
+ if (subs.length === 1) {
914
+ return subs[0];
915
+ }
916
+ // Concatenate subs iff op is same.
917
+ // Compute length in first pass.
918
+ let len = 0;
919
+ for (let sub of subs) {
920
+ len += sub.op === op ? sub.subs.length : 1;
921
+ }
922
+ let newsubs = new Array(len).fill(null);
923
+ let i = 0;
924
+ for (let sub of subs) {
925
+ if (sub.op === op) {
926
+ for (let j = 0; j < sub.subs.length; j++) {
927
+ newsubs[i++] = sub.subs[j];
928
+ }
929
+ this.reuse(sub);
930
+ }
931
+ else {
932
+ newsubs[i++] = sub;
933
+ }
934
+ }
935
+ let re = this.newRegexp(op);
936
+ re.subs = newsubs;
937
+ if (op === Regexp.Op.ALTERNATE) {
938
+ if (re.subs.length === 1) {
939
+ const old = re;
940
+ re = re.subs[0];
941
+ this.reuse(old);
942
+ }
943
+ }
944
+ return re;
945
+ }
946
+ parseInternal() {
947
+ if ((this.flags & LITERAL) !== 0) {
948
+ // Trivial parser for literal string.
949
+ return Parser.literalRegexp(this.wholeRegexp, this.flags);
950
+ }
951
+ // Otherwise, must do real work.
952
+ let lastRepeatPos = -1;
953
+ let min = -1;
954
+ let max = -1;
955
+ const t = new StringIterator(this.wholeRegexp);
956
+ while (t.more()) {
957
+ {
958
+ let repeatPos = -1;
959
+ switch (t.peek()) {
960
+ case 0x28:
961
+ if (t.lookingAt("(?")) {
962
+ // Flag changes and non-capturing groups.
963
+ this.parsePerlFlags(t);
964
+ break;
965
+ }
966
+ const lparen = this.op(Regexp.Op.LEFT_PAREN);
967
+ if (lparen === null) {
968
+ throw new Error("op(LEFT_PAREN) unexpectedly returned null");
969
+ }
970
+ lparen.cap = ++this.numCap;
971
+ t.skip(1); // '('
972
+ break;
973
+ case 0x7c:
974
+ this.parseVerticalBar(); // '|'
975
+ t.skip(1); // '|'
976
+ break;
977
+ case 0x29:
978
+ this.parseRightParen();
979
+ t.skip(1); // ')'
980
+ break;
981
+ case 0x5e:
982
+ if ((this.flags & ONE_LINE) !== 0) {
983
+ this.op(Regexp.Op.BEGIN_TEXT);
984
+ }
985
+ else {
986
+ this.op(Regexp.Op.BEGIN_LINE);
987
+ }
988
+ t.skip(1); // '^'
989
+ break;
990
+ case 0x24:
991
+ if ((this.flags & ONE_LINE) !== 0) {
992
+ const endText = this.op(Regexp.Op.END_TEXT);
993
+ if (endText === null) {
994
+ throw new Error("op(END_TEXT) unexpectedly returned null");
995
+ }
996
+ endText.flags |= WAS_DOLLAR;
997
+ }
998
+ else {
999
+ this.op(Regexp.Op.END_LINE);
1000
+ }
1001
+ t.skip(1); // '$'
1002
+ break;
1003
+ case 0x2e:
1004
+ if ((this.flags & DOT_NL) !== 0) {
1005
+ this.op(Regexp.Op.ANY_CHAR);
1006
+ }
1007
+ else {
1008
+ this.op(Regexp.Op.ANY_CHAR_NOT_NL);
1009
+ }
1010
+ t.skip(1); // '.'
1011
+ break;
1012
+ case 0x5b:
1013
+ this.parseClass(t);
1014
+ break;
1015
+ case 0x2a:
1016
+ case 0x2b:
1017
+ case 0x3f: {
1018
+ repeatPos = t.pos();
1019
+ let op = null;
1020
+ switch (t.pop()) {
1021
+ case 0x2a:
1022
+ op = Regexp.Op.STAR;
1023
+ break;
1024
+ case 0x2b:
1025
+ op = Regexp.Op.PLUS;
1026
+ break;
1027
+ case 0x3f:
1028
+ op = Regexp.Op.QUEST;
1029
+ break;
1030
+ }
1031
+ if (op === null) {
1032
+ throw new Error("repeat op unexpectedly null");
1033
+ }
1034
+ this.repeat(op, min, max, repeatPos, t, lastRepeatPos);
1035
+ // (min and max are now dead.)
1036
+ break;
1037
+ }
1038
+ case 0x7b: {
1039
+ repeatPos = t.pos();
1040
+ const minMax = Parser.parseRepeat(t);
1041
+ if (minMax < 0) {
1042
+ // If the repeat cannot be parsed, { is a literal.
1043
+ t.rewindTo(repeatPos);
1044
+ this.literal(t.pop()); // '{'
1045
+ break;
1046
+ }
1047
+ min = minMax >> 16;
1048
+ max = ((minMax & MAX_BMP) << 16) >> 16;
1049
+ this.repeat(Regexp.Op.REPEAT, min, max, repeatPos, t, lastRepeatPos);
1050
+ break;
1051
+ }
1052
+ case 0x5c: {
1053
+ const savedPos = t.pos();
1054
+ t.skip(1); // '\\'
1055
+ let handled = false;
1056
+ if (t.more()) {
1057
+ const c = t.pop();
1058
+ switch (c) {
1059
+ case 0x41:
1060
+ this.op(Regexp.Op.BEGIN_TEXT);
1061
+ handled = true;
1062
+ break;
1063
+ case 0x62:
1064
+ this.op(Regexp.Op.WORD_BOUNDARY);
1065
+ handled = true;
1066
+ break;
1067
+ case 0x42:
1068
+ this.op(Regexp.Op.NO_WORD_BOUNDARY);
1069
+ handled = true;
1070
+ break;
1071
+ case 0x43:
1072
+ // any byte; not supported
1073
+ throw new RE2JSSyntaxException(Parser.ERR_INVALID_ESCAPE, "\\C");
1074
+ case 0x51: {
1075
+ // \Q ... \E: the ... is always literals
1076
+ let lit = t.rest();
1077
+ const i = lit.indexOf("\\E");
1078
+ if (i >= 0) {
1079
+ lit = lit.substring(0, i);
1080
+ t.skipString(lit);
1081
+ t.skipString("\\E");
1082
+ }
1083
+ else {
1084
+ t.skipString(lit);
1085
+ }
1086
+ let j = 0;
1087
+ while (j < lit.length) {
1088
+ const codepoint = codePointAtOrThrow(lit, j);
1089
+ this.literal(codepoint);
1090
+ j += charCount(codepoint);
1091
+ }
1092
+ handled = true;
1093
+ break;
1094
+ }
1095
+ case 0x7a:
1096
+ this.op(Regexp.Op.END_TEXT);
1097
+ handled = true;
1098
+ break;
1099
+ default:
1100
+ t.rewindTo(savedPos);
1101
+ break;
1102
+ }
1103
+ }
1104
+ else {
1105
+ t.rewindTo(savedPos);
1106
+ }
1107
+ if (handled)
1108
+ break;
1109
+ const re = this.newRegexp(Regexp.Op.CHAR_CLASS);
1110
+ re.flags = this.flags;
1111
+ // Look for Unicode character group like \p{Han}
1112
+ if (t.lookingAt("\\p") || t.lookingAt("\\P")) {
1113
+ const cc = new CharClass();
1114
+ if (this.parseUnicodeClass(t, cc)) {
1115
+ re.runes = cc.toArray();
1116
+ this.push(re);
1117
+ break;
1118
+ }
1119
+ }
1120
+ // Perl character class escape.
1121
+ const cc = new CharClass();
1122
+ if (this.parsePerlClassEscape(t, cc)) {
1123
+ re.runes = cc.toArray();
1124
+ this.push(re);
1125
+ break;
1126
+ }
1127
+ t.rewindTo(savedPos);
1128
+ this.reuse(re);
1129
+ // Ordinary single-character escape.
1130
+ this.literal(Parser.parseEscape(t));
1131
+ break;
1132
+ }
1133
+ default:
1134
+ this.literal(t.pop());
1135
+ break;
1136
+ }
1137
+ lastRepeatPos = repeatPos;
1138
+ }
1139
+ }
1140
+ this.concat();
1141
+ if (this.swapVerticalBar()) {
1142
+ this.pop(); // pop vertical bar
1143
+ }
1144
+ this.alternate();
1145
+ const n = this.stack.length;
1146
+ if (n !== 1) {
1147
+ throw new RE2JSSyntaxException(Parser.ERR_MISSING_PAREN, this.wholeRegexp);
1148
+ }
1149
+ this.stack[0].namedGroups = this.namedGroups;
1150
+ return this.stack[0];
1151
+ }
1152
+ // parsePerlFlags parses a Perl flag setting or non-capturing group or both,
1153
+ // like (?i) or (?: or (?i:.
1154
+ // Pre: t at "(?". Post: t after ")".
1155
+ // Sets numCap.
1156
+ parsePerlFlags(t) {
1157
+ const startPos = t.pos();
1158
+ // Check for named captures, first introduced in Python's regexp library.
1159
+ // As usual, there are three slightly different syntaxes:
1160
+ //
1161
+ // (?P<name>expr) the original, introduced by Python
1162
+ // (?<name>expr) the .NET alteration, adopted by Perl 5.10
1163
+ // (?'name'expr) another .NET alteration, adopted by Perl 5.10
1164
+ //
1165
+ // Perl 5.10 gave in and implemented the Python version too,
1166
+ // but they claim that the last two are the preferred forms.
1167
+ // PCRE and languages based on it (specifically, PHP and Ruby)
1168
+ // support all three as well. EcmaScript 4 uses only the Python form.
1169
+ //
1170
+ // In both the open source world (via Code Search) and the
1171
+ // Google source tree, (?P<name>expr) and (?<name>expr) are the
1172
+ // dominant forms of named captures and both are supported.
1173
+ if (t.lookingAt("(?P<") || t.lookingAt("(?<")) {
1174
+ // Pull out name.
1175
+ const s = t.rest();
1176
+ const begin = s.charAt(2) === "P" ? 4 : 3;
1177
+ const end = s.indexOf(">");
1178
+ if (end < 0) {
1179
+ throw new RE2JSSyntaxException(Parser.ERR_INVALID_NAMED_CAPTURE, s);
1180
+ }
1181
+ const name = s.substring(begin, end); // "name"
1182
+ t.skipString(name);
1183
+ t.skip(begin + 1); // "(?P<>" or "(?<>"
1184
+ if (!Parser.isValidCaptureName(name)) {
1185
+ // "(?P<name>"
1186
+ throw new RE2JSSyntaxException(Parser.ERR_INVALID_NAMED_CAPTURE, s.substring(0, end + 1)); // "(?P<name>" or "(?<name>"
1187
+ }
1188
+ // Like ordinary capture, but named.
1189
+ const re = this.op(Regexp.Op.LEFT_PAREN);
1190
+ if (re === null) {
1191
+ throw new Error("op(LEFT_PAREN) unexpectedly returned null");
1192
+ }
1193
+ re.cap = ++this.numCap;
1194
+ if (this.namedGroups.get(name)) {
1195
+ throw new RE2JSSyntaxException(Parser.ERR_DUPLICATE_NAMED_CAPTURE, name);
1196
+ }
1197
+ this.namedGroups.set(name, this.numCap);
1198
+ re.name = name;
1199
+ return;
1200
+ }
1201
+ // Non-capturing group. Might also twiddle Perl flags.
1202
+ t.skip(2); // "(?"
1203
+ let flags = this.flags;
1204
+ let sign = +1;
1205
+ let sawFlag = false;
1206
+ loop: while (t.more()) {
1207
+ {
1208
+ const c = t.pop();
1209
+ switch (c) {
1210
+ case 0x69:
1211
+ flags |= FOLD_CASE;
1212
+ sawFlag = true;
1213
+ break;
1214
+ case 0x6d:
1215
+ flags &= ~ONE_LINE;
1216
+ sawFlag = true;
1217
+ break;
1218
+ case 0x73:
1219
+ flags |= DOT_NL;
1220
+ sawFlag = true;
1221
+ break;
1222
+ case 0x55:
1223
+ flags |= NON_GREEDY;
1224
+ sawFlag = true;
1225
+ break;
1226
+ // Switch to negation.
1227
+ case 0x2d:
1228
+ if (sign < 0) {
1229
+ break loop;
1230
+ }
1231
+ sign = -1;
1232
+ // Invert flags so that | above turn into &~ and vice versa.
1233
+ // We'll invert flags again before using it below.
1234
+ flags = ~flags;
1235
+ sawFlag = false;
1236
+ break;
1237
+ // End of flags, starting group or not.
1238
+ case 0x3a:
1239
+ case 0x29:
1240
+ if (sign < 0) {
1241
+ if (!sawFlag) {
1242
+ break loop;
1243
+ }
1244
+ flags = ~flags;
1245
+ }
1246
+ if (c === 0x3a) {
1247
+ // Open new group
1248
+ this.op(Regexp.Op.LEFT_PAREN);
1249
+ }
1250
+ this.flags = flags;
1251
+ return;
1252
+ default:
1253
+ // Flags.
1254
+ break loop;
1255
+ }
1256
+ }
1257
+ }
1258
+ throw new RE2JSSyntaxException(Parser.ERR_INVALID_PERL_OP, t.from(startPos));
1259
+ }
1260
+ // parseVerticalBar handles a | in the input.
1261
+ parseVerticalBar() {
1262
+ this.concat();
1263
+ // The concatenation we just parsed is on top of the stack.
1264
+ // If it sits above an opVerticalBar, swap it below
1265
+ // (things below an opVerticalBar become an alternation).
1266
+ // Otherwise, push a new vertical bar.
1267
+ if (!this.swapVerticalBar()) {
1268
+ this.op(Regexp.Op.VERTICAL_BAR);
1269
+ }
1270
+ }
1271
+ // If the top of the stack is an element followed by an opVerticalBar
1272
+ // swapVerticalBar swaps the two and returns true.
1273
+ // Otherwise it returns false.
1274
+ swapVerticalBar() {
1275
+ const n = this.stack.length;
1276
+ // If above and below vertical bar are literal or char class,
1277
+ // can merge into a single char class.
1278
+ if (n >= 3 &&
1279
+ this.stack[n - 2].op === Regexp.Op.VERTICAL_BAR &&
1280
+ Parser.isCharClass(this.stack[n - 1]) &&
1281
+ Parser.isCharClass(this.stack[n - 3])) {
1282
+ let re1 = this.stack[n - 1];
1283
+ let re3 = this.stack[n - 3];
1284
+ // Make re3 the more complex of the two.
1285
+ if (re1.op > re3.op) {
1286
+ const tmp = re3;
1287
+ re3 = re1;
1288
+ re1 = tmp;
1289
+ this.stack[n - 3] = re3;
1290
+ }
1291
+ Parser.mergeCharClass(re3, re1);
1292
+ this.reuse(re1);
1293
+ this.pop();
1294
+ return true;
1295
+ }
1296
+ if (n >= 2) {
1297
+ const re1 = this.stack[n - 1];
1298
+ const re2 = this.stack[n - 2];
1299
+ if (re2.op === Regexp.Op.VERTICAL_BAR) {
1300
+ if (n >= 3) {
1301
+ // Now out of reach.
1302
+ // Clean opportunistically.
1303
+ this.cleanAlt(this.stack[n - 3]);
1304
+ }
1305
+ this.stack[n - 2] = re1;
1306
+ this.stack[n - 1] = re2;
1307
+ return true;
1308
+ }
1309
+ }
1310
+ return false;
1311
+ }
1312
+ // parseRightParen handles a ')' in the input.
1313
+ parseRightParen() {
1314
+ this.concat();
1315
+ if (this.swapVerticalBar()) {
1316
+ this.pop(); // pop vertical bar
1317
+ }
1318
+ this.alternate();
1319
+ const n = this.stack.length;
1320
+ if (n < 2) {
1321
+ throw new RE2JSSyntaxException(Parser.ERR_UNEXPECTED_PAREN, this.wholeRegexp);
1322
+ }
1323
+ const re1 = this.pop();
1324
+ if (re1 === undefined) {
1325
+ throw new RE2JSSyntaxException(Parser.ERR_BAD_EXPRESSION, this.wholeRegexp);
1326
+ }
1327
+ const re2 = this.pop();
1328
+ if (re2 === undefined || re2.op !== Regexp.Op.LEFT_PAREN) {
1329
+ throw new RE2JSSyntaxException(Parser.ERR_UNEXPECTED_PAREN, this.wholeRegexp);
1330
+ }
1331
+ // Restore flags at time of paren.
1332
+ this.flags = re2.flags;
1333
+ if (re2.cap === 0) {
1334
+ // Just for grouping.
1335
+ this.push(re1);
1336
+ }
1337
+ else {
1338
+ re2.op = Regexp.Op.CAPTURE;
1339
+ re2.subs = [re1];
1340
+ this.push(re2);
1341
+ }
1342
+ }
1343
+ // parsePerlClassEscape parses a leading Perl character class escape like \d
1344
+ // from the beginning of |t|. If one is present, it appends the characters
1345
+ // to cc and returns true. The iterator is advanced past the escape
1346
+ // on success, undefined on failure, in which case false is returned.
1347
+ parsePerlClassEscape(t, cc) {
1348
+ const beforePos = t.pos();
1349
+ if (!t.more() || t.pop() !== 0x5c || !t.more()) {
1350
+ return false;
1351
+ }
1352
+ t.pop(); // e.g. advance past 'd' in "\\d"
1353
+ const p = t.from(beforePos);
1354
+ const g = getPerlGroups().get(p);
1355
+ if (g === undefined) {
1356
+ return false;
1357
+ }
1358
+ cc.appendGroup(g, (this.flags & FOLD_CASE) !== 0);
1359
+ return true;
1360
+ }
1361
+ // parseNamedClass parses a leading POSIX named character class like
1362
+ // [:alnum:] from the beginning of t. If one is present, it appends the
1363
+ // characters to cc, advances the iterator, and returns true.
1364
+ // Pre: t at "[:". Post: t after ":]".
1365
+ // On failure (no class of than name), throws RE2JSSyntaxException.
1366
+ // On misparse, returns false; t.pos() is undefined.
1367
+ parseNamedClass(t, cc) {
1368
+ // (Go precondition check deleted.)
1369
+ const cls = t.rest();
1370
+ const i = cls.indexOf(":]");
1371
+ if (i < 0) {
1372
+ return false;
1373
+ }
1374
+ const name = cls.substring(0, i + 2); // "[:alnum:]"
1375
+ t.skipString(name);
1376
+ const g = getPosixGroups().get(name);
1377
+ if (g === undefined) {
1378
+ throw new RE2JSSyntaxException(Parser.ERR_INVALID_CHAR_RANGE, name);
1379
+ }
1380
+ cc.appendGroup(g, (this.flags & FOLD_CASE) !== 0);
1381
+ return true;
1382
+ }
1383
+ // parseUnicodeClass() parses a leading Unicode character class like \p{Han}
1384
+ // from the beginning of t. If one is present, it appends the characters to
1385
+ // to |cc|, advances |t| and returns true.
1386
+ //
1387
+ // Returns false if such a pattern is not present or UNICODE_GROUPS
1388
+ // flag is not enabled; |t.pos()| is not advanced in this case.
1389
+ // Indicates error by throwing RE2JSSyntaxException.
1390
+ parseUnicodeClass(t, cc) {
1391
+ const startPos = t.pos();
1392
+ if ((this.flags & UNICODE_GROUPS) === 0 ||
1393
+ (!t.lookingAt("\\p") && !t.lookingAt("\\P"))) {
1394
+ return false;
1395
+ }
1396
+ t.skip(1); // '\\'
1397
+ // Committed to parse or throw exception.
1398
+ let sign = +1;
1399
+ let c = t.pop(); // 'p' or 'P'
1400
+ if (c === 0x50) {
1401
+ sign = -1;
1402
+ }
1403
+ if (!t.more()) {
1404
+ t.rewindTo(startPos);
1405
+ throw new RE2JSSyntaxException(Parser.ERR_INVALID_CHAR_RANGE, t.rest());
1406
+ }
1407
+ c = t.pop();
1408
+ let name;
1409
+ if (c !== 0x7b) {
1410
+ // Single-letter name.
1411
+ name = runeToString(c);
1412
+ }
1413
+ else {
1414
+ // Name is in braces.
1415
+ const rest = t.rest();
1416
+ const end = rest.indexOf("}");
1417
+ if (end < 0) {
1418
+ t.rewindTo(startPos);
1419
+ throw new RE2JSSyntaxException(Parser.ERR_INVALID_CHAR_RANGE, t.rest());
1420
+ }
1421
+ name = rest.substring(0, end); // e.g. "Han"
1422
+ t.skipString(name);
1423
+ t.skip(1);
1424
+ // Don't use skip(end) because it assumes UTF-16 coding, and
1425
+ // StringIterator doesn't guarantee that.
1426
+ }
1427
+ // Group can have leading negation too.
1428
+ // \p{^Han} == \P{Han}, \P{^Han} == \p{Han}.
1429
+ if (!(name.length === 0) && name.codePointAt(0) === 0x5e) {
1430
+ sign = 0 - sign; // -sign
1431
+ name = name.substring(1);
1432
+ }
1433
+ const pair = Parser.unicodeTable(name);
1434
+ if (pair === null) {
1435
+ throw new RE2JSSyntaxException(Parser.ERR_INVALID_CHAR_RANGE, t.from(startPos));
1436
+ }
1437
+ if (pair.sign < 0) {
1438
+ sign = 0 - sign;
1439
+ }
1440
+ const tab = pair.tab;
1441
+ const fold = pair.fold; // fold-equivalent table
1442
+ if (tab === null) {
1443
+ throw new RE2JSSyntaxException(Parser.ERR_INVALID_CHAR_RANGE, t.from(startPos));
1444
+ }
1445
+ // Variation of CharClass.appendGroup() for tables.
1446
+ if ((this.flags & FOLD_CASE) === 0 || fold === null) {
1447
+ cc.appendTableWithSign(tab, sign);
1448
+ }
1449
+ else {
1450
+ // Merge and clean tab and fold in a temporary buffer.
1451
+ // This is necessary for the negative case and just tidy
1452
+ // for the positive case.
1453
+ const tmp = new CharClass()
1454
+ .appendTable(tab)
1455
+ .appendTable(fold)
1456
+ .cleanClass()
1457
+ .toArray();
1458
+ cc.appendClassWithSign(tmp, sign);
1459
+ }
1460
+ return true;
1461
+ }
1462
+ // parseClass parses a character class and pushes it onto the parse stack.
1463
+ //
1464
+ // NOTES:
1465
+ // Pre: at '['; Post: after ']'.
1466
+ // Mutates stack. Advances iterator. May throw.
1467
+ parseClass(t) {
1468
+ const startPos = t.pos();
1469
+ t.skip(1); // '['
1470
+ const re = this.newRegexp(Regexp.Op.CHAR_CLASS);
1471
+ re.flags = this.flags;
1472
+ const cc = new CharClass();
1473
+ let sign = +1;
1474
+ if (t.more() && t.lookingAt("^")) {
1475
+ sign = -1;
1476
+ t.skip(1); // '^'
1477
+ // If character class does not match \n, add it here,
1478
+ // so that negation later will do the right thing.
1479
+ if ((this.flags & CLASS_NL) === 0) {
1480
+ cc.appendRange(0x0a, 0x0a);
1481
+ }
1482
+ }
1483
+ let first = true; // ']' and '-' are okay as first char in class
1484
+ while (!t.more() || t.peek() !== 0x5d || first) {
1485
+ first = false;
1486
+ const beforePos = t.pos();
1487
+ // Look for POSIX [:alnum:] etc.
1488
+ if (t.lookingAt("[:")) {
1489
+ if (this.parseNamedClass(t, cc)) {
1490
+ continue;
1491
+ }
1492
+ t.rewindTo(beforePos);
1493
+ }
1494
+ // Look for Unicode character group like \p{Han}.
1495
+ if (this.parseUnicodeClass(t, cc)) {
1496
+ continue;
1497
+ }
1498
+ // Look for Perl character class symbols (extension).
1499
+ if (this.parsePerlClassEscape(t, cc)) {
1500
+ continue;
1501
+ }
1502
+ t.rewindTo(beforePos);
1503
+ // Single character or simple range.
1504
+ const lo = Parser.parseClassChar(t, startPos);
1505
+ let hi = lo;
1506
+ if (t.more() && t.lookingAt("-")) {
1507
+ t.skip(1);
1508
+ if (t.more() && t.lookingAt("]")) {
1509
+ // [a-] means (a|-) so check for final ].
1510
+ t.skip(-1);
1511
+ }
1512
+ else {
1513
+ hi = Parser.parseClassChar(t, startPos);
1514
+ if (hi < lo) {
1515
+ throw new RE2JSSyntaxException(Parser.ERR_INVALID_CHAR_RANGE, t.from(beforePos));
1516
+ }
1517
+ }
1518
+ }
1519
+ if ((this.flags & FOLD_CASE) === 0) {
1520
+ cc.appendRange(lo, hi);
1521
+ }
1522
+ else {
1523
+ cc.appendFoldedRange(lo, hi);
1524
+ }
1525
+ }
1526
+ t.skip(1); // ']'
1527
+ cc.cleanClass();
1528
+ if (sign < 0) {
1529
+ cc.negateClass();
1530
+ }
1531
+ re.runes = cc.toArray();
1532
+ this.push(re);
1533
+ }
1534
+ }
1535
+ export { Parser };