@bufbuild/re2 0.0.1-alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +30 -0
  3. package/dist/cjs/CharClass.d.ts +30 -0
  4. package/dist/cjs/CharClass.js +284 -0
  5. package/dist/cjs/CharGroup.d.ts +8 -0
  6. package/dist/cjs/CharGroup.js +83 -0
  7. package/dist/cjs/Codepoint.d.ts +3 -0
  8. package/dist/cjs/Codepoint.js +62 -0
  9. package/dist/cjs/Compiler.d.ts +40 -0
  10. package/dist/cjs/Compiler.js +262 -0
  11. package/dist/cjs/DFA.d.ts +36 -0
  12. package/dist/cjs/DFA.js +350 -0
  13. package/dist/cjs/Inst.d.ts +26 -0
  14. package/dist/cjs/Inst.js +86 -0
  15. package/dist/cjs/MachineInput.d.ts +17 -0
  16. package/dist/cjs/MachineInput.js +72 -0
  17. package/dist/cjs/Parser.d.ts +111 -0
  18. package/dist/cjs/Parser.js +1538 -0
  19. package/dist/cjs/Prefilter.d.ts +19 -0
  20. package/dist/cjs/Prefilter.js +163 -0
  21. package/dist/cjs/Prog.d.ts +39 -0
  22. package/dist/cjs/Prog.js +154 -0
  23. package/dist/cjs/RE2.d.ts +27 -0
  24. package/dist/cjs/RE2.js +221 -0
  25. package/dist/cjs/RE2Flags.d.ts +16 -0
  26. package/dist/cjs/RE2Flags.js +58 -0
  27. package/dist/cjs/Regexp.d.ts +43 -0
  28. package/dist/cjs/Regexp.js +98 -0
  29. package/dist/cjs/Simplify.d.ts +3 -0
  30. package/dist/cjs/Simplify.js +230 -0
  31. package/dist/cjs/Unicode.d.ts +17 -0
  32. package/dist/cjs/Unicode.js +165 -0
  33. package/dist/cjs/UnicodeRangeTable.d.ts +12 -0
  34. package/dist/cjs/UnicodeRangeTable.js +31 -0
  35. package/dist/cjs/UnicodeTables.d.ts +29 -0
  36. package/dist/cjs/UnicodeTables.js +571 -0
  37. package/dist/cjs/Utils.d.ts +22 -0
  38. package/dist/cjs/Utils.js +119 -0
  39. package/dist/cjs/__fixtures__/find.d.ts +9 -0
  40. package/dist/cjs/__fixtures__/find.js +115 -0
  41. package/dist/cjs/chars.d.ts +2 -0
  42. package/dist/cjs/chars.js +19 -0
  43. package/dist/cjs/exceptions.d.ts +55 -0
  44. package/dist/cjs/exceptions.js +94 -0
  45. package/dist/cjs/index.d.ts +102 -0
  46. package/dist/cjs/index.js +173 -0
  47. package/dist/cjs/package.json +1 -0
  48. package/dist/cjs/testParser.d.ts +3 -0
  49. package/dist/cjs/testParser.js +143 -0
  50. package/dist/esm/CharClass.d.ts +30 -0
  51. package/dist/esm/CharClass.js +281 -0
  52. package/dist/esm/CharGroup.d.ts +8 -0
  53. package/dist/esm/CharGroup.js +78 -0
  54. package/dist/esm/Codepoint.d.ts +3 -0
  55. package/dist/esm/Codepoint.js +59 -0
  56. package/dist/esm/Compiler.d.ts +40 -0
  57. package/dist/esm/Compiler.js +259 -0
  58. package/dist/esm/DFA.d.ts +36 -0
  59. package/dist/esm/DFA.js +347 -0
  60. package/dist/esm/Inst.d.ts +26 -0
  61. package/dist/esm/Inst.js +83 -0
  62. package/dist/esm/MachineInput.d.ts +17 -0
  63. package/dist/esm/MachineInput.js +68 -0
  64. package/dist/esm/Parser.d.ts +111 -0
  65. package/dist/esm/Parser.js +1535 -0
  66. package/dist/esm/Prefilter.d.ts +19 -0
  67. package/dist/esm/Prefilter.js +159 -0
  68. package/dist/esm/Prog.d.ts +39 -0
  69. package/dist/esm/Prog.js +150 -0
  70. package/dist/esm/RE2.d.ts +27 -0
  71. package/dist/esm/RE2.js +218 -0
  72. package/dist/esm/RE2Flags.d.ts +16 -0
  73. package/dist/esm/RE2Flags.js +41 -0
  74. package/dist/esm/Regexp.d.ts +43 -0
  75. package/dist/esm/Regexp.js +94 -0
  76. package/dist/esm/Simplify.d.ts +3 -0
  77. package/dist/esm/Simplify.js +228 -0
  78. package/dist/esm/Unicode.d.ts +17 -0
  79. package/dist/esm/Unicode.js +150 -0
  80. package/dist/esm/UnicodeRangeTable.d.ts +12 -0
  81. package/dist/esm/UnicodeRangeTable.js +28 -0
  82. package/dist/esm/UnicodeTables.d.ts +29 -0
  83. package/dist/esm/UnicodeTables.js +568 -0
  84. package/dist/esm/Utils.d.ts +22 -0
  85. package/dist/esm/Utils.js +103 -0
  86. package/dist/esm/__fixtures__/find.d.ts +9 -0
  87. package/dist/esm/__fixtures__/find.js +112 -0
  88. package/dist/esm/chars.d.ts +2 -0
  89. package/dist/esm/chars.js +14 -0
  90. package/dist/esm/exceptions.d.ts +55 -0
  91. package/dist/esm/exceptions.js +86 -0
  92. package/dist/esm/index.d.ts +102 -0
  93. package/dist/esm/index.js +163 -0
  94. package/dist/esm/testParser.d.ts +3 -0
  95. package/dist/esm/testParser.js +138 -0
  96. package/package.json +49 -0
@@ -0,0 +1,1538 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.Parser = void 0;
4
+ const RE2Flags_js_1 = require("./RE2Flags.js");
5
+ const Unicode_js_1 = require("./Unicode.js");
6
+ const UnicodeTables_js_1 = require("./UnicodeTables.js");
7
+ const UnicodeRangeTable_js_1 = require("./UnicodeRangeTable.js");
8
+ const CharGroup_js_1 = require("./CharGroup.js");
9
+ const Utils_js_1 = require("./Utils.js");
10
+ const chars_js_1 = require("./chars.js");
11
+ const CharClass_js_1 = require("./CharClass.js");
12
+ const exceptions_js_1 = require("./exceptions.js");
13
+ const Regexp_js_1 = require("./Regexp.js");
14
+ // StringIterator: a stream of runes with an opaque cursor, permitting
15
+ // rewinding. The units of the cursor are not specified beyond the
16
+ // fact that ASCII characters are single width. (Cursor positions
17
+ // could be UTF-8 byte indices, UTF-16 code indices or rune indices.)
18
+ //
19
+ // In particular, be careful with:
20
+ // - skip: only use this to advance over ASCII characters
21
+ // since these always have a width of 1.
22
+ // - skipString: only use this to advance over strings which are
23
+ // known to be at the current position, e.g. due to prior call to
24
+ // lookingAt().
25
+ // Only use pop() to advance over possibly non-ASCII runes.
26
+ class StringIterator {
27
+ str;
28
+ position;
29
+ constructor(str) {
30
+ this.str = str;
31
+ this.position = 0;
32
+ }
33
+ // Returns the cursor position. Do not interpret the result!
34
+ pos() {
35
+ return this.position;
36
+ }
37
+ // Resets the cursor position to a previous value returned by pos().
38
+ rewindTo(pos) {
39
+ this.position = pos;
40
+ }
41
+ // Returns true unless the stream is exhausted.
42
+ more() {
43
+ return this.position < this.str.length;
44
+ }
45
+ // Returns the rune at the cursor position.
46
+ // Precondition: |more()|.
47
+ peek() {
48
+ return (0, chars_js_1.codePointAtOrThrow)(this.str, this.position);
49
+ }
50
+ // Advances the cursor by |n| positions, which must be ASCII runes.
51
+ //
52
+ // (In practise, this is only ever used to skip over regexp
53
+ // metacharacters that are ASCII, so there is no numeric difference
54
+ // between indices into UTF-8 bytes, UTF-16 codes and runes.)
55
+ skip(n) {
56
+ this.position += n;
57
+ }
58
+ // Advances the cursor by the number of cursor positions in |s|.
59
+ skipString(s) {
60
+ this.position += s.length;
61
+ }
62
+ // Returns the rune at the cursor position, and advances the cursor
63
+ // past it. Precondition: |more()|.
64
+ pop() {
65
+ const r = (0, chars_js_1.codePointAtOrThrow)(this.str, this.position);
66
+ this.position += (0, Utils_js_1.charCount)(r);
67
+ return r;
68
+ }
69
+ lookingAt(s) {
70
+ return this.str.startsWith(s, this.position);
71
+ }
72
+ // Returns the rest of the pattern from the current position.
73
+ rest() {
74
+ return this.str.substring(this.position);
75
+ }
76
+ // Returns the substring from |beforePos| to the current position.
77
+ // |beforePos| must have been previously returned by |pos()|.
78
+ from(beforePos) {
79
+ return this.str.substring(beforePos, this.position);
80
+ }
81
+ toString() {
82
+ return this.rest();
83
+ }
84
+ }
85
+ /**
86
+ * A parser of regular expression patterns.
87
+ *
88
+ * The only public entry point is {@link #parse(String pattern, int flags)}.
89
+ */
90
+ class Parser {
91
+ // Parse errors
92
+ static ERR_INVALID_CHAR_RANGE = "invalid character class range";
93
+ static ERR_INVALID_ESCAPE = "invalid escape sequence";
94
+ static ERR_INVALID_NAMED_CAPTURE = "invalid named capture";
95
+ static ERR_INVALID_PERL_OP = "invalid or unsupported Perl syntax";
96
+ static ERR_INVALID_REPEAT_OP = "invalid nested repetition operator";
97
+ static ERR_INVALID_REPEAT_SIZE = "invalid repeat count";
98
+ static ERR_MISSING_BRACKET = "missing closing ]";
99
+ static ERR_MISSING_PAREN = "missing closing )";
100
+ static ERR_MISSING_REPEAT_ARGUMENT = "missing argument to repetition operator";
101
+ static ERR_TRAILING_BACKSLASH = "trailing backslash at end of expression";
102
+ static ERR_DUPLICATE_NAMED_CAPTURE = "duplicate capture group name";
103
+ static ERR_UNEXPECTED_PAREN = "unexpected )";
104
+ static ERR_NESTING_DEPTH = "expression nests too deeply";
105
+ static ERR_LARGE = "expression too large";
106
+ static ERR_BAD_EXPRESSION = "expression not valid";
107
+ // maxHeight is the maximum height of a regexp parse tree.
108
+ // It is somewhat arbitrarily chosen, but the idea is to be large enough
109
+ // that no one will actually hit in real use but at the same time small enough
110
+ // that recursion on the Regexp tree will not hit the 1GB Go stack limit.
111
+ // The maximum amount of stack for a single recursive frame is probably
112
+ // closer to 1kB, so this could potentially be raised, but it seems unlikely
113
+ // that people have regexps nested even this deeply.
114
+ // We ran a test on Google's C++ code base and turned up only
115
+ // a single use case with depth > 100; it had depth 128.
116
+ // Using depth 1000 should be plenty of margin.
117
+ // As an optimization, we don't even bother calculating heights
118
+ // until we've allocated at least maxHeight Regexp structures.
119
+ static MAX_HEIGHT = 1000;
120
+ // maxSize is the maximum size of a compiled regexp in Insts.
121
+ // It too is somewhat arbitrarily chosen, but the idea is to be large enough
122
+ // to allow significant regexps while at the same time small enough that
123
+ // the compiled form will not take up too much memory.
124
+ // 128 MB is enough for a 3.3 million Inst structures, which roughly
125
+ // corresponds to a 3.3 MB regexp.
126
+ static MAX_SIZE = 3355443; // 128 << 20 / (5 * 8) (instSize = byte, 2 uint32, slice is 5 64-bit words)
127
+ // maxRunes is the maximum number of runes allowed in a regexp tree
128
+ // counting the runes in all the nodes.
129
+ // Ignoring character classes p.numRunes is always less than the length of the regexp.
130
+ // Character classes can make it much larger: each \pL adds 1292 runes.
131
+ // 128 MB is enough for 32M runes, which is over 26k \pL instances.
132
+ // Note that repetitions do not make copies of the rune slices,
133
+ // so \pL{1000} is only one rune slice, not 1000.
134
+ // We could keep a cache of character classes we've seen,
135
+ // so that all the \pL we see use the same rune list,
136
+ // but that doesn't remove the problem entirely:
137
+ // consider something like [\pL01234][\pL01235][\pL01236]...[\pL^&*()].
138
+ // And because the Rune slice is exposed directly in the Regexp,
139
+ // there is not an opportunity to change the representation to allow
140
+ // partial sharing between different character classes.
141
+ // So the limit is the best we can do.
142
+ static MAX_RUNES = 33554432; // 128 << 20 / 4 (runeSize, int32 is 4 bytes)
143
+ // RangeTables are represented as int[][], a list of triples (start, end,
144
+ // stride).
145
+ static ANY_TABLE = new UnicodeRangeTable_js_1.UnicodeRangeTable(new Uint32Array([0, Unicode_js_1.MAX_RUNE, 1]));
146
+ // Ascii tables
147
+ static ASCII_TABLE = new UnicodeRangeTable_js_1.UnicodeRangeTable(new Uint32Array([0, 0x7f, 1]));
148
+ static ASCII_FOLD_TABLE = new UnicodeRangeTable_js_1.UnicodeRangeTable(new Uint32Array([
149
+ 0,
150
+ 0x7f,
151
+ 1,
152
+ 0x017f,
153
+ 0x017f,
154
+ 1, // Old English long s (ſ), folds to S/s.
155
+ 0x212a,
156
+ 0x212a,
157
+ 1, // Kelvin K, folds to K/k.
158
+ ]));
159
+ // unicodeTable() returns the Unicode RangeTable identified by name
160
+ // and the table of additional fold-equivalent code points.
161
+ // Returns null if |name| does not identify a Unicode character range.
162
+ static unicodeTable(name) {
163
+ if (name === "Any") {
164
+ return { tab: Parser.ANY_TABLE, fold: Parser.ANY_TABLE, sign: +1 };
165
+ }
166
+ if (name === "Ascii") {
167
+ return {
168
+ tab: Parser.ASCII_TABLE,
169
+ fold: Parser.ASCII_FOLD_TABLE,
170
+ sign: +1,
171
+ };
172
+ }
173
+ if (name === "Assigned") {
174
+ // Assigned is the mathematical inversion of Cn (Unassigned)
175
+ return {
176
+ tab: UnicodeTables_js_1.UnicodeTables.CATEGORIES.get("Cn"),
177
+ fold: UnicodeTables_js_1.UnicodeTables.CATEGORIES.get("Cn"),
178
+ sign: -1,
179
+ };
180
+ }
181
+ if (name === "Lc") {
182
+ return {
183
+ tab: UnicodeTables_js_1.UnicodeTables.CATEGORIES.get("LC"),
184
+ fold: UnicodeTables_js_1.UnicodeTables.FOLD_CATEGORIES.get("LC"),
185
+ sign: +1,
186
+ };
187
+ }
188
+ if (UnicodeTables_js_1.UnicodeTables.CATEGORIES.has(name)) {
189
+ return {
190
+ tab: UnicodeTables_js_1.UnicodeTables.CATEGORIES.get(name),
191
+ fold: UnicodeTables_js_1.UnicodeTables.FOLD_CATEGORIES.get(name),
192
+ sign: +1,
193
+ };
194
+ }
195
+ if (UnicodeTables_js_1.UnicodeTables.SCRIPTS.has(name)) {
196
+ return {
197
+ tab: UnicodeTables_js_1.UnicodeTables.SCRIPTS.get(name),
198
+ fold: UnicodeTables_js_1.UnicodeTables.FOLD_SCRIPT.get(name),
199
+ sign: +1,
200
+ };
201
+ }
202
+ return null;
203
+ }
204
+ // minFoldRune returns the minimum rune fold-equivalent to r.
205
+ static minFoldRune(r) {
206
+ if (r < Unicode_js_1.MIN_FOLD || r > Unicode_js_1.MAX_FOLD) {
207
+ return r;
208
+ }
209
+ let min = r;
210
+ const r0 = r;
211
+ for (r = (0, Unicode_js_1.simpleFold)(r); r !== r0; r = (0, Unicode_js_1.simpleFold)(r)) {
212
+ if (min > r) {
213
+ min = r;
214
+ }
215
+ }
216
+ return min;
217
+ }
218
+ static literalRegexp(s, flags) {
219
+ const re = new Regexp_js_1.Regexp(Regexp_js_1.Regexp.Op.LITERAL);
220
+ re.flags = flags;
221
+ re.runes = (0, Utils_js_1.stringToRunes)(s);
222
+ return re;
223
+ }
224
+ /**
225
+ * Parse regular expression pattern {@code pattern} with mode flags {@code flags}.
226
+ * @param {string} pattern
227
+ * @param {number} flags
228
+ */
229
+ static parse(pattern, flags) {
230
+ return new Parser(pattern, flags).parseInternal();
231
+ }
232
+ // parseRepeat parses {min} (max=min) or {min,} (max=-1) or {min,max}.
233
+ // If |t| is not of that form, it returns -1.
234
+ // If |t| has the right form but the values are negative or too big,
235
+ // it returns -2.
236
+ // On success, returns a nonnegative number encoding min/max in the
237
+ // high/low signed halfwords of the result. (Note: min >= 0; max may
238
+ // be -1.)
239
+ //
240
+ // On success, advances |t| beyond the repeat; otherwise |t.pos()| is
241
+ // undefined.
242
+ static parseRepeat(t) {
243
+ const start = t.pos();
244
+ if (!t.more() || !t.lookingAt("{")) {
245
+ return -1;
246
+ }
247
+ t.skip(1);
248
+ const min = Parser.parseInt(t);
249
+ if (min === -1) {
250
+ return -1;
251
+ }
252
+ if (!t.more()) {
253
+ return -1;
254
+ }
255
+ let max;
256
+ if (!t.lookingAt(",")) {
257
+ max = min;
258
+ }
259
+ else {
260
+ t.skip(1);
261
+ if (!t.more()) {
262
+ return -1;
263
+ }
264
+ if (t.lookingAt("}")) {
265
+ max = -1;
266
+ }
267
+ else {
268
+ max = Parser.parseInt(t);
269
+ if (max === -1) {
270
+ return -1;
271
+ }
272
+ }
273
+ }
274
+ if (!t.more() || !t.lookingAt("}")) {
275
+ return -1;
276
+ }
277
+ t.skip(1);
278
+ if (min < 0 ||
279
+ min > 1000 ||
280
+ max === -2 ||
281
+ max > 1000 ||
282
+ (max >= 0 && min > max)) {
283
+ throw new exceptions_js_1.RE2JSSyntaxException(Parser.ERR_INVALID_REPEAT_SIZE, t.from(start));
284
+ }
285
+ return (min << 16) | (max & Unicode_js_1.MAX_BMP);
286
+ }
287
+ // isValidCaptureName reports whether name
288
+ // is a valid capture name: [A-Za-z0-9_]+.
289
+ // PCRE limits names to 32 bytes.
290
+ // Python rejects names starting with digits.
291
+ // We don't enforce either of those.
292
+ static isValidCaptureName(name) {
293
+ if (name.length === 0) {
294
+ return false;
295
+ }
296
+ for (let i = 0; i < name.length; i++) {
297
+ const c = (0, chars_js_1.codePointAtOrThrow)(name, i);
298
+ if (c !== 0x5f && !(0, Utils_js_1.isalnum)(c)) {
299
+ return false;
300
+ }
301
+ }
302
+ return true;
303
+ }
304
+ // parseInt parses a nonnegative decimal integer.
305
+ // -1 => bad format. -2 => format ok, but integer overflow.
306
+ static parseInt(t) {
307
+ const start = t.pos();
308
+ while (t.more() && t.peek() >= 0x30 && t.peek() <= 0x39) {
309
+ t.skip(1);
310
+ }
311
+ const n = t.from(start);
312
+ if (n.length === 0 || (n.length > 1 && n.codePointAt(0) === 0x30)) {
313
+ return -1;
314
+ }
315
+ if (n.length > 8) {
316
+ return -2;
317
+ }
318
+ return parseInt(n, 10);
319
+ }
320
+ // can this be represented as a character class?
321
+ // single-rune literal string, char class, ., and .|\n.
322
+ static isCharClass(re) {
323
+ return ((re.op === Regexp_js_1.Regexp.Op.LITERAL && re.runes.length === 1) ||
324
+ re.op === Regexp_js_1.Regexp.Op.CHAR_CLASS ||
325
+ re.op === Regexp_js_1.Regexp.Op.ANY_CHAR_NOT_NL ||
326
+ re.op === Regexp_js_1.Regexp.Op.ANY_CHAR);
327
+ }
328
+ // does re match r?
329
+ static matchRune(re, r) {
330
+ switch (re.op) {
331
+ case Regexp_js_1.Regexp.Op.LITERAL:
332
+ return re.runes.length === 1 && re.runes[0] === r;
333
+ case Regexp_js_1.Regexp.Op.CHAR_CLASS:
334
+ for (let i = 0; i < re.runes.length; i += 2) {
335
+ if (re.runes[i] <= r && r <= re.runes[i + 1]) {
336
+ return true;
337
+ }
338
+ }
339
+ return false;
340
+ case Regexp_js_1.Regexp.Op.ANY_CHAR_NOT_NL:
341
+ return r !== 0x0a;
342
+ case Regexp_js_1.Regexp.Op.ANY_CHAR:
343
+ return true;
344
+ }
345
+ return false;
346
+ }
347
+ // mergeCharClass makes dst = dst|src.
348
+ // The caller must ensure that dst.Op >= src.Op,
349
+ // to reduce the amount of copying.
350
+ static mergeCharClass(dst, src) {
351
+ switch (dst.op) {
352
+ case Regexp_js_1.Regexp.Op.ANY_CHAR:
353
+ break;
354
+ case Regexp_js_1.Regexp.Op.ANY_CHAR_NOT_NL:
355
+ if (Parser.matchRune(src, 0x0a)) {
356
+ dst.op = Regexp_js_1.Regexp.Op.ANY_CHAR;
357
+ }
358
+ break;
359
+ case Regexp_js_1.Regexp.Op.CHAR_CLASS:
360
+ if (src.op === Regexp_js_1.Regexp.Op.LITERAL) {
361
+ dst.runes = new CharClass_js_1.CharClass(dst.runes)
362
+ .appendLiteral(src.runes[0], src.flags)
363
+ .toArray();
364
+ }
365
+ else {
366
+ dst.runes = new CharClass_js_1.CharClass(dst.runes).appendClass(src.runes).toArray();
367
+ }
368
+ break;
369
+ case Regexp_js_1.Regexp.Op.LITERAL:
370
+ if (src.runes[0] === dst.runes[0] && src.flags === dst.flags) {
371
+ break;
372
+ }
373
+ dst.op = Regexp_js_1.Regexp.Op.CHAR_CLASS;
374
+ dst.runes = new CharClass_js_1.CharClass()
375
+ .appendLiteral(dst.runes[0], dst.flags)
376
+ .appendLiteral(src.runes[0], src.flags)
377
+ .toArray();
378
+ break;
379
+ }
380
+ }
381
+ // parseEscape parses an escape sequence at the beginning of s
382
+ // and returns the rune.
383
+ // Pre: t at '\\'. Post: after escape.
384
+ static parseEscape(t) {
385
+ const startPos = t.pos();
386
+ t.skip(1); // '\\'
387
+ if (!t.more()) {
388
+ throw new exceptions_js_1.RE2JSSyntaxException(Parser.ERR_TRAILING_BACKSLASH);
389
+ }
390
+ let c = t.pop();
391
+ switch (c) {
392
+ case 0x31:
393
+ case 0x32:
394
+ case 0x33:
395
+ case 0x34:
396
+ case 0x35:
397
+ case 0x36:
398
+ case 0x30:
399
+ case 0x37: {
400
+ if (c !== 0x30 && (!t.more() || t.peek() < 0x30 || t.peek() > 0x37)) {
401
+ break;
402
+ }
403
+ let r = c - 0x30;
404
+ for (let i = 1; i < 3; i++) {
405
+ if (!t.more() || t.peek() < 0x30 || t.peek() > 0x37) {
406
+ break;
407
+ }
408
+ r = r * 8 + t.peek() - 0x30;
409
+ t.skip(1);
410
+ }
411
+ return r;
412
+ }
413
+ case 0x78: {
414
+ if (!t.more()) {
415
+ break;
416
+ }
417
+ c = t.pop();
418
+ if (c === 0x7b) {
419
+ let nhex = 0;
420
+ let r = 0;
421
+ while (true) {
422
+ if (!t.more()) {
423
+ throw new exceptions_js_1.RE2JSSyntaxException(Parser.ERR_INVALID_ESCAPE, t.from(startPos));
424
+ }
425
+ c = t.pop();
426
+ if (c === 0x7d) {
427
+ break;
428
+ }
429
+ const v = (0, Utils_js_1.unhex)(c);
430
+ if (v < 0) {
431
+ throw new exceptions_js_1.RE2JSSyntaxException(Parser.ERR_INVALID_ESCAPE, t.from(startPos));
432
+ }
433
+ r = r * 16 + v;
434
+ if (r > Unicode_js_1.MAX_RUNE) {
435
+ throw new exceptions_js_1.RE2JSSyntaxException(Parser.ERR_INVALID_ESCAPE, t.from(startPos));
436
+ }
437
+ nhex++;
438
+ }
439
+ if (nhex === 0) {
440
+ break;
441
+ }
442
+ return r;
443
+ }
444
+ const x = (0, Utils_js_1.unhex)(c);
445
+ if (!t.more()) {
446
+ break;
447
+ }
448
+ c = t.pop();
449
+ const y = (0, Utils_js_1.unhex)(c);
450
+ if (x < 0 || y < 0) {
451
+ break;
452
+ }
453
+ return x * 16 + y;
454
+ }
455
+ case 0x61:
456
+ return 0x07;
457
+ case 0x66:
458
+ return 0x0c;
459
+ case 0x6e:
460
+ return 0x0a;
461
+ case 0x72:
462
+ return 0x0d;
463
+ case 0x74:
464
+ return 0x09;
465
+ case 0x76:
466
+ return 0x0b;
467
+ default:
468
+ if (c <= Unicode_js_1.MAX_ASCII && !(0, Utils_js_1.isalnum)(c)) {
469
+ return c;
470
+ }
471
+ break;
472
+ }
473
+ throw new exceptions_js_1.RE2JSSyntaxException(Parser.ERR_INVALID_ESCAPE, t.from(startPos));
474
+ }
475
+ // parseClassChar parses a character class character and returns it.
476
+ // wholeClassPos is the position of the start of the entire class "[...".
477
+ // Pre: t at class char; Post: t after it.
478
+ static parseClassChar(t, wholeClassPos) {
479
+ if (!t.more()) {
480
+ throw new exceptions_js_1.RE2JSSyntaxException(Parser.ERR_MISSING_BRACKET, t.from(wholeClassPos));
481
+ }
482
+ if (t.lookingAt("\\")) {
483
+ return Parser.parseEscape(t);
484
+ }
485
+ return t.pop();
486
+ }
487
+ static concatRunes(x, y) {
488
+ const r = new Array(x.length + y.length);
489
+ for (let i = 0; i < x.length; i++)
490
+ r[i] = x[i];
491
+ for (let i = 0; i < y.length; i++)
492
+ r[x.length + i] = y[i];
493
+ return r;
494
+ }
495
+ wholeRegexp;
496
+ flags;
497
+ numCap;
498
+ namedGroups;
499
+ stack;
500
+ free;
501
+ numRegexp;
502
+ numRunes;
503
+ repeats;
504
+ height;
505
+ size;
506
+ constructor(wholeRegexp, flags = 0) {
507
+ this.wholeRegexp = wholeRegexp;
508
+ // Flags control the behavior of the parser and record information about
509
+ // regexp context.
510
+ this.flags = flags;
511
+ // number of capturing groups seen
512
+ this.numCap = 0;
513
+ this.namedGroups = new Map();
514
+ // Stack of parsed expressions.
515
+ this.stack = [];
516
+ this.free = null;
517
+ // checks
518
+ this.numRegexp = 0; // number of regexps allocated
519
+ this.numRunes = 0; // number of runes in char classes
520
+ this.repeats = 0; // product of all repetitions seen
521
+ this.height = null; // regexp height, for height limit check
522
+ this.size = null; // regexp compiled size, for size limit check
523
+ }
524
+ // Allocate a Regexp, from the free list if possible.
525
+ newRegexp(op) {
526
+ let re = this.free;
527
+ if (re !== null && re.subs !== null && re.subs.length > 0) {
528
+ this.free = re.subs[0];
529
+ re.reinit();
530
+ re.op = op;
531
+ }
532
+ else {
533
+ re = new Regexp_js_1.Regexp(op);
534
+ this.numRegexp += 1;
535
+ }
536
+ return re;
537
+ }
538
+ reuse(re) {
539
+ if (this.height !== null) {
540
+ this.height.delete(re);
541
+ }
542
+ if (re.subs !== null && re.subs.length > 0) {
543
+ // subs[0] doubles as the free-list next pointer while re is on the list.
544
+ re.subs[0] = this.free;
545
+ }
546
+ this.free = re;
547
+ }
548
+ checkLimits(re) {
549
+ if (this.numRunes > Parser.MAX_RUNES) {
550
+ throw new exceptions_js_1.RE2JSSyntaxException(Parser.ERR_LARGE);
551
+ }
552
+ this.checkSize(re);
553
+ this.checkHeight(re);
554
+ }
555
+ checkSize(re) {
556
+ if (this.size === null) {
557
+ // We haven't started tracking size yet.
558
+ // Do a relatively cheap check to see if we need to start.
559
+ // Maintain the product of all the repeats we've seen
560
+ // and don't track if the total number of regexp nodes
561
+ // we've seen times the repeat product is in budget.
562
+ if (this.repeats === 0) {
563
+ this.repeats = 1;
564
+ }
565
+ if (re.op === Regexp_js_1.Regexp.Op.REPEAT) {
566
+ let n = re.max;
567
+ if (n === -1) {
568
+ n = re.min;
569
+ }
570
+ if (n <= 0) {
571
+ n = 1;
572
+ }
573
+ if (n > Parser.MAX_SIZE / this.repeats) {
574
+ this.repeats = Parser.MAX_SIZE;
575
+ }
576
+ else {
577
+ this.repeats *= n;
578
+ }
579
+ }
580
+ if (this.numRegexp < Parser.MAX_SIZE / this.repeats) {
581
+ return;
582
+ }
583
+ // We need to start tracking size.
584
+ // Make the map and belatedly populate it
585
+ // with info about everything we've constructed so far.
586
+ this.size = new Map();
587
+ for (let reEx of this.stack) {
588
+ this.checkSize(reEx);
589
+ }
590
+ }
591
+ if (this.calcSize(re, true) > Parser.MAX_SIZE) {
592
+ throw new exceptions_js_1.RE2JSSyntaxException(Parser.ERR_LARGE);
593
+ }
594
+ }
595
+ calcSize(re, force = false) {
596
+ if (!force && this.size !== null) {
597
+ const cached = this.size.get(re);
598
+ if (cached !== undefined) {
599
+ return cached;
600
+ }
601
+ }
602
+ let size = 0;
603
+ switch (re.op) {
604
+ case Regexp_js_1.Regexp.Op.LITERAL: {
605
+ size = re.runes.length;
606
+ break;
607
+ }
608
+ case Regexp_js_1.Regexp.Op.CAPTURE:
609
+ case Regexp_js_1.Regexp.Op.STAR: {
610
+ // star can be 1+ or 2+; assume 2 pessimistically
611
+ size = 2 + this.calcSize(re.subs[0]);
612
+ break;
613
+ }
614
+ case Regexp_js_1.Regexp.Op.PLUS:
615
+ case Regexp_js_1.Regexp.Op.QUEST: {
616
+ size = 1 + this.calcSize(re.subs[0]);
617
+ break;
618
+ }
619
+ case Regexp_js_1.Regexp.Op.CONCAT: {
620
+ for (let sub of re.subs) {
621
+ size = size + this.calcSize(sub);
622
+ }
623
+ break;
624
+ }
625
+ case Regexp_js_1.Regexp.Op.ALTERNATE: {
626
+ for (let sub of re.subs) {
627
+ size = size + this.calcSize(sub);
628
+ }
629
+ if (re.subs.length > 1) {
630
+ size = size + re.subs.length - 1;
631
+ }
632
+ break;
633
+ }
634
+ case Regexp_js_1.Regexp.Op.REPEAT: {
635
+ let sub = this.calcSize(re.subs[0]);
636
+ if (re.max === -1) {
637
+ if (re.min === 0) {
638
+ size = 2 + sub; // x*
639
+ }
640
+ else {
641
+ size = 1 + re.min * sub; // xxx+
642
+ }
643
+ break;
644
+ }
645
+ // x{2,5} = xx(x(x(x)?)?)?
646
+ size = re.max * sub + (re.max - re.min);
647
+ break;
648
+ }
649
+ }
650
+ size = Math.max(1, size);
651
+ this.size?.set(re, size);
652
+ return size;
653
+ }
654
+ checkHeight(re) {
655
+ if (this.numRegexp < Parser.MAX_HEIGHT) {
656
+ return;
657
+ }
658
+ if (this.height === null) {
659
+ this.height = new Map();
660
+ for (let reEx of this.stack) {
661
+ this.checkHeight(reEx);
662
+ }
663
+ }
664
+ if (this.calcHeight(re, true) > Parser.MAX_HEIGHT) {
665
+ throw new exceptions_js_1.RE2JSSyntaxException(Parser.ERR_NESTING_DEPTH);
666
+ }
667
+ }
668
+ calcHeight(re, force = false) {
669
+ if (!force && this.height !== null) {
670
+ const cached = this.height.get(re);
671
+ if (cached !== undefined) {
672
+ return cached;
673
+ }
674
+ }
675
+ let h = 1;
676
+ for (let sub of re.subs) {
677
+ const hsub = this.calcHeight(sub);
678
+ if (h < 1 + hsub) {
679
+ h = 1 + hsub;
680
+ }
681
+ }
682
+ this.height?.set(re, h);
683
+ return h;
684
+ }
685
+ // Parse stack manipulation.
686
+ pop() {
687
+ return this.stack.pop();
688
+ }
689
+ popToPseudo() {
690
+ const n = this.stack.length;
691
+ let i = n;
692
+ while (i > 0 && !Regexp_js_1.Regexp.isPseudoOp(this.stack[i - 1].op)) {
693
+ i--;
694
+ }
695
+ const r = this.stack.slice(i, n);
696
+ this.stack = this.stack.slice(0, i);
697
+ return r;
698
+ }
699
+ // push pushes the regexp re onto the parse stack and returns the regexp.
700
+ // Returns null for a CHAR_CLASS that can be merged with the top-of-stack.
701
+ push(re) {
702
+ this.numRunes += re.runes.length;
703
+ if (re.op === Regexp_js_1.Regexp.Op.CHAR_CLASS &&
704
+ re.runes.length === 2 &&
705
+ re.runes[0] === re.runes[1]) {
706
+ if (this.maybeConcat(re.runes[0], this.flags & ~RE2Flags_js_1.FOLD_CASE)) {
707
+ return null;
708
+ }
709
+ re.op = Regexp_js_1.Regexp.Op.LITERAL;
710
+ re.runes = [re.runes[0]];
711
+ re.flags = this.flags & ~RE2Flags_js_1.FOLD_CASE;
712
+ }
713
+ else if ((re.op === Regexp_js_1.Regexp.Op.CHAR_CLASS &&
714
+ re.runes.length === 4 &&
715
+ re.runes[0] === re.runes[1] &&
716
+ re.runes[2] === re.runes[3] &&
717
+ (0, Unicode_js_1.simpleFold)(re.runes[0]) === re.runes[2] &&
718
+ (0, Unicode_js_1.simpleFold)(re.runes[2]) === re.runes[0]) ||
719
+ (re.op === Regexp_js_1.Regexp.Op.CHAR_CLASS &&
720
+ re.runes.length === 2 &&
721
+ re.runes[0] + 1 === re.runes[1] &&
722
+ (0, Unicode_js_1.simpleFold)(re.runes[0]) === re.runes[1] &&
723
+ (0, Unicode_js_1.simpleFold)(re.runes[1]) === re.runes[0])) {
724
+ // Case-insensitive rune like [Aa] or [Δδ].
725
+ if (this.maybeConcat(re.runes[0], this.flags | RE2Flags_js_1.FOLD_CASE)) {
726
+ return null;
727
+ }
728
+ // Rewrite as (case-insensitive) literal.
729
+ re.op = Regexp_js_1.Regexp.Op.LITERAL;
730
+ re.runes = [re.runes[0]];
731
+ re.flags = this.flags | RE2Flags_js_1.FOLD_CASE;
732
+ }
733
+ else {
734
+ // Incremental concatenation.
735
+ this.maybeConcat(-1, 0);
736
+ }
737
+ this.stack.push(re);
738
+ this.checkLimits(re);
739
+ return re;
740
+ }
741
+ // maybeConcat implements incremental concatenation
742
+ // of literal runes into string nodes. The parser calls this
743
+ // before each push, so only the top fragment of the stack
744
+ // might need processing. Since this is called before a push,
745
+ // the topmost literal is no longer subject to operators like *
746
+ // (Otherwise ab* would turn into (ab)*.)
747
+ // If (r >= 0 and there's a node left over, maybeConcat uses it
748
+ // to push r with the given flags.
749
+ // maybeConcat reports whether r was pushed.
750
+ maybeConcat(r, flags) {
751
+ const n = this.stack.length;
752
+ if (n < 2) {
753
+ return false;
754
+ }
755
+ const re1 = this.stack[n - 1];
756
+ const re2 = this.stack[n - 2];
757
+ if (re1.op !== Regexp_js_1.Regexp.Op.LITERAL ||
758
+ re2.op !== Regexp_js_1.Regexp.Op.LITERAL ||
759
+ (re1.flags & RE2Flags_js_1.FOLD_CASE) !== (re2.flags & RE2Flags_js_1.FOLD_CASE)) {
760
+ return false;
761
+ }
762
+ // Push re1 into re2.
763
+ re2.runes = Parser.concatRunes(re2.runes, re1.runes);
764
+ // Reuse re1 if possible.
765
+ if (r >= 0) {
766
+ re1.runes = [r];
767
+ re1.flags = flags;
768
+ return true;
769
+ }
770
+ this.pop();
771
+ this.reuse(re1);
772
+ return false; // did not push r
773
+ }
774
+ // newLiteral returns a new LITERAL Regexp with the given flags
775
+ newLiteral(r, flags) {
776
+ const re = this.newRegexp(Regexp_js_1.Regexp.Op.LITERAL);
777
+ re.flags = flags;
778
+ if ((flags & RE2Flags_js_1.FOLD_CASE) !== 0) {
779
+ r = Parser.minFoldRune(r);
780
+ }
781
+ re.runes = [r];
782
+ return re;
783
+ }
784
+ // literal pushes a literal regexp for the rune r on the stack
785
+ // and returns that regexp.
786
+ literal(r) {
787
+ this.push(this.newLiteral(r, this.flags));
788
+ }
789
+ // op pushes a regexp with the given op onto the stack
790
+ // and returns that regexp.
791
+ op(op) {
792
+ const re = this.newRegexp(op);
793
+ re.flags = this.flags;
794
+ return this.push(re);
795
+ }
796
+ // repeat replaces the top stack element with itself repeated according to
797
+ // op, min, max. beforePos is the start position of the repetition operator.
798
+ // Pre: t is positioned after the initial repetition operator.
799
+ // Post: t advances past an optional perl-mode '?', or stays put.
800
+ // Or, it fails with RE2JSSyntaxException.
801
+ repeat(op, min, max, beforePos, t, lastRepeatPos) {
802
+ let flags = this.flags;
803
+ if (t.more() && t.lookingAt("?")) {
804
+ t.skip(1);
805
+ flags ^= RE2Flags_js_1.NON_GREEDY;
806
+ }
807
+ if (lastRepeatPos !== -1) {
808
+ throw new exceptions_js_1.RE2JSSyntaxException(Parser.ERR_INVALID_REPEAT_OP, t.from(lastRepeatPos));
809
+ }
810
+ const n = this.stack.length;
811
+ if (n === 0) {
812
+ throw new exceptions_js_1.RE2JSSyntaxException(Parser.ERR_MISSING_REPEAT_ARGUMENT, t.from(beforePos));
813
+ }
814
+ const sub = this.stack[n - 1];
815
+ if (Regexp_js_1.Regexp.isPseudoOp(sub.op)) {
816
+ throw new exceptions_js_1.RE2JSSyntaxException(Parser.ERR_MISSING_REPEAT_ARGUMENT, t.from(beforePos));
817
+ }
818
+ const re = this.newRegexp(op);
819
+ re.min = min;
820
+ re.max = max;
821
+ re.flags = flags;
822
+ re.subs = [sub];
823
+ this.stack[n - 1] = re;
824
+ this.checkLimits(re);
825
+ if (op === Regexp_js_1.Regexp.Op.REPEAT &&
826
+ (min >= 2 || max >= 2) &&
827
+ !this.repeatIsValid(re, 1000)) {
828
+ throw new exceptions_js_1.RE2JSSyntaxException(Parser.ERR_INVALID_REPEAT_SIZE, t.from(beforePos));
829
+ }
830
+ }
831
+ // repeatIsValid reports whether the repetition re is valid.
832
+ // Valid means that the combination of the top-level repetition
833
+ // and any inner repetitions does not exceed n copies of the
834
+ // innermost thing.
835
+ // This function rewalks the regexp tree and is called for every repetition,
836
+ // so we have to worry about inducing quadratic behavior in the parser.
837
+ // We avoid this by only calling repeatIsValid when min or max >= 2.
838
+ // In that case the depth of any >= 2 nesting can only get to 9 without
839
+ // triggering a parse error, so each subtree can only be rewalked 9 times.
840
+ repeatIsValid(re, n) {
841
+ if (re.op === Regexp_js_1.Regexp.Op.REPEAT) {
842
+ let m = re.max;
843
+ if (m === 0) {
844
+ return true;
845
+ }
846
+ if (m < 0) {
847
+ m = re.min;
848
+ }
849
+ if (m > n) {
850
+ return false;
851
+ }
852
+ if (m > 0) {
853
+ n = Math.trunc(n / m);
854
+ }
855
+ }
856
+ for (let sub of re.subs) {
857
+ if (!this.repeatIsValid(sub, n)) {
858
+ return false;
859
+ }
860
+ }
861
+ return true;
862
+ }
863
+ // concat replaces the top of the stack (above the topmost '|' or '(') with
864
+ // its concatenation.
865
+ concat() {
866
+ this.maybeConcat(-1, 0);
867
+ const subs = this.popToPseudo();
868
+ if (subs.length === 0) {
869
+ return this.push(this.newRegexp(Regexp_js_1.Regexp.Op.EMPTY_MATCH));
870
+ }
871
+ return this.push(this.collapse(subs, Regexp_js_1.Regexp.Op.CONCAT));
872
+ }
873
+ // alternate replaces the top of the stack (above the topmost '(') with its
874
+ // alternation.
875
+ alternate() {
876
+ // Scan down to find pseudo-operator (.
877
+ // There are no | above (.
878
+ const subs = this.popToPseudo();
879
+ // Make sure top class is clean.
880
+ // All the others already are (see swapVerticalBar).
881
+ if (subs.length > 0) {
882
+ this.cleanAlt(subs[subs.length - 1]);
883
+ }
884
+ // Empty alternate is special case
885
+ // (shouldn't happen but easy to handle).
886
+ if (subs.length === 0) {
887
+ return this.push(this.newRegexp(Regexp_js_1.Regexp.Op.NO_MATCH));
888
+ }
889
+ return this.push(this.collapse(subs, Regexp_js_1.Regexp.Op.ALTERNATE));
890
+ }
891
+ // cleanAlt cleans re for eventual inclusion in an alternation.
892
+ cleanAlt(re) {
893
+ if (re.op === Regexp_js_1.Regexp.Op.CHAR_CLASS) {
894
+ re.runes = new CharClass_js_1.CharClass(re.runes).cleanClass().toArray();
895
+ if (re.runes.length === 2 &&
896
+ re.runes[0] === 0 &&
897
+ re.runes[1] === Unicode_js_1.MAX_RUNE) {
898
+ re.runes = [];
899
+ re.op = Regexp_js_1.Regexp.Op.ANY_CHAR;
900
+ }
901
+ else if (re.runes.length === 4 &&
902
+ re.runes[0] === 0 &&
903
+ re.runes[1] === 0x0a - 1 &&
904
+ re.runes[2] === 0x0a + 1 &&
905
+ re.runes[3] === Unicode_js_1.MAX_RUNE) {
906
+ re.runes = [];
907
+ re.op = Regexp_js_1.Regexp.Op.ANY_CHAR_NOT_NL;
908
+ }
909
+ }
910
+ }
911
+ // collapse returns the result of applying op to subs[start:end].
912
+ // If (sub contains op nodes, they all get hoisted up
913
+ // so that there is never a concat of a concat or an
914
+ // alternate of an alternate.
915
+ collapse(subs, op) {
916
+ if (subs.length === 1) {
917
+ return subs[0];
918
+ }
919
+ // Concatenate subs iff op is same.
920
+ // Compute length in first pass.
921
+ let len = 0;
922
+ for (let sub of subs) {
923
+ len += sub.op === op ? sub.subs.length : 1;
924
+ }
925
+ let newsubs = new Array(len).fill(null);
926
+ let i = 0;
927
+ for (let sub of subs) {
928
+ if (sub.op === op) {
929
+ for (let j = 0; j < sub.subs.length; j++) {
930
+ newsubs[i++] = sub.subs[j];
931
+ }
932
+ this.reuse(sub);
933
+ }
934
+ else {
935
+ newsubs[i++] = sub;
936
+ }
937
+ }
938
+ let re = this.newRegexp(op);
939
+ re.subs = newsubs;
940
+ if (op === Regexp_js_1.Regexp.Op.ALTERNATE) {
941
+ if (re.subs.length === 1) {
942
+ const old = re;
943
+ re = re.subs[0];
944
+ this.reuse(old);
945
+ }
946
+ }
947
+ return re;
948
+ }
949
+ parseInternal() {
950
+ if ((this.flags & RE2Flags_js_1.LITERAL) !== 0) {
951
+ // Trivial parser for literal string.
952
+ return Parser.literalRegexp(this.wholeRegexp, this.flags);
953
+ }
954
+ // Otherwise, must do real work.
955
+ let lastRepeatPos = -1;
956
+ let min = -1;
957
+ let max = -1;
958
+ const t = new StringIterator(this.wholeRegexp);
959
+ while (t.more()) {
960
+ {
961
+ let repeatPos = -1;
962
+ switch (t.peek()) {
963
+ case 0x28:
964
+ if (t.lookingAt("(?")) {
965
+ // Flag changes and non-capturing groups.
966
+ this.parsePerlFlags(t);
967
+ break;
968
+ }
969
+ const lparen = this.op(Regexp_js_1.Regexp.Op.LEFT_PAREN);
970
+ if (lparen === null) {
971
+ throw new Error("op(LEFT_PAREN) unexpectedly returned null");
972
+ }
973
+ lparen.cap = ++this.numCap;
974
+ t.skip(1); // '('
975
+ break;
976
+ case 0x7c:
977
+ this.parseVerticalBar(); // '|'
978
+ t.skip(1); // '|'
979
+ break;
980
+ case 0x29:
981
+ this.parseRightParen();
982
+ t.skip(1); // ')'
983
+ break;
984
+ case 0x5e:
985
+ if ((this.flags & RE2Flags_js_1.ONE_LINE) !== 0) {
986
+ this.op(Regexp_js_1.Regexp.Op.BEGIN_TEXT);
987
+ }
988
+ else {
989
+ this.op(Regexp_js_1.Regexp.Op.BEGIN_LINE);
990
+ }
991
+ t.skip(1); // '^'
992
+ break;
993
+ case 0x24:
994
+ if ((this.flags & RE2Flags_js_1.ONE_LINE) !== 0) {
995
+ const endText = this.op(Regexp_js_1.Regexp.Op.END_TEXT);
996
+ if (endText === null) {
997
+ throw new Error("op(END_TEXT) unexpectedly returned null");
998
+ }
999
+ endText.flags |= RE2Flags_js_1.WAS_DOLLAR;
1000
+ }
1001
+ else {
1002
+ this.op(Regexp_js_1.Regexp.Op.END_LINE);
1003
+ }
1004
+ t.skip(1); // '$'
1005
+ break;
1006
+ case 0x2e:
1007
+ if ((this.flags & RE2Flags_js_1.DOT_NL) !== 0) {
1008
+ this.op(Regexp_js_1.Regexp.Op.ANY_CHAR);
1009
+ }
1010
+ else {
1011
+ this.op(Regexp_js_1.Regexp.Op.ANY_CHAR_NOT_NL);
1012
+ }
1013
+ t.skip(1); // '.'
1014
+ break;
1015
+ case 0x5b:
1016
+ this.parseClass(t);
1017
+ break;
1018
+ case 0x2a:
1019
+ case 0x2b:
1020
+ case 0x3f: {
1021
+ repeatPos = t.pos();
1022
+ let op = null;
1023
+ switch (t.pop()) {
1024
+ case 0x2a:
1025
+ op = Regexp_js_1.Regexp.Op.STAR;
1026
+ break;
1027
+ case 0x2b:
1028
+ op = Regexp_js_1.Regexp.Op.PLUS;
1029
+ break;
1030
+ case 0x3f:
1031
+ op = Regexp_js_1.Regexp.Op.QUEST;
1032
+ break;
1033
+ }
1034
+ if (op === null) {
1035
+ throw new Error("repeat op unexpectedly null");
1036
+ }
1037
+ this.repeat(op, min, max, repeatPos, t, lastRepeatPos);
1038
+ // (min and max are now dead.)
1039
+ break;
1040
+ }
1041
+ case 0x7b: {
1042
+ repeatPos = t.pos();
1043
+ const minMax = Parser.parseRepeat(t);
1044
+ if (minMax < 0) {
1045
+ // If the repeat cannot be parsed, { is a literal.
1046
+ t.rewindTo(repeatPos);
1047
+ this.literal(t.pop()); // '{'
1048
+ break;
1049
+ }
1050
+ min = minMax >> 16;
1051
+ max = ((minMax & Unicode_js_1.MAX_BMP) << 16) >> 16;
1052
+ this.repeat(Regexp_js_1.Regexp.Op.REPEAT, min, max, repeatPos, t, lastRepeatPos);
1053
+ break;
1054
+ }
1055
+ case 0x5c: {
1056
+ const savedPos = t.pos();
1057
+ t.skip(1); // '\\'
1058
+ let handled = false;
1059
+ if (t.more()) {
1060
+ const c = t.pop();
1061
+ switch (c) {
1062
+ case 0x41:
1063
+ this.op(Regexp_js_1.Regexp.Op.BEGIN_TEXT);
1064
+ handled = true;
1065
+ break;
1066
+ case 0x62:
1067
+ this.op(Regexp_js_1.Regexp.Op.WORD_BOUNDARY);
1068
+ handled = true;
1069
+ break;
1070
+ case 0x42:
1071
+ this.op(Regexp_js_1.Regexp.Op.NO_WORD_BOUNDARY);
1072
+ handled = true;
1073
+ break;
1074
+ case 0x43:
1075
+ // any byte; not supported
1076
+ throw new exceptions_js_1.RE2JSSyntaxException(Parser.ERR_INVALID_ESCAPE, "\\C");
1077
+ case 0x51: {
1078
+ // \Q ... \E: the ... is always literals
1079
+ let lit = t.rest();
1080
+ const i = lit.indexOf("\\E");
1081
+ if (i >= 0) {
1082
+ lit = lit.substring(0, i);
1083
+ t.skipString(lit);
1084
+ t.skipString("\\E");
1085
+ }
1086
+ else {
1087
+ t.skipString(lit);
1088
+ }
1089
+ let j = 0;
1090
+ while (j < lit.length) {
1091
+ const codepoint = (0, chars_js_1.codePointAtOrThrow)(lit, j);
1092
+ this.literal(codepoint);
1093
+ j += (0, Utils_js_1.charCount)(codepoint);
1094
+ }
1095
+ handled = true;
1096
+ break;
1097
+ }
1098
+ case 0x7a:
1099
+ this.op(Regexp_js_1.Regexp.Op.END_TEXT);
1100
+ handled = true;
1101
+ break;
1102
+ default:
1103
+ t.rewindTo(savedPos);
1104
+ break;
1105
+ }
1106
+ }
1107
+ else {
1108
+ t.rewindTo(savedPos);
1109
+ }
1110
+ if (handled)
1111
+ break;
1112
+ const re = this.newRegexp(Regexp_js_1.Regexp.Op.CHAR_CLASS);
1113
+ re.flags = this.flags;
1114
+ // Look for Unicode character group like \p{Han}
1115
+ if (t.lookingAt("\\p") || t.lookingAt("\\P")) {
1116
+ const cc = new CharClass_js_1.CharClass();
1117
+ if (this.parseUnicodeClass(t, cc)) {
1118
+ re.runes = cc.toArray();
1119
+ this.push(re);
1120
+ break;
1121
+ }
1122
+ }
1123
+ // Perl character class escape.
1124
+ const cc = new CharClass_js_1.CharClass();
1125
+ if (this.parsePerlClassEscape(t, cc)) {
1126
+ re.runes = cc.toArray();
1127
+ this.push(re);
1128
+ break;
1129
+ }
1130
+ t.rewindTo(savedPos);
1131
+ this.reuse(re);
1132
+ // Ordinary single-character escape.
1133
+ this.literal(Parser.parseEscape(t));
1134
+ break;
1135
+ }
1136
+ default:
1137
+ this.literal(t.pop());
1138
+ break;
1139
+ }
1140
+ lastRepeatPos = repeatPos;
1141
+ }
1142
+ }
1143
+ this.concat();
1144
+ if (this.swapVerticalBar()) {
1145
+ this.pop(); // pop vertical bar
1146
+ }
1147
+ this.alternate();
1148
+ const n = this.stack.length;
1149
+ if (n !== 1) {
1150
+ throw new exceptions_js_1.RE2JSSyntaxException(Parser.ERR_MISSING_PAREN, this.wholeRegexp);
1151
+ }
1152
+ this.stack[0].namedGroups = this.namedGroups;
1153
+ return this.stack[0];
1154
+ }
1155
+ // parsePerlFlags parses a Perl flag setting or non-capturing group or both,
1156
+ // like (?i) or (?: or (?i:.
1157
+ // Pre: t at "(?". Post: t after ")".
1158
+ // Sets numCap.
1159
+ parsePerlFlags(t) {
1160
+ const startPos = t.pos();
1161
+ // Check for named captures, first introduced in Python's regexp library.
1162
+ // As usual, there are three slightly different syntaxes:
1163
+ //
1164
+ // (?P<name>expr) the original, introduced by Python
1165
+ // (?<name>expr) the .NET alteration, adopted by Perl 5.10
1166
+ // (?'name'expr) another .NET alteration, adopted by Perl 5.10
1167
+ //
1168
+ // Perl 5.10 gave in and implemented the Python version too,
1169
+ // but they claim that the last two are the preferred forms.
1170
+ // PCRE and languages based on it (specifically, PHP and Ruby)
1171
+ // support all three as well. EcmaScript 4 uses only the Python form.
1172
+ //
1173
+ // In both the open source world (via Code Search) and the
1174
+ // Google source tree, (?P<name>expr) and (?<name>expr) are the
1175
+ // dominant forms of named captures and both are supported.
1176
+ if (t.lookingAt("(?P<") || t.lookingAt("(?<")) {
1177
+ // Pull out name.
1178
+ const s = t.rest();
1179
+ const begin = s.charAt(2) === "P" ? 4 : 3;
1180
+ const end = s.indexOf(">");
1181
+ if (end < 0) {
1182
+ throw new exceptions_js_1.RE2JSSyntaxException(Parser.ERR_INVALID_NAMED_CAPTURE, s);
1183
+ }
1184
+ const name = s.substring(begin, end); // "name"
1185
+ t.skipString(name);
1186
+ t.skip(begin + 1); // "(?P<>" or "(?<>"
1187
+ if (!Parser.isValidCaptureName(name)) {
1188
+ // "(?P<name>"
1189
+ throw new exceptions_js_1.RE2JSSyntaxException(Parser.ERR_INVALID_NAMED_CAPTURE, s.substring(0, end + 1)); // "(?P<name>" or "(?<name>"
1190
+ }
1191
+ // Like ordinary capture, but named.
1192
+ const re = this.op(Regexp_js_1.Regexp.Op.LEFT_PAREN);
1193
+ if (re === null) {
1194
+ throw new Error("op(LEFT_PAREN) unexpectedly returned null");
1195
+ }
1196
+ re.cap = ++this.numCap;
1197
+ if (this.namedGroups.get(name)) {
1198
+ throw new exceptions_js_1.RE2JSSyntaxException(Parser.ERR_DUPLICATE_NAMED_CAPTURE, name);
1199
+ }
1200
+ this.namedGroups.set(name, this.numCap);
1201
+ re.name = name;
1202
+ return;
1203
+ }
1204
+ // Non-capturing group. Might also twiddle Perl flags.
1205
+ t.skip(2); // "(?"
1206
+ let flags = this.flags;
1207
+ let sign = +1;
1208
+ let sawFlag = false;
1209
+ loop: while (t.more()) {
1210
+ {
1211
+ const c = t.pop();
1212
+ switch (c) {
1213
+ case 0x69:
1214
+ flags |= RE2Flags_js_1.FOLD_CASE;
1215
+ sawFlag = true;
1216
+ break;
1217
+ case 0x6d:
1218
+ flags &= ~RE2Flags_js_1.ONE_LINE;
1219
+ sawFlag = true;
1220
+ break;
1221
+ case 0x73:
1222
+ flags |= RE2Flags_js_1.DOT_NL;
1223
+ sawFlag = true;
1224
+ break;
1225
+ case 0x55:
1226
+ flags |= RE2Flags_js_1.NON_GREEDY;
1227
+ sawFlag = true;
1228
+ break;
1229
+ // Switch to negation.
1230
+ case 0x2d:
1231
+ if (sign < 0) {
1232
+ break loop;
1233
+ }
1234
+ sign = -1;
1235
+ // Invert flags so that | above turn into &~ and vice versa.
1236
+ // We'll invert flags again before using it below.
1237
+ flags = ~flags;
1238
+ sawFlag = false;
1239
+ break;
1240
+ // End of flags, starting group or not.
1241
+ case 0x3a:
1242
+ case 0x29:
1243
+ if (sign < 0) {
1244
+ if (!sawFlag) {
1245
+ break loop;
1246
+ }
1247
+ flags = ~flags;
1248
+ }
1249
+ if (c === 0x3a) {
1250
+ // Open new group
1251
+ this.op(Regexp_js_1.Regexp.Op.LEFT_PAREN);
1252
+ }
1253
+ this.flags = flags;
1254
+ return;
1255
+ default:
1256
+ // Flags.
1257
+ break loop;
1258
+ }
1259
+ }
1260
+ }
1261
+ throw new exceptions_js_1.RE2JSSyntaxException(Parser.ERR_INVALID_PERL_OP, t.from(startPos));
1262
+ }
1263
+ // parseVerticalBar handles a | in the input.
1264
+ parseVerticalBar() {
1265
+ this.concat();
1266
+ // The concatenation we just parsed is on top of the stack.
1267
+ // If it sits above an opVerticalBar, swap it below
1268
+ // (things below an opVerticalBar become an alternation).
1269
+ // Otherwise, push a new vertical bar.
1270
+ if (!this.swapVerticalBar()) {
1271
+ this.op(Regexp_js_1.Regexp.Op.VERTICAL_BAR);
1272
+ }
1273
+ }
1274
+ // If the top of the stack is an element followed by an opVerticalBar
1275
+ // swapVerticalBar swaps the two and returns true.
1276
+ // Otherwise it returns false.
1277
+ swapVerticalBar() {
1278
+ const n = this.stack.length;
1279
+ // If above and below vertical bar are literal or char class,
1280
+ // can merge into a single char class.
1281
+ if (n >= 3 &&
1282
+ this.stack[n - 2].op === Regexp_js_1.Regexp.Op.VERTICAL_BAR &&
1283
+ Parser.isCharClass(this.stack[n - 1]) &&
1284
+ Parser.isCharClass(this.stack[n - 3])) {
1285
+ let re1 = this.stack[n - 1];
1286
+ let re3 = this.stack[n - 3];
1287
+ // Make re3 the more complex of the two.
1288
+ if (re1.op > re3.op) {
1289
+ const tmp = re3;
1290
+ re3 = re1;
1291
+ re1 = tmp;
1292
+ this.stack[n - 3] = re3;
1293
+ }
1294
+ Parser.mergeCharClass(re3, re1);
1295
+ this.reuse(re1);
1296
+ this.pop();
1297
+ return true;
1298
+ }
1299
+ if (n >= 2) {
1300
+ const re1 = this.stack[n - 1];
1301
+ const re2 = this.stack[n - 2];
1302
+ if (re2.op === Regexp_js_1.Regexp.Op.VERTICAL_BAR) {
1303
+ if (n >= 3) {
1304
+ // Now out of reach.
1305
+ // Clean opportunistically.
1306
+ this.cleanAlt(this.stack[n - 3]);
1307
+ }
1308
+ this.stack[n - 2] = re1;
1309
+ this.stack[n - 1] = re2;
1310
+ return true;
1311
+ }
1312
+ }
1313
+ return false;
1314
+ }
1315
+ // parseRightParen handles a ')' in the input.
1316
+ parseRightParen() {
1317
+ this.concat();
1318
+ if (this.swapVerticalBar()) {
1319
+ this.pop(); // pop vertical bar
1320
+ }
1321
+ this.alternate();
1322
+ const n = this.stack.length;
1323
+ if (n < 2) {
1324
+ throw new exceptions_js_1.RE2JSSyntaxException(Parser.ERR_UNEXPECTED_PAREN, this.wholeRegexp);
1325
+ }
1326
+ const re1 = this.pop();
1327
+ if (re1 === undefined) {
1328
+ throw new exceptions_js_1.RE2JSSyntaxException(Parser.ERR_BAD_EXPRESSION, this.wholeRegexp);
1329
+ }
1330
+ const re2 = this.pop();
1331
+ if (re2 === undefined || re2.op !== Regexp_js_1.Regexp.Op.LEFT_PAREN) {
1332
+ throw new exceptions_js_1.RE2JSSyntaxException(Parser.ERR_UNEXPECTED_PAREN, this.wholeRegexp);
1333
+ }
1334
+ // Restore flags at time of paren.
1335
+ this.flags = re2.flags;
1336
+ if (re2.cap === 0) {
1337
+ // Just for grouping.
1338
+ this.push(re1);
1339
+ }
1340
+ else {
1341
+ re2.op = Regexp_js_1.Regexp.Op.CAPTURE;
1342
+ re2.subs = [re1];
1343
+ this.push(re2);
1344
+ }
1345
+ }
1346
+ // parsePerlClassEscape parses a leading Perl character class escape like \d
1347
+ // from the beginning of |t|. If one is present, it appends the characters
1348
+ // to cc and returns true. The iterator is advanced past the escape
1349
+ // on success, undefined on failure, in which case false is returned.
1350
+ parsePerlClassEscape(t, cc) {
1351
+ const beforePos = t.pos();
1352
+ if (!t.more() || t.pop() !== 0x5c || !t.more()) {
1353
+ return false;
1354
+ }
1355
+ t.pop(); // e.g. advance past 'd' in "\\d"
1356
+ const p = t.from(beforePos);
1357
+ const g = (0, CharGroup_js_1.getPerlGroups)().get(p);
1358
+ if (g === undefined) {
1359
+ return false;
1360
+ }
1361
+ cc.appendGroup(g, (this.flags & RE2Flags_js_1.FOLD_CASE) !== 0);
1362
+ return true;
1363
+ }
1364
+ // parseNamedClass parses a leading POSIX named character class like
1365
+ // [:alnum:] from the beginning of t. If one is present, it appends the
1366
+ // characters to cc, advances the iterator, and returns true.
1367
+ // Pre: t at "[:". Post: t after ":]".
1368
+ // On failure (no class of than name), throws RE2JSSyntaxException.
1369
+ // On misparse, returns false; t.pos() is undefined.
1370
+ parseNamedClass(t, cc) {
1371
+ // (Go precondition check deleted.)
1372
+ const cls = t.rest();
1373
+ const i = cls.indexOf(":]");
1374
+ if (i < 0) {
1375
+ return false;
1376
+ }
1377
+ const name = cls.substring(0, i + 2); // "[:alnum:]"
1378
+ t.skipString(name);
1379
+ const g = (0, CharGroup_js_1.getPosixGroups)().get(name);
1380
+ if (g === undefined) {
1381
+ throw new exceptions_js_1.RE2JSSyntaxException(Parser.ERR_INVALID_CHAR_RANGE, name);
1382
+ }
1383
+ cc.appendGroup(g, (this.flags & RE2Flags_js_1.FOLD_CASE) !== 0);
1384
+ return true;
1385
+ }
1386
+ // parseUnicodeClass() parses a leading Unicode character class like \p{Han}
1387
+ // from the beginning of t. If one is present, it appends the characters to
1388
+ // to |cc|, advances |t| and returns true.
1389
+ //
1390
+ // Returns false if such a pattern is not present or UNICODE_GROUPS
1391
+ // flag is not enabled; |t.pos()| is not advanced in this case.
1392
+ // Indicates error by throwing RE2JSSyntaxException.
1393
+ parseUnicodeClass(t, cc) {
1394
+ const startPos = t.pos();
1395
+ if ((this.flags & RE2Flags_js_1.UNICODE_GROUPS) === 0 ||
1396
+ (!t.lookingAt("\\p") && !t.lookingAt("\\P"))) {
1397
+ return false;
1398
+ }
1399
+ t.skip(1); // '\\'
1400
+ // Committed to parse or throw exception.
1401
+ let sign = +1;
1402
+ let c = t.pop(); // 'p' or 'P'
1403
+ if (c === 0x50) {
1404
+ sign = -1;
1405
+ }
1406
+ if (!t.more()) {
1407
+ t.rewindTo(startPos);
1408
+ throw new exceptions_js_1.RE2JSSyntaxException(Parser.ERR_INVALID_CHAR_RANGE, t.rest());
1409
+ }
1410
+ c = t.pop();
1411
+ let name;
1412
+ if (c !== 0x7b) {
1413
+ // Single-letter name.
1414
+ name = (0, Utils_js_1.runeToString)(c);
1415
+ }
1416
+ else {
1417
+ // Name is in braces.
1418
+ const rest = t.rest();
1419
+ const end = rest.indexOf("}");
1420
+ if (end < 0) {
1421
+ t.rewindTo(startPos);
1422
+ throw new exceptions_js_1.RE2JSSyntaxException(Parser.ERR_INVALID_CHAR_RANGE, t.rest());
1423
+ }
1424
+ name = rest.substring(0, end); // e.g. "Han"
1425
+ t.skipString(name);
1426
+ t.skip(1);
1427
+ // Don't use skip(end) because it assumes UTF-16 coding, and
1428
+ // StringIterator doesn't guarantee that.
1429
+ }
1430
+ // Group can have leading negation too.
1431
+ // \p{^Han} == \P{Han}, \P{^Han} == \p{Han}.
1432
+ if (!(name.length === 0) && name.codePointAt(0) === 0x5e) {
1433
+ sign = 0 - sign; // -sign
1434
+ name = name.substring(1);
1435
+ }
1436
+ const pair = Parser.unicodeTable(name);
1437
+ if (pair === null) {
1438
+ throw new exceptions_js_1.RE2JSSyntaxException(Parser.ERR_INVALID_CHAR_RANGE, t.from(startPos));
1439
+ }
1440
+ if (pair.sign < 0) {
1441
+ sign = 0 - sign;
1442
+ }
1443
+ const tab = pair.tab;
1444
+ const fold = pair.fold; // fold-equivalent table
1445
+ if (tab === null) {
1446
+ throw new exceptions_js_1.RE2JSSyntaxException(Parser.ERR_INVALID_CHAR_RANGE, t.from(startPos));
1447
+ }
1448
+ // Variation of CharClass.appendGroup() for tables.
1449
+ if ((this.flags & RE2Flags_js_1.FOLD_CASE) === 0 || fold === null) {
1450
+ cc.appendTableWithSign(tab, sign);
1451
+ }
1452
+ else {
1453
+ // Merge and clean tab and fold in a temporary buffer.
1454
+ // This is necessary for the negative case and just tidy
1455
+ // for the positive case.
1456
+ const tmp = new CharClass_js_1.CharClass()
1457
+ .appendTable(tab)
1458
+ .appendTable(fold)
1459
+ .cleanClass()
1460
+ .toArray();
1461
+ cc.appendClassWithSign(tmp, sign);
1462
+ }
1463
+ return true;
1464
+ }
1465
+ // parseClass parses a character class and pushes it onto the parse stack.
1466
+ //
1467
+ // NOTES:
1468
+ // Pre: at '['; Post: after ']'.
1469
+ // Mutates stack. Advances iterator. May throw.
1470
+ parseClass(t) {
1471
+ const startPos = t.pos();
1472
+ t.skip(1); // '['
1473
+ const re = this.newRegexp(Regexp_js_1.Regexp.Op.CHAR_CLASS);
1474
+ re.flags = this.flags;
1475
+ const cc = new CharClass_js_1.CharClass();
1476
+ let sign = +1;
1477
+ if (t.more() && t.lookingAt("^")) {
1478
+ sign = -1;
1479
+ t.skip(1); // '^'
1480
+ // If character class does not match \n, add it here,
1481
+ // so that negation later will do the right thing.
1482
+ if ((this.flags & RE2Flags_js_1.CLASS_NL) === 0) {
1483
+ cc.appendRange(0x0a, 0x0a);
1484
+ }
1485
+ }
1486
+ let first = true; // ']' and '-' are okay as first char in class
1487
+ while (!t.more() || t.peek() !== 0x5d || first) {
1488
+ first = false;
1489
+ const beforePos = t.pos();
1490
+ // Look for POSIX [:alnum:] etc.
1491
+ if (t.lookingAt("[:")) {
1492
+ if (this.parseNamedClass(t, cc)) {
1493
+ continue;
1494
+ }
1495
+ t.rewindTo(beforePos);
1496
+ }
1497
+ // Look for Unicode character group like \p{Han}.
1498
+ if (this.parseUnicodeClass(t, cc)) {
1499
+ continue;
1500
+ }
1501
+ // Look for Perl character class symbols (extension).
1502
+ if (this.parsePerlClassEscape(t, cc)) {
1503
+ continue;
1504
+ }
1505
+ t.rewindTo(beforePos);
1506
+ // Single character or simple range.
1507
+ const lo = Parser.parseClassChar(t, startPos);
1508
+ let hi = lo;
1509
+ if (t.more() && t.lookingAt("-")) {
1510
+ t.skip(1);
1511
+ if (t.more() && t.lookingAt("]")) {
1512
+ // [a-] means (a|-) so check for final ].
1513
+ t.skip(-1);
1514
+ }
1515
+ else {
1516
+ hi = Parser.parseClassChar(t, startPos);
1517
+ if (hi < lo) {
1518
+ throw new exceptions_js_1.RE2JSSyntaxException(Parser.ERR_INVALID_CHAR_RANGE, t.from(beforePos));
1519
+ }
1520
+ }
1521
+ }
1522
+ if ((this.flags & RE2Flags_js_1.FOLD_CASE) === 0) {
1523
+ cc.appendRange(lo, hi);
1524
+ }
1525
+ else {
1526
+ cc.appendFoldedRange(lo, hi);
1527
+ }
1528
+ }
1529
+ t.skip(1); // ']'
1530
+ cc.cleanClass();
1531
+ if (sign < 0) {
1532
+ cc.negateClass();
1533
+ }
1534
+ re.runes = cc.toArray();
1535
+ this.push(re);
1536
+ }
1537
+ }
1538
+ exports.Parser = Parser;