grpc 1.31.0.pre1 → 1.31.0.pre2

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of grpc might be problematic. Click here for more details.

Files changed (55) hide show
  1. checksums.yaml +4 -4
  2. data/Makefile +2 -2
  3. data/src/core/ext/filters/client_channel/lb_policy/weighted_target/weighted_target.cc +3 -4
  4. data/src/core/ext/filters/client_channel/lb_policy/xds/xds_routing.cc +5 -4
  5. data/src/ruby/lib/grpc/version.rb +1 -1
  6. data/third_party/re2/re2/bitmap256.h +117 -0
  7. data/third_party/re2/re2/bitstate.cc +385 -0
  8. data/third_party/re2/re2/compile.cc +1279 -0
  9. data/third_party/re2/re2/dfa.cc +2130 -0
  10. data/third_party/re2/re2/filtered_re2.cc +121 -0
  11. data/third_party/re2/re2/filtered_re2.h +109 -0
  12. data/third_party/re2/re2/mimics_pcre.cc +197 -0
  13. data/third_party/re2/re2/nfa.cc +713 -0
  14. data/third_party/re2/re2/onepass.cc +623 -0
  15. data/third_party/re2/re2/parse.cc +2464 -0
  16. data/third_party/re2/re2/perl_groups.cc +119 -0
  17. data/third_party/re2/re2/pod_array.h +55 -0
  18. data/third_party/re2/re2/prefilter.cc +710 -0
  19. data/third_party/re2/re2/prefilter.h +108 -0
  20. data/third_party/re2/re2/prefilter_tree.cc +407 -0
  21. data/third_party/re2/re2/prefilter_tree.h +139 -0
  22. data/third_party/re2/re2/prog.cc +988 -0
  23. data/third_party/re2/re2/prog.h +436 -0
  24. data/third_party/re2/re2/re2.cc +1362 -0
  25. data/third_party/re2/re2/re2.h +1002 -0
  26. data/third_party/re2/re2/regexp.cc +980 -0
  27. data/third_party/re2/re2/regexp.h +659 -0
  28. data/third_party/re2/re2/set.cc +154 -0
  29. data/third_party/re2/re2/set.h +80 -0
  30. data/third_party/re2/re2/simplify.cc +657 -0
  31. data/third_party/re2/re2/sparse_array.h +392 -0
  32. data/third_party/re2/re2/sparse_set.h +264 -0
  33. data/third_party/re2/re2/stringpiece.cc +65 -0
  34. data/third_party/re2/re2/stringpiece.h +210 -0
  35. data/third_party/re2/re2/tostring.cc +351 -0
  36. data/third_party/re2/re2/unicode_casefold.cc +582 -0
  37. data/third_party/re2/re2/unicode_casefold.h +78 -0
  38. data/third_party/re2/re2/unicode_groups.cc +6269 -0
  39. data/third_party/re2/re2/unicode_groups.h +67 -0
  40. data/third_party/re2/re2/walker-inl.h +246 -0
  41. data/third_party/re2/util/benchmark.h +156 -0
  42. data/third_party/re2/util/flags.h +26 -0
  43. data/third_party/re2/util/logging.h +109 -0
  44. data/third_party/re2/util/malloc_counter.h +19 -0
  45. data/third_party/re2/util/mix.h +41 -0
  46. data/third_party/re2/util/mutex.h +148 -0
  47. data/third_party/re2/util/pcre.cc +1025 -0
  48. data/third_party/re2/util/pcre.h +681 -0
  49. data/third_party/re2/util/rune.cc +260 -0
  50. data/third_party/re2/util/strutil.cc +149 -0
  51. data/third_party/re2/util/strutil.h +21 -0
  52. data/third_party/re2/util/test.h +50 -0
  53. data/third_party/re2/util/utf.h +44 -0
  54. data/third_party/re2/util/util.h +42 -0
  55. metadata +78 -29
@@ -0,0 +1,2464 @@
1
+ // Copyright 2006 The RE2 Authors. All Rights Reserved.
2
+ // Use of this source code is governed by a BSD-style
3
+ // license that can be found in the LICENSE file.
4
+
5
+ // Regular expression parser.
6
+
7
+ // The parser is a simple precedence-based parser with a
8
+ // manual stack. The parsing work is done by the methods
9
+ // of the ParseState class. The Regexp::Parse function is
10
+ // essentially just a lexer that calls the ParseState method
11
+ // for each token.
12
+
13
+ // The parser recognizes POSIX extended regular expressions
14
+ // excluding backreferences, collating elements, and collating
15
+ // classes. It also allows the empty string as a regular expression
16
+ // and recognizes the Perl escape sequences \d, \s, \w, \D, \S, and \W.
17
+ // See regexp.h for rationale.
18
+
19
+ #include <ctype.h>
20
+ #include <stddef.h>
21
+ #include <stdint.h>
22
+ #include <string.h>
23
+ #include <algorithm>
24
+ #include <map>
25
+ #include <string>
26
+ #include <vector>
27
+
28
+ #include "util/util.h"
29
+ #include "util/logging.h"
30
+ #include "util/strutil.h"
31
+ #include "util/utf.h"
32
+ #include "re2/pod_array.h"
33
+ #include "re2/regexp.h"
34
+ #include "re2/stringpiece.h"
35
+ #include "re2/unicode_casefold.h"
36
+ #include "re2/unicode_groups.h"
37
+ #include "re2/walker-inl.h"
38
+
39
+ #if defined(RE2_USE_ICU)
40
+ #include "unicode/uniset.h"
41
+ #include "unicode/unistr.h"
42
+ #include "unicode/utypes.h"
43
+ #endif
44
+
45
+ namespace re2 {
46
+
47
+ // Reduce the maximum repeat count by an order of magnitude when fuzzing.
48
+ #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
49
+ static const int kMaxRepeat = 100;
50
+ #else
51
+ static const int kMaxRepeat = 1000;
52
+ #endif
53
+
54
+ // Regular expression parse state.
55
+ // The list of parsed regexps so far is maintained as a vector of
56
+ // Regexp pointers called the stack. Left parenthesis and vertical
57
+ // bar markers are also placed on the stack, as Regexps with
58
+ // non-standard opcodes.
59
+ // Scanning a left parenthesis causes the parser to push a left parenthesis
60
+ // marker on the stack.
61
+ // Scanning a vertical bar causes the parser to pop the stack until it finds a
62
+ // vertical bar or left parenthesis marker (not popping the marker),
63
+ // concatenate all the popped results, and push them back on
64
+ // the stack (DoConcatenation).
65
+ // Scanning a right parenthesis causes the parser to act as though it
66
+ // has seen a vertical bar, which then leaves the top of the stack in the
67
+ // form LeftParen regexp VerticalBar regexp VerticalBar ... regexp VerticalBar.
68
+ // The parser pops all this off the stack and creates an alternation of the
69
+ // regexps (DoAlternation).
70
+
71
+ class Regexp::ParseState {
72
+ public:
73
+ ParseState(ParseFlags flags, const StringPiece& whole_regexp,
74
+ RegexpStatus* status);
75
+ ~ParseState();
76
+
77
+ ParseFlags flags() { return flags_; }
78
+ int rune_max() { return rune_max_; }
79
+
80
+ // Parse methods. All public methods return a bool saying
81
+ // whether parsing should continue. If a method returns
82
+ // false, it has set fields in *status_, and the parser
83
+ // should return NULL.
84
+
85
+ // Pushes the given regular expression onto the stack.
86
+ // Could check for too much memory used here.
87
+ bool PushRegexp(Regexp* re);
88
+
89
+ // Pushes the literal rune r onto the stack.
90
+ bool PushLiteral(Rune r);
91
+
92
+ // Pushes a regexp with the given op (and no args) onto the stack.
93
+ bool PushSimpleOp(RegexpOp op);
94
+
95
+ // Pushes a ^ onto the stack.
96
+ bool PushCaret();
97
+
98
+ // Pushes a \b (word == true) or \B (word == false) onto the stack.
99
+ bool PushWordBoundary(bool word);
100
+
101
+ // Pushes a $ onto the stack.
102
+ bool PushDollar();
103
+
104
+ // Pushes a . onto the stack
105
+ bool PushDot();
106
+
107
+ // Pushes a repeat operator regexp onto the stack.
108
+ // A valid argument for the operator must already be on the stack.
109
+ // s is the name of the operator, for use in error messages.
110
+ bool PushRepeatOp(RegexpOp op, const StringPiece& s, bool nongreedy);
111
+
112
+ // Pushes a repetition regexp onto the stack.
113
+ // A valid argument for the operator must already be on the stack.
114
+ bool PushRepetition(int min, int max, const StringPiece& s, bool nongreedy);
115
+
116
+ // Checks whether a particular regexp op is a marker.
117
+ bool IsMarker(RegexpOp op);
118
+
119
+ // Processes a left parenthesis in the input.
120
+ // Pushes a marker onto the stack.
121
+ bool DoLeftParen(const StringPiece& name);
122
+ bool DoLeftParenNoCapture();
123
+
124
+ // Processes a vertical bar in the input.
125
+ bool DoVerticalBar();
126
+
127
+ // Processes a right parenthesis in the input.
128
+ bool DoRightParen();
129
+
130
+ // Processes the end of input, returning the final regexp.
131
+ Regexp* DoFinish();
132
+
133
+ // Finishes the regexp if necessary, preparing it for use
134
+ // in a more complicated expression.
135
+ // If it is a CharClassBuilder, converts into a CharClass.
136
+ Regexp* FinishRegexp(Regexp*);
137
+
138
+ // These routines don't manipulate the parse stack
139
+ // directly, but they do need to look at flags_.
140
+ // ParseCharClass also manipulates the internals of Regexp
141
+ // while creating *out_re.
142
+
143
+ // Parse a character class into *out_re.
144
+ // Removes parsed text from s.
145
+ bool ParseCharClass(StringPiece* s, Regexp** out_re,
146
+ RegexpStatus* status);
147
+
148
+ // Parse a character class character into *rp.
149
+ // Removes parsed text from s.
150
+ bool ParseCCCharacter(StringPiece* s, Rune *rp,
151
+ const StringPiece& whole_class,
152
+ RegexpStatus* status);
153
+
154
+ // Parse a character class range into rr.
155
+ // Removes parsed text from s.
156
+ bool ParseCCRange(StringPiece* s, RuneRange* rr,
157
+ const StringPiece& whole_class,
158
+ RegexpStatus* status);
159
+
160
+ // Parse a Perl flag set or non-capturing group from s.
161
+ bool ParsePerlFlags(StringPiece* s);
162
+
163
+
164
+ // Finishes the current concatenation,
165
+ // collapsing it into a single regexp on the stack.
166
+ void DoConcatenation();
167
+
168
+ // Finishes the current alternation,
169
+ // collapsing it to a single regexp on the stack.
170
+ void DoAlternation();
171
+
172
+ // Generalized DoAlternation/DoConcatenation.
173
+ void DoCollapse(RegexpOp op);
174
+
175
+ // Maybe concatenate Literals into LiteralString.
176
+ bool MaybeConcatString(int r, ParseFlags flags);
177
+
178
+ private:
179
+ ParseFlags flags_;
180
+ StringPiece whole_regexp_;
181
+ RegexpStatus* status_;
182
+ Regexp* stacktop_;
183
+ int ncap_; // number of capturing parens seen
184
+ int rune_max_; // maximum char value for this encoding
185
+
186
+ ParseState(const ParseState&) = delete;
187
+ ParseState& operator=(const ParseState&) = delete;
188
+ };
189
+
190
+ // Pseudo-operators - only on parse stack.
191
+ const RegexpOp kLeftParen = static_cast<RegexpOp>(kMaxRegexpOp+1);
192
+ const RegexpOp kVerticalBar = static_cast<RegexpOp>(kMaxRegexpOp+2);
193
+
194
+ Regexp::ParseState::ParseState(ParseFlags flags,
195
+ const StringPiece& whole_regexp,
196
+ RegexpStatus* status)
197
+ : flags_(flags), whole_regexp_(whole_regexp),
198
+ status_(status), stacktop_(NULL), ncap_(0) {
199
+ if (flags_ & Latin1)
200
+ rune_max_ = 0xFF;
201
+ else
202
+ rune_max_ = Runemax;
203
+ }
204
+
205
+ // Cleans up by freeing all the regexps on the stack.
206
+ Regexp::ParseState::~ParseState() {
207
+ Regexp* next;
208
+ for (Regexp* re = stacktop_; re != NULL; re = next) {
209
+ next = re->down_;
210
+ re->down_ = NULL;
211
+ if (re->op() == kLeftParen)
212
+ delete re->name_;
213
+ re->Decref();
214
+ }
215
+ }
216
+
217
+ // Finishes the regexp if necessary, preparing it for use in
218
+ // a more complex expression.
219
+ // If it is a CharClassBuilder, converts into a CharClass.
220
+ Regexp* Regexp::ParseState::FinishRegexp(Regexp* re) {
221
+ if (re == NULL)
222
+ return NULL;
223
+ re->down_ = NULL;
224
+
225
+ if (re->op_ == kRegexpCharClass && re->ccb_ != NULL) {
226
+ CharClassBuilder* ccb = re->ccb_;
227
+ re->ccb_ = NULL;
228
+ re->cc_ = ccb->GetCharClass();
229
+ delete ccb;
230
+ }
231
+
232
+ return re;
233
+ }
234
+
235
+ // Pushes the given regular expression onto the stack.
236
+ // Could check for too much memory used here.
237
+ bool Regexp::ParseState::PushRegexp(Regexp* re) {
238
+ MaybeConcatString(-1, NoParseFlags);
239
+
240
+ // Special case: a character class of one character is just
241
+ // a literal. This is a common idiom for escaping
242
+ // single characters (e.g., [.] instead of \.), and some
243
+ // analysis does better with fewer character classes.
244
+ // Similarly, [Aa] can be rewritten as a literal A with ASCII case folding.
245
+ if (re->op_ == kRegexpCharClass && re->ccb_ != NULL) {
246
+ re->ccb_->RemoveAbove(rune_max_);
247
+ if (re->ccb_->size() == 1) {
248
+ Rune r = re->ccb_->begin()->lo;
249
+ re->Decref();
250
+ re = new Regexp(kRegexpLiteral, flags_);
251
+ re->rune_ = r;
252
+ } else if (re->ccb_->size() == 2) {
253
+ Rune r = re->ccb_->begin()->lo;
254
+ if ('A' <= r && r <= 'Z' && re->ccb_->Contains(r + 'a' - 'A')) {
255
+ re->Decref();
256
+ re = new Regexp(kRegexpLiteral, flags_ | FoldCase);
257
+ re->rune_ = r + 'a' - 'A';
258
+ }
259
+ }
260
+ }
261
+
262
+ if (!IsMarker(re->op()))
263
+ re->simple_ = re->ComputeSimple();
264
+ re->down_ = stacktop_;
265
+ stacktop_ = re;
266
+ return true;
267
+ }
268
+
269
+ // Searches the case folding tables and returns the CaseFold* that contains r.
270
+ // If there isn't one, returns the CaseFold* with smallest f->lo bigger than r.
271
+ // If there isn't one, returns NULL.
272
+ const CaseFold* LookupCaseFold(const CaseFold *f, int n, Rune r) {
273
+ const CaseFold* ef = f + n;
274
+
275
+ // Binary search for entry containing r.
276
+ while (n > 0) {
277
+ int m = n/2;
278
+ if (f[m].lo <= r && r <= f[m].hi)
279
+ return &f[m];
280
+ if (r < f[m].lo) {
281
+ n = m;
282
+ } else {
283
+ f += m+1;
284
+ n -= m+1;
285
+ }
286
+ }
287
+
288
+ // There is no entry that contains r, but f points
289
+ // where it would have been. Unless f points at
290
+ // the end of the array, it points at the next entry
291
+ // after r.
292
+ if (f < ef)
293
+ return f;
294
+
295
+ // No entry contains r; no entry contains runes > r.
296
+ return NULL;
297
+ }
298
+
299
+ // Returns the result of applying the fold f to the rune r.
300
+ Rune ApplyFold(const CaseFold *f, Rune r) {
301
+ switch (f->delta) {
302
+ default:
303
+ return r + f->delta;
304
+
305
+ case EvenOddSkip: // even <-> odd but only applies to every other
306
+ if ((r - f->lo) % 2)
307
+ return r;
308
+ FALLTHROUGH_INTENDED;
309
+ case EvenOdd: // even <-> odd
310
+ if (r%2 == 0)
311
+ return r + 1;
312
+ return r - 1;
313
+
314
+ case OddEvenSkip: // odd <-> even but only applies to every other
315
+ if ((r - f->lo) % 2)
316
+ return r;
317
+ FALLTHROUGH_INTENDED;
318
+ case OddEven: // odd <-> even
319
+ if (r%2 == 1)
320
+ return r + 1;
321
+ return r - 1;
322
+ }
323
+ }
324
+
325
+ // Returns the next Rune in r's folding cycle (see unicode_casefold.h).
326
+ // Examples:
327
+ // CycleFoldRune('A') = 'a'
328
+ // CycleFoldRune('a') = 'A'
329
+ //
330
+ // CycleFoldRune('K') = 'k'
331
+ // CycleFoldRune('k') = 0x212A (Kelvin)
332
+ // CycleFoldRune(0x212A) = 'K'
333
+ //
334
+ // CycleFoldRune('?') = '?'
335
+ Rune CycleFoldRune(Rune r) {
336
+ const CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, r);
337
+ if (f == NULL || r < f->lo)
338
+ return r;
339
+ return ApplyFold(f, r);
340
+ }
341
+
342
+ // Add lo-hi to the class, along with their fold-equivalent characters.
343
+ // If lo-hi is already in the class, assume that the fold-equivalent
344
+ // chars are there too, so there's no work to do.
345
+ static void AddFoldedRange(CharClassBuilder* cc, Rune lo, Rune hi, int depth) {
346
+ // AddFoldedRange calls itself recursively for each rune in the fold cycle.
347
+ // Most folding cycles are small: there aren't any bigger than four in the
348
+ // current Unicode tables. make_unicode_casefold.py checks that
349
+ // the cycles are not too long, and we double-check here using depth.
350
+ if (depth > 10) {
351
+ LOG(DFATAL) << "AddFoldedRange recurses too much.";
352
+ return;
353
+ }
354
+
355
+ if (!cc->AddRange(lo, hi)) // lo-hi was already there? we're done
356
+ return;
357
+
358
+ while (lo <= hi) {
359
+ const CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, lo);
360
+ if (f == NULL) // lo has no fold, nor does anything above lo
361
+ break;
362
+ if (lo < f->lo) { // lo has no fold; next rune with a fold is f->lo
363
+ lo = f->lo;
364
+ continue;
365
+ }
366
+
367
+ // Add in the result of folding the range lo - f->hi
368
+ // and that range's fold, recursively.
369
+ Rune lo1 = lo;
370
+ Rune hi1 = std::min<Rune>(hi, f->hi);
371
+ switch (f->delta) {
372
+ default:
373
+ lo1 += f->delta;
374
+ hi1 += f->delta;
375
+ break;
376
+ case EvenOdd:
377
+ if (lo1%2 == 1)
378
+ lo1--;
379
+ if (hi1%2 == 0)
380
+ hi1++;
381
+ break;
382
+ case OddEven:
383
+ if (lo1%2 == 0)
384
+ lo1--;
385
+ if (hi1%2 == 1)
386
+ hi1++;
387
+ break;
388
+ }
389
+ AddFoldedRange(cc, lo1, hi1, depth+1);
390
+
391
+ // Pick up where this fold left off.
392
+ lo = f->hi + 1;
393
+ }
394
+ }
395
+
396
+ // Pushes the literal rune r onto the stack.
397
+ bool Regexp::ParseState::PushLiteral(Rune r) {
398
+ // Do case folding if needed.
399
+ if ((flags_ & FoldCase) && CycleFoldRune(r) != r) {
400
+ Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);
401
+ re->ccb_ = new CharClassBuilder;
402
+ Rune r1 = r;
403
+ do {
404
+ if (!(flags_ & NeverNL) || r != '\n') {
405
+ re->ccb_->AddRange(r, r);
406
+ }
407
+ r = CycleFoldRune(r);
408
+ } while (r != r1);
409
+ return PushRegexp(re);
410
+ }
411
+
412
+ // Exclude newline if applicable.
413
+ if ((flags_ & NeverNL) && r == '\n')
414
+ return PushRegexp(new Regexp(kRegexpNoMatch, flags_));
415
+
416
+ // No fancy stuff worked. Ordinary literal.
417
+ if (MaybeConcatString(r, flags_))
418
+ return true;
419
+
420
+ Regexp* re = new Regexp(kRegexpLiteral, flags_);
421
+ re->rune_ = r;
422
+ return PushRegexp(re);
423
+ }
424
+
425
+ // Pushes a ^ onto the stack.
426
+ bool Regexp::ParseState::PushCaret() {
427
+ if (flags_ & OneLine) {
428
+ return PushSimpleOp(kRegexpBeginText);
429
+ }
430
+ return PushSimpleOp(kRegexpBeginLine);
431
+ }
432
+
433
+ // Pushes a \b or \B onto the stack.
434
+ bool Regexp::ParseState::PushWordBoundary(bool word) {
435
+ if (word)
436
+ return PushSimpleOp(kRegexpWordBoundary);
437
+ return PushSimpleOp(kRegexpNoWordBoundary);
438
+ }
439
+
440
+ // Pushes a $ onto the stack.
441
+ bool Regexp::ParseState::PushDollar() {
442
+ if (flags_ & OneLine) {
443
+ // Clumsy marker so that MimicsPCRE() can tell whether
444
+ // this kRegexpEndText was a $ and not a \z.
445
+ Regexp::ParseFlags oflags = flags_;
446
+ flags_ = flags_ | WasDollar;
447
+ bool ret = PushSimpleOp(kRegexpEndText);
448
+ flags_ = oflags;
449
+ return ret;
450
+ }
451
+ return PushSimpleOp(kRegexpEndLine);
452
+ }
453
+
454
+ // Pushes a . onto the stack.
455
+ bool Regexp::ParseState::PushDot() {
456
+ if ((flags_ & DotNL) && !(flags_ & NeverNL))
457
+ return PushSimpleOp(kRegexpAnyChar);
458
+ // Rewrite . into [^\n]
459
+ Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);
460
+ re->ccb_ = new CharClassBuilder;
461
+ re->ccb_->AddRange(0, '\n' - 1);
462
+ re->ccb_->AddRange('\n' + 1, rune_max_);
463
+ return PushRegexp(re);
464
+ }
465
+
466
+ // Pushes a regexp with the given op (and no args) onto the stack.
467
+ bool Regexp::ParseState::PushSimpleOp(RegexpOp op) {
468
+ Regexp* re = new Regexp(op, flags_);
469
+ return PushRegexp(re);
470
+ }
471
+
472
+ // Pushes a repeat operator regexp onto the stack.
473
+ // A valid argument for the operator must already be on the stack.
474
+ // The char c is the name of the operator, for use in error messages.
475
+ bool Regexp::ParseState::PushRepeatOp(RegexpOp op, const StringPiece& s,
476
+ bool nongreedy) {
477
+ if (stacktop_ == NULL || IsMarker(stacktop_->op())) {
478
+ status_->set_code(kRegexpRepeatArgument);
479
+ status_->set_error_arg(s);
480
+ return false;
481
+ }
482
+ Regexp::ParseFlags fl = flags_;
483
+ if (nongreedy)
484
+ fl = fl ^ NonGreedy;
485
+
486
+ // Squash **, ++ and ??. Regexp::Star() et al. handle this too, but
487
+ // they're mostly for use during simplification, not during parsing.
488
+ if (op == stacktop_->op() && fl == stacktop_->parse_flags())
489
+ return true;
490
+
491
+ // Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because
492
+ // op is a repeat, we just have to check that stacktop_->op() is too,
493
+ // then adjust stacktop_.
494
+ if ((stacktop_->op() == kRegexpStar ||
495
+ stacktop_->op() == kRegexpPlus ||
496
+ stacktop_->op() == kRegexpQuest) &&
497
+ fl == stacktop_->parse_flags()) {
498
+ stacktop_->op_ = kRegexpStar;
499
+ return true;
500
+ }
501
+
502
+ Regexp* re = new Regexp(op, fl);
503
+ re->AllocSub(1);
504
+ re->down_ = stacktop_->down_;
505
+ re->sub()[0] = FinishRegexp(stacktop_);
506
+ re->simple_ = re->ComputeSimple();
507
+ stacktop_ = re;
508
+ return true;
509
+ }
510
+
511
+ // RepetitionWalker reports whether the repetition regexp is valid.
512
+ // Valid means that the combination of the top-level repetition
513
+ // and any inner repetitions does not exceed n copies of the
514
+ // innermost thing.
515
+ // This rewalks the regexp tree and is called for every repetition,
516
+ // so we have to worry about inducing quadratic behavior in the parser.
517
+ // We avoid this by only using RepetitionWalker when min or max >= 2.
518
+ // In that case the depth of any >= 2 nesting can only get to 9 without
519
+ // triggering a parse error, so each subtree can only be rewalked 9 times.
520
+ class RepetitionWalker : public Regexp::Walker<int> {
521
+ public:
522
+ RepetitionWalker() {}
523
+ virtual int PreVisit(Regexp* re, int parent_arg, bool* stop);
524
+ virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg,
525
+ int* child_args, int nchild_args);
526
+ virtual int ShortVisit(Regexp* re, int parent_arg);
527
+
528
+ private:
529
+ RepetitionWalker(const RepetitionWalker&) = delete;
530
+ RepetitionWalker& operator=(const RepetitionWalker&) = delete;
531
+ };
532
+
533
+ int RepetitionWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) {
534
+ int arg = parent_arg;
535
+ if (re->op() == kRegexpRepeat) {
536
+ int m = re->max();
537
+ if (m < 0) {
538
+ m = re->min();
539
+ }
540
+ if (m > 0) {
541
+ arg /= m;
542
+ }
543
+ }
544
+ return arg;
545
+ }
546
+
547
+ int RepetitionWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg,
548
+ int* child_args, int nchild_args) {
549
+ int arg = pre_arg;
550
+ for (int i = 0; i < nchild_args; i++) {
551
+ if (child_args[i] < arg) {
552
+ arg = child_args[i];
553
+ }
554
+ }
555
+ return arg;
556
+ }
557
+
558
+ int RepetitionWalker::ShortVisit(Regexp* re, int parent_arg) {
559
+ // Should never be called: we use Walk(), not WalkExponential().
560
+ #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
561
+ LOG(DFATAL) << "RepetitionWalker::ShortVisit called";
562
+ #endif
563
+ return 0;
564
+ }
565
+
566
+ // Pushes a repetition regexp onto the stack.
567
+ // A valid argument for the operator must already be on the stack.
568
+ bool Regexp::ParseState::PushRepetition(int min, int max,
569
+ const StringPiece& s,
570
+ bool nongreedy) {
571
+ if ((max != -1 && max < min) || min > kMaxRepeat || max > kMaxRepeat) {
572
+ status_->set_code(kRegexpRepeatSize);
573
+ status_->set_error_arg(s);
574
+ return false;
575
+ }
576
+ if (stacktop_ == NULL || IsMarker(stacktop_->op())) {
577
+ status_->set_code(kRegexpRepeatArgument);
578
+ status_->set_error_arg(s);
579
+ return false;
580
+ }
581
+ Regexp::ParseFlags fl = flags_;
582
+ if (nongreedy)
583
+ fl = fl ^ NonGreedy;
584
+ Regexp* re = new Regexp(kRegexpRepeat, fl);
585
+ re->min_ = min;
586
+ re->max_ = max;
587
+ re->AllocSub(1);
588
+ re->down_ = stacktop_->down_;
589
+ re->sub()[0] = FinishRegexp(stacktop_);
590
+ re->simple_ = re->ComputeSimple();
591
+ stacktop_ = re;
592
+ if (min >= 2 || max >= 2) {
593
+ RepetitionWalker w;
594
+ if (w.Walk(stacktop_, kMaxRepeat) == 0) {
595
+ status_->set_code(kRegexpRepeatSize);
596
+ status_->set_error_arg(s);
597
+ return false;
598
+ }
599
+ }
600
+ return true;
601
+ }
602
+
603
+ // Checks whether a particular regexp op is a marker.
604
+ bool Regexp::ParseState::IsMarker(RegexpOp op) {
605
+ return op >= kLeftParen;
606
+ }
607
+
608
+ // Processes a left parenthesis in the input.
609
+ // Pushes a marker onto the stack.
610
+ bool Regexp::ParseState::DoLeftParen(const StringPiece& name) {
611
+ Regexp* re = new Regexp(kLeftParen, flags_);
612
+ re->cap_ = ++ncap_;
613
+ if (name.data() != NULL)
614
+ re->name_ = new std::string(name);
615
+ return PushRegexp(re);
616
+ }
617
+
618
+ // Pushes a non-capturing marker onto the stack.
619
+ bool Regexp::ParseState::DoLeftParenNoCapture() {
620
+ Regexp* re = new Regexp(kLeftParen, flags_);
621
+ re->cap_ = -1;
622
+ return PushRegexp(re);
623
+ }
624
+
625
+ // Processes a vertical bar in the input.
626
+ bool Regexp::ParseState::DoVerticalBar() {
627
+ MaybeConcatString(-1, NoParseFlags);
628
+ DoConcatenation();
629
+
630
+ // Below the vertical bar is a list to alternate.
631
+ // Above the vertical bar is a list to concatenate.
632
+ // We just did the concatenation, so either swap
633
+ // the result below the vertical bar or push a new
634
+ // vertical bar on the stack.
635
+ Regexp* r1;
636
+ Regexp* r2;
637
+ if ((r1 = stacktop_) != NULL &&
638
+ (r2 = r1->down_) != NULL &&
639
+ r2->op() == kVerticalBar) {
640
+ Regexp* r3;
641
+ if ((r3 = r2->down_) != NULL &&
642
+ (r1->op() == kRegexpAnyChar || r3->op() == kRegexpAnyChar)) {
643
+ // AnyChar is above or below the vertical bar. Let it subsume
644
+ // the other when the other is Literal, CharClass or AnyChar.
645
+ if (r3->op() == kRegexpAnyChar &&
646
+ (r1->op() == kRegexpLiteral ||
647
+ r1->op() == kRegexpCharClass ||
648
+ r1->op() == kRegexpAnyChar)) {
649
+ // Discard r1.
650
+ stacktop_ = r2;
651
+ r1->Decref();
652
+ return true;
653
+ }
654
+ if (r1->op() == kRegexpAnyChar &&
655
+ (r3->op() == kRegexpLiteral ||
656
+ r3->op() == kRegexpCharClass ||
657
+ r3->op() == kRegexpAnyChar)) {
658
+ // Rearrange the stack and discard r3.
659
+ r1->down_ = r3->down_;
660
+ r2->down_ = r1;
661
+ stacktop_ = r2;
662
+ r3->Decref();
663
+ return true;
664
+ }
665
+ }
666
+ // Swap r1 below vertical bar (r2).
667
+ r1->down_ = r2->down_;
668
+ r2->down_ = r1;
669
+ stacktop_ = r2;
670
+ return true;
671
+ }
672
+ return PushSimpleOp(kVerticalBar);
673
+ }
674
+
675
+ // Processes a right parenthesis in the input.
676
+ bool Regexp::ParseState::DoRightParen() {
677
+ // Finish the current concatenation and alternation.
678
+ DoAlternation();
679
+
680
+ // The stack should be: LeftParen regexp
681
+ // Remove the LeftParen, leaving the regexp,
682
+ // parenthesized.
683
+ Regexp* r1;
684
+ Regexp* r2;
685
+ if ((r1 = stacktop_) == NULL ||
686
+ (r2 = r1->down_) == NULL ||
687
+ r2->op() != kLeftParen) {
688
+ status_->set_code(kRegexpMissingParen);
689
+ status_->set_error_arg(whole_regexp_);
690
+ return false;
691
+ }
692
+
693
+ // Pop off r1, r2. Will Decref or reuse below.
694
+ stacktop_ = r2->down_;
695
+
696
+ // Restore flags from when paren opened.
697
+ Regexp* re = r2;
698
+ flags_ = re->parse_flags();
699
+
700
+ // Rewrite LeftParen as capture if needed.
701
+ if (re->cap_ > 0) {
702
+ re->op_ = kRegexpCapture;
703
+ // re->cap_ is already set
704
+ re->AllocSub(1);
705
+ re->sub()[0] = FinishRegexp(r1);
706
+ re->simple_ = re->ComputeSimple();
707
+ } else {
708
+ re->Decref();
709
+ re = r1;
710
+ }
711
+ return PushRegexp(re);
712
+ }
713
+
714
+ // Processes the end of input, returning the final regexp.
715
+ Regexp* Regexp::ParseState::DoFinish() {
716
+ DoAlternation();
717
+ Regexp* re = stacktop_;
718
+ if (re != NULL && re->down_ != NULL) {
719
+ status_->set_code(kRegexpMissingParen);
720
+ status_->set_error_arg(whole_regexp_);
721
+ return NULL;
722
+ }
723
+ stacktop_ = NULL;
724
+ return FinishRegexp(re);
725
+ }
726
+
727
+ // Returns the leading regexp that re starts with.
728
+ // The returned Regexp* points into a piece of re,
729
+ // so it must not be used after the caller calls re->Decref().
730
+ Regexp* Regexp::LeadingRegexp(Regexp* re) {
731
+ if (re->op() == kRegexpEmptyMatch)
732
+ return NULL;
733
+ if (re->op() == kRegexpConcat && re->nsub() >= 2) {
734
+ Regexp** sub = re->sub();
735
+ if (sub[0]->op() == kRegexpEmptyMatch)
736
+ return NULL;
737
+ return sub[0];
738
+ }
739
+ return re;
740
+ }
741
+
742
+ // Removes LeadingRegexp(re) from re and returns what's left.
743
+ // Consumes the reference to re and may edit it in place.
744
+ // If caller wants to hold on to LeadingRegexp(re),
745
+ // must have already Incref'ed it.
746
+ Regexp* Regexp::RemoveLeadingRegexp(Regexp* re) {
747
+ if (re->op() == kRegexpEmptyMatch)
748
+ return re;
749
+ if (re->op() == kRegexpConcat && re->nsub() >= 2) {
750
+ Regexp** sub = re->sub();
751
+ if (sub[0]->op() == kRegexpEmptyMatch)
752
+ return re;
753
+ sub[0]->Decref();
754
+ sub[0] = NULL;
755
+ if (re->nsub() == 2) {
756
+ // Collapse concatenation to single regexp.
757
+ Regexp* nre = sub[1];
758
+ sub[1] = NULL;
759
+ re->Decref();
760
+ return nre;
761
+ }
762
+ // 3 or more -> 2 or more.
763
+ re->nsub_--;
764
+ memmove(sub, sub + 1, re->nsub_ * sizeof sub[0]);
765
+ return re;
766
+ }
767
+ Regexp::ParseFlags pf = re->parse_flags();
768
+ re->Decref();
769
+ return new Regexp(kRegexpEmptyMatch, pf);
770
+ }
771
+
772
+ // Returns the leading string that re starts with.
773
+ // The returned Rune* points into a piece of re,
774
+ // so it must not be used after the caller calls re->Decref().
775
+ Rune* Regexp::LeadingString(Regexp* re, int *nrune,
776
+ Regexp::ParseFlags *flags) {
777
+ while (re->op() == kRegexpConcat && re->nsub() > 0)
778
+ re = re->sub()[0];
779
+
780
+ *flags = static_cast<Regexp::ParseFlags>(re->parse_flags_ & Regexp::FoldCase);
781
+
782
+ if (re->op() == kRegexpLiteral) {
783
+ *nrune = 1;
784
+ return &re->rune_;
785
+ }
786
+
787
+ if (re->op() == kRegexpLiteralString) {
788
+ *nrune = re->nrunes_;
789
+ return re->runes_;
790
+ }
791
+
792
+ *nrune = 0;
793
+ return NULL;
794
+ }
795
+
796
+ // Removes the first n leading runes from the beginning of re.
797
+ // Edits re in place.
798
+ void Regexp::RemoveLeadingString(Regexp* re, int n) {
799
+ // Chase down concats to find first string.
800
+ // For regexps generated by parser, nested concats are
801
+ // flattened except when doing so would overflow the 16-bit
802
+ // limit on the size of a concatenation, so we should never
803
+ // see more than two here.
804
+ Regexp* stk[4];
805
+ size_t d = 0;
806
+ while (re->op() == kRegexpConcat) {
807
+ if (d < arraysize(stk))
808
+ stk[d++] = re;
809
+ re = re->sub()[0];
810
+ }
811
+
812
+ // Remove leading string from re.
813
+ if (re->op() == kRegexpLiteral) {
814
+ re->rune_ = 0;
815
+ re->op_ = kRegexpEmptyMatch;
816
+ } else if (re->op() == kRegexpLiteralString) {
817
+ if (n >= re->nrunes_) {
818
+ delete[] re->runes_;
819
+ re->runes_ = NULL;
820
+ re->nrunes_ = 0;
821
+ re->op_ = kRegexpEmptyMatch;
822
+ } else if (n == re->nrunes_ - 1) {
823
+ Rune rune = re->runes_[re->nrunes_ - 1];
824
+ delete[] re->runes_;
825
+ re->runes_ = NULL;
826
+ re->nrunes_ = 0;
827
+ re->rune_ = rune;
828
+ re->op_ = kRegexpLiteral;
829
+ } else {
830
+ re->nrunes_ -= n;
831
+ memmove(re->runes_, re->runes_ + n, re->nrunes_ * sizeof re->runes_[0]);
832
+ }
833
+ }
834
+
835
+ // If re is now empty, concatenations might simplify too.
836
+ while (d > 0) {
837
+ re = stk[--d];
838
+ Regexp** sub = re->sub();
839
+ if (sub[0]->op() == kRegexpEmptyMatch) {
840
+ sub[0]->Decref();
841
+ sub[0] = NULL;
842
+ // Delete first element of concat.
843
+ switch (re->nsub()) {
844
+ case 0:
845
+ case 1:
846
+ // Impossible.
847
+ LOG(DFATAL) << "Concat of " << re->nsub();
848
+ re->submany_ = NULL;
849
+ re->op_ = kRegexpEmptyMatch;
850
+ break;
851
+
852
+ case 2: {
853
+ // Replace re with sub[1].
854
+ Regexp* old = sub[1];
855
+ sub[1] = NULL;
856
+ re->Swap(old);
857
+ old->Decref();
858
+ break;
859
+ }
860
+
861
+ default:
862
+ // Slide down.
863
+ re->nsub_--;
864
+ memmove(sub, sub + 1, re->nsub_ * sizeof sub[0]);
865
+ break;
866
+ }
867
+ }
868
+ }
869
+ }
870
+
871
+ // In the context of factoring alternations, a Splice is: a factored prefix or
872
+ // merged character class computed by one iteration of one round of factoring;
873
+ // the span of subexpressions of the alternation to be "spliced" (i.e. removed
874
+ // and replaced); and, for a factored prefix, the number of suffixes after any
875
+ // factoring that might have subsequently been performed on them. For a merged
876
+ // character class, there are no suffixes, of course, so the field is ignored.
877
+ struct Splice {
878
+ Splice(Regexp* prefix, Regexp** sub, int nsub)
879
+ : prefix(prefix),
880
+ sub(sub),
881
+ nsub(nsub),
882
+ nsuffix(-1) {}
883
+
884
+ Regexp* prefix;
885
+ Regexp** sub;
886
+ int nsub;
887
+ int nsuffix;
888
+ };
889
+
890
+ // Named so because it is used to implement an explicit stack, a Frame is: the
891
+ // span of subexpressions of the alternation to be factored; the current round
892
+ // of factoring; any Splices computed; and, for a factored prefix, an iterator
893
+ // to the next Splice to be factored (i.e. in another Frame) because suffixes.
894
+ struct Frame {
895
+ Frame(Regexp** sub, int nsub)
896
+ : sub(sub),
897
+ nsub(nsub),
898
+ round(0) {}
899
+
900
+ Regexp** sub;
901
+ int nsub;
902
+ int round;
903
+ std::vector<Splice> splices;
904
+ int spliceidx;
905
+ };
906
+
907
+ // Bundled into a class for friend access to Regexp without needing to declare
908
+ // (or define) Splice in regexp.h.
909
+ class FactorAlternationImpl {
910
+ public:
911
+ static void Round1(Regexp** sub, int nsub,
912
+ Regexp::ParseFlags flags,
913
+ std::vector<Splice>* splices);
914
+ static void Round2(Regexp** sub, int nsub,
915
+ Regexp::ParseFlags flags,
916
+ std::vector<Splice>* splices);
917
+ static void Round3(Regexp** sub, int nsub,
918
+ Regexp::ParseFlags flags,
919
+ std::vector<Splice>* splices);
920
+ };
921
+
922
+ // Factors common prefixes from alternation.
923
+ // For example,
924
+ // ABC|ABD|AEF|BCX|BCY
925
+ // simplifies to
926
+ // A(B(C|D)|EF)|BC(X|Y)
927
+ // and thence to
928
+ // A(B[CD]|EF)|BC[XY]
929
+ //
930
+ // Rewrites sub to contain simplified list to alternate and returns
931
+ // the new length of sub. Adjusts reference counts accordingly
932
+ // (incoming sub[i] decremented, outgoing sub[i] incremented).
933
+ int Regexp::FactorAlternation(Regexp** sub, int nsub, ParseFlags flags) {
934
+ std::vector<Frame> stk;
935
+ stk.emplace_back(sub, nsub);
936
+
937
+ for (;;) {
938
+ auto& sub = stk.back().sub;
939
+ auto& nsub = stk.back().nsub;
940
+ auto& round = stk.back().round;
941
+ auto& splices = stk.back().splices;
942
+ auto& spliceidx = stk.back().spliceidx;
943
+
944
+ if (splices.empty()) {
945
+ // Advance to the next round of factoring. Note that this covers
946
+ // the initialised state: when splices is empty and round is 0.
947
+ round++;
948
+ } else if (spliceidx < static_cast<int>(splices.size())) {
949
+ // We have at least one more Splice to factor. Recurse logically.
950
+ stk.emplace_back(splices[spliceidx].sub, splices[spliceidx].nsub);
951
+ continue;
952
+ } else {
953
+ // We have no more Splices to factor. Apply them.
954
+ auto iter = splices.begin();
955
+ int out = 0;
956
+ for (int i = 0; i < nsub; ) {
957
+ // Copy until we reach where the next Splice begins.
958
+ while (sub + i < iter->sub)
959
+ sub[out++] = sub[i++];
960
+ switch (round) {
961
+ case 1:
962
+ case 2: {
963
+ // Assemble the Splice prefix and the suffixes.
964
+ Regexp* re[2];
965
+ re[0] = iter->prefix;
966
+ re[1] = Regexp::AlternateNoFactor(iter->sub, iter->nsuffix, flags);
967
+ sub[out++] = Regexp::Concat(re, 2, flags);
968
+ i += iter->nsub;
969
+ break;
970
+ }
971
+ case 3:
972
+ // Just use the Splice prefix.
973
+ sub[out++] = iter->prefix;
974
+ i += iter->nsub;
975
+ break;
976
+ default:
977
+ LOG(DFATAL) << "unknown round: " << round;
978
+ break;
979
+ }
980
+ // If we are done, copy until the end of sub.
981
+ if (++iter == splices.end()) {
982
+ while (i < nsub)
983
+ sub[out++] = sub[i++];
984
+ }
985
+ }
986
+ splices.clear();
987
+ nsub = out;
988
+ // Advance to the next round of factoring.
989
+ round++;
990
+ }
991
+
992
+ switch (round) {
993
+ case 1:
994
+ FactorAlternationImpl::Round1(sub, nsub, flags, &splices);
995
+ break;
996
+ case 2:
997
+ FactorAlternationImpl::Round2(sub, nsub, flags, &splices);
998
+ break;
999
+ case 3:
1000
+ FactorAlternationImpl::Round3(sub, nsub, flags, &splices);
1001
+ break;
1002
+ case 4:
1003
+ if (stk.size() == 1) {
1004
+ // We are at the top of the stack. Just return.
1005
+ return nsub;
1006
+ } else {
1007
+ // Pop the stack and set the number of suffixes.
1008
+ // (Note that references will be invalidated!)
1009
+ int nsuffix = nsub;
1010
+ stk.pop_back();
1011
+ stk.back().splices[stk.back().spliceidx].nsuffix = nsuffix;
1012
+ ++stk.back().spliceidx;
1013
+ continue;
1014
+ }
1015
+ default:
1016
+ LOG(DFATAL) << "unknown round: " << round;
1017
+ break;
1018
+ }
1019
+
1020
+ // Set spliceidx depending on whether we have Splices to factor.
1021
+ if (splices.empty() || round == 3) {
1022
+ spliceidx = static_cast<int>(splices.size());
1023
+ } else {
1024
+ spliceidx = 0;
1025
+ }
1026
+ }
1027
+ }
1028
+
1029
+ void FactorAlternationImpl::Round1(Regexp** sub, int nsub,
1030
+ Regexp::ParseFlags flags,
1031
+ std::vector<Splice>* splices) {
1032
+ // Round 1: Factor out common literal prefixes.
1033
+ int start = 0;
1034
+ Rune* rune = NULL;
1035
+ int nrune = 0;
1036
+ Regexp::ParseFlags runeflags = Regexp::NoParseFlags;
1037
+ for (int i = 0; i <= nsub; i++) {
1038
+ // Invariant: sub[start:i] consists of regexps that all
1039
+ // begin with rune[0:nrune].
1040
+ Rune* rune_i = NULL;
1041
+ int nrune_i = 0;
1042
+ Regexp::ParseFlags runeflags_i = Regexp::NoParseFlags;
1043
+ if (i < nsub) {
1044
+ rune_i = Regexp::LeadingString(sub[i], &nrune_i, &runeflags_i);
1045
+ if (runeflags_i == runeflags) {
1046
+ int same = 0;
1047
+ while (same < nrune && same < nrune_i && rune[same] == rune_i[same])
1048
+ same++;
1049
+ if (same > 0) {
1050
+ // Matches at least one rune in current range. Keep going around.
1051
+ nrune = same;
1052
+ continue;
1053
+ }
1054
+ }
1055
+ }
1056
+
1057
+ // Found end of a run with common leading literal string:
1058
+ // sub[start:i] all begin with rune[0:nrune],
1059
+ // but sub[i] does not even begin with rune[0].
1060
+ if (i == start) {
1061
+ // Nothing to do - first iteration.
1062
+ } else if (i == start+1) {
1063
+ // Just one: don't bother factoring.
1064
+ } else {
1065
+ Regexp* prefix = Regexp::LiteralString(rune, nrune, runeflags);
1066
+ for (int j = start; j < i; j++)
1067
+ Regexp::RemoveLeadingString(sub[j], nrune);
1068
+ splices->emplace_back(prefix, sub + start, i - start);
1069
+ }
1070
+
1071
+ // Prepare for next iteration (if there is one).
1072
+ if (i < nsub) {
1073
+ start = i;
1074
+ rune = rune_i;
1075
+ nrune = nrune_i;
1076
+ runeflags = runeflags_i;
1077
+ }
1078
+ }
1079
+ }
1080
+
1081
+ void FactorAlternationImpl::Round2(Regexp** sub, int nsub,
1082
+ Regexp::ParseFlags flags,
1083
+ std::vector<Splice>* splices) {
1084
+ // Round 2: Factor out common simple prefixes,
1085
+ // just the first piece of each concatenation.
1086
+ // This will be good enough a lot of the time.
1087
+ //
1088
+ // Complex subexpressions (e.g. involving quantifiers)
1089
+ // are not safe to factor because that collapses their
1090
+ // distinct paths through the automaton, which affects
1091
+ // correctness in some cases.
1092
+ int start = 0;
1093
+ Regexp* first = NULL;
1094
+ for (int i = 0; i <= nsub; i++) {
1095
+ // Invariant: sub[start:i] consists of regexps that all
1096
+ // begin with first.
1097
+ Regexp* first_i = NULL;
1098
+ if (i < nsub) {
1099
+ first_i = Regexp::LeadingRegexp(sub[i]);
1100
+ if (first != NULL &&
1101
+ // first must be an empty-width op
1102
+ // OR a char class, any char or any byte
1103
+ // OR a fixed repeat of a literal, char class, any char or any byte.
1104
+ (first->op() == kRegexpBeginLine ||
1105
+ first->op() == kRegexpEndLine ||
1106
+ first->op() == kRegexpWordBoundary ||
1107
+ first->op() == kRegexpNoWordBoundary ||
1108
+ first->op() == kRegexpBeginText ||
1109
+ first->op() == kRegexpEndText ||
1110
+ first->op() == kRegexpCharClass ||
1111
+ first->op() == kRegexpAnyChar ||
1112
+ first->op() == kRegexpAnyByte ||
1113
+ (first->op() == kRegexpRepeat &&
1114
+ first->min() == first->max() &&
1115
+ (first->sub()[0]->op() == kRegexpLiteral ||
1116
+ first->sub()[0]->op() == kRegexpCharClass ||
1117
+ first->sub()[0]->op() == kRegexpAnyChar ||
1118
+ first->sub()[0]->op() == kRegexpAnyByte))) &&
1119
+ Regexp::Equal(first, first_i))
1120
+ continue;
1121
+ }
1122
+
1123
+ // Found end of a run with common leading regexp:
1124
+ // sub[start:i] all begin with first,
1125
+ // but sub[i] does not.
1126
+ if (i == start) {
1127
+ // Nothing to do - first iteration.
1128
+ } else if (i == start+1) {
1129
+ // Just one: don't bother factoring.
1130
+ } else {
1131
+ Regexp* prefix = first->Incref();
1132
+ for (int j = start; j < i; j++)
1133
+ sub[j] = Regexp::RemoveLeadingRegexp(sub[j]);
1134
+ splices->emplace_back(prefix, sub + start, i - start);
1135
+ }
1136
+
1137
+ // Prepare for next iteration (if there is one).
1138
+ if (i < nsub) {
1139
+ start = i;
1140
+ first = first_i;
1141
+ }
1142
+ }
1143
+ }
1144
+
1145
+ void FactorAlternationImpl::Round3(Regexp** sub, int nsub,
1146
+ Regexp::ParseFlags flags,
1147
+ std::vector<Splice>* splices) {
1148
+ // Round 3: Merge runs of literals and/or character classes.
1149
+ int start = 0;
1150
+ Regexp* first = NULL;
1151
+ for (int i = 0; i <= nsub; i++) {
1152
+ // Invariant: sub[start:i] consists of regexps that all
1153
+ // are either literals (i.e. runes) or character classes.
1154
+ Regexp* first_i = NULL;
1155
+ if (i < nsub) {
1156
+ first_i = sub[i];
1157
+ if (first != NULL &&
1158
+ (first->op() == kRegexpLiteral ||
1159
+ first->op() == kRegexpCharClass) &&
1160
+ (first_i->op() == kRegexpLiteral ||
1161
+ first_i->op() == kRegexpCharClass))
1162
+ continue;
1163
+ }
1164
+
1165
+ // Found end of a run of Literal/CharClass:
1166
+ // sub[start:i] all are either one or the other,
1167
+ // but sub[i] is not.
1168
+ if (i == start) {
1169
+ // Nothing to do - first iteration.
1170
+ } else if (i == start+1) {
1171
+ // Just one: don't bother factoring.
1172
+ } else {
1173
+ CharClassBuilder ccb;
1174
+ for (int j = start; j < i; j++) {
1175
+ Regexp* re = sub[j];
1176
+ if (re->op() == kRegexpCharClass) {
1177
+ CharClass* cc = re->cc();
1178
+ for (CharClass::iterator it = cc->begin(); it != cc->end(); ++it)
1179
+ ccb.AddRange(it->lo, it->hi);
1180
+ } else if (re->op() == kRegexpLiteral) {
1181
+ ccb.AddRangeFlags(re->rune(), re->rune(), re->parse_flags());
1182
+ } else {
1183
+ LOG(DFATAL) << "RE2: unexpected op: " << re->op() << " "
1184
+ << re->ToString();
1185
+ }
1186
+ re->Decref();
1187
+ }
1188
+ Regexp* re = Regexp::NewCharClass(ccb.GetCharClass(), flags);
1189
+ splices->emplace_back(re, sub + start, i - start);
1190
+ }
1191
+
1192
+ // Prepare for next iteration (if there is one).
1193
+ if (i < nsub) {
1194
+ start = i;
1195
+ first = first_i;
1196
+ }
1197
+ }
1198
+ }
1199
+
1200
+ // Collapse the regexps on top of the stack, down to the
1201
+ // first marker, into a new op node (op == kRegexpAlternate
1202
+ // or op == kRegexpConcat).
1203
+ void Regexp::ParseState::DoCollapse(RegexpOp op) {
1204
+ // Scan backward to marker, counting children of composite.
1205
+ int n = 0;
1206
+ Regexp* next = NULL;
1207
+ Regexp* sub;
1208
+ for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) {
1209
+ next = sub->down_;
1210
+ if (sub->op_ == op)
1211
+ n += sub->nsub_;
1212
+ else
1213
+ n++;
1214
+ }
1215
+
1216
+ // If there's just one child, leave it alone.
1217
+ // (Concat of one thing is that one thing; alternate of one thing is same.)
1218
+ if (stacktop_ != NULL && stacktop_->down_ == next)
1219
+ return;
1220
+
1221
+ // Construct op (alternation or concatenation), flattening op of op.
1222
+ PODArray<Regexp*> subs(n);
1223
+ next = NULL;
1224
+ int i = n;
1225
+ for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) {
1226
+ next = sub->down_;
1227
+ if (sub->op_ == op) {
1228
+ Regexp** sub_subs = sub->sub();
1229
+ for (int k = sub->nsub_ - 1; k >= 0; k--)
1230
+ subs[--i] = sub_subs[k]->Incref();
1231
+ sub->Decref();
1232
+ } else {
1233
+ subs[--i] = FinishRegexp(sub);
1234
+ }
1235
+ }
1236
+
1237
+ Regexp* re = ConcatOrAlternate(op, subs.data(), n, flags_, true);
1238
+ re->simple_ = re->ComputeSimple();
1239
+ re->down_ = next;
1240
+ stacktop_ = re;
1241
+ }
1242
+
1243
+ // Finishes the current concatenation,
1244
+ // collapsing it into a single regexp on the stack.
1245
+ void Regexp::ParseState::DoConcatenation() {
1246
+ Regexp* r1 = stacktop_;
1247
+ if (r1 == NULL || IsMarker(r1->op())) {
1248
+ // empty concatenation is special case
1249
+ Regexp* re = new Regexp(kRegexpEmptyMatch, flags_);
1250
+ PushRegexp(re);
1251
+ }
1252
+ DoCollapse(kRegexpConcat);
1253
+ }
1254
+
1255
+ // Finishes the current alternation,
1256
+ // collapsing it to a single regexp on the stack.
1257
+ void Regexp::ParseState::DoAlternation() {
1258
+ DoVerticalBar();
1259
+ // Now stack top is kVerticalBar.
1260
+ Regexp* r1 = stacktop_;
1261
+ stacktop_ = r1->down_;
1262
+ r1->Decref();
1263
+ DoCollapse(kRegexpAlternate);
1264
+ }
1265
+
1266
+ // Incremental conversion of concatenated literals into strings.
1267
+ // If top two elements on stack are both literal or string,
1268
+ // collapse into single string.
1269
+ // Don't walk down the stack -- the parser calls this frequently
1270
+ // enough that below the bottom two is known to be collapsed.
1271
+ // Only called when another regexp is about to be pushed
1272
+ // on the stack, so that the topmost literal is not being considered.
1273
+ // (Otherwise ab* would turn into (ab)*.)
1274
+ // If r >= 0, consider pushing a literal r on the stack.
1275
+ // Return whether that happened.
1276
+ bool Regexp::ParseState::MaybeConcatString(int r, ParseFlags flags) {
1277
+ Regexp* re1;
1278
+ Regexp* re2;
1279
+ if ((re1 = stacktop_) == NULL || (re2 = re1->down_) == NULL)
1280
+ return false;
1281
+
1282
+ if (re1->op_ != kRegexpLiteral && re1->op_ != kRegexpLiteralString)
1283
+ return false;
1284
+ if (re2->op_ != kRegexpLiteral && re2->op_ != kRegexpLiteralString)
1285
+ return false;
1286
+ if ((re1->parse_flags_ & FoldCase) != (re2->parse_flags_ & FoldCase))
1287
+ return false;
1288
+
1289
+ if (re2->op_ == kRegexpLiteral) {
1290
+ // convert into string
1291
+ Rune rune = re2->rune_;
1292
+ re2->op_ = kRegexpLiteralString;
1293
+ re2->nrunes_ = 0;
1294
+ re2->runes_ = NULL;
1295
+ re2->AddRuneToString(rune);
1296
+ }
1297
+
1298
+ // push re1 into re2.
1299
+ if (re1->op_ == kRegexpLiteral) {
1300
+ re2->AddRuneToString(re1->rune_);
1301
+ } else {
1302
+ for (int i = 0; i < re1->nrunes_; i++)
1303
+ re2->AddRuneToString(re1->runes_[i]);
1304
+ re1->nrunes_ = 0;
1305
+ delete[] re1->runes_;
1306
+ re1->runes_ = NULL;
1307
+ }
1308
+
1309
+ // reuse re1 if possible
1310
+ if (r >= 0) {
1311
+ re1->op_ = kRegexpLiteral;
1312
+ re1->rune_ = r;
1313
+ re1->parse_flags_ = static_cast<uint16_t>(flags);
1314
+ return true;
1315
+ }
1316
+
1317
+ stacktop_ = re2;
1318
+ re1->Decref();
1319
+ return false;
1320
+ }
1321
+
1322
+ // Lexing routines.
1323
+
1324
+ // Parses a decimal integer, storing it in *np.
1325
+ // Sets *s to span the remainder of the string.
1326
+ static bool ParseInteger(StringPiece* s, int* np) {
1327
+ if (s->empty() || !isdigit((*s)[0] & 0xFF))
1328
+ return false;
1329
+ // Disallow leading zeros.
1330
+ if (s->size() >= 2 && (*s)[0] == '0' && isdigit((*s)[1] & 0xFF))
1331
+ return false;
1332
+ int n = 0;
1333
+ int c;
1334
+ while (!s->empty() && isdigit(c = (*s)[0] & 0xFF)) {
1335
+ // Avoid overflow.
1336
+ if (n >= 100000000)
1337
+ return false;
1338
+ n = n*10 + c - '0';
1339
+ s->remove_prefix(1); // digit
1340
+ }
1341
+ *np = n;
1342
+ return true;
1343
+ }
1344
+
1345
+ // Parses a repetition suffix like {1,2} or {2} or {2,}.
1346
+ // Sets *s to span the remainder of the string on success.
1347
+ // Sets *lo and *hi to the given range.
1348
+ // In the case of {2,}, the high number is unbounded;
1349
+ // sets *hi to -1 to signify this.
1350
+ // {,2} is NOT a valid suffix.
1351
+ // The Maybe in the name signifies that the regexp parse
1352
+ // doesn't fail even if ParseRepetition does, so the StringPiece
1353
+ // s must NOT be edited unless MaybeParseRepetition returns true.
1354
+ static bool MaybeParseRepetition(StringPiece* sp, int* lo, int* hi) {
1355
+ StringPiece s = *sp;
1356
+ if (s.empty() || s[0] != '{')
1357
+ return false;
1358
+ s.remove_prefix(1); // '{'
1359
+ if (!ParseInteger(&s, lo))
1360
+ return false;
1361
+ if (s.empty())
1362
+ return false;
1363
+ if (s[0] == ',') {
1364
+ s.remove_prefix(1); // ','
1365
+ if (s.empty())
1366
+ return false;
1367
+ if (s[0] == '}') {
1368
+ // {2,} means at least 2
1369
+ *hi = -1;
1370
+ } else {
1371
+ // {2,4} means 2, 3, or 4.
1372
+ if (!ParseInteger(&s, hi))
1373
+ return false;
1374
+ }
1375
+ } else {
1376
+ // {2} means exactly two
1377
+ *hi = *lo;
1378
+ }
1379
+ if (s.empty() || s[0] != '}')
1380
+ return false;
1381
+ s.remove_prefix(1); // '}'
1382
+ *sp = s;
1383
+ return true;
1384
+ }
1385
+
1386
+ // Removes the next Rune from the StringPiece and stores it in *r.
1387
+ // Returns number of bytes removed from sp.
1388
+ // Behaves as though there is a terminating NUL at the end of sp.
1389
+ // Argument order is backwards from usual Google style
1390
+ // but consistent with chartorune.
1391
+ static int StringPieceToRune(Rune *r, StringPiece *sp, RegexpStatus* status) {
1392
+ // fullrune() takes int, not size_t. However, it just looks
1393
+ // at the leading byte and treats any length >= 4 the same.
1394
+ if (fullrune(sp->data(), static_cast<int>(std::min(size_t{4}, sp->size())))) {
1395
+ int n = chartorune(r, sp->data());
1396
+ // Some copies of chartorune have a bug that accepts
1397
+ // encodings of values in (10FFFF, 1FFFFF] as valid.
1398
+ // Those values break the character class algorithm,
1399
+ // which assumes Runemax is the largest rune.
1400
+ if (*r > Runemax) {
1401
+ n = 1;
1402
+ *r = Runeerror;
1403
+ }
1404
+ if (!(n == 1 && *r == Runeerror)) { // no decoding error
1405
+ sp->remove_prefix(n);
1406
+ return n;
1407
+ }
1408
+ }
1409
+
1410
+ status->set_code(kRegexpBadUTF8);
1411
+ status->set_error_arg(StringPiece());
1412
+ return -1;
1413
+ }
1414
+
1415
+ // Return whether name is valid UTF-8.
1416
+ // If not, set status to kRegexpBadUTF8.
1417
+ static bool IsValidUTF8(const StringPiece& s, RegexpStatus* status) {
1418
+ StringPiece t = s;
1419
+ Rune r;
1420
+ while (!t.empty()) {
1421
+ if (StringPieceToRune(&r, &t, status) < 0)
1422
+ return false;
1423
+ }
1424
+ return true;
1425
+ }
1426
+
1427
+ // Is c a hex digit?
1428
+ static int IsHex(int c) {
1429
+ return ('0' <= c && c <= '9') ||
1430
+ ('A' <= c && c <= 'F') ||
1431
+ ('a' <= c && c <= 'f');
1432
+ }
1433
+
1434
+ // Convert hex digit to value.
1435
+ static int UnHex(int c) {
1436
+ if ('0' <= c && c <= '9')
1437
+ return c - '0';
1438
+ if ('A' <= c && c <= 'F')
1439
+ return c - 'A' + 10;
1440
+ if ('a' <= c && c <= 'f')
1441
+ return c - 'a' + 10;
1442
+ LOG(DFATAL) << "Bad hex digit " << c;
1443
+ return 0;
1444
+ }
1445
+
1446
+ // Parse an escape sequence (e.g., \n, \{).
1447
+ // Sets *s to span the remainder of the string.
1448
+ // Sets *rp to the named character.
1449
+ static bool ParseEscape(StringPiece* s, Rune* rp,
1450
+ RegexpStatus* status, int rune_max) {
1451
+ const char* begin = s->data();
1452
+ if (s->empty() || (*s)[0] != '\\') {
1453
+ // Should not happen - caller always checks.
1454
+ status->set_code(kRegexpInternalError);
1455
+ status->set_error_arg(StringPiece());
1456
+ return false;
1457
+ }
1458
+ if (s->size() == 1) {
1459
+ status->set_code(kRegexpTrailingBackslash);
1460
+ status->set_error_arg(StringPiece());
1461
+ return false;
1462
+ }
1463
+ Rune c, c1;
1464
+ s->remove_prefix(1); // backslash
1465
+ if (StringPieceToRune(&c, s, status) < 0)
1466
+ return false;
1467
+ int code;
1468
+ switch (c) {
1469
+ default:
1470
+ if (c < Runeself && !isalpha(c) && !isdigit(c)) {
1471
+ // Escaped non-word characters are always themselves.
1472
+ // PCRE is not quite so rigorous: it accepts things like
1473
+ // \q, but we don't. We once rejected \_, but too many
1474
+ // programs and people insist on using it, so allow \_.
1475
+ *rp = c;
1476
+ return true;
1477
+ }
1478
+ goto BadEscape;
1479
+
1480
+ // Octal escapes.
1481
+ case '1':
1482
+ case '2':
1483
+ case '3':
1484
+ case '4':
1485
+ case '5':
1486
+ case '6':
1487
+ case '7':
1488
+ // Single non-zero octal digit is a backreference; not supported.
1489
+ if (s->empty() || (*s)[0] < '0' || (*s)[0] > '7')
1490
+ goto BadEscape;
1491
+ FALLTHROUGH_INTENDED;
1492
+ case '0':
1493
+ // consume up to three octal digits; already have one.
1494
+ code = c - '0';
1495
+ if (!s->empty() && '0' <= (c = (*s)[0]) && c <= '7') {
1496
+ code = code * 8 + c - '0';
1497
+ s->remove_prefix(1); // digit
1498
+ if (!s->empty()) {
1499
+ c = (*s)[0];
1500
+ if ('0' <= c && c <= '7') {
1501
+ code = code * 8 + c - '0';
1502
+ s->remove_prefix(1); // digit
1503
+ }
1504
+ }
1505
+ }
1506
+ if (code > rune_max)
1507
+ goto BadEscape;
1508
+ *rp = code;
1509
+ return true;
1510
+
1511
+ // Hexadecimal escapes
1512
+ case 'x':
1513
+ if (s->empty())
1514
+ goto BadEscape;
1515
+ if (StringPieceToRune(&c, s, status) < 0)
1516
+ return false;
1517
+ if (c == '{') {
1518
+ // Any number of digits in braces.
1519
+ // Update n as we consume the string, so that
1520
+ // the whole thing gets shown in the error message.
1521
+ // Perl accepts any text at all; it ignores all text
1522
+ // after the first non-hex digit. We require only hex digits,
1523
+ // and at least one.
1524
+ if (StringPieceToRune(&c, s, status) < 0)
1525
+ return false;
1526
+ int nhex = 0;
1527
+ code = 0;
1528
+ while (IsHex(c)) {
1529
+ nhex++;
1530
+ code = code * 16 + UnHex(c);
1531
+ if (code > rune_max)
1532
+ goto BadEscape;
1533
+ if (s->empty())
1534
+ goto BadEscape;
1535
+ if (StringPieceToRune(&c, s, status) < 0)
1536
+ return false;
1537
+ }
1538
+ if (c != '}' || nhex == 0)
1539
+ goto BadEscape;
1540
+ *rp = code;
1541
+ return true;
1542
+ }
1543
+ // Easy case: two hex digits.
1544
+ if (s->empty())
1545
+ goto BadEscape;
1546
+ if (StringPieceToRune(&c1, s, status) < 0)
1547
+ return false;
1548
+ if (!IsHex(c) || !IsHex(c1))
1549
+ goto BadEscape;
1550
+ *rp = UnHex(c) * 16 + UnHex(c1);
1551
+ return true;
1552
+
1553
+ // C escapes.
1554
+ case 'n':
1555
+ *rp = '\n';
1556
+ return true;
1557
+ case 'r':
1558
+ *rp = '\r';
1559
+ return true;
1560
+ case 't':
1561
+ *rp = '\t';
1562
+ return true;
1563
+
1564
+ // Less common C escapes.
1565
+ case 'a':
1566
+ *rp = '\a';
1567
+ return true;
1568
+ case 'f':
1569
+ *rp = '\f';
1570
+ return true;
1571
+ case 'v':
1572
+ *rp = '\v';
1573
+ return true;
1574
+
1575
+ // This code is disabled to avoid misparsing
1576
+ // the Perl word-boundary \b as a backspace
1577
+ // when in POSIX regexp mode. Surprisingly,
1578
+ // in Perl, \b means word-boundary but [\b]
1579
+ // means backspace. We don't support that:
1580
+ // if you want a backspace embed a literal
1581
+ // backspace character or use \x08.
1582
+ //
1583
+ // case 'b':
1584
+ // *rp = '\b';
1585
+ // return true;
1586
+ }
1587
+
1588
+ LOG(DFATAL) << "Not reached in ParseEscape.";
1589
+
1590
+ BadEscape:
1591
+ // Unrecognized escape sequence.
1592
+ status->set_code(kRegexpBadEscape);
1593
+ status->set_error_arg(
1594
+ StringPiece(begin, static_cast<size_t>(s->data() - begin)));
1595
+ return false;
1596
+ }
1597
+
1598
+ // Add a range to the character class, but exclude newline if asked.
1599
+ // Also handle case folding.
1600
+ void CharClassBuilder::AddRangeFlags(
1601
+ Rune lo, Rune hi, Regexp::ParseFlags parse_flags) {
1602
+
1603
+ // Take out \n if the flags say so.
1604
+ bool cutnl = !(parse_flags & Regexp::ClassNL) ||
1605
+ (parse_flags & Regexp::NeverNL);
1606
+ if (cutnl && lo <= '\n' && '\n' <= hi) {
1607
+ if (lo < '\n')
1608
+ AddRangeFlags(lo, '\n' - 1, parse_flags);
1609
+ if (hi > '\n')
1610
+ AddRangeFlags('\n' + 1, hi, parse_flags);
1611
+ return;
1612
+ }
1613
+
1614
+ // If folding case, add fold-equivalent characters too.
1615
+ if (parse_flags & Regexp::FoldCase)
1616
+ AddFoldedRange(this, lo, hi, 0);
1617
+ else
1618
+ AddRange(lo, hi);
1619
+ }
1620
+
1621
+ // Look for a group with the given name.
1622
+ static const UGroup* LookupGroup(const StringPiece& name,
1623
+ const UGroup *groups, int ngroups) {
1624
+ // Simple name lookup.
1625
+ for (int i = 0; i < ngroups; i++)
1626
+ if (StringPiece(groups[i].name) == name)
1627
+ return &groups[i];
1628
+ return NULL;
1629
+ }
1630
+
1631
+ // Look for a POSIX group with the given name (e.g., "[:^alpha:]")
1632
+ static const UGroup* LookupPosixGroup(const StringPiece& name) {
1633
+ return LookupGroup(name, posix_groups, num_posix_groups);
1634
+ }
1635
+
1636
+ static const UGroup* LookupPerlGroup(const StringPiece& name) {
1637
+ return LookupGroup(name, perl_groups, num_perl_groups);
1638
+ }
1639
+
1640
+ #if !defined(RE2_USE_ICU)
1641
+ // Fake UGroup containing all Runes
1642
+ static URange16 any16[] = { { 0, 65535 } };
1643
+ static URange32 any32[] = { { 65536, Runemax } };
1644
+ static UGroup anygroup = { "Any", +1, any16, 1, any32, 1 };
1645
+
1646
+ // Look for a Unicode group with the given name (e.g., "Han")
1647
+ static const UGroup* LookupUnicodeGroup(const StringPiece& name) {
1648
+ // Special case: "Any" means any.
1649
+ if (name == StringPiece("Any"))
1650
+ return &anygroup;
1651
+ return LookupGroup(name, unicode_groups, num_unicode_groups);
1652
+ }
1653
+ #endif
1654
+
1655
+ // Add a UGroup or its negation to the character class.
1656
+ static void AddUGroup(CharClassBuilder *cc, const UGroup *g, int sign,
1657
+ Regexp::ParseFlags parse_flags) {
1658
+ if (sign == +1) {
1659
+ for (int i = 0; i < g->nr16; i++) {
1660
+ cc->AddRangeFlags(g->r16[i].lo, g->r16[i].hi, parse_flags);
1661
+ }
1662
+ for (int i = 0; i < g->nr32; i++) {
1663
+ cc->AddRangeFlags(g->r32[i].lo, g->r32[i].hi, parse_flags);
1664
+ }
1665
+ } else {
1666
+ if (parse_flags & Regexp::FoldCase) {
1667
+ // Normally adding a case-folded group means
1668
+ // adding all the extra fold-equivalent runes too.
1669
+ // But if we're adding the negation of the group,
1670
+ // we have to exclude all the runes that are fold-equivalent
1671
+ // to what's already missing. Too hard, so do in two steps.
1672
+ CharClassBuilder ccb1;
1673
+ AddUGroup(&ccb1, g, +1, parse_flags);
1674
+ // If the flags say to take out \n, put it in, so that negating will take it out.
1675
+ // Normally AddRangeFlags does this, but we're bypassing AddRangeFlags.
1676
+ bool cutnl = !(parse_flags & Regexp::ClassNL) ||
1677
+ (parse_flags & Regexp::NeverNL);
1678
+ if (cutnl) {
1679
+ ccb1.AddRange('\n', '\n');
1680
+ }
1681
+ ccb1.Negate();
1682
+ cc->AddCharClass(&ccb1);
1683
+ return;
1684
+ }
1685
+ int next = 0;
1686
+ for (int i = 0; i < g->nr16; i++) {
1687
+ if (next < g->r16[i].lo)
1688
+ cc->AddRangeFlags(next, g->r16[i].lo - 1, parse_flags);
1689
+ next = g->r16[i].hi + 1;
1690
+ }
1691
+ for (int i = 0; i < g->nr32; i++) {
1692
+ if (next < g->r32[i].lo)
1693
+ cc->AddRangeFlags(next, g->r32[i].lo - 1, parse_flags);
1694
+ next = g->r32[i].hi + 1;
1695
+ }
1696
+ if (next <= Runemax)
1697
+ cc->AddRangeFlags(next, Runemax, parse_flags);
1698
+ }
1699
+ }
1700
+
1701
+ // Maybe parse a Perl character class escape sequence.
1702
+ // Only recognizes the Perl character classes (\d \s \w \D \S \W),
1703
+ // not the Perl empty-string classes (\b \B \A \Z \z).
1704
+ // On success, sets *s to span the remainder of the string
1705
+ // and returns the corresponding UGroup.
1706
+ // The StringPiece must *NOT* be edited unless the call succeeds.
1707
+ const UGroup* MaybeParsePerlCCEscape(StringPiece* s, Regexp::ParseFlags parse_flags) {
1708
+ if (!(parse_flags & Regexp::PerlClasses))
1709
+ return NULL;
1710
+ if (s->size() < 2 || (*s)[0] != '\\')
1711
+ return NULL;
1712
+ // Could use StringPieceToRune, but there aren't
1713
+ // any non-ASCII Perl group names.
1714
+ StringPiece name(s->data(), 2);
1715
+ const UGroup *g = LookupPerlGroup(name);
1716
+ if (g == NULL)
1717
+ return NULL;
1718
+ s->remove_prefix(name.size());
1719
+ return g;
1720
+ }
1721
+
1722
+ enum ParseStatus {
1723
+ kParseOk, // Did some parsing.
1724
+ kParseError, // Found an error.
1725
+ kParseNothing, // Decided not to parse.
1726
+ };
1727
+
1728
+ // Maybe parses a Unicode character group like \p{Han} or \P{Han}
1729
+ // (the latter is a negated group).
1730
+ ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags,
1731
+ CharClassBuilder *cc,
1732
+ RegexpStatus* status) {
1733
+ // Decide whether to parse.
1734
+ if (!(parse_flags & Regexp::UnicodeGroups))
1735
+ return kParseNothing;
1736
+ if (s->size() < 2 || (*s)[0] != '\\')
1737
+ return kParseNothing;
1738
+ Rune c = (*s)[1];
1739
+ if (c != 'p' && c != 'P')
1740
+ return kParseNothing;
1741
+
1742
+ // Committed to parse. Results:
1743
+ int sign = +1; // -1 = negated char class
1744
+ if (c == 'P')
1745
+ sign = -sign;
1746
+ StringPiece seq = *s; // \p{Han} or \pL
1747
+ StringPiece name; // Han or L
1748
+ s->remove_prefix(2); // '\\', 'p'
1749
+
1750
+ if (!StringPieceToRune(&c, s, status))
1751
+ return kParseError;
1752
+ if (c != '{') {
1753
+ // Name is the bit of string we just skipped over for c.
1754
+ const char* p = seq.data() + 2;
1755
+ name = StringPiece(p, static_cast<size_t>(s->data() - p));
1756
+ } else {
1757
+ // Name is in braces. Look for closing }
1758
+ size_t end = s->find('}', 0);
1759
+ if (end == StringPiece::npos) {
1760
+ if (!IsValidUTF8(seq, status))
1761
+ return kParseError;
1762
+ status->set_code(kRegexpBadCharRange);
1763
+ status->set_error_arg(seq);
1764
+ return kParseError;
1765
+ }
1766
+ name = StringPiece(s->data(), end); // without '}'
1767
+ s->remove_prefix(end + 1); // with '}'
1768
+ if (!IsValidUTF8(name, status))
1769
+ return kParseError;
1770
+ }
1771
+
1772
+ // Chop seq where s now begins.
1773
+ seq = StringPiece(seq.data(), static_cast<size_t>(s->data() - seq.data()));
1774
+
1775
+ if (!name.empty() && name[0] == '^') {
1776
+ sign = -sign;
1777
+ name.remove_prefix(1); // '^'
1778
+ }
1779
+
1780
+ #if !defined(RE2_USE_ICU)
1781
+ // Look up the group in the RE2 Unicode data.
1782
+ const UGroup *g = LookupUnicodeGroup(name);
1783
+ if (g == NULL) {
1784
+ status->set_code(kRegexpBadCharRange);
1785
+ status->set_error_arg(seq);
1786
+ return kParseError;
1787
+ }
1788
+
1789
+ AddUGroup(cc, g, sign, parse_flags);
1790
+ #else
1791
+ // Look up the group in the ICU Unicode data. Because ICU provides full
1792
+ // Unicode properties support, this could be more than a lookup by name.
1793
+ ::icu::UnicodeString ustr = ::icu::UnicodeString::fromUTF8(
1794
+ std::string("\\p{") + std::string(name) + std::string("}"));
1795
+ UErrorCode uerr = U_ZERO_ERROR;
1796
+ ::icu::UnicodeSet uset(ustr, uerr);
1797
+ if (U_FAILURE(uerr)) {
1798
+ status->set_code(kRegexpBadCharRange);
1799
+ status->set_error_arg(seq);
1800
+ return kParseError;
1801
+ }
1802
+
1803
+ // Convert the UnicodeSet to a URange32 and UGroup that we can add.
1804
+ int nr = uset.getRangeCount();
1805
+ PODArray<URange32> r(nr);
1806
+ for (int i = 0; i < nr; i++) {
1807
+ r[i].lo = uset.getRangeStart(i);
1808
+ r[i].hi = uset.getRangeEnd(i);
1809
+ }
1810
+ UGroup g = {"", +1, 0, 0, r.data(), nr};
1811
+ AddUGroup(cc, &g, sign, parse_flags);
1812
+ #endif
1813
+
1814
+ return kParseOk;
1815
+ }
1816
+
1817
+ // Parses a character class name like [:alnum:].
1818
+ // Sets *s to span the remainder of the string.
1819
+ // Adds the ranges corresponding to the class to ranges.
1820
+ static ParseStatus ParseCCName(StringPiece* s, Regexp::ParseFlags parse_flags,
1821
+ CharClassBuilder *cc,
1822
+ RegexpStatus* status) {
1823
+ // Check begins with [:
1824
+ const char* p = s->data();
1825
+ const char* ep = s->data() + s->size();
1826
+ if (ep - p < 2 || p[0] != '[' || p[1] != ':')
1827
+ return kParseNothing;
1828
+
1829
+ // Look for closing :].
1830
+ const char* q;
1831
+ for (q = p+2; q <= ep-2 && (*q != ':' || *(q+1) != ']'); q++)
1832
+ ;
1833
+
1834
+ // If no closing :], then ignore.
1835
+ if (q > ep-2)
1836
+ return kParseNothing;
1837
+
1838
+ // Got it. Check that it's valid.
1839
+ q += 2;
1840
+ StringPiece name(p, static_cast<size_t>(q - p));
1841
+
1842
+ const UGroup *g = LookupPosixGroup(name);
1843
+ if (g == NULL) {
1844
+ status->set_code(kRegexpBadCharRange);
1845
+ status->set_error_arg(name);
1846
+ return kParseError;
1847
+ }
1848
+
1849
+ s->remove_prefix(name.size());
1850
+ AddUGroup(cc, g, g->sign, parse_flags);
1851
+ return kParseOk;
1852
+ }
1853
+
1854
+ // Parses a character inside a character class.
1855
+ // There are fewer special characters here than in the rest of the regexp.
1856
+ // Sets *s to span the remainder of the string.
1857
+ // Sets *rp to the character.
1858
+ bool Regexp::ParseState::ParseCCCharacter(StringPiece* s, Rune *rp,
1859
+ const StringPiece& whole_class,
1860
+ RegexpStatus* status) {
1861
+ if (s->empty()) {
1862
+ status->set_code(kRegexpMissingBracket);
1863
+ status->set_error_arg(whole_class);
1864
+ return false;
1865
+ }
1866
+
1867
+ // Allow regular escape sequences even though
1868
+ // many need not be escaped in this context.
1869
+ if ((*s)[0] == '\\')
1870
+ return ParseEscape(s, rp, status, rune_max_);
1871
+
1872
+ // Otherwise take the next rune.
1873
+ return StringPieceToRune(rp, s, status) >= 0;
1874
+ }
1875
+
1876
+ // Parses a character class character, or, if the character
1877
+ // is followed by a hyphen, parses a character class range.
1878
+ // For single characters, rr->lo == rr->hi.
1879
+ // Sets *s to span the remainder of the string.
1880
+ // Sets *rp to the character.
1881
+ bool Regexp::ParseState::ParseCCRange(StringPiece* s, RuneRange* rr,
1882
+ const StringPiece& whole_class,
1883
+ RegexpStatus* status) {
1884
+ StringPiece os = *s;
1885
+ if (!ParseCCCharacter(s, &rr->lo, whole_class, status))
1886
+ return false;
1887
+ // [a-] means (a|-), so check for final ].
1888
+ if (s->size() >= 2 && (*s)[0] == '-' && (*s)[1] != ']') {
1889
+ s->remove_prefix(1); // '-'
1890
+ if (!ParseCCCharacter(s, &rr->hi, whole_class, status))
1891
+ return false;
1892
+ if (rr->hi < rr->lo) {
1893
+ status->set_code(kRegexpBadCharRange);
1894
+ status->set_error_arg(
1895
+ StringPiece(os.data(), static_cast<size_t>(s->data() - os.data())));
1896
+ return false;
1897
+ }
1898
+ } else {
1899
+ rr->hi = rr->lo;
1900
+ }
1901
+ return true;
1902
+ }
1903
+
1904
+ // Parses a possibly-negated character class expression like [^abx-z[:digit:]].
1905
+ // Sets *s to span the remainder of the string.
1906
+ // Sets *out_re to the regexp for the class.
1907
+ bool Regexp::ParseState::ParseCharClass(StringPiece* s,
1908
+ Regexp** out_re,
1909
+ RegexpStatus* status) {
1910
+ StringPiece whole_class = *s;
1911
+ if (s->empty() || (*s)[0] != '[') {
1912
+ // Caller checked this.
1913
+ status->set_code(kRegexpInternalError);
1914
+ status->set_error_arg(StringPiece());
1915
+ return false;
1916
+ }
1917
+ bool negated = false;
1918
+ Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);
1919
+ re->ccb_ = new CharClassBuilder;
1920
+ s->remove_prefix(1); // '['
1921
+ if (!s->empty() && (*s)[0] == '^') {
1922
+ s->remove_prefix(1); // '^'
1923
+ negated = true;
1924
+ if (!(flags_ & ClassNL) || (flags_ & NeverNL)) {
1925
+ // If NL can't match implicitly, then pretend
1926
+ // negated classes include a leading \n.
1927
+ re->ccb_->AddRange('\n', '\n');
1928
+ }
1929
+ }
1930
+ bool first = true; // ] is okay as first char in class
1931
+ while (!s->empty() && ((*s)[0] != ']' || first)) {
1932
+ // - is only okay unescaped as first or last in class.
1933
+ // Except that Perl allows - anywhere.
1934
+ if ((*s)[0] == '-' && !first && !(flags_&PerlX) &&
1935
+ (s->size() == 1 || (*s)[1] != ']')) {
1936
+ StringPiece t = *s;
1937
+ t.remove_prefix(1); // '-'
1938
+ Rune r;
1939
+ int n = StringPieceToRune(&r, &t, status);
1940
+ if (n < 0) {
1941
+ re->Decref();
1942
+ return false;
1943
+ }
1944
+ status->set_code(kRegexpBadCharRange);
1945
+ status->set_error_arg(StringPiece(s->data(), 1+n));
1946
+ re->Decref();
1947
+ return false;
1948
+ }
1949
+ first = false;
1950
+
1951
+ // Look for [:alnum:] etc.
1952
+ if (s->size() > 2 && (*s)[0] == '[' && (*s)[1] == ':') {
1953
+ switch (ParseCCName(s, flags_, re->ccb_, status)) {
1954
+ case kParseOk:
1955
+ continue;
1956
+ case kParseError:
1957
+ re->Decref();
1958
+ return false;
1959
+ case kParseNothing:
1960
+ break;
1961
+ }
1962
+ }
1963
+
1964
+ // Look for Unicode character group like \p{Han}
1965
+ if (s->size() > 2 &&
1966
+ (*s)[0] == '\\' &&
1967
+ ((*s)[1] == 'p' || (*s)[1] == 'P')) {
1968
+ switch (ParseUnicodeGroup(s, flags_, re->ccb_, status)) {
1969
+ case kParseOk:
1970
+ continue;
1971
+ case kParseError:
1972
+ re->Decref();
1973
+ return false;
1974
+ case kParseNothing:
1975
+ break;
1976
+ }
1977
+ }
1978
+
1979
+ // Look for Perl character class symbols (extension).
1980
+ const UGroup *g = MaybeParsePerlCCEscape(s, flags_);
1981
+ if (g != NULL) {
1982
+ AddUGroup(re->ccb_, g, g->sign, flags_);
1983
+ continue;
1984
+ }
1985
+
1986
+ // Otherwise assume single character or simple range.
1987
+ RuneRange rr;
1988
+ if (!ParseCCRange(s, &rr, whole_class, status)) {
1989
+ re->Decref();
1990
+ return false;
1991
+ }
1992
+ // AddRangeFlags is usually called in response to a class like
1993
+ // \p{Foo} or [[:foo:]]; for those, it filters \n out unless
1994
+ // Regexp::ClassNL is set. In an explicit range or singleton
1995
+ // like we just parsed, we do not filter \n out, so set ClassNL
1996
+ // in the flags.
1997
+ re->ccb_->AddRangeFlags(rr.lo, rr.hi, flags_ | Regexp::ClassNL);
1998
+ }
1999
+ if (s->empty()) {
2000
+ status->set_code(kRegexpMissingBracket);
2001
+ status->set_error_arg(whole_class);
2002
+ re->Decref();
2003
+ return false;
2004
+ }
2005
+ s->remove_prefix(1); // ']'
2006
+
2007
+ if (negated)
2008
+ re->ccb_->Negate();
2009
+
2010
+ *out_re = re;
2011
+ return true;
2012
+ }
2013
+
2014
+ // Is this a valid capture name? [A-Za-z0-9_]+
2015
+ // PCRE limits names to 32 bytes.
2016
+ // Python rejects names starting with digits.
2017
+ // We don't enforce either of those.
2018
+ static bool IsValidCaptureName(const StringPiece& name) {
2019
+ if (name.empty())
2020
+ return false;
2021
+ for (size_t i = 0; i < name.size(); i++) {
2022
+ int c = name[i];
2023
+ if (('0' <= c && c <= '9') ||
2024
+ ('a' <= c && c <= 'z') ||
2025
+ ('A' <= c && c <= 'Z') ||
2026
+ c == '_')
2027
+ continue;
2028
+ return false;
2029
+ }
2030
+ return true;
2031
+ }
2032
+
2033
+ // Parses a Perl flag setting or non-capturing group or both,
2034
+ // like (?i) or (?: or (?i:. Removes from s, updates parse state.
2035
+ // The caller must check that s begins with "(?".
2036
+ // Returns true on success. If the Perl flag is not
2037
+ // well-formed or not supported, sets status_ and returns false.
2038
+ bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) {
2039
+ StringPiece t = *s;
2040
+
2041
+ // Caller is supposed to check this.
2042
+ if (!(flags_ & PerlX) || t.size() < 2 || t[0] != '(' || t[1] != '?') {
2043
+ LOG(DFATAL) << "Bad call to ParseState::ParsePerlFlags";
2044
+ status_->set_code(kRegexpInternalError);
2045
+ return false;
2046
+ }
2047
+
2048
+ t.remove_prefix(2); // "(?"
2049
+
2050
+ // Check for named captures, first introduced in Python's regexp library.
2051
+ // As usual, there are three slightly different syntaxes:
2052
+ //
2053
+ // (?P<name>expr) the original, introduced by Python
2054
+ // (?<name>expr) the .NET alteration, adopted by Perl 5.10
2055
+ // (?'name'expr) another .NET alteration, adopted by Perl 5.10
2056
+ //
2057
+ // Perl 5.10 gave in and implemented the Python version too,
2058
+ // but they claim that the last two are the preferred forms.
2059
+ // PCRE and languages based on it (specifically, PHP and Ruby)
2060
+ // support all three as well. EcmaScript 4 uses only the Python form.
2061
+ //
2062
+ // In both the open source world (via Code Search) and the
2063
+ // Google source tree, (?P<expr>name) is the dominant form,
2064
+ // so that's the one we implement. One is enough.
2065
+ if (t.size() > 2 && t[0] == 'P' && t[1] == '<') {
2066
+ // Pull out name.
2067
+ size_t end = t.find('>', 2);
2068
+ if (end == StringPiece::npos) {
2069
+ if (!IsValidUTF8(*s, status_))
2070
+ return false;
2071
+ status_->set_code(kRegexpBadNamedCapture);
2072
+ status_->set_error_arg(*s);
2073
+ return false;
2074
+ }
2075
+
2076
+ // t is "P<name>...", t[end] == '>'
2077
+ StringPiece capture(t.data()-2, end+3); // "(?P<name>"
2078
+ StringPiece name(t.data()+2, end-2); // "name"
2079
+ if (!IsValidUTF8(name, status_))
2080
+ return false;
2081
+ if (!IsValidCaptureName(name)) {
2082
+ status_->set_code(kRegexpBadNamedCapture);
2083
+ status_->set_error_arg(capture);
2084
+ return false;
2085
+ }
2086
+
2087
+ if (!DoLeftParen(name)) {
2088
+ // DoLeftParen's failure set status_.
2089
+ return false;
2090
+ }
2091
+
2092
+ s->remove_prefix(
2093
+ static_cast<size_t>(capture.data() + capture.size() - s->data()));
2094
+ return true;
2095
+ }
2096
+
2097
+ bool negated = false;
2098
+ bool sawflags = false;
2099
+ int nflags = flags_;
2100
+ Rune c;
2101
+ for (bool done = false; !done; ) {
2102
+ if (t.empty())
2103
+ goto BadPerlOp;
2104
+ if (StringPieceToRune(&c, &t, status_) < 0)
2105
+ return false;
2106
+ switch (c) {
2107
+ default:
2108
+ goto BadPerlOp;
2109
+
2110
+ // Parse flags.
2111
+ case 'i':
2112
+ sawflags = true;
2113
+ if (negated)
2114
+ nflags &= ~FoldCase;
2115
+ else
2116
+ nflags |= FoldCase;
2117
+ break;
2118
+
2119
+ case 'm': // opposite of our OneLine
2120
+ sawflags = true;
2121
+ if (negated)
2122
+ nflags |= OneLine;
2123
+ else
2124
+ nflags &= ~OneLine;
2125
+ break;
2126
+
2127
+ case 's':
2128
+ sawflags = true;
2129
+ if (negated)
2130
+ nflags &= ~DotNL;
2131
+ else
2132
+ nflags |= DotNL;
2133
+ break;
2134
+
2135
+ case 'U':
2136
+ sawflags = true;
2137
+ if (negated)
2138
+ nflags &= ~NonGreedy;
2139
+ else
2140
+ nflags |= NonGreedy;
2141
+ break;
2142
+
2143
+ // Negation
2144
+ case '-':
2145
+ if (negated)
2146
+ goto BadPerlOp;
2147
+ negated = true;
2148
+ sawflags = false;
2149
+ break;
2150
+
2151
+ // Open new group.
2152
+ case ':':
2153
+ if (!DoLeftParenNoCapture()) {
2154
+ // DoLeftParenNoCapture's failure set status_.
2155
+ return false;
2156
+ }
2157
+ done = true;
2158
+ break;
2159
+
2160
+ // Finish flags.
2161
+ case ')':
2162
+ done = true;
2163
+ break;
2164
+ }
2165
+ }
2166
+
2167
+ if (negated && !sawflags)
2168
+ goto BadPerlOp;
2169
+
2170
+ flags_ = static_cast<Regexp::ParseFlags>(nflags);
2171
+ *s = t;
2172
+ return true;
2173
+
2174
+ BadPerlOp:
2175
+ status_->set_code(kRegexpBadPerlOp);
2176
+ status_->set_error_arg(
2177
+ StringPiece(s->data(), static_cast<size_t>(t.data() - s->data())));
2178
+ return false;
2179
+ }
2180
+
2181
+ // Converts latin1 (assumed to be encoded as Latin1 bytes)
2182
+ // into UTF8 encoding in string.
2183
+ // Can't use EncodingUtils::EncodeLatin1AsUTF8 because it is
2184
+ // deprecated and because it rejects code points 0x80-0x9F.
2185
+ void ConvertLatin1ToUTF8(const StringPiece& latin1, std::string* utf) {
2186
+ char buf[UTFmax];
2187
+
2188
+ utf->clear();
2189
+ for (size_t i = 0; i < latin1.size(); i++) {
2190
+ Rune r = latin1[i] & 0xFF;
2191
+ int n = runetochar(buf, &r);
2192
+ utf->append(buf, n);
2193
+ }
2194
+ }
2195
+
2196
+ // Parses the regular expression given by s,
2197
+ // returning the corresponding Regexp tree.
2198
+ // The caller must Decref the return value when done with it.
2199
+ // Returns NULL on error.
2200
+ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags,
2201
+ RegexpStatus* status) {
2202
+ // Make status non-NULL (easier on everyone else).
2203
+ RegexpStatus xstatus;
2204
+ if (status == NULL)
2205
+ status = &xstatus;
2206
+
2207
+ ParseState ps(global_flags, s, status);
2208
+ StringPiece t = s;
2209
+
2210
+ // Convert regexp to UTF-8 (easier on the rest of the parser).
2211
+ if (global_flags & Latin1) {
2212
+ std::string* tmp = new std::string;
2213
+ ConvertLatin1ToUTF8(t, tmp);
2214
+ status->set_tmp(tmp);
2215
+ t = *tmp;
2216
+ }
2217
+
2218
+ if (global_flags & Literal) {
2219
+ // Special parse loop for literal string.
2220
+ while (!t.empty()) {
2221
+ Rune r;
2222
+ if (StringPieceToRune(&r, &t, status) < 0)
2223
+ return NULL;
2224
+ if (!ps.PushLiteral(r))
2225
+ return NULL;
2226
+ }
2227
+ return ps.DoFinish();
2228
+ }
2229
+
2230
+ StringPiece lastunary = StringPiece();
2231
+ while (!t.empty()) {
2232
+ StringPiece isunary = StringPiece();
2233
+ switch (t[0]) {
2234
+ default: {
2235
+ Rune r;
2236
+ if (StringPieceToRune(&r, &t, status) < 0)
2237
+ return NULL;
2238
+ if (!ps.PushLiteral(r))
2239
+ return NULL;
2240
+ break;
2241
+ }
2242
+
2243
+ case '(':
2244
+ // "(?" introduces Perl escape.
2245
+ if ((ps.flags() & PerlX) && (t.size() >= 2 && t[1] == '?')) {
2246
+ // Flag changes and non-capturing groups.
2247
+ if (!ps.ParsePerlFlags(&t))
2248
+ return NULL;
2249
+ break;
2250
+ }
2251
+ if (ps.flags() & NeverCapture) {
2252
+ if (!ps.DoLeftParenNoCapture())
2253
+ return NULL;
2254
+ } else {
2255
+ if (!ps.DoLeftParen(StringPiece()))
2256
+ return NULL;
2257
+ }
2258
+ t.remove_prefix(1); // '('
2259
+ break;
2260
+
2261
+ case '|':
2262
+ if (!ps.DoVerticalBar())
2263
+ return NULL;
2264
+ t.remove_prefix(1); // '|'
2265
+ break;
2266
+
2267
+ case ')':
2268
+ if (!ps.DoRightParen())
2269
+ return NULL;
2270
+ t.remove_prefix(1); // ')'
2271
+ break;
2272
+
2273
+ case '^': // Beginning of line.
2274
+ if (!ps.PushCaret())
2275
+ return NULL;
2276
+ t.remove_prefix(1); // '^'
2277
+ break;
2278
+
2279
+ case '$': // End of line.
2280
+ if (!ps.PushDollar())
2281
+ return NULL;
2282
+ t.remove_prefix(1); // '$'
2283
+ break;
2284
+
2285
+ case '.': // Any character (possibly except newline).
2286
+ if (!ps.PushDot())
2287
+ return NULL;
2288
+ t.remove_prefix(1); // '.'
2289
+ break;
2290
+
2291
+ case '[': { // Character class.
2292
+ Regexp* re;
2293
+ if (!ps.ParseCharClass(&t, &re, status))
2294
+ return NULL;
2295
+ if (!ps.PushRegexp(re))
2296
+ return NULL;
2297
+ break;
2298
+ }
2299
+
2300
+ case '*': { // Zero or more.
2301
+ RegexpOp op;
2302
+ op = kRegexpStar;
2303
+ goto Rep;
2304
+ case '+': // One or more.
2305
+ op = kRegexpPlus;
2306
+ goto Rep;
2307
+ case '?': // Zero or one.
2308
+ op = kRegexpQuest;
2309
+ goto Rep;
2310
+ Rep:
2311
+ StringPiece opstr = t;
2312
+ bool nongreedy = false;
2313
+ t.remove_prefix(1); // '*' or '+' or '?'
2314
+ if (ps.flags() & PerlX) {
2315
+ if (!t.empty() && t[0] == '?') {
2316
+ nongreedy = true;
2317
+ t.remove_prefix(1); // '?'
2318
+ }
2319
+ if (!lastunary.empty()) {
2320
+ // In Perl it is not allowed to stack repetition operators:
2321
+ // a** is a syntax error, not a double-star.
2322
+ // (and a++ means something else entirely, which we don't support!)
2323
+ status->set_code(kRegexpRepeatOp);
2324
+ status->set_error_arg(StringPiece(
2325
+ lastunary.data(),
2326
+ static_cast<size_t>(t.data() - lastunary.data())));
2327
+ return NULL;
2328
+ }
2329
+ }
2330
+ opstr = StringPiece(opstr.data(),
2331
+ static_cast<size_t>(t.data() - opstr.data()));
2332
+ if (!ps.PushRepeatOp(op, opstr, nongreedy))
2333
+ return NULL;
2334
+ isunary = opstr;
2335
+ break;
2336
+ }
2337
+
2338
+ case '{': { // Counted repetition.
2339
+ int lo, hi;
2340
+ StringPiece opstr = t;
2341
+ if (!MaybeParseRepetition(&t, &lo, &hi)) {
2342
+ // Treat like a literal.
2343
+ if (!ps.PushLiteral('{'))
2344
+ return NULL;
2345
+ t.remove_prefix(1); // '{'
2346
+ break;
2347
+ }
2348
+ bool nongreedy = false;
2349
+ if (ps.flags() & PerlX) {
2350
+ if (!t.empty() && t[0] == '?') {
2351
+ nongreedy = true;
2352
+ t.remove_prefix(1); // '?'
2353
+ }
2354
+ if (!lastunary.empty()) {
2355
+ // Not allowed to stack repetition operators.
2356
+ status->set_code(kRegexpRepeatOp);
2357
+ status->set_error_arg(StringPiece(
2358
+ lastunary.data(),
2359
+ static_cast<size_t>(t.data() - lastunary.data())));
2360
+ return NULL;
2361
+ }
2362
+ }
2363
+ opstr = StringPiece(opstr.data(),
2364
+ static_cast<size_t>(t.data() - opstr.data()));
2365
+ if (!ps.PushRepetition(lo, hi, opstr, nongreedy))
2366
+ return NULL;
2367
+ isunary = opstr;
2368
+ break;
2369
+ }
2370
+
2371
+ case '\\': { // Escaped character or Perl sequence.
2372
+ // \b and \B: word boundary or not
2373
+ if ((ps.flags() & Regexp::PerlB) &&
2374
+ t.size() >= 2 && (t[1] == 'b' || t[1] == 'B')) {
2375
+ if (!ps.PushWordBoundary(t[1] == 'b'))
2376
+ return NULL;
2377
+ t.remove_prefix(2); // '\\', 'b'
2378
+ break;
2379
+ }
2380
+
2381
+ if ((ps.flags() & Regexp::PerlX) && t.size() >= 2) {
2382
+ if (t[1] == 'A') {
2383
+ if (!ps.PushSimpleOp(kRegexpBeginText))
2384
+ return NULL;
2385
+ t.remove_prefix(2); // '\\', 'A'
2386
+ break;
2387
+ }
2388
+ if (t[1] == 'z') {
2389
+ if (!ps.PushSimpleOp(kRegexpEndText))
2390
+ return NULL;
2391
+ t.remove_prefix(2); // '\\', 'z'
2392
+ break;
2393
+ }
2394
+ // Do not recognize \Z, because this library can't
2395
+ // implement the exact Perl/PCRE semantics.
2396
+ // (This library treats "(?-m)$" as \z, even though
2397
+ // in Perl and PCRE it is equivalent to \Z.)
2398
+
2399
+ if (t[1] == 'C') { // \C: any byte [sic]
2400
+ if (!ps.PushSimpleOp(kRegexpAnyByte))
2401
+ return NULL;
2402
+ t.remove_prefix(2); // '\\', 'C'
2403
+ break;
2404
+ }
2405
+
2406
+ if (t[1] == 'Q') { // \Q ... \E: the ... is always literals
2407
+ t.remove_prefix(2); // '\\', 'Q'
2408
+ while (!t.empty()) {
2409
+ if (t.size() >= 2 && t[0] == '\\' && t[1] == 'E') {
2410
+ t.remove_prefix(2); // '\\', 'E'
2411
+ break;
2412
+ }
2413
+ Rune r;
2414
+ if (StringPieceToRune(&r, &t, status) < 0)
2415
+ return NULL;
2416
+ if (!ps.PushLiteral(r))
2417
+ return NULL;
2418
+ }
2419
+ break;
2420
+ }
2421
+ }
2422
+
2423
+ if (t.size() >= 2 && (t[1] == 'p' || t[1] == 'P')) {
2424
+ Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase);
2425
+ re->ccb_ = new CharClassBuilder;
2426
+ switch (ParseUnicodeGroup(&t, ps.flags(), re->ccb_, status)) {
2427
+ case kParseOk:
2428
+ if (!ps.PushRegexp(re))
2429
+ return NULL;
2430
+ goto Break2;
2431
+ case kParseError:
2432
+ re->Decref();
2433
+ return NULL;
2434
+ case kParseNothing:
2435
+ re->Decref();
2436
+ break;
2437
+ }
2438
+ }
2439
+
2440
+ const UGroup *g = MaybeParsePerlCCEscape(&t, ps.flags());
2441
+ if (g != NULL) {
2442
+ Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase);
2443
+ re->ccb_ = new CharClassBuilder;
2444
+ AddUGroup(re->ccb_, g, g->sign, ps.flags());
2445
+ if (!ps.PushRegexp(re))
2446
+ return NULL;
2447
+ break;
2448
+ }
2449
+
2450
+ Rune r;
2451
+ if (!ParseEscape(&t, &r, status, ps.rune_max()))
2452
+ return NULL;
2453
+ if (!ps.PushLiteral(r))
2454
+ return NULL;
2455
+ break;
2456
+ }
2457
+ }
2458
+ Break2:
2459
+ lastunary = isunary;
2460
+ }
2461
+ return ps.DoFinish();
2462
+ }
2463
+
2464
+ } // namespace re2