chipper 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. data/README.rdoc +51 -0
  2. data/ext/extconf.rb +58 -0
  3. data/ext/libstemmer_c/Makefile +10 -0
  4. data/ext/libstemmer_c/examples/stemwords.c +209 -0
  5. data/ext/libstemmer_c/include/libstemmer.h +79 -0
  6. data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
  7. data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
  8. data/ext/libstemmer_c/libstemmer/modules.h +190 -0
  9. data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
  10. data/ext/libstemmer_c/mkinc.mak +82 -0
  11. data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
  12. data/ext/libstemmer_c/runtime/api.c +66 -0
  13. data/ext/libstemmer_c/runtime/api.h +26 -0
  14. data/ext/libstemmer_c/runtime/header.h +58 -0
  15. data/ext/libstemmer_c/runtime/utilities.c +478 -0
  16. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
  17. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
  18. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
  19. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
  20. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
  21. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
  22. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
  23. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
  24. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
  25. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
  26. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
  27. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
  28. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
  29. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
  30. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
  31. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
  32. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
  33. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
  34. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
  35. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
  36. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
  37. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
  38. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
  39. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
  40. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
  41. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
  42. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
  43. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
  44. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
  45. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
  46. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
  47. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
  48. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
  49. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
  50. data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
  51. data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
  52. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
  53. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
  54. data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
  55. data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
  56. data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
  57. data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
  58. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
  59. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
  60. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
  61. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
  62. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
  63. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
  64. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
  65. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
  66. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
  67. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
  68. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
  69. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
  70. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
  71. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
  72. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
  73. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
  74. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
  75. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
  76. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
  77. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
  78. data/ext/re2/bitstate.cc +378 -0
  79. data/ext/re2/compile.cc +1138 -0
  80. data/ext/re2/dfa.cc +2086 -0
  81. data/ext/re2/filtered_re2.cc +100 -0
  82. data/ext/re2/filtered_re2.h +99 -0
  83. data/ext/re2/hash.cc +231 -0
  84. data/ext/re2/mimics_pcre.cc +185 -0
  85. data/ext/re2/nfa.cc +709 -0
  86. data/ext/re2/onepass.cc +614 -0
  87. data/ext/re2/parse.cc +2202 -0
  88. data/ext/re2/perl_groups.cc +119 -0
  89. data/ext/re2/prefilter.cc +671 -0
  90. data/ext/re2/prefilter.h +105 -0
  91. data/ext/re2/prefilter_tree.cc +398 -0
  92. data/ext/re2/prefilter_tree.h +130 -0
  93. data/ext/re2/prog.cc +341 -0
  94. data/ext/re2/prog.h +376 -0
  95. data/ext/re2/re2.cc +1180 -0
  96. data/ext/re2/re2.h +837 -0
  97. data/ext/re2/regexp.cc +920 -0
  98. data/ext/re2/regexp.h +632 -0
  99. data/ext/re2/rune.cc +258 -0
  100. data/ext/re2/set.cc +113 -0
  101. data/ext/re2/set.h +55 -0
  102. data/ext/re2/simplify.cc +393 -0
  103. data/ext/re2/stringpiece.cc +87 -0
  104. data/ext/re2/stringpiece.h +182 -0
  105. data/ext/re2/tostring.cc +341 -0
  106. data/ext/re2/unicode_casefold.cc +469 -0
  107. data/ext/re2/unicode_casefold.h +75 -0
  108. data/ext/re2/unicode_groups.cc +4851 -0
  109. data/ext/re2/unicode_groups.h +64 -0
  110. data/ext/re2/valgrind.cc +24 -0
  111. data/ext/re2/variadic_function.h +346 -0
  112. data/ext/re2/walker-inl.h +244 -0
  113. data/ext/src/chipper.cc +626 -0
  114. data/ext/src/version.h +1 -0
  115. data/ext/stemmer.rb +40 -0
  116. data/ext/util/arena.h +103 -0
  117. data/ext/util/atomicops.h +79 -0
  118. data/ext/util/benchmark.h +41 -0
  119. data/ext/util/flags.h +27 -0
  120. data/ext/util/logging.h +78 -0
  121. data/ext/util/mutex.h +190 -0
  122. data/ext/util/pcre.h +679 -0
  123. data/ext/util/random.h +29 -0
  124. data/ext/util/sparse_array.h +451 -0
  125. data/ext/util/sparse_set.h +177 -0
  126. data/ext/util/test.h +57 -0
  127. data/ext/util/thread.h +26 -0
  128. data/ext/util/utf.h +43 -0
  129. data/ext/util/util.h +127 -0
  130. data/ext/util/valgrind.h +4517 -0
  131. data/test/helper.rb +5 -0
  132. data/test/test_entities.rb +57 -0
  133. data/test/test_tokens.rb +118 -0
  134. metadata +199 -0
data/ext/re2/parse.cc ADDED
@@ -0,0 +1,2202 @@
1
+ // Copyright 2006 The RE2 Authors. All Rights Reserved.
2
+ // Use of this source code is governed by a BSD-style
3
+ // license that can be found in the LICENSE file.
4
+
5
+ // Regular expression parser.
6
+
7
+ // The parser is a simple precedence-based parser with a
8
+ // manual stack. The parsing work is done by the methods
9
+ // of the ParseState class. The Regexp::Parse function is
10
+ // essentially just a lexer that calls the ParseState method
11
+ // for each token.
12
+
13
+ // The parser recognizes POSIX extended regular expressions
14
+ // excluding backreferences, collating elements, and collating
15
+ // classes. It also allows the empty string as a regular expression
16
+ // and recognizes the Perl escape sequences \d, \s, \w, \D, \S, and \W.
17
+ // See regexp.h for rationale.
18
+
19
+ #include "util/util.h"
20
+ #include "re2/regexp.h"
21
+ #include "re2/stringpiece.h"
22
+ #include "re2/unicode_casefold.h"
23
+ #include "re2/unicode_groups.h"
24
+
25
+ namespace re2 {
26
+
27
+ // Regular expression parse state.
28
+ // The list of parsed regexps so far is maintained as a vector of
29
+ // Regexp pointers called the stack. Left parenthesis and vertical
30
+ // bar markers are also placed on the stack, as Regexps with
31
+ // non-standard opcodes.
32
+ // Scanning a left parenthesis causes the parser to push a left parenthesis
33
+ // marker on the stack.
34
+ // Scanning a vertical bar causes the parser to pop the stack until it finds a
35
+ // vertical bar or left parenthesis marker (not popping the marker),
36
+ // concatenate all the popped results, and push them back on
37
+ // the stack (DoConcatenation).
38
+ // Scanning a right parenthesis causes the parser to act as though it
39
+ // has seen a vertical bar, which then leaves the top of the stack in the
40
+ // form LeftParen regexp VerticalBar regexp VerticalBar ... regexp VerticalBar.
41
+ // The parser pops all this off the stack and creates an alternation of the
42
+ // regexps (DoAlternation).
43
+
44
+ class Regexp::ParseState {
45
+ public:
46
+ ParseState(ParseFlags flags, const StringPiece& whole_regexp,
47
+ RegexpStatus* status);
48
+ ~ParseState();
49
+
50
+ ParseFlags flags() { return flags_; }
51
+ int rune_max() { return rune_max_; }
52
+
53
+ // Parse methods. All public methods return a bool saying
54
+ // whether parsing should continue. If a method returns
55
+ // false, it has set fields in *status_, and the parser
56
+ // should return NULL.
57
+
58
+ // Pushes the given regular expression onto the stack.
59
+ // Could check for too much memory used here.
60
+ bool PushRegexp(Regexp* re);
61
+
62
+ // Pushes the literal rune r onto the stack.
63
+ bool PushLiteral(Rune r);
64
+
65
+ // Pushes a regexp with the given op (and no args) onto the stack.
66
+ bool PushSimpleOp(RegexpOp op);
67
+
68
+ // Pushes a ^ onto the stack.
69
+ bool PushCarat();
70
+
71
+ // Pushes a \b (word == true) or \B (word == false) onto the stack.
72
+ bool PushWordBoundary(bool word);
73
+
74
+ // Pushes a $ onto the stack.
75
+ bool PushDollar();
76
+
77
+ // Pushes a . onto the stack
78
+ bool PushDot();
79
+
80
+ // Pushes a repeat operator regexp onto the stack.
81
+ // A valid argument for the operator must already be on the stack.
82
+ // s is the name of the operator, for use in error messages.
83
+ bool PushRepeatOp(RegexpOp op, const StringPiece& s, bool nongreedy);
84
+
85
+ // Pushes a repetition regexp onto the stack.
86
+ // A valid argument for the operator must already be on the stack.
87
+ bool PushRepetition(int min, int max, const StringPiece& s, bool nongreedy);
88
+
89
+ // Checks whether a particular regexp op is a marker.
90
+ bool IsMarker(RegexpOp op);
91
+
92
+ // Processes a left parenthesis in the input.
93
+ // Pushes a marker onto the stack.
94
+ bool DoLeftParen(const StringPiece& name);
95
+ bool DoLeftParenNoCapture();
96
+
97
+ // Processes a vertical bar in the input.
98
+ bool DoVerticalBar();
99
+
100
+ // Processes a right parenthesis in the input.
101
+ bool DoRightParen();
102
+
103
+ // Processes the end of input, returning the final regexp.
104
+ Regexp* DoFinish();
105
+
106
+ // Finishes the regexp if necessary, preparing it for use
107
+ // in a more complicated expression.
108
+ // If it is a CharClassBuilder, converts into a CharClass.
109
+ Regexp* FinishRegexp(Regexp*);
110
+
111
+ // These routines don't manipulate the parse stack
112
+ // directly, but they do need to look at flags_.
113
+ // ParseCharClass also manipulates the internals of Regexp
114
+ // while creating *out_re.
115
+
116
+ // Parse a character class into *out_re.
117
+ // Removes parsed text from s.
118
+ bool ParseCharClass(StringPiece* s, Regexp** out_re,
119
+ RegexpStatus* status);
120
+
121
+ // Parse a character class character into *rp.
122
+ // Removes parsed text from s.
123
+ bool ParseCCCharacter(StringPiece* s, Rune *rp,
124
+ const StringPiece& whole_class,
125
+ RegexpStatus* status);
126
+
127
+ // Parse a character class range into rr.
128
+ // Removes parsed text from s.
129
+ bool ParseCCRange(StringPiece* s, RuneRange* rr,
130
+ const StringPiece& whole_class,
131
+ RegexpStatus* status);
132
+
133
+ // Parse a Perl flag set or non-capturing group from s.
134
+ bool ParsePerlFlags(StringPiece* s);
135
+
136
+
137
+ // Finishes the current concatenation,
138
+ // collapsing it into a single regexp on the stack.
139
+ void DoConcatenation();
140
+
141
+ // Finishes the current alternation,
142
+ // collapsing it to a single regexp on the stack.
143
+ void DoAlternation();
144
+
145
+ // Generalized DoAlternation/DoConcatenation.
146
+ void DoCollapse(RegexpOp op);
147
+
148
+ // Maybe concatenate Literals into LiteralString.
149
+ bool MaybeConcatString(int r, ParseFlags flags);
150
+
151
+ private:
152
+ ParseFlags flags_;
153
+ StringPiece whole_regexp_;
154
+ RegexpStatus* status_;
155
+ Regexp* stacktop_;
156
+ int ncap_; // number of capturing parens seen
157
+ int rune_max_; // maximum char value for this encoding
158
+
159
+ DISALLOW_EVIL_CONSTRUCTORS(ParseState);
160
+ };
161
+
162
+ // Pseudo-operators - only on parse stack.
163
+ const RegexpOp kLeftParen = static_cast<RegexpOp>(kMaxRegexpOp+1);
164
+ const RegexpOp kVerticalBar = static_cast<RegexpOp>(kMaxRegexpOp+2);
165
+
166
+ Regexp::ParseState::ParseState(ParseFlags flags,
167
+ const StringPiece& whole_regexp,
168
+ RegexpStatus* status)
169
+ : flags_(flags), whole_regexp_(whole_regexp),
170
+ status_(status), stacktop_(NULL), ncap_(0) {
171
+ if (flags_ & Latin1)
172
+ rune_max_ = 0xFF;
173
+ else
174
+ rune_max_ = Runemax;
175
+ }
176
+
177
+ // Cleans up by freeing all the regexps on the stack.
178
+ Regexp::ParseState::~ParseState() {
179
+ Regexp* next;
180
+ for (Regexp* re = stacktop_; re != NULL; re = next) {
181
+ next = re->down_;
182
+ re->down_ = NULL;
183
+ if (re->op() == kLeftParen)
184
+ delete re->name_;
185
+ re->Decref();
186
+ }
187
+ }
188
+
189
+ // Finishes the regexp if necessary, preparing it for use in
190
+ // a more complex expression.
191
+ // If it is a CharClassBuilder, converts into a CharClass.
192
+ Regexp* Regexp::ParseState::FinishRegexp(Regexp* re) {
193
+ if (re == NULL)
194
+ return NULL;
195
+ re->down_ = NULL;
196
+
197
+ if (re->op_ == kRegexpCharClass && re->ccb_ != NULL) {
198
+ CharClassBuilder* ccb = re->ccb_;
199
+ re->ccb_ = NULL;
200
+ re->cc_ = ccb->GetCharClass();
201
+ delete ccb;
202
+ }
203
+
204
+ return re;
205
+ }
206
+
207
+ // Pushes the given regular expression onto the stack.
208
+ // Could check for too much memory used here.
209
+ bool Regexp::ParseState::PushRegexp(Regexp* re) {
210
+ MaybeConcatString(-1, NoParseFlags);
211
+
212
+ // Special case: a character class of one character is just
213
+ // a literal. This is a common idiom for escaping
214
+ // single characters (e.g., [.] instead of \.), and some
215
+ // analysis does better with fewer character classes.
216
+ // Similarly, [Aa] can be rewritten as a literal A with ASCII case folding.
217
+ if (re->op_ == kRegexpCharClass) {
218
+ if (re->ccb_->size() == 1) {
219
+ Rune r = re->ccb_->begin()->lo;
220
+ re->Decref();
221
+ re = new Regexp(kRegexpLiteral, flags_);
222
+ re->rune_ = r;
223
+ } else if (re->ccb_->size() == 2) {
224
+ Rune r = re->ccb_->begin()->lo;
225
+ if ('A' <= r && r <= 'Z' && re->ccb_->Contains(r + 'a' - 'A')) {
226
+ re->Decref();
227
+ re = new Regexp(kRegexpLiteral, flags_ | FoldCase);
228
+ re->rune_ = r + 'a' - 'A';
229
+ }
230
+ }
231
+ }
232
+
233
+ if (!IsMarker(re->op()))
234
+ re->simple_ = re->ComputeSimple();
235
+ re->down_ = stacktop_;
236
+ stacktop_ = re;
237
+ return true;
238
+ }
239
+
240
+ // Searches the case folding tables and returns the CaseFold* that contains r.
241
+ // If there isn't one, returns the CaseFold* with smallest f->lo bigger than r.
242
+ // If there isn't one, returns NULL.
243
+ CaseFold* LookupCaseFold(CaseFold *f, int n, Rune r) {
244
+ CaseFold* ef = f + n;
245
+
246
+ // Binary search for entry containing r.
247
+ while (n > 0) {
248
+ int m = n/2;
249
+ if (f[m].lo <= r && r <= f[m].hi)
250
+ return &f[m];
251
+ if (r < f[m].lo) {
252
+ n = m;
253
+ } else {
254
+ f += m+1;
255
+ n -= m+1;
256
+ }
257
+ }
258
+
259
+ // There is no entry that contains r, but f points
260
+ // where it would have been. Unless f points at
261
+ // the end of the array, it points at the next entry
262
+ // after r.
263
+ if (f < ef)
264
+ return f;
265
+
266
+ // No entry contains r; no entry contains runes > r.
267
+ return NULL;
268
+ }
269
+
270
+ // Returns the result of applying the fold f to the rune r.
271
+ Rune ApplyFold(CaseFold *f, Rune r) {
272
+ switch (f->delta) {
273
+ default:
274
+ return r + f->delta;
275
+
276
+ case EvenOddSkip: // even <-> odd but only applies to every other
277
+ if ((r - f->lo) % 2)
278
+ return r;
279
+ // fall through
280
+ case EvenOdd: // even <-> odd
281
+ if (r%2 == 0)
282
+ return r + 1;
283
+ return r - 1;
284
+
285
+ case OddEvenSkip: // odd <-> even but only applies to every other
286
+ if ((r - f->lo) % 2)
287
+ return r;
288
+ // fall through
289
+ case OddEven: // odd <-> even
290
+ if (r%2 == 1)
291
+ return r + 1;
292
+ return r - 1;
293
+ }
294
+ }
295
+
296
+ // Returns the next Rune in r's folding cycle (see unicode_casefold.h).
297
+ // Examples:
298
+ // CycleFoldRune('A') = 'a'
299
+ // CycleFoldRune('a') = 'A'
300
+ //
301
+ // CycleFoldRune('K') = 'k'
302
+ // CycleFoldRune('k') = 0x212A (Kelvin)
303
+ // CycleFoldRune(0x212A) = 'K'
304
+ //
305
+ // CycleFoldRune('?') = '?'
306
+ Rune CycleFoldRune(Rune r) {
307
+ CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, r);
308
+ if (f == NULL || r < f->lo)
309
+ return r;
310
+ return ApplyFold(f, r);
311
+ }
312
+
313
+ // Add lo-hi to the class, along with their fold-equivalent characters.
314
+ // If lo-hi is already in the class, assume that the fold-equivalent
315
+ // chars are there too, so there's no work to do.
316
+ static void AddFoldedRange(CharClassBuilder* cc, Rune lo, Rune hi, int depth) {
317
+ // AddFoldedRange calls itself recursively for each rune in the fold cycle.
318
+ // Most folding cycles are small: there aren't any bigger than four in the
319
+ // current Unicode tables. make_unicode_casefold.py checks that
320
+ // the cycles are not too long, and we double-check here using depth.
321
+ if (depth > 10) {
322
+ LOG(DFATAL) << "AddFoldedRange recurses too much.";
323
+ return;
324
+ }
325
+
326
+ if (!cc->AddRange(lo, hi)) // lo-hi was already there? we're done
327
+ return;
328
+
329
+ while (lo <= hi) {
330
+ CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, lo);
331
+ if (f == NULL) // lo has no fold, nor does anything above lo
332
+ break;
333
+ if (lo < f->lo) { // lo has no fold; next rune with a fold is f->lo
334
+ lo = f->lo;
335
+ continue;
336
+ }
337
+
338
+ // Add in the result of folding the range lo - f->hi
339
+ // and that range's fold, recursively.
340
+ Rune lo1 = lo;
341
+ Rune hi1 = min<Rune>(hi, f->hi);
342
+ switch (f->delta) {
343
+ default:
344
+ lo1 += f->delta;
345
+ hi1 += f->delta;
346
+ break;
347
+ case EvenOdd:
348
+ if (lo1%2 == 1)
349
+ lo1--;
350
+ if (hi1%2 == 0)
351
+ hi1++;
352
+ break;
353
+ case OddEven:
354
+ if (lo1%2 == 0)
355
+ lo1--;
356
+ if (hi1%2 == 1)
357
+ hi1++;
358
+ break;
359
+ }
360
+ AddFoldedRange(cc, lo1, hi1, depth+1);
361
+
362
+ // Pick up where this fold left off.
363
+ lo = f->hi + 1;
364
+ }
365
+ }
366
+
367
+ // Pushes the literal rune r onto the stack.
368
+ bool Regexp::ParseState::PushLiteral(Rune r) {
369
+ // Do case folding if needed.
370
+ if ((flags_ & FoldCase) && CycleFoldRune(r) != r) {
371
+ Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);
372
+ re->ccb_ = new CharClassBuilder;
373
+ Rune r1 = r;
374
+ do {
375
+ if (!(flags_ & NeverNL) || r != '\n') {
376
+ re->ccb_->AddRange(r, r);
377
+ }
378
+ r = CycleFoldRune(r);
379
+ } while (r != r1);
380
+ re->ccb_->RemoveAbove(rune_max_);
381
+ return PushRegexp(re);
382
+ }
383
+
384
+ // Exclude newline if applicable.
385
+ if ((flags_ & NeverNL) && r == '\n')
386
+ return PushRegexp(new Regexp(kRegexpNoMatch, flags_));
387
+
388
+ // No fancy stuff worked. Ordinary literal.
389
+ if (MaybeConcatString(r, flags_))
390
+ return true;
391
+
392
+ Regexp* re = new Regexp(kRegexpLiteral, flags_);
393
+ re->rune_ = r;
394
+ return PushRegexp(re);
395
+ }
396
+
397
+ // Pushes a ^ onto the stack.
398
+ bool Regexp::ParseState::PushCarat() {
399
+ if (flags_ & OneLine) {
400
+ return PushSimpleOp(kRegexpBeginText);
401
+ }
402
+ return PushSimpleOp(kRegexpBeginLine);
403
+ }
404
+
405
+ // Pushes a \b or \B onto the stack.
406
+ bool Regexp::ParseState::PushWordBoundary(bool word) {
407
+ if (word)
408
+ return PushSimpleOp(kRegexpWordBoundary);
409
+ return PushSimpleOp(kRegexpNoWordBoundary);
410
+ }
411
+
412
+ // Pushes a $ onto the stack.
413
+ bool Regexp::ParseState::PushDollar() {
414
+ if (flags_ & OneLine) {
415
+ // Clumsy marker so that MimicsPCRE() can tell whether
416
+ // this kRegexpEndText was a $ and not a \z.
417
+ Regexp::ParseFlags oflags = flags_;
418
+ flags_ = flags_ | WasDollar;
419
+ bool ret = PushSimpleOp(kRegexpEndText);
420
+ flags_ = oflags;
421
+ return ret;
422
+ }
423
+ return PushSimpleOp(kRegexpEndLine);
424
+ }
425
+
426
+ // Pushes a . onto the stack.
427
+ bool Regexp::ParseState::PushDot() {
428
+ if ((flags_ & DotNL) && !(flags_ & NeverNL))
429
+ return PushSimpleOp(kRegexpAnyChar);
430
+ // Rewrite . into [^\n]
431
+ Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);
432
+ re->ccb_ = new CharClassBuilder;
433
+ re->ccb_->AddRange(0, '\n' - 1);
434
+ re->ccb_->AddRange('\n' + 1, rune_max_);
435
+ return PushRegexp(re);
436
+ }
437
+
438
+ // Pushes a regexp with the given op (and no args) onto the stack.
439
+ bool Regexp::ParseState::PushSimpleOp(RegexpOp op) {
440
+ Regexp* re = new Regexp(op, flags_);
441
+ return PushRegexp(re);
442
+ }
443
+
444
+ // Pushes a repeat operator regexp onto the stack.
445
+ // A valid argument for the operator must already be on the stack.
446
+ // The char c is the name of the operator, for use in error messages.
447
+ bool Regexp::ParseState::PushRepeatOp(RegexpOp op, const StringPiece& s,
448
+ bool nongreedy) {
449
+ if (stacktop_ == NULL || IsMarker(stacktop_->op())) {
450
+ status_->set_code(kRegexpRepeatArgument);
451
+ status_->set_error_arg(s);
452
+ return false;
453
+ }
454
+ Regexp::ParseFlags fl = flags_;
455
+ if (nongreedy)
456
+ fl = fl ^ NonGreedy;
457
+ Regexp* re = new Regexp(op, fl);
458
+ re->AllocSub(1);
459
+ re->down_ = stacktop_->down_;
460
+ re->sub()[0] = FinishRegexp(stacktop_);
461
+ re->simple_ = re->ComputeSimple();
462
+ stacktop_ = re;
463
+ return true;
464
+ }
465
+
466
+ // Pushes a repetition regexp onto the stack.
467
+ // A valid argument for the operator must already be on the stack.
468
+ bool Regexp::ParseState::PushRepetition(int min, int max,
469
+ const StringPiece& s,
470
+ bool nongreedy) {
471
+ if ((max != -1 && max < min) || min > 1000 || max > 1000) {
472
+ status_->set_code(kRegexpRepeatSize);
473
+ status_->set_error_arg(s);
474
+ return false;
475
+ }
476
+ if (stacktop_ == NULL || IsMarker(stacktop_->op())) {
477
+ status_->set_code(kRegexpRepeatArgument);
478
+ status_->set_error_arg(s);
479
+ return false;
480
+ }
481
+ Regexp::ParseFlags fl = flags_;
482
+ if (nongreedy)
483
+ fl = fl ^ NonGreedy;
484
+ Regexp* re = new Regexp(kRegexpRepeat, fl);
485
+ re->min_ = min;
486
+ re->max_ = max;
487
+ re->AllocSub(1);
488
+ re->down_ = stacktop_->down_;
489
+ re->sub()[0] = FinishRegexp(stacktop_);
490
+ re->simple_ = re->ComputeSimple();
491
+
492
+ stacktop_ = re;
493
+ return true;
494
+ }
495
+
496
+ // Checks whether a particular regexp op is a marker.
497
+ bool Regexp::ParseState::IsMarker(RegexpOp op) {
498
+ return op >= kLeftParen;
499
+ }
500
+
501
+ // Processes a left parenthesis in the input.
502
+ // Pushes a marker onto the stack.
503
+ bool Regexp::ParseState::DoLeftParen(const StringPiece& name) {
504
+ Regexp* re = new Regexp(kLeftParen, flags_);
505
+ re->cap_ = ++ncap_;
506
+ if (name.data() != NULL)
507
+ re->name_ = new string(name.as_string());
508
+ return PushRegexp(re);
509
+ }
510
+
511
+ // Pushes a non-capturing marker onto the stack.
512
+ bool Regexp::ParseState::DoLeftParenNoCapture() {
513
+ Regexp* re = new Regexp(kLeftParen, flags_);
514
+ re->cap_ = -1;
515
+ return PushRegexp(re);
516
+ }
517
+
518
+ // Adds r to cc, along with r's upper case if foldascii is set.
519
+ static void AddLiteral(CharClassBuilder* cc, Rune r, bool foldascii) {
520
+ cc->AddRange(r, r);
521
+ if (foldascii && 'a' <= r && r <= 'z')
522
+ cc->AddRange(r + 'A' - 'a', r + 'A' - 'a');
523
+ }
524
+
525
+ // Processes a vertical bar in the input.
526
+ bool Regexp::ParseState::DoVerticalBar() {
527
+ MaybeConcatString(-1, NoParseFlags);
528
+ DoConcatenation();
529
+
530
+ // Below the vertical bar is a list to alternate.
531
+ // Above the vertical bar is a list to concatenate.
532
+ // We just did the concatenation, so either swap
533
+ // the result below the vertical bar or push a new
534
+ // vertical bar on the stack.
535
+ Regexp* r1;
536
+ Regexp* r2;
537
+ if ((r1 = stacktop_) != NULL &&
538
+ (r2 = stacktop_->down_) != NULL &&
539
+ r2->op() == kVerticalBar) {
540
+ // If above and below vertical bar are literal or char class,
541
+ // can merge into a single char class.
542
+ Regexp* r3;
543
+ if ((r1->op() == kRegexpLiteral ||
544
+ r1->op() == kRegexpCharClass ||
545
+ r1->op() == kRegexpAnyChar) &&
546
+ (r3 = r2->down_) != NULL) {
547
+ Rune rune;
548
+ switch (r3->op()) {
549
+ case kRegexpLiteral: // convert to char class
550
+ rune = r3->rune_;
551
+ r3->op_ = kRegexpCharClass;
552
+ r3->cc_ = NULL;
553
+ r3->ccb_ = new CharClassBuilder;
554
+ AddLiteral(r3->ccb_, rune, r3->parse_flags_ & Regexp::FoldCase);
555
+ // fall through
556
+ case kRegexpCharClass:
557
+ if (r1->op() == kRegexpLiteral)
558
+ AddLiteral(r3->ccb_, r1->rune_,
559
+ r1->parse_flags_ & Regexp::FoldCase);
560
+ else if (r1->op() == kRegexpCharClass)
561
+ r3->ccb_->AddCharClass(r1->ccb_);
562
+ if (r1->op() == kRegexpAnyChar || r3->ccb_->full()) {
563
+ delete r3->ccb_;
564
+ r3->ccb_ = NULL;
565
+ r3->op_ = kRegexpAnyChar;
566
+ }
567
+ // fall through
568
+ case kRegexpAnyChar:
569
+ // pop r1
570
+ stacktop_ = r2;
571
+ r1->Decref();
572
+ return true;
573
+ default:
574
+ break;
575
+ }
576
+ }
577
+
578
+ // Swap r1 below vertical bar (r2).
579
+ r1->down_ = r2->down_;
580
+ r2->down_ = r1;
581
+ stacktop_ = r2;
582
+ return true;
583
+ }
584
+ return PushSimpleOp(kVerticalBar);
585
+ }
586
+
587
+ // Processes a right parenthesis in the input.
588
+ bool Regexp::ParseState::DoRightParen() {
589
+ // Finish the current concatenation and alternation.
590
+ DoAlternation();
591
+
592
+ // The stack should be: LeftParen regexp
593
+ // Remove the LeftParen, leaving the regexp,
594
+ // parenthesized.
595
+ Regexp* r1;
596
+ Regexp* r2;
597
+ if ((r1 = stacktop_) == NULL ||
598
+ (r2 = r1->down_) == NULL ||
599
+ r2->op() != kLeftParen) {
600
+ status_->set_code(kRegexpMissingParen);
601
+ status_->set_error_arg(whole_regexp_);
602
+ return false;
603
+ }
604
+
605
+ // Pop off r1, r2. Will Decref or reuse below.
606
+ stacktop_ = r2->down_;
607
+
608
+ // Restore flags from when paren opened.
609
+ Regexp* re = r2;
610
+ flags_ = re->parse_flags();
611
+
612
+ // Rewrite LeftParen as capture if needed.
613
+ if (re->cap_ > 0) {
614
+ re->op_ = kRegexpCapture;
615
+ // re->cap_ is already set
616
+ re->AllocSub(1);
617
+ re->sub()[0] = FinishRegexp(r1);
618
+ re->simple_ = re->ComputeSimple();
619
+ } else {
620
+ re->Decref();
621
+ re = r1;
622
+ }
623
+ return PushRegexp(re);
624
+ }
625
+
626
+ // Processes the end of input, returning the final regexp.
627
+ Regexp* Regexp::ParseState::DoFinish() {
628
+ DoAlternation();
629
+ Regexp* re = stacktop_;
630
+ if (re != NULL && re->down_ != NULL) {
631
+ status_->set_code(kRegexpMissingParen);
632
+ status_->set_error_arg(whole_regexp_);
633
+ return NULL;
634
+ }
635
+ stacktop_ = NULL;
636
+ return FinishRegexp(re);
637
+ }
638
+
639
+ // Returns the leading regexp that re starts with.
640
+ // The returned Regexp* points into a piece of re,
641
+ // so it must not be used after the caller calls re->Decref().
642
+ Regexp* Regexp::LeadingRegexp(Regexp* re) {
643
+ if (re->op() == kRegexpEmptyMatch)
644
+ return NULL;
645
+ if (re->op() == kRegexpConcat && re->nsub() >= 2) {
646
+ Regexp** sub = re->sub();
647
+ if (sub[0]->op() == kRegexpEmptyMatch)
648
+ return NULL;
649
+ return sub[0];
650
+ }
651
+ return re;
652
+ }
653
+
654
+ // Removes LeadingRegexp(re) from re and returns what's left.
655
+ // Consumes the reference to re and may edit it in place.
656
+ // If caller wants to hold on to LeadingRegexp(re),
657
+ // must have already Incref'ed it.
658
+ Regexp* Regexp::RemoveLeadingRegexp(Regexp* re) {
659
+ if (re->op() == kRegexpEmptyMatch)
660
+ return re;
661
+ if (re->op() == kRegexpConcat && re->nsub() >= 2) {
662
+ Regexp** sub = re->sub();
663
+ if (sub[0]->op() == kRegexpEmptyMatch)
664
+ return re;
665
+ sub[0]->Decref();
666
+ sub[0] = NULL;
667
+ if (re->nsub() == 2) {
668
+ // Collapse concatenation to single regexp.
669
+ Regexp* nre = sub[1];
670
+ sub[1] = NULL;
671
+ re->Decref();
672
+ return nre;
673
+ }
674
+ // 3 or more -> 2 or more.
675
+ re->nsub_--;
676
+ memmove(sub, sub + 1, re->nsub_ * sizeof sub[0]);
677
+ return re;
678
+ }
679
+ Regexp::ParseFlags pf = re->parse_flags();
680
+ re->Decref();
681
+ return new Regexp(kRegexpEmptyMatch, pf);
682
+ }
683
+
684
+ // Returns the leading string that re starts with.
685
+ // The returned Rune* points into a piece of re,
686
+ // so it must not be used after the caller calls re->Decref().
687
+ Rune* Regexp::LeadingString(Regexp* re, int *nrune,
688
+ Regexp::ParseFlags *flags) {
689
+ while (re->op() == kRegexpConcat && re->nsub() > 0)
690
+ re = re->sub()[0];
691
+
692
+ *flags = static_cast<Regexp::ParseFlags>(re->parse_flags_ & Regexp::FoldCase);
693
+
694
+ if (re->op() == kRegexpLiteral) {
695
+ *nrune = 1;
696
+ return &re->rune_;
697
+ }
698
+
699
+ if (re->op() == kRegexpLiteralString) {
700
+ *nrune = re->nrunes_;
701
+ return re->runes_;
702
+ }
703
+
704
+ *nrune = 0;
705
+ return NULL;
706
+ }
707
+
708
+ // Removes the first n leading runes from the beginning of re.
709
+ // Edits re in place.
710
+ void Regexp::RemoveLeadingString(Regexp* re, int n) {
711
+ // Chase down concats to find first string.
712
+ // For regexps generated by parser, nested concats are
713
+ // flattened except when doing so would overflow the 16-bit
714
+ // limit on the size of a concatenation, so we should never
715
+ // see more than two here.
716
+ Regexp* stk[4];
717
+ int d = 0;
718
+ while (re->op() == kRegexpConcat) {
719
+ if (d < arraysize(stk))
720
+ stk[d++] = re;
721
+ re = re->sub()[0];
722
+ }
723
+
724
+ // Remove leading string from re.
725
+ if (re->op() == kRegexpLiteral) {
726
+ re->rune_ = 0;
727
+ re->op_ = kRegexpEmptyMatch;
728
+ } else if (re->op() == kRegexpLiteralString) {
729
+ if (n >= re->nrunes_) {
730
+ delete[] re->runes_;
731
+ re->runes_ = NULL;
732
+ re->nrunes_ = 0;
733
+ re->op_ = kRegexpEmptyMatch;
734
+ } else if (n == re->nrunes_ - 1) {
735
+ Rune rune = re->runes_[re->nrunes_ - 1];
736
+ delete[] re->runes_;
737
+ re->runes_ = NULL;
738
+ re->nrunes_ = 0;
739
+ re->rune_ = rune;
740
+ re->op_ = kRegexpLiteral;
741
+ } else {
742
+ re->nrunes_ -= n;
743
+ memmove(re->runes_, re->runes_ + n, re->nrunes_ * sizeof re->runes_[0]);
744
+ }
745
+ }
746
+
747
+ // If re is now empty, concatenations might simplify too.
748
+ while (d-- > 0) {
749
+ re = stk[d];
750
+ Regexp** sub = re->sub();
751
+ if (sub[0]->op() == kRegexpEmptyMatch) {
752
+ sub[0]->Decref();
753
+ sub[0] = NULL;
754
+ // Delete first element of concat.
755
+ switch (re->nsub()) {
756
+ case 0:
757
+ case 1:
758
+ // Impossible.
759
+ LOG(DFATAL) << "Concat of " << re->nsub();
760
+ re->submany_ = NULL;
761
+ re->op_ = kRegexpEmptyMatch;
762
+ break;
763
+
764
+ case 2: {
765
+ // Replace re with sub[1].
766
+ Regexp* old = sub[1];
767
+ sub[1] = NULL;
768
+ re->Swap(old);
769
+ old->Decref();
770
+ break;
771
+ }
772
+
773
+ default:
774
+ // Slide down.
775
+ re->nsub_--;
776
+ memmove(sub, sub + 1, re->nsub_ * sizeof sub[0]);
777
+ break;
778
+ }
779
+ }
780
+ }
781
+ }
782
+
783
+ // Factors common prefixes from alternation.
784
+ // For example,
785
+ // ABC|ABD|AEF|BCX|BCY
786
+ // simplifies to
787
+ // A(B(C|D)|EF)|BC(X|Y)
788
+ // which the normal parse state routines will further simplify to
789
+ // A(B[CD]|EF)|BC[XY]
790
+ //
791
+ // Rewrites sub to contain simplified list to alternate and returns
792
+ // the new length of sub. Adjusts reference counts accordingly
793
+ // (incoming sub[i] decremented, outgoing sub[i] incremented).
794
+
795
+ // It's too much of a pain to write this code with an explicit stack,
796
+ // so instead we let the caller specify a maximum depth and
797
+ // don't simplify beyond that. There are around 15 words of local
798
+ // variables and parameters in the frame, so allowing 8 levels
799
+ // on a 64-bit machine is still less than a kilobyte of stack and
800
+ // probably enough benefit for practical uses.
801
+ const int kFactorAlternationMaxDepth = 8;
802
+
803
+ int Regexp::FactorAlternation(
804
+ Regexp** sub, int n,
805
+ Regexp::ParseFlags altflags) {
806
+ return FactorAlternationRecursive(sub, n, altflags,
807
+ kFactorAlternationMaxDepth);
808
+ }
809
+
810
+ int Regexp::FactorAlternationRecursive(
811
+ Regexp** sub, int n,
812
+ Regexp::ParseFlags altflags,
813
+ int maxdepth) {
814
+
815
+ if (maxdepth <= 0)
816
+ return n;
817
+
818
+ // Round 1: Factor out common literal prefixes.
819
+ Rune *rune = NULL;
820
+ int nrune = 0;
821
+ Regexp::ParseFlags runeflags = Regexp::NoParseFlags;
822
+ int start = 0;
823
+ int out = 0;
824
+ for (int i = 0; i <= n; i++) {
825
+ // Invariant: what was in sub[0:start] has been Decref'ed
826
+ // and that space has been reused for sub[0:out] (out <= start).
827
+ //
828
+ // Invariant: sub[start:i] consists of regexps that all begin
829
+ // with the string rune[0:nrune].
830
+
831
+ Rune* rune_i = NULL;
832
+ int nrune_i = 0;
833
+ Regexp::ParseFlags runeflags_i = Regexp::NoParseFlags;
834
+ if (i < n) {
835
+ rune_i = LeadingString(sub[i], &nrune_i, &runeflags_i);
836
+ if (runeflags_i == runeflags) {
837
+ int same = 0;
838
+ while (same < nrune && same < nrune_i && rune[same] == rune_i[same])
839
+ same++;
840
+ if (same > 0) {
841
+ // Matches at least one rune in current range. Keep going around.
842
+ nrune = same;
843
+ continue;
844
+ }
845
+ }
846
+ }
847
+
848
+ // Found end of a run with common leading literal string:
849
+ // sub[start:i] all begin with rune[0:nrune] but sub[i]
850
+ // does not even begin with rune[0].
851
+ //
852
+ // Factor out common string and append factored expression to sub[0:out].
853
+ if (i == start) {
854
+ // Nothing to do - first iteration.
855
+ } else if (i == start+1) {
856
+ // Just one: don't bother factoring.
857
+ sub[out++] = sub[start];
858
+ } else {
859
+ // Construct factored form: prefix(suffix1|suffix2|...)
860
+ Regexp* x[2]; // x[0] = prefix, x[1] = suffix1|suffix2|...
861
+ x[0] = LiteralString(rune, nrune, runeflags);
862
+ for (int j = start; j < i; j++)
863
+ RemoveLeadingString(sub[j], nrune);
864
+ int nn = FactorAlternationRecursive(sub + start, i - start, altflags,
865
+ maxdepth - 1);
866
+ x[1] = AlternateNoFactor(sub + start, nn, altflags);
867
+ sub[out++] = Concat(x, 2, altflags);
868
+ }
869
+
870
+ // Prepare for next round (if there is one).
871
+ if (i < n) {
872
+ start = i;
873
+ rune = rune_i;
874
+ nrune = nrune_i;
875
+ runeflags = runeflags_i;
876
+ }
877
+ }
878
+ n = out;
879
+
880
+ // Round 2: Factor out common complex prefixes,
881
+ // just the first piece of each concatenation,
882
+ // whatever it is. This is good enough a lot of the time.
883
+ start = 0;
884
+ out = 0;
885
+ Regexp* first = NULL;
886
+ for (int i = 0; i <= n; i++) {
887
+ // Invariant: what was in sub[0:start] has been Decref'ed
888
+ // and that space has been reused for sub[0:out] (out <= start).
889
+ //
890
+ // Invariant: sub[start:i] consists of regexps that all begin with first.
891
+
892
+ Regexp* first_i = NULL;
893
+ if (i < n) {
894
+ first_i = LeadingRegexp(sub[i]);
895
+ if (first != NULL && Regexp::Equal(first, first_i)) {
896
+ continue;
897
+ }
898
+ }
899
+
900
+ // Found end of a run with common leading regexp:
901
+ // sub[start:i] all begin with first but sub[i] does not.
902
+ //
903
+ // Factor out common regexp and append factored expression to sub[0:out].
904
+ if (i == start) {
905
+ // Nothing to do - first iteration.
906
+ } else if (i == start+1) {
907
+ // Just one: don't bother factoring.
908
+ sub[out++] = sub[start];
909
+ } else {
910
+ // Construct factored form: prefix(suffix1|suffix2|...)
911
+ Regexp* x[2]; // x[0] = prefix, x[1] = suffix1|suffix2|...
912
+ x[0] = first->Incref();
913
+ for (int j = start; j < i; j++)
914
+ sub[j] = RemoveLeadingRegexp(sub[j]);
915
+ int nn = FactorAlternationRecursive(sub + start, i - start, altflags,
916
+ maxdepth - 1);
917
+ x[1] = AlternateNoFactor(sub + start, nn, altflags);
918
+ sub[out++] = Concat(x, 2, altflags);
919
+ }
920
+
921
+ // Prepare for next round (if there is one).
922
+ if (i < n) {
923
+ start = i;
924
+ first = first_i;
925
+ }
926
+ }
927
+ n = out;
928
+
929
+ // Round 3: Collapse runs of single literals into character classes.
930
+ start = 0;
931
+ out = 0;
932
+ for (int i = 0; i <= n; i++) {
933
+ // Invariant: what was in sub[0:start] has been Decref'ed
934
+ // and that space has been reused for sub[0:out] (out <= start).
935
+ //
936
+ // Invariant: sub[start:i] consists of regexps that are either
937
+ // literal runes or character classes.
938
+
939
+ if (i < n &&
940
+ (sub[i]->op() == kRegexpLiteral ||
941
+ sub[i]->op() == kRegexpCharClass))
942
+ continue;
943
+
944
+ // sub[i] is not a char or char class;
945
+ // emit char class for sub[start:i]...
946
+ if (i == start) {
947
+ // Nothing to do.
948
+ } else if (i == start+1) {
949
+ sub[out++] = sub[start];
950
+ } else {
951
+ // Make new char class.
952
+ CharClassBuilder ccb;
953
+ for (int j = start; j < i; j++) {
954
+ Regexp* re = sub[j];
955
+ if (re->op() == kRegexpCharClass) {
956
+ CharClass* cc = re->cc();
957
+ for (CharClass::iterator it = cc->begin(); it != cc->end(); ++it)
958
+ ccb.AddRange(it->lo, it->hi);
959
+ } else if (re->op() == kRegexpLiteral) {
960
+ ccb.AddRangeFlags(re->rune(), re->rune(), re->parse_flags());
961
+ } else {
962
+ LOG(DFATAL) << "RE2: unexpected op: " << re->op() << " "
963
+ << re->ToString();
964
+ }
965
+ re->Decref();
966
+ }
967
+ sub[out++] = NewCharClass(ccb.GetCharClass(), altflags);
968
+ }
969
+
970
+ // ... and then emit sub[i].
971
+ if (i < n)
972
+ sub[out++] = sub[i];
973
+ start = i+1;
974
+ }
975
+ n = out;
976
+
977
+ // Round 4: Collapse runs of empty matches into single empty match.
978
+ start = 0;
979
+ out = 0;
980
+ for (int i = 0; i < n; i++) {
981
+ if (i + 1 < n &&
982
+ sub[i]->op() == kRegexpEmptyMatch &&
983
+ sub[i+1]->op() == kRegexpEmptyMatch) {
984
+ sub[i]->Decref();
985
+ continue;
986
+ }
987
+ sub[out++] = sub[i];
988
+ }
989
+ n = out;
990
+
991
+ return n;
992
+ }
993
+
994
+ // Collapse the regexps on top of the stack, down to the
995
+ // first marker, into a new op node (op == kRegexpAlternate
996
+ // or op == kRegexpConcat).
997
+ void Regexp::ParseState::DoCollapse(RegexpOp op) {
998
+ // Scan backward to marker, counting children of composite.
999
+ int n = 0;
1000
+ Regexp* next = NULL;
1001
+ Regexp* sub;
1002
+ for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) {
1003
+ next = sub->down_;
1004
+ if (sub->op_ == op)
1005
+ n += sub->nsub_;
1006
+ else
1007
+ n++;
1008
+ }
1009
+
1010
+ // If there's just one child, leave it alone.
1011
+ // (Concat of one thing is that one thing; alternate of one thing is same.)
1012
+ if (stacktop_ != NULL && stacktop_->down_ == next)
1013
+ return;
1014
+
1015
+ // Construct op (alternation or concatenation), flattening op of op.
1016
+ Regexp** subs = new Regexp*[n];
1017
+ next = NULL;
1018
+ int i = n;
1019
+ for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) {
1020
+ next = sub->down_;
1021
+ if (sub->op_ == op) {
1022
+ Regexp** sub_subs = sub->sub();
1023
+ for (int k = sub->nsub_ - 1; k >= 0; k--)
1024
+ subs[--i] = sub_subs[k]->Incref();
1025
+ sub->Decref();
1026
+ } else {
1027
+ subs[--i] = FinishRegexp(sub);
1028
+ }
1029
+ }
1030
+
1031
+ Regexp* re = ConcatOrAlternate(op, subs, n, flags_, true);
1032
+ delete[] subs;
1033
+ re->simple_ = re->ComputeSimple();
1034
+ re->down_ = next;
1035
+ stacktop_ = re;
1036
+ }
1037
+
1038
+ // Finishes the current concatenation,
1039
+ // collapsing it into a single regexp on the stack.
1040
+ void Regexp::ParseState::DoConcatenation() {
1041
+ Regexp* r1 = stacktop_;
1042
+ if (r1 == NULL || IsMarker(r1->op())) {
1043
+ // empty concatenation is special case
1044
+ Regexp* re = new Regexp(kRegexpEmptyMatch, flags_);
1045
+ PushRegexp(re);
1046
+ }
1047
+ DoCollapse(kRegexpConcat);
1048
+ }
1049
+
1050
+ // Finishes the current alternation,
1051
+ // collapsing it to a single regexp on the stack.
1052
+ void Regexp::ParseState::DoAlternation() {
1053
+ DoVerticalBar();
1054
+ // Now stack top is kVerticalBar.
1055
+ Regexp* r1 = stacktop_;
1056
+ stacktop_ = r1->down_;
1057
+ r1->Decref();
1058
+ DoCollapse(kRegexpAlternate);
1059
+ }
1060
+
1061
+ // Incremental conversion of concatenated literals into strings.
1062
+ // If top two elements on stack are both literal or string,
1063
+ // collapse into single string.
1064
+ // Don't walk down the stack -- the parser calls this frequently
1065
+ // enough that below the bottom two is known to be collapsed.
1066
+ // Only called when another regexp is about to be pushed
1067
+ // on the stack, so that the topmost literal is not being considered.
1068
+ // (Otherwise ab* would turn into (ab)*.)
1069
+ // If r >= 0, consider pushing a literal r on the stack.
1070
+ // Return whether that happened.
1071
+ bool Regexp::ParseState::MaybeConcatString(int r, ParseFlags flags) {
1072
+ Regexp* re1;
1073
+ Regexp* re2;
1074
+ if ((re1 = stacktop_) == NULL || (re2 = re1->down_) == NULL)
1075
+ return false;
1076
+
1077
+ if (re1->op_ != kRegexpLiteral && re1->op_ != kRegexpLiteralString)
1078
+ return false;
1079
+ if (re2->op_ != kRegexpLiteral && re2->op_ != kRegexpLiteralString)
1080
+ return false;
1081
+ if ((re1->parse_flags_ & FoldCase) != (re2->parse_flags_ & FoldCase))
1082
+ return false;
1083
+
1084
+ if (re2->op_ == kRegexpLiteral) {
1085
+ // convert into string
1086
+ Rune rune = re2->rune_;
1087
+ re2->op_ = kRegexpLiteralString;
1088
+ re2->nrunes_ = 0;
1089
+ re2->runes_ = NULL;
1090
+ re2->AddRuneToString(rune);
1091
+ }
1092
+
1093
+ // push re1 into re2.
1094
+ if (re1->op_ == kRegexpLiteral) {
1095
+ re2->AddRuneToString(re1->rune_);
1096
+ } else {
1097
+ for (int i = 0; i < re1->nrunes_; i++)
1098
+ re2->AddRuneToString(re1->runes_[i]);
1099
+ re1->nrunes_ = 0;
1100
+ delete[] re1->runes_;
1101
+ re1->runes_ = NULL;
1102
+ }
1103
+
1104
+ // reuse re1 if possible
1105
+ if (r >= 0) {
1106
+ re1->op_ = kRegexpLiteral;
1107
+ re1->rune_ = r;
1108
+ re1->parse_flags_ = flags;
1109
+ return true;
1110
+ }
1111
+
1112
+ stacktop_ = re2;
1113
+ re1->Decref();
1114
+ return false;
1115
+ }
1116
+
1117
+ // Lexing routines.
1118
+
1119
+ // Parses a decimal integer, storing it in *n.
1120
+ // Sets *s to span the remainder of the string.
1121
+ // Sets *out_re to the regexp for the class.
1122
+ static bool ParseInteger(StringPiece* s, int* np) {
1123
+ if (s->size() == 0 || !isdigit((*s)[0] & 0xFF))
1124
+ return false;
1125
+ // Disallow leading zeros.
1126
+ if (s->size() >= 2 && (*s)[0] == '0' && isdigit((*s)[1] & 0xFF))
1127
+ return false;
1128
+ int n = 0;
1129
+ int c;
1130
+ while (s->size() > 0 && isdigit(c = (*s)[0] & 0xFF)) {
1131
+ // Avoid overflow.
1132
+ if (n >= 100000000)
1133
+ return false;
1134
+ n = n*10 + c - '0';
1135
+ s->remove_prefix(1); // digit
1136
+ }
1137
+ *np = n;
1138
+ return true;
1139
+ }
1140
+
1141
+ // Parses a repetition suffix like {1,2} or {2} or {2,}.
1142
+ // Sets *s to span the remainder of the string on success.
1143
+ // Sets *lo and *hi to the given range.
1144
+ // In the case of {2,}, the high number is unbounded;
1145
+ // sets *hi to -1 to signify this.
1146
+ // {,2} is NOT a valid suffix.
1147
+ // The Maybe in the name signifies that the regexp parse
1148
+ // doesn't fail even if ParseRepetition does, so the StringPiece
1149
+ // s must NOT be edited unless MaybeParseRepetition returns true.
1150
+ static bool MaybeParseRepetition(StringPiece* sp, int* lo, int* hi) {
1151
+ StringPiece s = *sp;
1152
+ if (s.size() == 0 || s[0] != '{')
1153
+ return false;
1154
+ s.remove_prefix(1); // '{'
1155
+ if (!ParseInteger(&s, lo))
1156
+ return false;
1157
+ if (s.size() == 0)
1158
+ return false;
1159
+ if (s[0] == ',') {
1160
+ s.remove_prefix(1); // ','
1161
+ if (s.size() == 0)
1162
+ return false;
1163
+ if (s[0] == '}') {
1164
+ // {2,} means at least 2
1165
+ *hi = -1;
1166
+ } else {
1167
+ // {2,4} means 2, 3, or 4.
1168
+ if (!ParseInteger(&s, hi))
1169
+ return false;
1170
+ }
1171
+ } else {
1172
+ // {2} means exactly two
1173
+ *hi = *lo;
1174
+ }
1175
+ if (s.size() == 0 || s[0] != '}')
1176
+ return false;
1177
+ s.remove_prefix(1); // '}'
1178
+ *sp = s;
1179
+ return true;
1180
+ }
1181
+
1182
+ // Removes the next Rune from the StringPiece and stores it in *r.
1183
+ // Returns number of bytes removed from sp.
1184
+ // Behaves as though there is a terminating NUL at the end of sp.
1185
+ // Argument order is backwards from usual Google style
1186
+ // but consistent with chartorune.
1187
+ static int StringPieceToRune(Rune *r, StringPiece *sp, RegexpStatus* status) {
1188
+ int n;
1189
+ if (fullrune(sp->data(), sp->size())) {
1190
+ n = chartorune(r, sp->data());
1191
+ if (!(n == 1 && *r == Runeerror)) { // no decoding error
1192
+ sp->remove_prefix(n);
1193
+ return n;
1194
+ }
1195
+ }
1196
+
1197
+ status->set_code(kRegexpBadUTF8);
1198
+ status->set_error_arg(NULL);
1199
+ return -1;
1200
+ }
1201
+
1202
+ // Return whether name is valid UTF-8.
1203
+ // If not, set status to kRegexpBadUTF8.
1204
+ static bool IsValidUTF8(const StringPiece& s, RegexpStatus* status) {
1205
+ StringPiece t = s;
1206
+ Rune r;
1207
+ while (t.size() > 0) {
1208
+ if (StringPieceToRune(&r, &t, status) < 0)
1209
+ return false;
1210
+ }
1211
+ return true;
1212
+ }
1213
+
1214
+ // Is c a hex digit?
1215
+ static int IsHex(int c) {
1216
+ return ('0' <= c && c <= '9') ||
1217
+ ('A' <= c && c <= 'F') ||
1218
+ ('a' <= c && c <= 'f');
1219
+ }
1220
+
1221
+ // Convert hex digit to value.
1222
+ static int UnHex(int c) {
1223
+ if ('0' <= c && c <= '9')
1224
+ return c - '0';
1225
+ if ('A' <= c && c <= 'F')
1226
+ return c - 'A' + 10;
1227
+ if ('a' <= c && c <= 'f')
1228
+ return c - 'a' + 10;
1229
+ LOG(DFATAL) << "Bad hex digit " << c;
1230
+ return 0;
1231
+ }
1232
+
1233
+ // Parse an escape sequence (e.g., \n, \{).
1234
+ // Sets *s to span the remainder of the string.
1235
+ // Sets *rp to the named character.
1236
+ static bool ParseEscape(StringPiece* s, Rune* rp,
1237
+ RegexpStatus* status, int rune_max) {
1238
+ const char* begin = s->begin();
1239
+ if (s->size() < 1 || (*s)[0] != '\\') {
1240
+ // Should not happen - caller always checks.
1241
+ status->set_code(kRegexpInternalError);
1242
+ status->set_error_arg(NULL);
1243
+ return false;
1244
+ }
1245
+ if (s->size() < 2) {
1246
+ status->set_code(kRegexpTrailingBackslash);
1247
+ status->set_error_arg(NULL);
1248
+ return false;
1249
+ }
1250
+ Rune c, c1;
1251
+ s->remove_prefix(1); // backslash
1252
+ if (StringPieceToRune(&c, s, status) < 0)
1253
+ return false;
1254
+ int code;
1255
+ switch (c) {
1256
+ default:
1257
+ if (c < Runeself && !isalpha(c) && !isdigit(c)) {
1258
+ // Escaped non-word characters are always themselves.
1259
+ // PCRE is not quite so rigorous: it accepts things like
1260
+ // \q, but we don't. We once rejected \_, but too many
1261
+ // programs and people insist on using it, so allow \_.
1262
+ *rp = c;
1263
+ return true;
1264
+ }
1265
+ goto BadEscape;
1266
+
1267
+ // Octal escapes.
1268
+ case '1':
1269
+ case '2':
1270
+ case '3':
1271
+ case '4':
1272
+ case '5':
1273
+ case '6':
1274
+ case '7':
1275
+ // Single non-zero octal digit is a backreference; not supported.
1276
+ if (s->size() == 0 || (*s)[0] < '0' || (*s)[0] > '7')
1277
+ goto BadEscape;
1278
+ // fall through
1279
+ case '0':
1280
+ // consume up to three octal digits; already have one.
1281
+ code = c - '0';
1282
+ if (s->size() > 0 && '0' <= (c = (*s)[0]) && c <= '7') {
1283
+ code = code * 8 + c - '0';
1284
+ s->remove_prefix(1); // digit
1285
+ if (s->size() > 0) {
1286
+ c = (*s)[0];
1287
+ if ('0' <= c && c <= '7') {
1288
+ code = code * 8 + c - '0';
1289
+ s->remove_prefix(1); // digit
1290
+ }
1291
+ }
1292
+ }
1293
+ *rp = code;
1294
+ return true;
1295
+
1296
+ // Hexadecimal escapes
1297
+ case 'x':
1298
+ if (s->size() == 0)
1299
+ goto BadEscape;
1300
+ if (StringPieceToRune(&c, s, status) < 0)
1301
+ return false;
1302
+ if (c == '{') {
1303
+ // Any number of digits in braces.
1304
+ // Update n as we consume the string, so that
1305
+ // the whole thing gets shown in the error message.
1306
+ // Perl accepts any text at all; it ignores all text
1307
+ // after the first non-hex digit. We require only hex digits,
1308
+ // and at least one.
1309
+ if (StringPieceToRune(&c, s, status) < 0)
1310
+ return false;
1311
+ int nhex = 0;
1312
+ code = 0;
1313
+ while (IsHex(c)) {
1314
+ nhex++;
1315
+ code = code * 16 + UnHex(c);
1316
+ if (code > rune_max)
1317
+ goto BadEscape;
1318
+ if (s->size() == 0)
1319
+ goto BadEscape;
1320
+ if (StringPieceToRune(&c, s, status) < 0)
1321
+ return false;
1322
+ }
1323
+ if (c != '}' || nhex == 0)
1324
+ goto BadEscape;
1325
+ *rp = code;
1326
+ return true;
1327
+ }
1328
+ // Easy case: two hex digits.
1329
+ if (s->size() == 0)
1330
+ goto BadEscape;
1331
+ if (StringPieceToRune(&c1, s, status) < 0)
1332
+ return false;
1333
+ if (!IsHex(c) || !IsHex(c1))
1334
+ goto BadEscape;
1335
+ *rp = UnHex(c) * 16 + UnHex(c1);
1336
+ return true;
1337
+
1338
+ // C escapes.
1339
+ case 'n':
1340
+ *rp = '\n';
1341
+ return true;
1342
+ case 'r':
1343
+ *rp = '\r';
1344
+ return true;
1345
+ case 't':
1346
+ *rp = '\t';
1347
+ return true;
1348
+
1349
+ // Less common C escapes.
1350
+ case 'a':
1351
+ *rp = '\a';
1352
+ return true;
1353
+ case 'f':
1354
+ *rp = '\f';
1355
+ return true;
1356
+ case 'v':
1357
+ *rp = '\v';
1358
+ return true;
1359
+
1360
+ // This code is disabled to avoid misparsing
1361
+ // the Perl word-boundary \b as a backspace
1362
+ // when in POSIX regexp mode. Surprisingly,
1363
+ // in Perl, \b means word-boundary but [\b]
1364
+ // means backspace. We don't support that:
1365
+ // if you want a backspace embed a literal
1366
+ // backspace character or use \x08.
1367
+ //
1368
+ // case 'b':
1369
+ // *rp = '\b';
1370
+ // return true;
1371
+ }
1372
+
1373
+ LOG(DFATAL) << "Not reached in ParseEscape.";
1374
+
1375
+ BadEscape:
1376
+ // Unrecognized escape sequence.
1377
+ status->set_code(kRegexpBadEscape);
1378
+ status->set_error_arg(StringPiece(begin, s->data() - begin));
1379
+ return false;
1380
+ }
1381
+
1382
+ // Add a range to the character class, but exclude newline if asked.
1383
+ // Also handle case folding.
1384
+ void CharClassBuilder::AddRangeFlags(
1385
+ Rune lo, Rune hi, Regexp::ParseFlags parse_flags) {
1386
+
1387
+ // Take out \n if the flags say so.
1388
+ bool cutnl = !(parse_flags & Regexp::ClassNL) ||
1389
+ (parse_flags & Regexp::NeverNL);
1390
+ if (cutnl && lo <= '\n' && '\n' <= hi) {
1391
+ if (lo < '\n')
1392
+ AddRangeFlags(lo, '\n' - 1, parse_flags);
1393
+ if (hi > '\n')
1394
+ AddRangeFlags('\n' + 1, hi, parse_flags);
1395
+ return;
1396
+ }
1397
+
1398
+ // If folding case, add fold-equivalent characters too.
1399
+ if (parse_flags & Regexp::FoldCase)
1400
+ AddFoldedRange(this, lo, hi, 0);
1401
+ else
1402
+ AddRange(lo, hi);
1403
+ }
1404
+
1405
+ // Look for a group with the given name.
1406
+ static UGroup* LookupGroup(const StringPiece& name,
1407
+ UGroup *groups, int ngroups) {
1408
+ // Simple name lookup.
1409
+ for (int i = 0; i < ngroups; i++)
1410
+ if (StringPiece(groups[i].name) == name)
1411
+ return &groups[i];
1412
+ return NULL;
1413
+ }
1414
+
1415
+ // Fake UGroup containing all Runes
1416
+ static URange16 any16[] = { { 0, 65535 } };
1417
+ static URange32 any32[] = { { 65536, Runemax } };
1418
+ static UGroup anygroup = { "Any", +1, any16, 1, any32, 1 };
1419
+
1420
+ // Look for a POSIX group with the given name (e.g., "[:^alpha:]")
1421
+ static UGroup* LookupPosixGroup(const StringPiece& name) {
1422
+ return LookupGroup(name, posix_groups, num_posix_groups);
1423
+ }
1424
+
1425
+ static UGroup* LookupPerlGroup(const StringPiece& name) {
1426
+ return LookupGroup(name, perl_groups, num_perl_groups);
1427
+ }
1428
+
1429
+ // Look for a Unicode group with the given name (e.g., "Han")
1430
+ static UGroup* LookupUnicodeGroup(const StringPiece& name) {
1431
+ // Special case: "Any" means any.
1432
+ if (name == StringPiece("Any"))
1433
+ return &anygroup;
1434
+ return LookupGroup(name, unicode_groups, num_unicode_groups);
1435
+ }
1436
+
1437
+ // Add a UGroup or its negation to the character class.
1438
+ static void AddUGroup(CharClassBuilder *cc, UGroup *g, int sign,
1439
+ Regexp::ParseFlags parse_flags) {
1440
+ if (sign == +1) {
1441
+ for (int i = 0; i < g->nr16; i++) {
1442
+ cc->AddRangeFlags(g->r16[i].lo, g->r16[i].hi, parse_flags);
1443
+ }
1444
+ for (int i = 0; i < g->nr32; i++) {
1445
+ cc->AddRangeFlags(g->r32[i].lo, g->r32[i].hi, parse_flags);
1446
+ }
1447
+ } else {
1448
+ if (parse_flags & Regexp::FoldCase) {
1449
+ // Normally adding a case-folded group means
1450
+ // adding all the extra fold-equivalent runes too.
1451
+ // But if we're adding the negation of the group,
1452
+ // we have to exclude all the runes that are fold-equivalent
1453
+ // to what's already missing. Too hard, so do in two steps.
1454
+ CharClassBuilder ccb1;
1455
+ AddUGroup(&ccb1, g, +1, parse_flags);
1456
+ ccb1.Negate();
1457
+ cc->AddCharClass(&ccb1);
1458
+ return;
1459
+ }
1460
+ int next = 0;
1461
+ for (int i = 0; i < g->nr16; i++) {
1462
+ if (next < g->r16[i].lo)
1463
+ cc->AddRangeFlags(next, g->r16[i].lo - 1, parse_flags);
1464
+ next = g->r16[i].hi + 1;
1465
+ }
1466
+ for (int i = 0; i < g->nr32; i++) {
1467
+ if (next < g->r32[i].lo)
1468
+ cc->AddRangeFlags(next, g->r32[i].lo - 1, parse_flags);
1469
+ next = g->r32[i].hi + 1;
1470
+ }
1471
+ if (next <= Runemax)
1472
+ cc->AddRangeFlags(next, Runemax, parse_flags);
1473
+ }
1474
+ }
1475
+
1476
+ // Maybe parse a Perl character class escape sequence.
1477
+ // Only recognizes the Perl character classes (\d \s \w \D \S \W),
1478
+ // not the Perl empty-string classes (\b \B \A \Z \z).
1479
+ // On success, sets *s to span the remainder of the string
1480
+ // and returns the corresponding UGroup.
1481
+ // The StringPiece must *NOT* be edited unless the call succeeds.
1482
+ UGroup* MaybeParsePerlCCEscape(StringPiece* s, Regexp::ParseFlags parse_flags) {
1483
+ if (!(parse_flags & Regexp::PerlClasses))
1484
+ return NULL;
1485
+ if (s->size() < 2 || (*s)[0] != '\\')
1486
+ return NULL;
1487
+ // Could use StringPieceToRune, but there aren't
1488
+ // any non-ASCII Perl group names.
1489
+ StringPiece name(s->begin(), 2);
1490
+ UGroup *g = LookupPerlGroup(name);
1491
+ if (g == NULL)
1492
+ return NULL;
1493
+ s->remove_prefix(name.size());
1494
+ return g;
1495
+ }
1496
+
1497
+ enum ParseStatus {
1498
+ kParseOk, // Did some parsing.
1499
+ kParseError, // Found an error.
1500
+ kParseNothing, // Decided not to parse.
1501
+ };
1502
+
1503
+ // Maybe parses a Unicode character group like \p{Han} or \P{Han}
1504
+ // (the latter is a negated group).
1505
+ ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags,
1506
+ CharClassBuilder *cc,
1507
+ RegexpStatus* status) {
1508
+ // Decide whether to parse.
1509
+ if (!(parse_flags & Regexp::UnicodeGroups))
1510
+ return kParseNothing;
1511
+ if (s->size() < 2 || (*s)[0] != '\\')
1512
+ return kParseNothing;
1513
+ Rune c = (*s)[1];
1514
+ if (c != 'p' && c != 'P')
1515
+ return kParseNothing;
1516
+
1517
+ // Committed to parse. Results:
1518
+ int sign = +1; // -1 = negated char class
1519
+ if (c == 'P')
1520
+ sign = -1;
1521
+ StringPiece seq = *s; // \p{Han} or \pL
1522
+ StringPiece name; // Han or L
1523
+ s->remove_prefix(2); // '\\', 'p'
1524
+
1525
+ if (!StringPieceToRune(&c, s, status))
1526
+ return kParseError;
1527
+ if (c != '{') {
1528
+ // Name is the bit of string we just skipped over for c.
1529
+ const char* p = seq.begin() + 2;
1530
+ name = StringPiece(p, s->begin() - p);
1531
+ } else {
1532
+ // Name is in braces. Look for closing }
1533
+ int end = s->find('}', 0);
1534
+ if (end == s->npos) {
1535
+ if (!IsValidUTF8(seq, status))
1536
+ return kParseError;
1537
+ status->set_code(kRegexpBadCharRange);
1538
+ status->set_error_arg(seq);
1539
+ return kParseError;
1540
+ }
1541
+ name = StringPiece(s->begin(), end); // without '}'
1542
+ s->remove_prefix(end + 1); // with '}'
1543
+ if (!IsValidUTF8(name, status))
1544
+ return kParseError;
1545
+ }
1546
+
1547
+ // Chop seq where s now begins.
1548
+ seq = StringPiece(seq.begin(), s->begin() - seq.begin());
1549
+
1550
+ // Look up group
1551
+ if (name.size() > 0 && name[0] == '^') {
1552
+ sign = -sign;
1553
+ name.remove_prefix(1); // '^'
1554
+ }
1555
+ UGroup *g = LookupUnicodeGroup(name);
1556
+ if (g == NULL) {
1557
+ status->set_code(kRegexpBadCharRange);
1558
+ status->set_error_arg(seq);
1559
+ return kParseError;
1560
+ }
1561
+
1562
+ AddUGroup(cc, g, sign, parse_flags);
1563
+ return kParseOk;
1564
+ }
1565
+
1566
+ // Parses a character class name like [:alnum:].
1567
+ // Sets *s to span the remainder of the string.
1568
+ // Adds the ranges corresponding to the class to ranges.
1569
+ static ParseStatus ParseCCName(StringPiece* s, Regexp::ParseFlags parse_flags,
1570
+ CharClassBuilder *cc,
1571
+ RegexpStatus* status) {
1572
+ // Check begins with [:
1573
+ const char* p = s->data();
1574
+ const char* ep = s->data() + s->size();
1575
+ if (ep - p < 2 || p[0] != '[' || p[1] != ':')
1576
+ return kParseNothing;
1577
+
1578
+ // Look for closing :].
1579
+ const char* q;
1580
+ for (q = p+2; q <= ep-2 && (*q != ':' || *(q+1) != ']'); q++)
1581
+ ;
1582
+
1583
+ // If no closing :], then ignore.
1584
+ if (q > ep-2)
1585
+ return kParseNothing;
1586
+
1587
+ // Got it. Check that it's valid.
1588
+ q += 2;
1589
+ StringPiece name(p, q-p);
1590
+
1591
+ UGroup *g = LookupPosixGroup(name);
1592
+ if (g == NULL) {
1593
+ status->set_code(kRegexpBadCharRange);
1594
+ status->set_error_arg(name);
1595
+ return kParseError;
1596
+ }
1597
+
1598
+ s->remove_prefix(name.size());
1599
+ AddUGroup(cc, g, g->sign, parse_flags);
1600
+ return kParseOk;
1601
+ }
1602
+
1603
+ // Parses a character inside a character class.
1604
+ // There are fewer special characters here than in the rest of the regexp.
1605
+ // Sets *s to span the remainder of the string.
1606
+ // Sets *rp to the character.
1607
+ bool Regexp::ParseState::ParseCCCharacter(StringPiece* s, Rune *rp,
1608
+ const StringPiece& whole_class,
1609
+ RegexpStatus* status) {
1610
+ if (s->size() == 0) {
1611
+ status->set_code(kRegexpMissingBracket);
1612
+ status->set_error_arg(whole_class);
1613
+ return false;
1614
+ }
1615
+
1616
+ // Allow regular escape sequences even though
1617
+ // many need not be escaped in this context.
1618
+ if (s->size() >= 1 && (*s)[0] == '\\')
1619
+ return ParseEscape(s, rp, status, rune_max_);
1620
+
1621
+ // Otherwise take the next rune.
1622
+ return StringPieceToRune(rp, s, status) >= 0;
1623
+ }
1624
+
1625
+ // Parses a character class character, or, if the character
1626
+ // is followed by a hyphen, parses a character class range.
1627
+ // For single characters, rr->lo == rr->hi.
1628
+ // Sets *s to span the remainder of the string.
1629
+ // Sets *rp to the character.
1630
+ bool Regexp::ParseState::ParseCCRange(StringPiece* s, RuneRange* rr,
1631
+ const StringPiece& whole_class,
1632
+ RegexpStatus* status) {
1633
+ StringPiece os = *s;
1634
+ if (!ParseCCCharacter(s, &rr->lo, whole_class, status))
1635
+ return false;
1636
+ // [a-] means (a|-), so check for final ].
1637
+ if (s->size() >= 2 && (*s)[0] == '-' && (*s)[1] != ']') {
1638
+ s->remove_prefix(1); // '-'
1639
+ if (!ParseCCCharacter(s, &rr->hi, whole_class, status))
1640
+ return false;
1641
+ if (rr->hi < rr->lo) {
1642
+ status->set_code(kRegexpBadCharRange);
1643
+ status->set_error_arg(StringPiece(os.data(), s->data() - os.data()));
1644
+ return false;
1645
+ }
1646
+ } else {
1647
+ rr->hi = rr->lo;
1648
+ }
1649
+ return true;
1650
+ }
1651
+
1652
+ // Parses a possibly-negated character class expression like [^abx-z[:digit:]].
1653
+ // Sets *s to span the remainder of the string.
1654
+ // Sets *out_re to the regexp for the class.
1655
+ bool Regexp::ParseState::ParseCharClass(StringPiece* s,
1656
+ Regexp** out_re,
1657
+ RegexpStatus* status) {
1658
+ StringPiece whole_class = *s;
1659
+ if (s->size() == 0 || (*s)[0] != '[') {
1660
+ // Caller checked this.
1661
+ status->set_code(kRegexpInternalError);
1662
+ status->set_error_arg(NULL);
1663
+ return false;
1664
+ }
1665
+ bool negated = false;
1666
+ Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);
1667
+ re->ccb_ = new CharClassBuilder;
1668
+ s->remove_prefix(1); // '['
1669
+ if (s->size() > 0 && (*s)[0] == '^') {
1670
+ s->remove_prefix(1); // '^'
1671
+ negated = true;
1672
+ if (!(flags_ & ClassNL) || (flags_ & NeverNL)) {
1673
+ // If NL can't match implicitly, then pretend
1674
+ // negated classes include a leading \n.
1675
+ re->ccb_->AddRange('\n', '\n');
1676
+ }
1677
+ }
1678
+ bool first = true; // ] is okay as first char in class
1679
+ while (s->size() > 0 && ((*s)[0] != ']' || first)) {
1680
+ // - is only okay unescaped as first or last in class.
1681
+ // Except that Perl allows - anywhere.
1682
+ if ((*s)[0] == '-' && !first && !(flags_&PerlX) &&
1683
+ (s->size() == 1 || (*s)[1] != ']')) {
1684
+ StringPiece t = *s;
1685
+ t.remove_prefix(1); // '-'
1686
+ Rune r;
1687
+ int n = StringPieceToRune(&r, &t, status);
1688
+ if (n < 0) {
1689
+ re->Decref();
1690
+ return false;
1691
+ }
1692
+ status->set_code(kRegexpBadCharRange);
1693
+ status->set_error_arg(StringPiece(s->data(), 1+n));
1694
+ re->Decref();
1695
+ return false;
1696
+ }
1697
+ first = false;
1698
+
1699
+ // Look for [:alnum:] etc.
1700
+ if (s->size() > 2 && (*s)[0] == '[' && (*s)[1] == ':') {
1701
+ switch (ParseCCName(s, flags_, re->ccb_, status)) {
1702
+ case kParseOk:
1703
+ continue;
1704
+ case kParseError:
1705
+ re->Decref();
1706
+ return false;
1707
+ case kParseNothing:
1708
+ break;
1709
+ }
1710
+ }
1711
+
1712
+ // Look for Unicode character group like \p{Han}
1713
+ if (s->size() > 2 &&
1714
+ (*s)[0] == '\\' &&
1715
+ ((*s)[1] == 'p' || (*s)[1] == 'P')) {
1716
+ switch (ParseUnicodeGroup(s, flags_, re->ccb_, status)) {
1717
+ case kParseOk:
1718
+ continue;
1719
+ case kParseError:
1720
+ re->Decref();
1721
+ return false;
1722
+ case kParseNothing:
1723
+ break;
1724
+ }
1725
+ }
1726
+
1727
+ // Look for Perl character class symbols (extension).
1728
+ UGroup *g = MaybeParsePerlCCEscape(s, flags_);
1729
+ if (g != NULL) {
1730
+ AddUGroup(re->ccb_, g, g->sign, flags_);
1731
+ continue;
1732
+ }
1733
+
1734
+ // Otherwise assume single character or simple range.
1735
+ RuneRange rr;
1736
+ if (!ParseCCRange(s, &rr, whole_class, status)) {
1737
+ re->Decref();
1738
+ return false;
1739
+ }
1740
+ // AddRangeFlags is usually called in response to a class like
1741
+ // \p{Foo} or [[:foo:]]; for those, it filters \n out unless
1742
+ // Regexp::ClassNL is set. In an explicit range or singleton
1743
+ // like we just parsed, we do not filter \n out, so set ClassNL
1744
+ // in the flags.
1745
+ re->ccb_->AddRangeFlags(rr.lo, rr.hi, flags_ | Regexp::ClassNL);
1746
+ }
1747
+ if (s->size() == 0) {
1748
+ status->set_code(kRegexpMissingBracket);
1749
+ status->set_error_arg(whole_class);
1750
+ re->Decref();
1751
+ return false;
1752
+ }
1753
+ s->remove_prefix(1); // ']'
1754
+
1755
+ if (negated)
1756
+ re->ccb_->Negate();
1757
+ re->ccb_->RemoveAbove(rune_max_);
1758
+
1759
+ *out_re = re;
1760
+ return true;
1761
+ }
1762
+
1763
+ // Is this a valid capture name? [A-Za-z0-9_]+
1764
+ // PCRE limits names to 32 bytes.
1765
+ // Python rejects names starting with digits.
1766
+ // We don't enforce either of those.
1767
+ static bool IsValidCaptureName(const StringPiece& name) {
1768
+ if (name.size() == 0)
1769
+ return false;
1770
+ for (int i = 0; i < name.size(); i++) {
1771
+ int c = name[i];
1772
+ if (('0' <= c && c <= '9') ||
1773
+ ('a' <= c && c <= 'z') ||
1774
+ ('A' <= c && c <= 'Z') ||
1775
+ c == '_')
1776
+ continue;
1777
+ return false;
1778
+ }
1779
+ return true;
1780
+ }
1781
+
1782
+ // Parses a Perl flag setting or non-capturing group or both,
1783
+ // like (?i) or (?: or (?i:. Removes from s, updates parse state.
1784
+ // The caller must check that s begins with "(?".
1785
+ // Returns true on success. If the Perl flag is not
1786
+ // well-formed or not supported, sets status_ and returns false.
1787
+ bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) {
1788
+ StringPiece t = *s;
1789
+
1790
+ // Caller is supposed to check this.
1791
+ if (!(flags_ & PerlX) || t.size() < 2 || t[0] != '(' || t[1] != '?') {
1792
+ LOG(DFATAL) << "Bad call to ParseState::ParsePerlFlags";
1793
+ status_->set_code(kRegexpInternalError);
1794
+ return false;
1795
+ }
1796
+
1797
+ t.remove_prefix(2); // "(?"
1798
+
1799
+ // Check for named captures, first introduced in Python's regexp library.
1800
+ // As usual, there are three slightly different syntaxes:
1801
+ //
1802
+ // (?P<name>expr) the original, introduced by Python
1803
+ // (?<name>expr) the .NET alteration, adopted by Perl 5.10
1804
+ // (?'name'expr) another .NET alteration, adopted by Perl 5.10
1805
+ //
1806
+ // Perl 5.10 gave in and implemented the Python version too,
1807
+ // but they claim that the last two are the preferred forms.
1808
+ // PCRE and languages based on it (specifically, PHP and Ruby)
1809
+ // support all three as well. EcmaScript 4 uses only the Python form.
1810
+ //
1811
+ // In both the open source world (via Code Search) and the
1812
+ // Google source tree, (?P<expr>name) is the dominant form,
1813
+ // so that's the one we implement. One is enough.
1814
+ if (t.size() > 2 && t[0] == 'P' && t[1] == '<') {
1815
+ // Pull out name.
1816
+ int end = t.find('>', 2);
1817
+ if (end == t.npos) {
1818
+ if (!IsValidUTF8(*s, status_))
1819
+ return false;
1820
+ status_->set_code(kRegexpBadNamedCapture);
1821
+ status_->set_error_arg(*s);
1822
+ return false;
1823
+ }
1824
+
1825
+ // t is "P<name>...", t[end] == '>'
1826
+ StringPiece capture(t.begin()-2, end+3); // "(?P<name>"
1827
+ StringPiece name(t.begin()+2, end-2); // "name"
1828
+ if (!IsValidUTF8(name, status_))
1829
+ return false;
1830
+ if (!IsValidCaptureName(name)) {
1831
+ status_->set_code(kRegexpBadNamedCapture);
1832
+ status_->set_error_arg(capture);
1833
+ return false;
1834
+ }
1835
+
1836
+ if (!DoLeftParen(name)) {
1837
+ // DoLeftParen's failure set status_.
1838
+ return false;
1839
+ }
1840
+
1841
+ s->remove_prefix(capture.end() - s->begin());
1842
+ return true;
1843
+ }
1844
+
1845
+ bool negated = false;
1846
+ bool sawflags = false;
1847
+ int nflags = flags_;
1848
+ Rune c;
1849
+ for (bool done = false; !done; ) {
1850
+ if (t.size() == 0)
1851
+ goto BadPerlOp;
1852
+ if (StringPieceToRune(&c, &t, status_) < 0)
1853
+ return false;
1854
+ switch (c) {
1855
+ default:
1856
+ goto BadPerlOp;
1857
+
1858
+ // Parse flags.
1859
+ case 'i':
1860
+ sawflags = true;
1861
+ if (negated)
1862
+ nflags &= ~FoldCase;
1863
+ else
1864
+ nflags |= FoldCase;
1865
+ break;
1866
+
1867
+ case 'm': // opposite of our OneLine
1868
+ sawflags = true;
1869
+ if (negated)
1870
+ nflags |= OneLine;
1871
+ else
1872
+ nflags &= ~OneLine;
1873
+ break;
1874
+
1875
+ case 's':
1876
+ sawflags = true;
1877
+ if (negated)
1878
+ nflags &= ~DotNL;
1879
+ else
1880
+ nflags |= DotNL;
1881
+ break;
1882
+
1883
+ case 'U':
1884
+ sawflags = true;
1885
+ if (negated)
1886
+ nflags &= ~NonGreedy;
1887
+ else
1888
+ nflags |= NonGreedy;
1889
+ break;
1890
+
1891
+ // Negation
1892
+ case '-':
1893
+ if (negated)
1894
+ goto BadPerlOp;
1895
+ negated = true;
1896
+ sawflags = false;
1897
+ break;
1898
+
1899
+ // Open new group.
1900
+ case ':':
1901
+ if (!DoLeftParenNoCapture()) {
1902
+ // DoLeftParenNoCapture's failure set status_.
1903
+ return false;
1904
+ }
1905
+ done = true;
1906
+ break;
1907
+
1908
+ // Finish flags.
1909
+ case ')':
1910
+ done = true;
1911
+ break;
1912
+ }
1913
+ }
1914
+
1915
+ if (negated && !sawflags)
1916
+ goto BadPerlOp;
1917
+
1918
+ flags_ = static_cast<Regexp::ParseFlags>(nflags);
1919
+ *s = t;
1920
+ return true;
1921
+
1922
+ BadPerlOp:
1923
+ status_->set_code(kRegexpBadPerlOp);
1924
+ status_->set_error_arg(StringPiece(s->begin(), t.begin() - s->begin()));
1925
+ return false;
1926
+ }
1927
+
1928
+ // Converts latin1 (assumed to be encoded as Latin1 bytes)
1929
+ // into UTF8 encoding in string.
1930
+ // Can't use EncodingUtils::EncodeLatin1AsUTF8 because it is
1931
+ // deprecated and because it rejects code points 0x80-0x9F.
1932
+ void ConvertLatin1ToUTF8(const StringPiece& latin1, string* utf) {
1933
+ char buf[UTFmax];
1934
+
1935
+ utf->clear();
1936
+ for (int i = 0; i < latin1.size(); i++) {
1937
+ Rune r = latin1[i] & 0xFF;
1938
+ int n = runetochar(buf, &r);
1939
+ utf->append(buf, n);
1940
+ }
1941
+ }
1942
+
1943
+ // Parses the regular expression given by s,
1944
+ // returning the corresponding Regexp tree.
1945
+ // The caller must Decref the return value when done with it.
1946
+ // Returns NULL on error.
1947
+ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags,
1948
+ RegexpStatus* status) {
1949
+ // Make status non-NULL (easier on everyone else).
1950
+ RegexpStatus xstatus;
1951
+ if (status == NULL)
1952
+ status = &xstatus;
1953
+
1954
+ ParseState ps(global_flags, s, status);
1955
+ StringPiece t = s;
1956
+
1957
+ // Convert regexp to UTF-8 (easier on the rest of the parser).
1958
+ if (global_flags & Latin1) {
1959
+ string* tmp = new string;
1960
+ ConvertLatin1ToUTF8(t, tmp);
1961
+ status->set_tmp(tmp);
1962
+ t = *tmp;
1963
+ }
1964
+
1965
+ if (global_flags & Literal) {
1966
+ // Special parse loop for literal string.
1967
+ while (t.size() > 0) {
1968
+ Rune r;
1969
+ if (StringPieceToRune(&r, &t, status) < 0)
1970
+ return NULL;
1971
+ if (!ps.PushLiteral(r))
1972
+ return NULL;
1973
+ }
1974
+ return ps.DoFinish();
1975
+ }
1976
+
1977
+ StringPiece lastunary = NULL;
1978
+ while (t.size() > 0) {
1979
+ StringPiece isunary = NULL;
1980
+ switch (t[0]) {
1981
+ default: {
1982
+ Rune r;
1983
+ if (StringPieceToRune(&r, &t, status) < 0)
1984
+ return NULL;
1985
+ if (!ps.PushLiteral(r))
1986
+ return NULL;
1987
+ break;
1988
+ }
1989
+
1990
+ case '(':
1991
+ // "(?" introduces Perl escape.
1992
+ if ((ps.flags() & PerlX) && (t.size() >= 2 && t[1] == '?')) {
1993
+ // Flag changes and non-capturing groups.
1994
+ if (!ps.ParsePerlFlags(&t))
1995
+ return NULL;
1996
+ break;
1997
+ }
1998
+ if (!ps.DoLeftParen(NULL))
1999
+ return NULL;
2000
+ t.remove_prefix(1); // '('
2001
+ break;
2002
+
2003
+ case '|':
2004
+ if (!ps.DoVerticalBar())
2005
+ return NULL;
2006
+ t.remove_prefix(1); // '|'
2007
+ break;
2008
+
2009
+ case ')':
2010
+ if (!ps.DoRightParen())
2011
+ return NULL;
2012
+ t.remove_prefix(1); // ')'
2013
+ break;
2014
+
2015
+ case '^': // Beginning of line.
2016
+ if (!ps.PushCarat())
2017
+ return NULL;
2018
+ t.remove_prefix(1); // '^'
2019
+ break;
2020
+
2021
+ case '$': // End of line.
2022
+ if (!ps.PushDollar())
2023
+ return NULL;
2024
+ t.remove_prefix(1); // '$'
2025
+ break;
2026
+
2027
+ case '.': // Any character (possibly except newline).
2028
+ if (!ps.PushDot())
2029
+ return NULL;
2030
+ t.remove_prefix(1); // '.'
2031
+ break;
2032
+
2033
+ case '[': { // Character class.
2034
+ Regexp* re;
2035
+ if (!ps.ParseCharClass(&t, &re, status))
2036
+ return NULL;
2037
+ if (!ps.PushRegexp(re))
2038
+ return NULL;
2039
+ break;
2040
+ }
2041
+
2042
+ case '*': { // Zero or more.
2043
+ RegexpOp op;
2044
+ op = kRegexpStar;
2045
+ goto Rep;
2046
+ case '+': // One or more.
2047
+ op = kRegexpPlus;
2048
+ goto Rep;
2049
+ case '?': // Zero or one.
2050
+ op = kRegexpQuest;
2051
+ goto Rep;
2052
+ Rep:
2053
+ StringPiece opstr = t;
2054
+ bool nongreedy = false;
2055
+ t.remove_prefix(1); // '*' or '+' or '?'
2056
+ if (ps.flags() & PerlX) {
2057
+ if (t.size() > 0 && t[0] == '?') {
2058
+ nongreedy = true;
2059
+ t.remove_prefix(1); // '?'
2060
+ }
2061
+ if (lastunary.size() > 0) {
2062
+ // In Perl it is not allowed to stack repetition operators:
2063
+ // a** is a syntax error, not a double-star.
2064
+ // (and a++ means something else entirely, which we don't support!)
2065
+ status->set_code(kRegexpRepeatOp);
2066
+ status->set_error_arg(StringPiece(lastunary.begin(),
2067
+ t.begin() - lastunary.begin()));
2068
+ return NULL;
2069
+ }
2070
+ }
2071
+ opstr.set(opstr.data(), t.data() - opstr.data());
2072
+ if (!ps.PushRepeatOp(op, opstr, nongreedy))
2073
+ return NULL;
2074
+ isunary = opstr;
2075
+ break;
2076
+ }
2077
+
2078
+ case '{': { // Counted repetition.
2079
+ int lo, hi;
2080
+ StringPiece opstr = t;
2081
+ if (!MaybeParseRepetition(&t, &lo, &hi)) {
2082
+ // Treat like a literal.
2083
+ if (!ps.PushLiteral('{'))
2084
+ return NULL;
2085
+ t.remove_prefix(1); // '{'
2086
+ break;
2087
+ }
2088
+ bool nongreedy = false;
2089
+ if (ps.flags() & PerlX) {
2090
+ if (t.size() > 0 && t[0] == '?') {
2091
+ nongreedy = true;
2092
+ t.remove_prefix(1); // '?'
2093
+ }
2094
+ if (lastunary.size() > 0) {
2095
+ // Not allowed to stack repetition operators.
2096
+ status->set_code(kRegexpRepeatOp);
2097
+ status->set_error_arg(StringPiece(lastunary.begin(),
2098
+ t.begin() - lastunary.begin()));
2099
+ return NULL;
2100
+ }
2101
+ }
2102
+ opstr.set(opstr.data(), t.data() - opstr.data());
2103
+ if (!ps.PushRepetition(lo, hi, opstr, nongreedy))
2104
+ return NULL;
2105
+ isunary = opstr;
2106
+ break;
2107
+ }
2108
+
2109
+ case '\\': { // Escaped character or Perl sequence.
2110
+ // \b and \B: word boundary or not
2111
+ if ((ps.flags() & Regexp::PerlB) &&
2112
+ t.size() >= 2 && (t[1] == 'b' || t[1] == 'B')) {
2113
+ if (!ps.PushWordBoundary(t[1] == 'b'))
2114
+ return NULL;
2115
+ t.remove_prefix(2); // '\\', 'b'
2116
+ break;
2117
+ }
2118
+
2119
+ if ((ps.flags() & Regexp::PerlX) && t.size() >= 2) {
2120
+ if (t[1] == 'A') {
2121
+ if (!ps.PushSimpleOp(kRegexpBeginText))
2122
+ return NULL;
2123
+ t.remove_prefix(2); // '\\', 'A'
2124
+ break;
2125
+ }
2126
+ if (t[1] == 'z') {
2127
+ if (!ps.PushSimpleOp(kRegexpEndText))
2128
+ return NULL;
2129
+ t.remove_prefix(2); // '\\', 'z'
2130
+ break;
2131
+ }
2132
+ // Do not recognize \Z, because this library can't
2133
+ // implement the exact Perl/PCRE semantics.
2134
+ // (This library treats "(?-m)$" as \z, even though
2135
+ // in Perl and PCRE it is equivalent to \Z.)
2136
+
2137
+ if (t[1] == 'C') { // \C: any byte [sic]
2138
+ if (!ps.PushSimpleOp(kRegexpAnyByte))
2139
+ return NULL;
2140
+ t.remove_prefix(2); // '\\', 'C'
2141
+ break;
2142
+ }
2143
+
2144
+ if (t[1] == 'Q') { // \Q ... \E: the ... is always literals
2145
+ t.remove_prefix(2); // '\\', 'Q'
2146
+ while (t.size() > 0) {
2147
+ if (t.size() >= 2 && t[0] == '\\' && t[1] == 'E') {
2148
+ t.remove_prefix(2); // '\\', 'E'
2149
+ break;
2150
+ }
2151
+ Rune r;
2152
+ if (StringPieceToRune(&r, &t, status) < 0)
2153
+ return NULL;
2154
+ if (!ps.PushLiteral(r))
2155
+ return NULL;
2156
+ }
2157
+ break;
2158
+ }
2159
+ }
2160
+
2161
+ if (t.size() >= 2 && (t[1] == 'p' || t[1] == 'P')) {
2162
+ Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase);
2163
+ re->ccb_ = new CharClassBuilder;
2164
+ switch (ParseUnicodeGroup(&t, ps.flags(), re->ccb_, status)) {
2165
+ case kParseOk:
2166
+ if (!ps.PushRegexp(re))
2167
+ return NULL;
2168
+ goto Break2;
2169
+ case kParseError:
2170
+ re->Decref();
2171
+ return NULL;
2172
+ case kParseNothing:
2173
+ re->Decref();
2174
+ break;
2175
+ }
2176
+ }
2177
+
2178
+ UGroup *g = MaybeParsePerlCCEscape(&t, ps.flags());
2179
+ if (g != NULL) {
2180
+ Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase);
2181
+ re->ccb_ = new CharClassBuilder;
2182
+ AddUGroup(re->ccb_, g, g->sign, ps.flags());
2183
+ if (!ps.PushRegexp(re))
2184
+ return NULL;
2185
+ break;
2186
+ }
2187
+
2188
+ Rune r;
2189
+ if (!ParseEscape(&t, &r, status, ps.rune_max()))
2190
+ return NULL;
2191
+ if (!ps.PushLiteral(r))
2192
+ return NULL;
2193
+ break;
2194
+ }
2195
+ }
2196
+ Break2:
2197
+ lastunary = isunary;
2198
+ }
2199
+ return ps.DoFinish();
2200
+ }
2201
+
2202
+ } // namespace re2