chipper 0.4.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (134) hide show
  1. data/README.rdoc +51 -0
  2. data/ext/extconf.rb +58 -0
  3. data/ext/libstemmer_c/Makefile +10 -0
  4. data/ext/libstemmer_c/examples/stemwords.c +209 -0
  5. data/ext/libstemmer_c/include/libstemmer.h +79 -0
  6. data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
  7. data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
  8. data/ext/libstemmer_c/libstemmer/modules.h +190 -0
  9. data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
  10. data/ext/libstemmer_c/mkinc.mak +82 -0
  11. data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
  12. data/ext/libstemmer_c/runtime/api.c +66 -0
  13. data/ext/libstemmer_c/runtime/api.h +26 -0
  14. data/ext/libstemmer_c/runtime/header.h +58 -0
  15. data/ext/libstemmer_c/runtime/utilities.c +478 -0
  16. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
  17. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
  18. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
  19. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
  20. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
  21. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
  22. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
  23. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
  24. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
  25. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
  26. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
  27. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
  28. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
  29. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
  30. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
  31. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
  32. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
  33. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
  34. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
  35. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
  36. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
  37. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
  38. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
  39. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
  40. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
  41. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
  42. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
  43. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
  44. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
  45. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
  46. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
  47. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
  48. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
  49. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
  50. data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
  51. data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
  52. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
  53. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
  54. data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
  55. data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
  56. data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
  57. data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
  58. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
  59. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
  60. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
  61. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
  62. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
  63. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
  64. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
  65. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
  66. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
  67. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
  68. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
  69. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
  70. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
  71. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
  72. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
  73. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
  74. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
  75. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
  76. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
  77. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
  78. data/ext/re2/bitstate.cc +378 -0
  79. data/ext/re2/compile.cc +1138 -0
  80. data/ext/re2/dfa.cc +2086 -0
  81. data/ext/re2/filtered_re2.cc +100 -0
  82. data/ext/re2/filtered_re2.h +99 -0
  83. data/ext/re2/hash.cc +231 -0
  84. data/ext/re2/mimics_pcre.cc +185 -0
  85. data/ext/re2/nfa.cc +709 -0
  86. data/ext/re2/onepass.cc +614 -0
  87. data/ext/re2/parse.cc +2202 -0
  88. data/ext/re2/perl_groups.cc +119 -0
  89. data/ext/re2/prefilter.cc +671 -0
  90. data/ext/re2/prefilter.h +105 -0
  91. data/ext/re2/prefilter_tree.cc +398 -0
  92. data/ext/re2/prefilter_tree.h +130 -0
  93. data/ext/re2/prog.cc +341 -0
  94. data/ext/re2/prog.h +376 -0
  95. data/ext/re2/re2.cc +1180 -0
  96. data/ext/re2/re2.h +837 -0
  97. data/ext/re2/regexp.cc +920 -0
  98. data/ext/re2/regexp.h +632 -0
  99. data/ext/re2/rune.cc +258 -0
  100. data/ext/re2/set.cc +113 -0
  101. data/ext/re2/set.h +55 -0
  102. data/ext/re2/simplify.cc +393 -0
  103. data/ext/re2/stringpiece.cc +87 -0
  104. data/ext/re2/stringpiece.h +182 -0
  105. data/ext/re2/tostring.cc +341 -0
  106. data/ext/re2/unicode_casefold.cc +469 -0
  107. data/ext/re2/unicode_casefold.h +75 -0
  108. data/ext/re2/unicode_groups.cc +4851 -0
  109. data/ext/re2/unicode_groups.h +64 -0
  110. data/ext/re2/valgrind.cc +24 -0
  111. data/ext/re2/variadic_function.h +346 -0
  112. data/ext/re2/walker-inl.h +244 -0
  113. data/ext/src/chipper.cc +626 -0
  114. data/ext/src/version.h +1 -0
  115. data/ext/stemmer.rb +40 -0
  116. data/ext/util/arena.h +103 -0
  117. data/ext/util/atomicops.h +79 -0
  118. data/ext/util/benchmark.h +41 -0
  119. data/ext/util/flags.h +27 -0
  120. data/ext/util/logging.h +78 -0
  121. data/ext/util/mutex.h +190 -0
  122. data/ext/util/pcre.h +679 -0
  123. data/ext/util/random.h +29 -0
  124. data/ext/util/sparse_array.h +451 -0
  125. data/ext/util/sparse_set.h +177 -0
  126. data/ext/util/test.h +57 -0
  127. data/ext/util/thread.h +26 -0
  128. data/ext/util/utf.h +43 -0
  129. data/ext/util/util.h +127 -0
  130. data/ext/util/valgrind.h +4517 -0
  131. data/test/helper.rb +5 -0
  132. data/test/test_entities.rb +57 -0
  133. data/test/test_tokens.rb +118 -0
  134. metadata +199 -0
data/ext/re2/regexp.cc ADDED
@@ -0,0 +1,920 @@
1
+ // Copyright 2006 The RE2 Authors. All Rights Reserved.
2
+ // Use of this source code is governed by a BSD-style
3
+ // license that can be found in the LICENSE file.
4
+
5
+ // Regular expression representation.
6
+ // Tested by parse_test.cc
7
+
8
+ #include "util/util.h"
9
+ #include "re2/regexp.h"
10
+ #include "re2/stringpiece.h"
11
+ #include "re2/walker-inl.h"
12
+
13
+ namespace re2 {
14
+
15
+ // Constructor. Allocates vectors as appropriate for operator.
16
+ Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
17
+ : op_(op),
18
+ simple_(false),
19
+ parse_flags_(static_cast<uint16>(parse_flags)),
20
+ ref_(1),
21
+ nsub_(0),
22
+ down_(NULL) {
23
+ subone_ = NULL;
24
+ memset(the_union_, 0, sizeof the_union_);
25
+ }
26
+
27
+ // Destructor. Assumes already cleaned up children.
28
+ // Private: use Decref() instead of delete to destroy Regexps.
29
+ // Can't call Decref on the sub-Regexps here because
30
+ // that could cause arbitrarily deep recursion, so
31
+ // required Decref() to have handled them for us.
32
+ Regexp::~Regexp() {
33
+ if (nsub_ > 0)
34
+ LOG(DFATAL) << "Regexp not destroyed.";
35
+
36
+ switch (op_) {
37
+ default:
38
+ break;
39
+ case kRegexpCapture:
40
+ delete name_;
41
+ break;
42
+ case kRegexpLiteralString:
43
+ delete[] runes_;
44
+ break;
45
+ case kRegexpCharClass:
46
+ cc_->Delete();
47
+ delete ccb_;
48
+ break;
49
+ }
50
+ }
51
+
52
+ // If it's possible to destroy this regexp without recurring,
53
+ // do so and return true. Else return false.
54
+ bool Regexp::QuickDestroy() {
55
+ if (nsub_ == 0) {
56
+ delete this;
57
+ return true;
58
+ }
59
+ return false;
60
+ }
61
+
62
+ static map<Regexp*, int> ref_map;
63
+ static Mutex ref_mutex;
64
+
65
+ int Regexp::Ref() {
66
+ if (ref_ < kMaxRef)
67
+ return ref_;
68
+
69
+ MutexLock l(&ref_mutex);
70
+ return ref_map[this];
71
+ }
72
+
73
+ // Increments reference count, returns object as convenience.
74
+ Regexp* Regexp::Incref() {
75
+ if (ref_ >= kMaxRef-1) {
76
+ // Store ref count in overflow map.
77
+ MutexLock l(&ref_mutex);
78
+ if (ref_ == kMaxRef) { // already overflowed
79
+ ref_map[this]++;
80
+ return this;
81
+ }
82
+ // overflowing now
83
+ ref_map[this] = kMaxRef;
84
+ ref_ = kMaxRef;
85
+ return this;
86
+ }
87
+
88
+ ref_++;
89
+ return this;
90
+ }
91
+
92
+ // Decrements reference count and deletes this object if count reaches 0.
93
+ void Regexp::Decref() {
94
+ if (ref_ == kMaxRef) {
95
+ // Ref count is stored in overflow map.
96
+ MutexLock l(&ref_mutex);
97
+ int r = ref_map[this] - 1;
98
+ if (r < kMaxRef) {
99
+ ref_ = r;
100
+ ref_map.erase(this);
101
+ } else {
102
+ ref_map[this] = r;
103
+ }
104
+ return;
105
+ }
106
+ ref_--;
107
+ if (ref_ == 0)
108
+ Destroy();
109
+ }
110
+
111
+ // Deletes this object; ref count has count reached 0.
112
+ void Regexp::Destroy() {
113
+ if (QuickDestroy())
114
+ return;
115
+
116
+ // Handle recursive Destroy with explicit stack
117
+ // to avoid arbitrarily deep recursion on process stack [sigh].
118
+ down_ = NULL;
119
+ Regexp* stack = this;
120
+ while (stack != NULL) {
121
+ Regexp* re = stack;
122
+ stack = re->down_;
123
+ if (re->ref_ != 0)
124
+ LOG(DFATAL) << "Bad reference count " << re->ref_;
125
+ if (re->nsub_ > 0) {
126
+ Regexp** subs = re->sub();
127
+ for (int i = 0; i < re->nsub_; i++) {
128
+ Regexp* sub = subs[i];
129
+ if (sub == NULL)
130
+ continue;
131
+ if (sub->ref_ == kMaxRef)
132
+ sub->Decref();
133
+ else
134
+ --sub->ref_;
135
+ if (sub->ref_ == 0 && !sub->QuickDestroy()) {
136
+ sub->down_ = stack;
137
+ stack = sub;
138
+ }
139
+ }
140
+ if (re->nsub_ > 1)
141
+ delete[] subs;
142
+ re->nsub_ = 0;
143
+ }
144
+ delete re;
145
+ }
146
+ }
147
+
148
+ void Regexp::AddRuneToString(Rune r) {
149
+ DCHECK(op_ == kRegexpLiteralString);
150
+ if (nrunes_ == 0) {
151
+ // start with 8
152
+ runes_ = new Rune[8];
153
+ } else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) {
154
+ // double on powers of two
155
+ Rune *old = runes_;
156
+ runes_ = new Rune[nrunes_ * 2];
157
+ for (int i = 0; i < nrunes_; i++)
158
+ runes_[i] = old[i];
159
+ delete[] old;
160
+ }
161
+
162
+ runes_[nrunes_++] = r;
163
+ }
164
+
165
+ Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
166
+ Regexp* re = new Regexp(kRegexpHaveMatch, flags);
167
+ re->match_id_ = match_id;
168
+ return re;
169
+ }
170
+
171
+ Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
172
+ if (sub->op() == kRegexpPlus && sub->parse_flags() == flags)
173
+ return sub;
174
+ Regexp* re = new Regexp(kRegexpPlus, flags);
175
+ re->AllocSub(1);
176
+ re->sub()[0] = sub;
177
+ return re;
178
+ }
179
+
180
+ Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
181
+ if (sub->op() == kRegexpStar && sub->parse_flags() == flags)
182
+ return sub;
183
+ Regexp* re = new Regexp(kRegexpStar, flags);
184
+ re->AllocSub(1);
185
+ re->sub()[0] = sub;
186
+ return re;
187
+ }
188
+
189
+ Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
190
+ if (sub->op() == kRegexpQuest && sub->parse_flags() == flags)
191
+ return sub;
192
+ Regexp* re = new Regexp(kRegexpQuest, flags);
193
+ re->AllocSub(1);
194
+ re->sub()[0] = sub;
195
+ return re;
196
+ }
197
+
198
+ Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
199
+ ParseFlags flags, bool can_factor) {
200
+ if (nsub == 1)
201
+ return sub[0];
202
+
203
+ Regexp** subcopy = NULL;
204
+ if (op == kRegexpAlternate && can_factor) {
205
+ // Going to edit sub; make a copy so we don't step on caller.
206
+ subcopy = new Regexp*[nsub];
207
+ memmove(subcopy, sub, nsub * sizeof sub[0]);
208
+ sub = subcopy;
209
+ nsub = FactorAlternation(sub, nsub, flags);
210
+ if (nsub == 1) {
211
+ Regexp* re = sub[0];
212
+ delete[] subcopy;
213
+ return re;
214
+ }
215
+ }
216
+
217
+ if (nsub > kMaxNsub) {
218
+ // Too many subexpressions to fit in a single Regexp.
219
+ // Make a two-level tree. Two levels gets us to 65535^2.
220
+ int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub;
221
+ Regexp* re = new Regexp(op, flags);
222
+ re->AllocSub(nbigsub);
223
+ Regexp** subs = re->sub();
224
+ for (int i = 0; i < nbigsub - 1; i++)
225
+ subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false);
226
+ subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,
227
+ nsub - (nbigsub-1)*kMaxNsub, flags,
228
+ false);
229
+ delete[] subcopy;
230
+ return re;
231
+ }
232
+
233
+ Regexp* re = new Regexp(op, flags);
234
+ re->AllocSub(nsub);
235
+ Regexp** subs = re->sub();
236
+ for (int i = 0; i < nsub; i++)
237
+ subs[i] = sub[i];
238
+
239
+ delete[] subcopy;
240
+ return re;
241
+ }
242
+
243
+ Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {
244
+ return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);
245
+ }
246
+
247
+ Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {
248
+ return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);
249
+ }
250
+
251
+ Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {
252
+ return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);
253
+ }
254
+
255
+ Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {
256
+ Regexp* re = new Regexp(kRegexpCapture, flags);
257
+ re->AllocSub(1);
258
+ re->sub()[0] = sub;
259
+ re->cap_ = cap;
260
+ return re;
261
+ }
262
+
263
+ Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) {
264
+ Regexp* re = new Regexp(kRegexpRepeat, flags);
265
+ re->AllocSub(1);
266
+ re->sub()[0] = sub;
267
+ re->min_ = min;
268
+ re->max_ = max;
269
+ return re;
270
+ }
271
+
272
+ Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) {
273
+ Regexp* re = new Regexp(kRegexpLiteral, flags);
274
+ re->rune_ = rune;
275
+ return re;
276
+ }
277
+
278
+ Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) {
279
+ if (nrunes <= 0)
280
+ return new Regexp(kRegexpEmptyMatch, flags);
281
+ if (nrunes == 1)
282
+ return NewLiteral(runes[0], flags);
283
+ Regexp* re = new Regexp(kRegexpLiteralString, flags);
284
+ for (int i = 0; i < nrunes; i++)
285
+ re->AddRuneToString(runes[i]);
286
+ return re;
287
+ }
288
+
289
+ Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) {
290
+ Regexp* re = new Regexp(kRegexpCharClass, flags);
291
+ re->cc_ = cc;
292
+ return re;
293
+ }
294
+
295
+ // Swaps this and that in place.
296
+ void Regexp::Swap(Regexp* that) {
297
+ // Can use memmove because Regexp is just a struct (no vtable).
298
+ char tmp[sizeof *this];
299
+ memmove(tmp, this, sizeof tmp);
300
+ memmove(this, that, sizeof tmp);
301
+ memmove(that, tmp, sizeof tmp);
302
+ }
303
+
304
+ // Tests equality of all top-level structure but not subregexps.
305
+ static bool TopEqual(Regexp* a, Regexp* b) {
306
+ if (a->op() != b->op())
307
+ return false;
308
+
309
+ switch (a->op()) {
310
+ case kRegexpNoMatch:
311
+ case kRegexpEmptyMatch:
312
+ case kRegexpAnyChar:
313
+ case kRegexpAnyByte:
314
+ case kRegexpBeginLine:
315
+ case kRegexpEndLine:
316
+ case kRegexpWordBoundary:
317
+ case kRegexpNoWordBoundary:
318
+ case kRegexpBeginText:
319
+ return true;
320
+
321
+ case kRegexpEndText:
322
+ // The parse flags remember whether it's \z or (?-m:$),
323
+ // which matters when testing against PCRE.
324
+ return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;
325
+
326
+ case kRegexpLiteral:
327
+ return a->rune() == b->rune() &&
328
+ ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;
329
+
330
+ case kRegexpLiteralString:
331
+ return a->nrunes() == b->nrunes() &&
332
+ ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&
333
+ memcmp(a->runes(), b->runes(),
334
+ a->nrunes() * sizeof a->runes()[0]) == 0;
335
+
336
+ case kRegexpAlternate:
337
+ case kRegexpConcat:
338
+ return a->nsub() == b->nsub();
339
+
340
+ case kRegexpStar:
341
+ case kRegexpPlus:
342
+ case kRegexpQuest:
343
+ return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;
344
+
345
+ case kRegexpRepeat:
346
+ return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 &&
347
+ a->min() == b->min() &&
348
+ a->max() == b->max();
349
+
350
+ case kRegexpCapture:
351
+ return a->cap() == b->cap() && a->name() == b->name();
352
+
353
+ case kRegexpHaveMatch:
354
+ return a->match_id() == b->match_id();
355
+
356
+ case kRegexpCharClass: {
357
+ CharClass* acc = a->cc();
358
+ CharClass* bcc = b->cc();
359
+ return acc->size() == bcc->size() &&
360
+ acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
361
+ memcmp(acc->begin(), bcc->begin(),
362
+ (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;
363
+ }
364
+ }
365
+
366
+ LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
367
+ return 0;
368
+ }
369
+
370
+ bool Regexp::Equal(Regexp* a, Regexp* b) {
371
+ if (a == NULL || b == NULL)
372
+ return a == b;
373
+
374
+ if (!TopEqual(a, b))
375
+ return false;
376
+
377
+ // Fast path:
378
+ // return without allocating vector if there are no subregexps.
379
+ switch (a->op()) {
380
+ case kRegexpAlternate:
381
+ case kRegexpConcat:
382
+ case kRegexpStar:
383
+ case kRegexpPlus:
384
+ case kRegexpQuest:
385
+ case kRegexpRepeat:
386
+ case kRegexpCapture:
387
+ break;
388
+
389
+ default:
390
+ return true;
391
+ }
392
+
393
+ // Committed to doing real work.
394
+ // The stack (vector) has pairs of regexps waiting to
395
+ // be compared. The regexps are only equal if
396
+ // all the pairs end up being equal.
397
+ vector<Regexp*> stk;
398
+
399
+ for (;;) {
400
+ // Invariant: TopEqual(a, b) == true.
401
+ Regexp* a2;
402
+ Regexp* b2;
403
+ switch (a->op()) {
404
+ default:
405
+ break;
406
+ case kRegexpAlternate:
407
+ case kRegexpConcat:
408
+ for (int i = 0; i < a->nsub(); i++) {
409
+ a2 = a->sub()[i];
410
+ b2 = b->sub()[i];
411
+ if (!TopEqual(a2, b2))
412
+ return false;
413
+ stk.push_back(a2);
414
+ stk.push_back(b2);
415
+ }
416
+ break;
417
+
418
+ case kRegexpStar:
419
+ case kRegexpPlus:
420
+ case kRegexpQuest:
421
+ case kRegexpRepeat:
422
+ case kRegexpCapture:
423
+ a2 = a->sub()[0];
424
+ b2 = b->sub()[0];
425
+ if (!TopEqual(a2, b2))
426
+ return false;
427
+ // Really:
428
+ // stk.push_back(a2);
429
+ // stk.push_back(b2);
430
+ // break;
431
+ // but faster to assign directly and loop.
432
+ a = a2;
433
+ b = b2;
434
+ continue;
435
+ }
436
+
437
+ int n = stk.size();
438
+ if (n == 0)
439
+ break;
440
+
441
+ a = stk[n-2];
442
+ b = stk[n-1];
443
+ stk.resize(n-2);
444
+ }
445
+
446
+ return true;
447
+ }
448
+
449
+ // Keep in sync with enum RegexpStatusCode in regexp.h
450
+ static const string kErrorStrings[] = {
451
+ "no error",
452
+ "unexpected error",
453
+ "invalid escape sequence",
454
+ "invalid character class",
455
+ "invalid character class range",
456
+ "missing ]",
457
+ "missing )",
458
+ "trailing \\",
459
+ "no argument for repetition operator",
460
+ "invalid repetition size",
461
+ "bad repetition operator",
462
+ "invalid perl operator",
463
+ "invalid UTF-8",
464
+ "invalid named capture group",
465
+ };
466
+
467
+ const string& RegexpStatus::CodeText(enum RegexpStatusCode code) {
468
+ if (code < 0 || code >= arraysize(kErrorStrings))
469
+ code = kRegexpInternalError;
470
+ return kErrorStrings[code];
471
+ }
472
+
473
+ string RegexpStatus::Text() const {
474
+ if (error_arg_.empty())
475
+ return CodeText(code_);
476
+ string s;
477
+ s.append(CodeText(code_));
478
+ s.append(": ");
479
+ s.append(error_arg_.data(), error_arg_.size());
480
+ return s;
481
+ }
482
+
483
+ void RegexpStatus::Copy(const RegexpStatus& status) {
484
+ code_ = status.code_;
485
+ error_arg_ = status.error_arg_;
486
+ }
487
+
488
+ typedef int Ignored; // Walker<void> doesn't exist
489
+
490
+ // Walker subclass to count capturing parens in regexp.
491
+ class NumCapturesWalker : public Regexp::Walker<Ignored> {
492
+ public:
493
+ NumCapturesWalker() : ncapture_(0) {}
494
+ int ncapture() { return ncapture_; }
495
+
496
+ virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
497
+ if (re->op() == kRegexpCapture)
498
+ ncapture_++;
499
+ return ignored;
500
+ }
501
+ virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
502
+ // Should never be called: we use Walk not WalkExponential.
503
+ LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";
504
+ return ignored;
505
+ }
506
+
507
+ private:
508
+ int ncapture_;
509
+ DISALLOW_EVIL_CONSTRUCTORS(NumCapturesWalker);
510
+ };
511
+
512
+ int Regexp::NumCaptures() {
513
+ NumCapturesWalker w;
514
+ w.Walk(this, 0);
515
+ return w.ncapture();
516
+ }
517
+
518
+ // Walker class to build map of named capture groups and their indices.
519
+ class NamedCapturesWalker : public Regexp::Walker<Ignored> {
520
+ public:
521
+ NamedCapturesWalker() : map_(NULL) {}
522
+ ~NamedCapturesWalker() { delete map_; }
523
+
524
+ map<string, int>* TakeMap() {
525
+ map<string, int>* m = map_;
526
+ map_ = NULL;
527
+ return m;
528
+ }
529
+
530
+ Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
531
+ if (re->op() == kRegexpCapture && re->name() != NULL) {
532
+ // Allocate map once we find a name.
533
+ if (map_ == NULL)
534
+ map_ = new map<string, int>;
535
+
536
+ // Record first occurrence of each name.
537
+ // (The rule is that if you have the same name
538
+ // multiple times, only the leftmost one counts.)
539
+ if (map_->find(*re->name()) == map_->end())
540
+ (*map_)[*re->name()] = re->cap();
541
+ }
542
+ return ignored;
543
+ }
544
+
545
+ virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
546
+ // Should never be called: we use Walk not WalkExponential.
547
+ LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";
548
+ return ignored;
549
+ }
550
+
551
+ private:
552
+ map<string, int>* map_;
553
+ DISALLOW_EVIL_CONSTRUCTORS(NamedCapturesWalker);
554
+ };
555
+
556
+ map<string, int>* Regexp::NamedCaptures() {
557
+ NamedCapturesWalker w;
558
+ w.Walk(this, 0);
559
+ return w.TakeMap();
560
+ }
561
+
562
+ // Walker class to build map from capture group indices to their names.
563
+ class CaptureNamesWalker : public Regexp::Walker<Ignored> {
564
+ public:
565
+ CaptureNamesWalker() : map_(NULL) {}
566
+ ~CaptureNamesWalker() { delete map_; }
567
+
568
+ map<int, string>* TakeMap() {
569
+ map<int, string>* m = map_;
570
+ map_ = NULL;
571
+ return m;
572
+ }
573
+
574
+ Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
575
+ if (re->op() == kRegexpCapture && re->name() != NULL) {
576
+ // Allocate map once we find a name.
577
+ if (map_ == NULL)
578
+ map_ = new map<int, string>;
579
+
580
+ (*map_)[re->cap()] = *re->name();
581
+ }
582
+ return ignored;
583
+ }
584
+
585
+ virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
586
+ // Should never be called: we use Walk not WalkExponential.
587
+ LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
588
+ return ignored;
589
+ }
590
+
591
+ private:
592
+ map<int, string>* map_;
593
+ DISALLOW_EVIL_CONSTRUCTORS(CaptureNamesWalker);
594
+ };
595
+
596
+ map<int, string>* Regexp::CaptureNames() {
597
+ CaptureNamesWalker w;
598
+ w.Walk(this, 0);
599
+ return w.TakeMap();
600
+ }
601
+
602
+ // Determines whether regexp matches must be anchored
603
+ // with a fixed string prefix. If so, returns the prefix and
604
+ // the regexp that remains after the prefix. The prefix might
605
+ // be ASCII case-insensitive.
606
+ bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
607
+ // No need for a walker: the regexp must be of the form
608
+ // 1. some number of ^ anchors
609
+ // 2. a literal char or string
610
+ // 3. the rest
611
+ prefix->clear();
612
+ *foldcase = false;
613
+ *suffix = NULL;
614
+ if (op_ != kRegexpConcat)
615
+ return false;
616
+
617
+ // Some number of anchors, then a literal or concatenation.
618
+ int i = 0;
619
+ Regexp** sub = this->sub();
620
+ while (i < nsub_ && sub[i]->op_ == kRegexpBeginText)
621
+ i++;
622
+ if (i == 0 || i >= nsub_)
623
+ return false;
624
+
625
+ Regexp* re = sub[i];
626
+ switch (re->op_) {
627
+ default:
628
+ return false;
629
+
630
+ case kRegexpLiteralString:
631
+ // Convert to string in proper encoding.
632
+ if (re->parse_flags() & Latin1) {
633
+ prefix->resize(re->nrunes_);
634
+ for (int j = 0; j < re->nrunes_; j++)
635
+ (*prefix)[j] = re->runes_[j];
636
+ } else {
637
+ // Convert to UTF-8 in place.
638
+ // Assume worst-case space and then trim.
639
+ prefix->resize(re->nrunes_ * UTFmax);
640
+ char *p = &(*prefix)[0];
641
+ for (int j = 0; j < re->nrunes_; j++) {
642
+ Rune r = re->runes_[j];
643
+ if (r < Runeself)
644
+ *p++ = r;
645
+ else
646
+ p += runetochar(p, &r);
647
+ }
648
+ prefix->resize(p - &(*prefix)[0]);
649
+ }
650
+ break;
651
+
652
+ case kRegexpLiteral:
653
+ if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) {
654
+ prefix->append(1, re->rune_);
655
+ } else {
656
+ char buf[UTFmax];
657
+ prefix->append(buf, runetochar(buf, &re->rune_));
658
+ }
659
+ break;
660
+ }
661
+ *foldcase = (sub[i]->parse_flags() & FoldCase);
662
+ i++;
663
+
664
+ // The rest.
665
+ if (i < nsub_) {
666
+ for (int j = i; j < nsub_; j++)
667
+ sub[j]->Incref();
668
+ re = Concat(sub + i, nsub_ - i, parse_flags());
669
+ } else {
670
+ re = new Regexp(kRegexpEmptyMatch, parse_flags());
671
+ }
672
+ *suffix = re;
673
+ return true;
674
+ }
675
+
676
+ // Character class builder is a balanced binary tree (STL set)
677
+ // containing non-overlapping, non-abutting RuneRanges.
678
+ // The less-than operator used in the tree treats two
679
+ // ranges as equal if they overlap at all, so that
680
+ // lookups for a particular Rune are possible.
681
+
682
+ CharClassBuilder::CharClassBuilder() {
683
+ nrunes_ = 0;
684
+ upper_ = 0;
685
+ lower_ = 0;
686
+ }
687
+
688
+ // Add lo-hi to the class; return whether class got bigger.
689
+ bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
690
+ if (hi < lo)
691
+ return false;
692
+
693
+ if (lo <= 'z' && hi >= 'A') {
694
+ // Overlaps some alpha, maybe not all.
695
+ // Update bitmaps telling which ASCII letters are in the set.
696
+ Rune lo1 = max<Rune>(lo, 'A');
697
+ Rune hi1 = min<Rune>(hi, 'Z');
698
+ if (lo1 <= hi1)
699
+ upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');
700
+
701
+ lo1 = max<Rune>(lo, 'a');
702
+ hi1 = min<Rune>(hi, 'z');
703
+ if (lo1 <= hi1)
704
+ lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
705
+ }
706
+
707
+ { // Check whether lo, hi is already in the class.
708
+ iterator it = ranges_.find(RuneRange(lo, lo));
709
+ if (it != end() && it->lo <= lo && hi <= it->hi)
710
+ return false;
711
+ }
712
+
713
+ // Look for a range abutting lo on the left.
714
+ // If it exists, take it out and increase our range.
715
+ if (lo > 0) {
716
+ iterator it = ranges_.find(RuneRange(lo-1, lo-1));
717
+ if (it != end()) {
718
+ lo = it->lo;
719
+ if (it->hi > hi)
720
+ hi = it->hi;
721
+ nrunes_ -= it->hi - it->lo + 1;
722
+ ranges_.erase(it);
723
+ }
724
+ }
725
+
726
+ // Look for a range abutting hi on the right.
727
+ // If it exists, take it out and increase our range.
728
+ if (hi < Runemax) {
729
+ iterator it = ranges_.find(RuneRange(hi+1, hi+1));
730
+ if (it != end()) {
731
+ hi = it->hi;
732
+ nrunes_ -= it->hi - it->lo + 1;
733
+ ranges_.erase(it);
734
+ }
735
+ }
736
+
737
+ // Look for ranges between lo and hi. Take them out.
738
+ // This is only safe because the set has no overlapping ranges.
739
+ // We've already removed any ranges abutting lo and hi, so
740
+ // any that overlap [lo, hi] must be contained within it.
741
+ for (;;) {
742
+ iterator it = ranges_.find(RuneRange(lo, hi));
743
+ if (it == end())
744
+ break;
745
+ nrunes_ -= it->hi - it->lo + 1;
746
+ ranges_.erase(it);
747
+ }
748
+
749
+ // Finally, add [lo, hi].
750
+ nrunes_ += hi - lo + 1;
751
+ ranges_.insert(RuneRange(lo, hi));
752
+ return true;
753
+ }
754
+
755
+ void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {
756
+ for (iterator it = cc->begin(); it != cc->end(); ++it)
757
+ AddRange(it->lo, it->hi);
758
+ }
759
+
760
+ bool CharClassBuilder::Contains(Rune r) {
761
+ return ranges_.find(RuneRange(r, r)) != end();
762
+ }
763
+
764
+ // Does the character class behave the same on A-Z as on a-z?
765
+ bool CharClassBuilder::FoldsASCII() {
766
+ return ((upper_ ^ lower_) & AlphaMask) == 0;
767
+ }
768
+
769
+ CharClassBuilder* CharClassBuilder::Copy() {
770
+ CharClassBuilder* cc = new CharClassBuilder;
771
+ for (iterator it = begin(); it != end(); ++it)
772
+ cc->ranges_.insert(RuneRange(it->lo, it->hi));
773
+ cc->upper_ = upper_;
774
+ cc->lower_ = lower_;
775
+ cc->nrunes_ = nrunes_;
776
+ return cc;
777
+ }
778
+
779
+
780
+
781
+ void CharClassBuilder::RemoveAbove(Rune r) {
782
+ if (r >= Runemax)
783
+ return;
784
+
785
+ if (r < 'z') {
786
+ if (r < 'a')
787
+ lower_ = 0;
788
+ else
789
+ lower_ &= AlphaMask >> ('z' - r);
790
+ }
791
+
792
+ if (r < 'Z') {
793
+ if (r < 'A')
794
+ upper_ = 0;
795
+ else
796
+ upper_ &= AlphaMask >> ('Z' - r);
797
+ }
798
+
799
+ for (;;) {
800
+
801
+ iterator it = ranges_.find(RuneRange(r + 1, Runemax));
802
+ if (it == end())
803
+ break;
804
+ RuneRange rr = *it;
805
+ ranges_.erase(it);
806
+ nrunes_ -= rr.hi - rr.lo + 1;
807
+ if (rr.lo <= r) {
808
+ rr.hi = r;
809
+ ranges_.insert(rr);
810
+ nrunes_ += rr.hi - rr.lo + 1;
811
+ }
812
+ }
813
+ }
814
+
815
+ void CharClassBuilder::Negate() {
816
+ // Build up negation and then copy in.
817
+ // Could edit ranges in place, but C++ won't let me.
818
+ vector<RuneRange> v;
819
+ v.reserve(ranges_.size() + 1);
820
+
821
+ // In negation, first range begins at 0, unless
822
+ // the current class begins at 0.
823
+ iterator it = begin();
824
+ if (it == end()) {
825
+ v.push_back(RuneRange(0, Runemax));
826
+ } else {
827
+ int nextlo = 0;
828
+ if (it->lo == 0) {
829
+ nextlo = it->hi + 1;
830
+ ++it;
831
+ }
832
+ for (; it != end(); ++it) {
833
+ v.push_back(RuneRange(nextlo, it->lo - 1));
834
+ nextlo = it->hi + 1;
835
+ }
836
+ if (nextlo <= Runemax)
837
+ v.push_back(RuneRange(nextlo, Runemax));
838
+ }
839
+
840
+ ranges_.clear();
841
+ for (int i = 0; i < v.size(); i++)
842
+ ranges_.insert(v[i]);
843
+
844
+ upper_ = AlphaMask & ~upper_;
845
+ lower_ = AlphaMask & ~lower_;
846
+ nrunes_ = Runemax+1 - nrunes_;
847
+ }
848
+
849
+ // Character class is a sorted list of ranges.
850
+ // The ranges are allocated in the same block as the header,
851
+ // necessitating a special allocator and Delete method.
852
+
853
+ CharClass* CharClass::New(int maxranges) {
854
+ CharClass* cc;
855
+ uint8* data = new uint8[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
856
+ cc = reinterpret_cast<CharClass*>(data);
857
+ cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc);
858
+ cc->nranges_ = 0;
859
+ cc->folds_ascii_ = false;
860
+ cc->nrunes_ = 0;
861
+ return cc;
862
+ }
863
+
864
+ void CharClass::Delete() {
865
+ if (this == NULL)
866
+ return;
867
+ uint8 *data = reinterpret_cast<uint8*>(this);
868
+ delete[] data;
869
+ }
870
+
871
+ CharClass* CharClass::Negate() {
872
+ CharClass* cc = CharClass::New(nranges_+1);
873
+ cc->folds_ascii_ = folds_ascii_;
874
+ cc->nrunes_ = Runemax + 1 - nrunes_;
875
+ int n = 0;
876
+ int nextlo = 0;
877
+ for (CharClass::iterator it = begin(); it != end(); ++it) {
878
+ if (it->lo == nextlo) {
879
+ nextlo = it->hi + 1;
880
+ } else {
881
+ cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1);
882
+ nextlo = it->hi + 1;
883
+ }
884
+ }
885
+ if (nextlo <= Runemax)
886
+ cc->ranges_[n++] = RuneRange(nextlo, Runemax);
887
+ cc->nranges_ = n;
888
+ return cc;
889
+ }
890
+
891
+ bool CharClass::Contains(Rune r) {
892
+ RuneRange* rr = ranges_;
893
+ int n = nranges_;
894
+ while (n > 0) {
895
+ int m = n/2;
896
+ if (rr[m].hi < r) {
897
+ rr += m+1;
898
+ n -= m+1;
899
+ } else if (r < rr[m].lo) {
900
+ n = m;
901
+ } else { // rr[m].lo <= r && r <= rr[m].hi
902
+ return true;
903
+ }
904
+ }
905
+ return false;
906
+ }
907
+
908
+ CharClass* CharClassBuilder::GetCharClass() {
909
+ CharClass* cc = CharClass::New(ranges_.size());
910
+ int n = 0;
911
+ for (iterator it = begin(); it != end(); ++it)
912
+ cc->ranges_[n++] = *it;
913
+ cc->nranges_ = n;
914
+ DCHECK_LE(n, ranges_.size());
915
+ cc->nrunes_ = nrunes_;
916
+ cc->folds_ascii_ = FoldsASCII();
917
+ return cc;
918
+ }
919
+
920
+ } // namespace re2