chipper 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. data/README.rdoc +51 -0
  2. data/ext/extconf.rb +58 -0
  3. data/ext/libstemmer_c/Makefile +10 -0
  4. data/ext/libstemmer_c/examples/stemwords.c +209 -0
  5. data/ext/libstemmer_c/include/libstemmer.h +79 -0
  6. data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
  7. data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
  8. data/ext/libstemmer_c/libstemmer/modules.h +190 -0
  9. data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
  10. data/ext/libstemmer_c/mkinc.mak +82 -0
  11. data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
  12. data/ext/libstemmer_c/runtime/api.c +66 -0
  13. data/ext/libstemmer_c/runtime/api.h +26 -0
  14. data/ext/libstemmer_c/runtime/header.h +58 -0
  15. data/ext/libstemmer_c/runtime/utilities.c +478 -0
  16. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
  17. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
  18. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
  19. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
  20. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
  21. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
  22. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
  23. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
  24. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
  25. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
  26. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
  27. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
  28. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
  29. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
  30. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
  31. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
  32. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
  33. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
  34. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
  35. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
  36. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
  37. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
  38. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
  39. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
  40. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
  41. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
  42. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
  43. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
  44. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
  45. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
  46. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
  47. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
  48. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
  49. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
  50. data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
  51. data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
  52. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
  53. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
  54. data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
  55. data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
  56. data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
  57. data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
  58. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
  59. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
  60. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
  61. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
  62. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
  63. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
  64. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
  65. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
  66. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
  67. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
  68. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
  69. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
  70. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
  71. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
  72. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
  73. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
  74. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
  75. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
  76. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
  77. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
  78. data/ext/re2/bitstate.cc +378 -0
  79. data/ext/re2/compile.cc +1138 -0
  80. data/ext/re2/dfa.cc +2086 -0
  81. data/ext/re2/filtered_re2.cc +100 -0
  82. data/ext/re2/filtered_re2.h +99 -0
  83. data/ext/re2/hash.cc +231 -0
  84. data/ext/re2/mimics_pcre.cc +185 -0
  85. data/ext/re2/nfa.cc +709 -0
  86. data/ext/re2/onepass.cc +614 -0
  87. data/ext/re2/parse.cc +2202 -0
  88. data/ext/re2/perl_groups.cc +119 -0
  89. data/ext/re2/prefilter.cc +671 -0
  90. data/ext/re2/prefilter.h +105 -0
  91. data/ext/re2/prefilter_tree.cc +398 -0
  92. data/ext/re2/prefilter_tree.h +130 -0
  93. data/ext/re2/prog.cc +341 -0
  94. data/ext/re2/prog.h +376 -0
  95. data/ext/re2/re2.cc +1180 -0
  96. data/ext/re2/re2.h +837 -0
  97. data/ext/re2/regexp.cc +920 -0
  98. data/ext/re2/regexp.h +632 -0
  99. data/ext/re2/rune.cc +258 -0
  100. data/ext/re2/set.cc +113 -0
  101. data/ext/re2/set.h +55 -0
  102. data/ext/re2/simplify.cc +393 -0
  103. data/ext/re2/stringpiece.cc +87 -0
  104. data/ext/re2/stringpiece.h +182 -0
  105. data/ext/re2/tostring.cc +341 -0
  106. data/ext/re2/unicode_casefold.cc +469 -0
  107. data/ext/re2/unicode_casefold.h +75 -0
  108. data/ext/re2/unicode_groups.cc +4851 -0
  109. data/ext/re2/unicode_groups.h +64 -0
  110. data/ext/re2/valgrind.cc +24 -0
  111. data/ext/re2/variadic_function.h +346 -0
  112. data/ext/re2/walker-inl.h +244 -0
  113. data/ext/src/chipper.cc +626 -0
  114. data/ext/src/version.h +1 -0
  115. data/ext/stemmer.rb +40 -0
  116. data/ext/util/arena.h +103 -0
  117. data/ext/util/atomicops.h +79 -0
  118. data/ext/util/benchmark.h +41 -0
  119. data/ext/util/flags.h +27 -0
  120. data/ext/util/logging.h +78 -0
  121. data/ext/util/mutex.h +190 -0
  122. data/ext/util/pcre.h +679 -0
  123. data/ext/util/random.h +29 -0
  124. data/ext/util/sparse_array.h +451 -0
  125. data/ext/util/sparse_set.h +177 -0
  126. data/ext/util/test.h +57 -0
  127. data/ext/util/thread.h +26 -0
  128. data/ext/util/utf.h +43 -0
  129. data/ext/util/util.h +127 -0
  130. data/ext/util/valgrind.h +4517 -0
  131. data/test/helper.rb +5 -0
  132. data/test/test_entities.rb +57 -0
  133. data/test/test_tokens.rb +118 -0
  134. metadata +199 -0
data/ext/re2/regexp.cc ADDED
@@ -0,0 +1,920 @@
1
+ // Copyright 2006 The RE2 Authors. All Rights Reserved.
2
+ // Use of this source code is governed by a BSD-style
3
+ // license that can be found in the LICENSE file.
4
+
5
+ // Regular expression representation.
6
+ // Tested by parse_test.cc
7
+
8
+ #include "util/util.h"
9
+ #include "re2/regexp.h"
10
+ #include "re2/stringpiece.h"
11
+ #include "re2/walker-inl.h"
12
+
13
+ namespace re2 {
14
+
15
+ // Constructor. Allocates vectors as appropriate for operator.
16
+ Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
17
+ : op_(op),
18
+ simple_(false),
19
+ parse_flags_(static_cast<uint16>(parse_flags)),
20
+ ref_(1),
21
+ nsub_(0),
22
+ down_(NULL) {
23
+ subone_ = NULL;
24
+ memset(the_union_, 0, sizeof the_union_);
25
+ }
26
+
27
+ // Destructor. Assumes already cleaned up children.
28
+ // Private: use Decref() instead of delete to destroy Regexps.
29
+ // Can't call Decref on the sub-Regexps here because
30
+ // that could cause arbitrarily deep recursion, so
31
+ // required Decref() to have handled them for us.
32
+ Regexp::~Regexp() {
33
+ if (nsub_ > 0)
34
+ LOG(DFATAL) << "Regexp not destroyed.";
35
+
36
+ switch (op_) {
37
+ default:
38
+ break;
39
+ case kRegexpCapture:
40
+ delete name_;
41
+ break;
42
+ case kRegexpLiteralString:
43
+ delete[] runes_;
44
+ break;
45
+ case kRegexpCharClass:
46
+ cc_->Delete();
47
+ delete ccb_;
48
+ break;
49
+ }
50
+ }
51
+
52
+ // If it's possible to destroy this regexp without recurring,
53
+ // do so and return true. Else return false.
54
+ bool Regexp::QuickDestroy() {
55
+ if (nsub_ == 0) {
56
+ delete this;
57
+ return true;
58
+ }
59
+ return false;
60
+ }
61
+
62
+ static map<Regexp*, int> ref_map;
63
+ static Mutex ref_mutex;
64
+
65
+ int Regexp::Ref() {
66
+ if (ref_ < kMaxRef)
67
+ return ref_;
68
+
69
+ MutexLock l(&ref_mutex);
70
+ return ref_map[this];
71
+ }
72
+
73
+ // Increments reference count, returns object as convenience.
74
+ Regexp* Regexp::Incref() {
75
+ if (ref_ >= kMaxRef-1) {
76
+ // Store ref count in overflow map.
77
+ MutexLock l(&ref_mutex);
78
+ if (ref_ == kMaxRef) { // already overflowed
79
+ ref_map[this]++;
80
+ return this;
81
+ }
82
+ // overflowing now
83
+ ref_map[this] = kMaxRef;
84
+ ref_ = kMaxRef;
85
+ return this;
86
+ }
87
+
88
+ ref_++;
89
+ return this;
90
+ }
91
+
92
+ // Decrements reference count and deletes this object if count reaches 0.
93
+ void Regexp::Decref() {
94
+ if (ref_ == kMaxRef) {
95
+ // Ref count is stored in overflow map.
96
+ MutexLock l(&ref_mutex);
97
+ int r = ref_map[this] - 1;
98
+ if (r < kMaxRef) {
99
+ ref_ = r;
100
+ ref_map.erase(this);
101
+ } else {
102
+ ref_map[this] = r;
103
+ }
104
+ return;
105
+ }
106
+ ref_--;
107
+ if (ref_ == 0)
108
+ Destroy();
109
+ }
110
+
111
+ // Deletes this object; ref count has count reached 0.
112
+ void Regexp::Destroy() {
113
+ if (QuickDestroy())
114
+ return;
115
+
116
+ // Handle recursive Destroy with explicit stack
117
+ // to avoid arbitrarily deep recursion on process stack [sigh].
118
+ down_ = NULL;
119
+ Regexp* stack = this;
120
+ while (stack != NULL) {
121
+ Regexp* re = stack;
122
+ stack = re->down_;
123
+ if (re->ref_ != 0)
124
+ LOG(DFATAL) << "Bad reference count " << re->ref_;
125
+ if (re->nsub_ > 0) {
126
+ Regexp** subs = re->sub();
127
+ for (int i = 0; i < re->nsub_; i++) {
128
+ Regexp* sub = subs[i];
129
+ if (sub == NULL)
130
+ continue;
131
+ if (sub->ref_ == kMaxRef)
132
+ sub->Decref();
133
+ else
134
+ --sub->ref_;
135
+ if (sub->ref_ == 0 && !sub->QuickDestroy()) {
136
+ sub->down_ = stack;
137
+ stack = sub;
138
+ }
139
+ }
140
+ if (re->nsub_ > 1)
141
+ delete[] subs;
142
+ re->nsub_ = 0;
143
+ }
144
+ delete re;
145
+ }
146
+ }
147
+
148
+ void Regexp::AddRuneToString(Rune r) {
149
+ DCHECK(op_ == kRegexpLiteralString);
150
+ if (nrunes_ == 0) {
151
+ // start with 8
152
+ runes_ = new Rune[8];
153
+ } else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) {
154
+ // double on powers of two
155
+ Rune *old = runes_;
156
+ runes_ = new Rune[nrunes_ * 2];
157
+ for (int i = 0; i < nrunes_; i++)
158
+ runes_[i] = old[i];
159
+ delete[] old;
160
+ }
161
+
162
+ runes_[nrunes_++] = r;
163
+ }
164
+
165
+ Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
166
+ Regexp* re = new Regexp(kRegexpHaveMatch, flags);
167
+ re->match_id_ = match_id;
168
+ return re;
169
+ }
170
+
171
+ Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
172
+ if (sub->op() == kRegexpPlus && sub->parse_flags() == flags)
173
+ return sub;
174
+ Regexp* re = new Regexp(kRegexpPlus, flags);
175
+ re->AllocSub(1);
176
+ re->sub()[0] = sub;
177
+ return re;
178
+ }
179
+
180
+ Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
181
+ if (sub->op() == kRegexpStar && sub->parse_flags() == flags)
182
+ return sub;
183
+ Regexp* re = new Regexp(kRegexpStar, flags);
184
+ re->AllocSub(1);
185
+ re->sub()[0] = sub;
186
+ return re;
187
+ }
188
+
189
+ Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
190
+ if (sub->op() == kRegexpQuest && sub->parse_flags() == flags)
191
+ return sub;
192
+ Regexp* re = new Regexp(kRegexpQuest, flags);
193
+ re->AllocSub(1);
194
+ re->sub()[0] = sub;
195
+ return re;
196
+ }
197
+
198
+ Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
199
+ ParseFlags flags, bool can_factor) {
200
+ if (nsub == 1)
201
+ return sub[0];
202
+
203
+ Regexp** subcopy = NULL;
204
+ if (op == kRegexpAlternate && can_factor) {
205
+ // Going to edit sub; make a copy so we don't step on caller.
206
+ subcopy = new Regexp*[nsub];
207
+ memmove(subcopy, sub, nsub * sizeof sub[0]);
208
+ sub = subcopy;
209
+ nsub = FactorAlternation(sub, nsub, flags);
210
+ if (nsub == 1) {
211
+ Regexp* re = sub[0];
212
+ delete[] subcopy;
213
+ return re;
214
+ }
215
+ }
216
+
217
+ if (nsub > kMaxNsub) {
218
+ // Too many subexpressions to fit in a single Regexp.
219
+ // Make a two-level tree. Two levels gets us to 65535^2.
220
+ int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub;
221
+ Regexp* re = new Regexp(op, flags);
222
+ re->AllocSub(nbigsub);
223
+ Regexp** subs = re->sub();
224
+ for (int i = 0; i < nbigsub - 1; i++)
225
+ subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false);
226
+ subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,
227
+ nsub - (nbigsub-1)*kMaxNsub, flags,
228
+ false);
229
+ delete[] subcopy;
230
+ return re;
231
+ }
232
+
233
+ Regexp* re = new Regexp(op, flags);
234
+ re->AllocSub(nsub);
235
+ Regexp** subs = re->sub();
236
+ for (int i = 0; i < nsub; i++)
237
+ subs[i] = sub[i];
238
+
239
+ delete[] subcopy;
240
+ return re;
241
+ }
242
+
243
+ Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {
244
+ return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);
245
+ }
246
+
247
+ Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {
248
+ return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);
249
+ }
250
+
251
+ Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {
252
+ return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);
253
+ }
254
+
255
+ Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {
256
+ Regexp* re = new Regexp(kRegexpCapture, flags);
257
+ re->AllocSub(1);
258
+ re->sub()[0] = sub;
259
+ re->cap_ = cap;
260
+ return re;
261
+ }
262
+
263
+ Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) {
264
+ Regexp* re = new Regexp(kRegexpRepeat, flags);
265
+ re->AllocSub(1);
266
+ re->sub()[0] = sub;
267
+ re->min_ = min;
268
+ re->max_ = max;
269
+ return re;
270
+ }
271
+
272
+ Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) {
273
+ Regexp* re = new Regexp(kRegexpLiteral, flags);
274
+ re->rune_ = rune;
275
+ return re;
276
+ }
277
+
278
+ Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) {
279
+ if (nrunes <= 0)
280
+ return new Regexp(kRegexpEmptyMatch, flags);
281
+ if (nrunes == 1)
282
+ return NewLiteral(runes[0], flags);
283
+ Regexp* re = new Regexp(kRegexpLiteralString, flags);
284
+ for (int i = 0; i < nrunes; i++)
285
+ re->AddRuneToString(runes[i]);
286
+ return re;
287
+ }
288
+
289
+ Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) {
290
+ Regexp* re = new Regexp(kRegexpCharClass, flags);
291
+ re->cc_ = cc;
292
+ return re;
293
+ }
294
+
295
+ // Swaps this and that in place.
296
+ void Regexp::Swap(Regexp* that) {
297
+ // Can use memmove because Regexp is just a struct (no vtable).
298
+ char tmp[sizeof *this];
299
+ memmove(tmp, this, sizeof tmp);
300
+ memmove(this, that, sizeof tmp);
301
+ memmove(that, tmp, sizeof tmp);
302
+ }
303
+
304
+ // Tests equality of all top-level structure but not subregexps.
305
+ static bool TopEqual(Regexp* a, Regexp* b) {
306
+ if (a->op() != b->op())
307
+ return false;
308
+
309
+ switch (a->op()) {
310
+ case kRegexpNoMatch:
311
+ case kRegexpEmptyMatch:
312
+ case kRegexpAnyChar:
313
+ case kRegexpAnyByte:
314
+ case kRegexpBeginLine:
315
+ case kRegexpEndLine:
316
+ case kRegexpWordBoundary:
317
+ case kRegexpNoWordBoundary:
318
+ case kRegexpBeginText:
319
+ return true;
320
+
321
+ case kRegexpEndText:
322
+ // The parse flags remember whether it's \z or (?-m:$),
323
+ // which matters when testing against PCRE.
324
+ return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;
325
+
326
+ case kRegexpLiteral:
327
+ return a->rune() == b->rune() &&
328
+ ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;
329
+
330
+ case kRegexpLiteralString:
331
+ return a->nrunes() == b->nrunes() &&
332
+ ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&
333
+ memcmp(a->runes(), b->runes(),
334
+ a->nrunes() * sizeof a->runes()[0]) == 0;
335
+
336
+ case kRegexpAlternate:
337
+ case kRegexpConcat:
338
+ return a->nsub() == b->nsub();
339
+
340
+ case kRegexpStar:
341
+ case kRegexpPlus:
342
+ case kRegexpQuest:
343
+ return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;
344
+
345
+ case kRegexpRepeat:
346
+ return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 &&
347
+ a->min() == b->min() &&
348
+ a->max() == b->max();
349
+
350
+ case kRegexpCapture:
351
+ return a->cap() == b->cap() && a->name() == b->name();
352
+
353
+ case kRegexpHaveMatch:
354
+ return a->match_id() == b->match_id();
355
+
356
+ case kRegexpCharClass: {
357
+ CharClass* acc = a->cc();
358
+ CharClass* bcc = b->cc();
359
+ return acc->size() == bcc->size() &&
360
+ acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
361
+ memcmp(acc->begin(), bcc->begin(),
362
+ (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;
363
+ }
364
+ }
365
+
366
+ LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
367
+ return 0;
368
+ }
369
+
370
+ bool Regexp::Equal(Regexp* a, Regexp* b) {
371
+ if (a == NULL || b == NULL)
372
+ return a == b;
373
+
374
+ if (!TopEqual(a, b))
375
+ return false;
376
+
377
+ // Fast path:
378
+ // return without allocating vector if there are no subregexps.
379
+ switch (a->op()) {
380
+ case kRegexpAlternate:
381
+ case kRegexpConcat:
382
+ case kRegexpStar:
383
+ case kRegexpPlus:
384
+ case kRegexpQuest:
385
+ case kRegexpRepeat:
386
+ case kRegexpCapture:
387
+ break;
388
+
389
+ default:
390
+ return true;
391
+ }
392
+
393
+ // Committed to doing real work.
394
+ // The stack (vector) has pairs of regexps waiting to
395
+ // be compared. The regexps are only equal if
396
+ // all the pairs end up being equal.
397
+ vector<Regexp*> stk;
398
+
399
+ for (;;) {
400
+ // Invariant: TopEqual(a, b) == true.
401
+ Regexp* a2;
402
+ Regexp* b2;
403
+ switch (a->op()) {
404
+ default:
405
+ break;
406
+ case kRegexpAlternate:
407
+ case kRegexpConcat:
408
+ for (int i = 0; i < a->nsub(); i++) {
409
+ a2 = a->sub()[i];
410
+ b2 = b->sub()[i];
411
+ if (!TopEqual(a2, b2))
412
+ return false;
413
+ stk.push_back(a2);
414
+ stk.push_back(b2);
415
+ }
416
+ break;
417
+
418
+ case kRegexpStar:
419
+ case kRegexpPlus:
420
+ case kRegexpQuest:
421
+ case kRegexpRepeat:
422
+ case kRegexpCapture:
423
+ a2 = a->sub()[0];
424
+ b2 = b->sub()[0];
425
+ if (!TopEqual(a2, b2))
426
+ return false;
427
+ // Really:
428
+ // stk.push_back(a2);
429
+ // stk.push_back(b2);
430
+ // break;
431
+ // but faster to assign directly and loop.
432
+ a = a2;
433
+ b = b2;
434
+ continue;
435
+ }
436
+
437
+ int n = stk.size();
438
+ if (n == 0)
439
+ break;
440
+
441
+ a = stk[n-2];
442
+ b = stk[n-1];
443
+ stk.resize(n-2);
444
+ }
445
+
446
+ return true;
447
+ }
448
+
449
+ // Keep in sync with enum RegexpStatusCode in regexp.h
450
+ static const string kErrorStrings[] = {
451
+ "no error",
452
+ "unexpected error",
453
+ "invalid escape sequence",
454
+ "invalid character class",
455
+ "invalid character class range",
456
+ "missing ]",
457
+ "missing )",
458
+ "trailing \\",
459
+ "no argument for repetition operator",
460
+ "invalid repetition size",
461
+ "bad repetition operator",
462
+ "invalid perl operator",
463
+ "invalid UTF-8",
464
+ "invalid named capture group",
465
+ };
466
+
467
+ const string& RegexpStatus::CodeText(enum RegexpStatusCode code) {
468
+ if (code < 0 || code >= arraysize(kErrorStrings))
469
+ code = kRegexpInternalError;
470
+ return kErrorStrings[code];
471
+ }
472
+
473
+ string RegexpStatus::Text() const {
474
+ if (error_arg_.empty())
475
+ return CodeText(code_);
476
+ string s;
477
+ s.append(CodeText(code_));
478
+ s.append(": ");
479
+ s.append(error_arg_.data(), error_arg_.size());
480
+ return s;
481
+ }
482
+
483
+ void RegexpStatus::Copy(const RegexpStatus& status) {
484
+ code_ = status.code_;
485
+ error_arg_ = status.error_arg_;
486
+ }
487
+
488
+ typedef int Ignored; // Walker<void> doesn't exist
489
+
490
+ // Walker subclass to count capturing parens in regexp.
491
+ class NumCapturesWalker : public Regexp::Walker<Ignored> {
492
+ public:
493
+ NumCapturesWalker() : ncapture_(0) {}
494
+ int ncapture() { return ncapture_; }
495
+
496
+ virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
497
+ if (re->op() == kRegexpCapture)
498
+ ncapture_++;
499
+ return ignored;
500
+ }
501
+ virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
502
+ // Should never be called: we use Walk not WalkExponential.
503
+ LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";
504
+ return ignored;
505
+ }
506
+
507
+ private:
508
+ int ncapture_;
509
+ DISALLOW_EVIL_CONSTRUCTORS(NumCapturesWalker);
510
+ };
511
+
512
+ int Regexp::NumCaptures() {
513
+ NumCapturesWalker w;
514
+ w.Walk(this, 0);
515
+ return w.ncapture();
516
+ }
517
+
518
+ // Walker class to build map of named capture groups and their indices.
519
+ class NamedCapturesWalker : public Regexp::Walker<Ignored> {
520
+ public:
521
+ NamedCapturesWalker() : map_(NULL) {}
522
+ ~NamedCapturesWalker() { delete map_; }
523
+
524
+ map<string, int>* TakeMap() {
525
+ map<string, int>* m = map_;
526
+ map_ = NULL;
527
+ return m;
528
+ }
529
+
530
+ Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
531
+ if (re->op() == kRegexpCapture && re->name() != NULL) {
532
+ // Allocate map once we find a name.
533
+ if (map_ == NULL)
534
+ map_ = new map<string, int>;
535
+
536
+ // Record first occurrence of each name.
537
+ // (The rule is that if you have the same name
538
+ // multiple times, only the leftmost one counts.)
539
+ if (map_->find(*re->name()) == map_->end())
540
+ (*map_)[*re->name()] = re->cap();
541
+ }
542
+ return ignored;
543
+ }
544
+
545
+ virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
546
+ // Should never be called: we use Walk not WalkExponential.
547
+ LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";
548
+ return ignored;
549
+ }
550
+
551
+ private:
552
+ map<string, int>* map_;
553
+ DISALLOW_EVIL_CONSTRUCTORS(NamedCapturesWalker);
554
+ };
555
+
556
+ map<string, int>* Regexp::NamedCaptures() {
557
+ NamedCapturesWalker w;
558
+ w.Walk(this, 0);
559
+ return w.TakeMap();
560
+ }
561
+
562
+ // Walker class to build map from capture group indices to their names.
563
+ class CaptureNamesWalker : public Regexp::Walker<Ignored> {
564
+ public:
565
+ CaptureNamesWalker() : map_(NULL) {}
566
+ ~CaptureNamesWalker() { delete map_; }
567
+
568
+ map<int, string>* TakeMap() {
569
+ map<int, string>* m = map_;
570
+ map_ = NULL;
571
+ return m;
572
+ }
573
+
574
+ Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
575
+ if (re->op() == kRegexpCapture && re->name() != NULL) {
576
+ // Allocate map once we find a name.
577
+ if (map_ == NULL)
578
+ map_ = new map<int, string>;
579
+
580
+ (*map_)[re->cap()] = *re->name();
581
+ }
582
+ return ignored;
583
+ }
584
+
585
+ virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
586
+ // Should never be called: we use Walk not WalkExponential.
587
+ LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
588
+ return ignored;
589
+ }
590
+
591
+ private:
592
+ map<int, string>* map_;
593
+ DISALLOW_EVIL_CONSTRUCTORS(CaptureNamesWalker);
594
+ };
595
+
596
+ map<int, string>* Regexp::CaptureNames() {
597
+ CaptureNamesWalker w;
598
+ w.Walk(this, 0);
599
+ return w.TakeMap();
600
+ }
601
+
602
+ // Determines whether regexp matches must be anchored
603
+ // with a fixed string prefix. If so, returns the prefix and
604
+ // the regexp that remains after the prefix. The prefix might
605
+ // be ASCII case-insensitive.
606
+ bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
607
+ // No need for a walker: the regexp must be of the form
608
+ // 1. some number of ^ anchors
609
+ // 2. a literal char or string
610
+ // 3. the rest
611
+ prefix->clear();
612
+ *foldcase = false;
613
+ *suffix = NULL;
614
+ if (op_ != kRegexpConcat)
615
+ return false;
616
+
617
+ // Some number of anchors, then a literal or concatenation.
618
+ int i = 0;
619
+ Regexp** sub = this->sub();
620
+ while (i < nsub_ && sub[i]->op_ == kRegexpBeginText)
621
+ i++;
622
+ if (i == 0 || i >= nsub_)
623
+ return false;
624
+
625
+ Regexp* re = sub[i];
626
+ switch (re->op_) {
627
+ default:
628
+ return false;
629
+
630
+ case kRegexpLiteralString:
631
+ // Convert to string in proper encoding.
632
+ if (re->parse_flags() & Latin1) {
633
+ prefix->resize(re->nrunes_);
634
+ for (int j = 0; j < re->nrunes_; j++)
635
+ (*prefix)[j] = re->runes_[j];
636
+ } else {
637
+ // Convert to UTF-8 in place.
638
+ // Assume worst-case space and then trim.
639
+ prefix->resize(re->nrunes_ * UTFmax);
640
+ char *p = &(*prefix)[0];
641
+ for (int j = 0; j < re->nrunes_; j++) {
642
+ Rune r = re->runes_[j];
643
+ if (r < Runeself)
644
+ *p++ = r;
645
+ else
646
+ p += runetochar(p, &r);
647
+ }
648
+ prefix->resize(p - &(*prefix)[0]);
649
+ }
650
+ break;
651
+
652
+ case kRegexpLiteral:
653
+ if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) {
654
+ prefix->append(1, re->rune_);
655
+ } else {
656
+ char buf[UTFmax];
657
+ prefix->append(buf, runetochar(buf, &re->rune_));
658
+ }
659
+ break;
660
+ }
661
+ *foldcase = (sub[i]->parse_flags() & FoldCase);
662
+ i++;
663
+
664
+ // The rest.
665
+ if (i < nsub_) {
666
+ for (int j = i; j < nsub_; j++)
667
+ sub[j]->Incref();
668
+ re = Concat(sub + i, nsub_ - i, parse_flags());
669
+ } else {
670
+ re = new Regexp(kRegexpEmptyMatch, parse_flags());
671
+ }
672
+ *suffix = re;
673
+ return true;
674
+ }
675
+
676
+ // Character class builder is a balanced binary tree (STL set)
677
+ // containing non-overlapping, non-abutting RuneRanges.
678
+ // The less-than operator used in the tree treats two
679
+ // ranges as equal if they overlap at all, so that
680
+ // lookups for a particular Rune are possible.
681
+
682
+ CharClassBuilder::CharClassBuilder() {
683
+ nrunes_ = 0;
684
+ upper_ = 0;
685
+ lower_ = 0;
686
+ }
687
+
688
+ // Add lo-hi to the class; return whether class got bigger.
689
+ bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
690
+ if (hi < lo)
691
+ return false;
692
+
693
+ if (lo <= 'z' && hi >= 'A') {
694
+ // Overlaps some alpha, maybe not all.
695
+ // Update bitmaps telling which ASCII letters are in the set.
696
+ Rune lo1 = max<Rune>(lo, 'A');
697
+ Rune hi1 = min<Rune>(hi, 'Z');
698
+ if (lo1 <= hi1)
699
+ upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');
700
+
701
+ lo1 = max<Rune>(lo, 'a');
702
+ hi1 = min<Rune>(hi, 'z');
703
+ if (lo1 <= hi1)
704
+ lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
705
+ }
706
+
707
+ { // Check whether lo, hi is already in the class.
708
+ iterator it = ranges_.find(RuneRange(lo, lo));
709
+ if (it != end() && it->lo <= lo && hi <= it->hi)
710
+ return false;
711
+ }
712
+
713
+ // Look for a range abutting lo on the left.
714
+ // If it exists, take it out and increase our range.
715
+ if (lo > 0) {
716
+ iterator it = ranges_.find(RuneRange(lo-1, lo-1));
717
+ if (it != end()) {
718
+ lo = it->lo;
719
+ if (it->hi > hi)
720
+ hi = it->hi;
721
+ nrunes_ -= it->hi - it->lo + 1;
722
+ ranges_.erase(it);
723
+ }
724
+ }
725
+
726
+ // Look for a range abutting hi on the right.
727
+ // If it exists, take it out and increase our range.
728
+ if (hi < Runemax) {
729
+ iterator it = ranges_.find(RuneRange(hi+1, hi+1));
730
+ if (it != end()) {
731
+ hi = it->hi;
732
+ nrunes_ -= it->hi - it->lo + 1;
733
+ ranges_.erase(it);
734
+ }
735
+ }
736
+
737
+ // Look for ranges between lo and hi. Take them out.
738
+ // This is only safe because the set has no overlapping ranges.
739
+ // We've already removed any ranges abutting lo and hi, so
740
+ // any that overlap [lo, hi] must be contained within it.
741
+ for (;;) {
742
+ iterator it = ranges_.find(RuneRange(lo, hi));
743
+ if (it == end())
744
+ break;
745
+ nrunes_ -= it->hi - it->lo + 1;
746
+ ranges_.erase(it);
747
+ }
748
+
749
+ // Finally, add [lo, hi].
750
+ nrunes_ += hi - lo + 1;
751
+ ranges_.insert(RuneRange(lo, hi));
752
+ return true;
753
+ }
754
+
755
+ void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {
756
+ for (iterator it = cc->begin(); it != cc->end(); ++it)
757
+ AddRange(it->lo, it->hi);
758
+ }
759
+
760
+ bool CharClassBuilder::Contains(Rune r) {
761
+ return ranges_.find(RuneRange(r, r)) != end();
762
+ }
763
+
764
+ // Does the character class behave the same on A-Z as on a-z?
765
+ bool CharClassBuilder::FoldsASCII() {
766
+ return ((upper_ ^ lower_) & AlphaMask) == 0;
767
+ }
768
+
769
+ CharClassBuilder* CharClassBuilder::Copy() {
770
+ CharClassBuilder* cc = new CharClassBuilder;
771
+ for (iterator it = begin(); it != end(); ++it)
772
+ cc->ranges_.insert(RuneRange(it->lo, it->hi));
773
+ cc->upper_ = upper_;
774
+ cc->lower_ = lower_;
775
+ cc->nrunes_ = nrunes_;
776
+ return cc;
777
+ }
778
+
779
+
780
+
781
+ void CharClassBuilder::RemoveAbove(Rune r) {
782
+ if (r >= Runemax)
783
+ return;
784
+
785
+ if (r < 'z') {
786
+ if (r < 'a')
787
+ lower_ = 0;
788
+ else
789
+ lower_ &= AlphaMask >> ('z' - r);
790
+ }
791
+
792
+ if (r < 'Z') {
793
+ if (r < 'A')
794
+ upper_ = 0;
795
+ else
796
+ upper_ &= AlphaMask >> ('Z' - r);
797
+ }
798
+
799
+ for (;;) {
800
+
801
+ iterator it = ranges_.find(RuneRange(r + 1, Runemax));
802
+ if (it == end())
803
+ break;
804
+ RuneRange rr = *it;
805
+ ranges_.erase(it);
806
+ nrunes_ -= rr.hi - rr.lo + 1;
807
+ if (rr.lo <= r) {
808
+ rr.hi = r;
809
+ ranges_.insert(rr);
810
+ nrunes_ += rr.hi - rr.lo + 1;
811
+ }
812
+ }
813
+ }
814
+
815
+ void CharClassBuilder::Negate() {
816
+ // Build up negation and then copy in.
817
+ // Could edit ranges in place, but C++ won't let me.
818
+ vector<RuneRange> v;
819
+ v.reserve(ranges_.size() + 1);
820
+
821
+ // In negation, first range begins at 0, unless
822
+ // the current class begins at 0.
823
+ iterator it = begin();
824
+ if (it == end()) {
825
+ v.push_back(RuneRange(0, Runemax));
826
+ } else {
827
+ int nextlo = 0;
828
+ if (it->lo == 0) {
829
+ nextlo = it->hi + 1;
830
+ ++it;
831
+ }
832
+ for (; it != end(); ++it) {
833
+ v.push_back(RuneRange(nextlo, it->lo - 1));
834
+ nextlo = it->hi + 1;
835
+ }
836
+ if (nextlo <= Runemax)
837
+ v.push_back(RuneRange(nextlo, Runemax));
838
+ }
839
+
840
+ ranges_.clear();
841
+ for (int i = 0; i < v.size(); i++)
842
+ ranges_.insert(v[i]);
843
+
844
+ upper_ = AlphaMask & ~upper_;
845
+ lower_ = AlphaMask & ~lower_;
846
+ nrunes_ = Runemax+1 - nrunes_;
847
+ }
848
+
849
+ // Character class is a sorted list of ranges.
850
+ // The ranges are allocated in the same block as the header,
851
+ // necessitating a special allocator and Delete method.
852
+
853
+ CharClass* CharClass::New(int maxranges) {
854
+ CharClass* cc;
855
+ uint8* data = new uint8[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
856
+ cc = reinterpret_cast<CharClass*>(data);
857
+ cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc);
858
+ cc->nranges_ = 0;
859
+ cc->folds_ascii_ = false;
860
+ cc->nrunes_ = 0;
861
+ return cc;
862
+ }
863
+
864
+ void CharClass::Delete() {
865
+ if (this == NULL)
866
+ return;
867
+ uint8 *data = reinterpret_cast<uint8*>(this);
868
+ delete[] data;
869
+ }
870
+
871
+ CharClass* CharClass::Negate() {
872
+ CharClass* cc = CharClass::New(nranges_+1);
873
+ cc->folds_ascii_ = folds_ascii_;
874
+ cc->nrunes_ = Runemax + 1 - nrunes_;
875
+ int n = 0;
876
+ int nextlo = 0;
877
+ for (CharClass::iterator it = begin(); it != end(); ++it) {
878
+ if (it->lo == nextlo) {
879
+ nextlo = it->hi + 1;
880
+ } else {
881
+ cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1);
882
+ nextlo = it->hi + 1;
883
+ }
884
+ }
885
+ if (nextlo <= Runemax)
886
+ cc->ranges_[n++] = RuneRange(nextlo, Runemax);
887
+ cc->nranges_ = n;
888
+ return cc;
889
+ }
890
+
891
+ bool CharClass::Contains(Rune r) {
892
+ RuneRange* rr = ranges_;
893
+ int n = nranges_;
894
+ while (n > 0) {
895
+ int m = n/2;
896
+ if (rr[m].hi < r) {
897
+ rr += m+1;
898
+ n -= m+1;
899
+ } else if (r < rr[m].lo) {
900
+ n = m;
901
+ } else { // rr[m].lo <= r && r <= rr[m].hi
902
+ return true;
903
+ }
904
+ }
905
+ return false;
906
+ }
907
+
908
+ CharClass* CharClassBuilder::GetCharClass() {
909
+ CharClass* cc = CharClass::New(ranges_.size());
910
+ int n = 0;
911
+ for (iterator it = begin(); it != end(); ++it)
912
+ cc->ranges_[n++] = *it;
913
+ cc->nranges_ = n;
914
+ DCHECK_LE(n, ranges_.size());
915
+ cc->nrunes_ = nrunes_;
916
+ cc->folds_ascii_ = FoldsASCII();
917
+ return cc;
918
+ }
919
+
920
+ } // namespace re2