chipper 0.4.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (134) hide show
  1. data/README.rdoc +51 -0
  2. data/ext/extconf.rb +58 -0
  3. data/ext/libstemmer_c/Makefile +10 -0
  4. data/ext/libstemmer_c/examples/stemwords.c +209 -0
  5. data/ext/libstemmer_c/include/libstemmer.h +79 -0
  6. data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
  7. data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
  8. data/ext/libstemmer_c/libstemmer/modules.h +190 -0
  9. data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
  10. data/ext/libstemmer_c/mkinc.mak +82 -0
  11. data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
  12. data/ext/libstemmer_c/runtime/api.c +66 -0
  13. data/ext/libstemmer_c/runtime/api.h +26 -0
  14. data/ext/libstemmer_c/runtime/header.h +58 -0
  15. data/ext/libstemmer_c/runtime/utilities.c +478 -0
  16. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
  17. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
  18. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
  19. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
  20. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
  21. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
  22. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
  23. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
  24. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
  25. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
  26. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
  27. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
  28. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
  29. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
  30. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
  31. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
  32. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
  33. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
  34. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
  35. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
  36. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
  37. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
  38. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
  39. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
  40. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
  41. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
  42. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
  43. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
  44. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
  45. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
  46. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
  47. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
  48. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
  49. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
  50. data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
  51. data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
  52. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
  53. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
  54. data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
  55. data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
  56. data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
  57. data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
  58. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
  59. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
  60. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
  61. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
  62. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
  63. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
  64. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
  65. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
  66. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
  67. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
  68. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
  69. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
  70. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
  71. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
  72. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
  73. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
  74. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
  75. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
  76. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
  77. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
  78. data/ext/re2/bitstate.cc +378 -0
  79. data/ext/re2/compile.cc +1138 -0
  80. data/ext/re2/dfa.cc +2086 -0
  81. data/ext/re2/filtered_re2.cc +100 -0
  82. data/ext/re2/filtered_re2.h +99 -0
  83. data/ext/re2/hash.cc +231 -0
  84. data/ext/re2/mimics_pcre.cc +185 -0
  85. data/ext/re2/nfa.cc +709 -0
  86. data/ext/re2/onepass.cc +614 -0
  87. data/ext/re2/parse.cc +2202 -0
  88. data/ext/re2/perl_groups.cc +119 -0
  89. data/ext/re2/prefilter.cc +671 -0
  90. data/ext/re2/prefilter.h +105 -0
  91. data/ext/re2/prefilter_tree.cc +398 -0
  92. data/ext/re2/prefilter_tree.h +130 -0
  93. data/ext/re2/prog.cc +341 -0
  94. data/ext/re2/prog.h +376 -0
  95. data/ext/re2/re2.cc +1180 -0
  96. data/ext/re2/re2.h +837 -0
  97. data/ext/re2/regexp.cc +920 -0
  98. data/ext/re2/regexp.h +632 -0
  99. data/ext/re2/rune.cc +258 -0
  100. data/ext/re2/set.cc +113 -0
  101. data/ext/re2/set.h +55 -0
  102. data/ext/re2/simplify.cc +393 -0
  103. data/ext/re2/stringpiece.cc +87 -0
  104. data/ext/re2/stringpiece.h +182 -0
  105. data/ext/re2/tostring.cc +341 -0
  106. data/ext/re2/unicode_casefold.cc +469 -0
  107. data/ext/re2/unicode_casefold.h +75 -0
  108. data/ext/re2/unicode_groups.cc +4851 -0
  109. data/ext/re2/unicode_groups.h +64 -0
  110. data/ext/re2/valgrind.cc +24 -0
  111. data/ext/re2/variadic_function.h +346 -0
  112. data/ext/re2/walker-inl.h +244 -0
  113. data/ext/src/chipper.cc +626 -0
  114. data/ext/src/version.h +1 -0
  115. data/ext/stemmer.rb +40 -0
  116. data/ext/util/arena.h +103 -0
  117. data/ext/util/atomicops.h +79 -0
  118. data/ext/util/benchmark.h +41 -0
  119. data/ext/util/flags.h +27 -0
  120. data/ext/util/logging.h +78 -0
  121. data/ext/util/mutex.h +190 -0
  122. data/ext/util/pcre.h +679 -0
  123. data/ext/util/random.h +29 -0
  124. data/ext/util/sparse_array.h +451 -0
  125. data/ext/util/sparse_set.h +177 -0
  126. data/ext/util/test.h +57 -0
  127. data/ext/util/thread.h +26 -0
  128. data/ext/util/utf.h +43 -0
  129. data/ext/util/util.h +127 -0
  130. data/ext/util/valgrind.h +4517 -0
  131. data/test/helper.rb +5 -0
  132. data/test/test_entities.rb +57 -0
  133. data/test/test_tokens.rb +118 -0
  134. metadata +199 -0
@@ -0,0 +1,671 @@
1
+ // Copyright 2009 The RE2 Authors. All Rights Reserved.
2
+ // Use of this source code is governed by a BSD-style
3
+ // license that can be found in the LICENSE file.
4
+
5
+ #include "util/util.h"
6
+ #include "re2/prefilter.h"
7
+ #include "re2/re2.h"
8
+ #include "re2/unicode_casefold.h"
9
+ #include "re2/walker-inl.h"
10
+
11
+ namespace re2 {
12
+
13
+ static const int Trace = false;
14
+
15
+ typedef set<string>::iterator SSIter;
16
+ typedef set<string>::const_iterator ConstSSIter;
17
+
18
+ static int alloc_id = 100000; // Used for debugging.
19
+ // Initializes a Prefilter, allocating subs_ as necessary.
20
+ Prefilter::Prefilter(Op op) {
21
+ op_ = op;
22
+ subs_ = NULL;
23
+ if (op_ == AND || op_ == OR)
24
+ subs_ = new vector<Prefilter*>;
25
+
26
+ alloc_id_ = alloc_id++;
27
+ VLOG(10) << "alloc_id: " << alloc_id_;
28
+ }
29
+
30
+ // Destroys a Prefilter.
31
+ Prefilter::~Prefilter() {
32
+ VLOG(10) << "Deleted: " << alloc_id_;
33
+ if (subs_) {
34
+ for (int i = 0; i < subs_->size(); i++)
35
+ delete (*subs_)[i];
36
+ delete subs_;
37
+ subs_ = NULL;
38
+ }
39
+ }
40
+
41
+ // Simplify if the node is an empty Or or And.
42
+ Prefilter* Prefilter::Simplify() {
43
+ if (op_ != AND && op_ != OR) {
44
+ return this;
45
+ }
46
+
47
+ // Nothing left in the AND/OR.
48
+ if (subs_->size() == 0) {
49
+ if (op_ == AND)
50
+ op_ = ALL; // AND of nothing is true
51
+ else
52
+ op_ = NONE; // OR of nothing is false
53
+
54
+ return this;
55
+ }
56
+
57
+ // Just one subnode: throw away wrapper.
58
+ if (subs_->size() == 1) {
59
+ Prefilter* a = (*subs_)[0];
60
+ subs_->clear();
61
+ delete this;
62
+ return a->Simplify();
63
+ }
64
+
65
+ return this;
66
+ }
67
+
68
+ // Combines two Prefilters together to create an "op" (AND or OR).
69
+ // The passed Prefilters will be part of the returned Prefilter or deleted.
70
+ // Does lots of work to avoid creating unnecessarily complicated structures.
71
+ Prefilter* Prefilter::AndOr(Op op, Prefilter* a, Prefilter* b) {
72
+ // If a, b can be rewritten as op, do so.
73
+ a = a->Simplify();
74
+ b = b->Simplify();
75
+
76
+ // Canonicalize: a->op <= b->op.
77
+ if (a->op() > b->op()) {
78
+ Prefilter* t = a;
79
+ a = b;
80
+ b = t;
81
+ }
82
+
83
+ // Trivial cases.
84
+ // ALL AND b = b
85
+ // NONE OR b = b
86
+ // ALL OR b = ALL
87
+ // NONE AND b = NONE
88
+ // Don't need to look at b, because of canonicalization above.
89
+ // ALL and NONE are smallest opcodes.
90
+ if (a->op() == ALL || a->op() == NONE) {
91
+ if ((a->op() == ALL && op == AND) ||
92
+ (a->op() == NONE && op == OR)) {
93
+ delete a;
94
+ return b;
95
+ } else {
96
+ delete b;
97
+ return a;
98
+ }
99
+ }
100
+
101
+ // If a and b match op, merge their contents.
102
+ if (a->op() == op && b->op() == op) {
103
+ for (int i = 0; i < b->subs()->size(); i++) {
104
+ Prefilter* bb = (*b->subs())[i];
105
+ a->subs()->push_back(bb);
106
+ }
107
+ b->subs()->clear();
108
+ delete b;
109
+ return a;
110
+ }
111
+
112
+ // If a already has the same op as the op that is under construction
113
+ // add in b (similarly if b already has the same op, add in a).
114
+ if (b->op() == op) {
115
+ Prefilter* t = a;
116
+ a = b;
117
+ b = t;
118
+ }
119
+ if (a->op() == op) {
120
+ a->subs()->push_back(b);
121
+ return a;
122
+ }
123
+
124
+ // Otherwise just return the op.
125
+ Prefilter* c = new Prefilter(op);
126
+ c->subs()->push_back(a);
127
+ c->subs()->push_back(b);
128
+ return c;
129
+ }
130
+
131
+ Prefilter* Prefilter::And(Prefilter* a, Prefilter* b) {
132
+ return AndOr(AND, a, b);
133
+ }
134
+
135
+ Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) {
136
+ return AndOr(OR, a, b);
137
+ }
138
+
139
+ static void SimplifyStringSet(set<string> *ss) {
140
+ // Now make sure that the strings aren't redundant. For example, if
141
+ // we know "ab" is a required string, then it doesn't help at all to
142
+ // know that "abc" is also a required string, so delete "abc". This
143
+ // is because, when we are performing a string search to filter
144
+ // regexps, matching ab will already allow this regexp to be a
145
+ // candidate for match, so further matching abc is redundant.
146
+
147
+ for (SSIter i = ss->begin(); i != ss->end(); ++i) {
148
+ SSIter j = i;
149
+ ++j;
150
+ while (j != ss->end()) {
151
+ // Increment j early so that we can erase the element it points to.
152
+ SSIter old_j = j;
153
+ ++j;
154
+ if (old_j->find(*i) != string::npos)
155
+ ss->erase(old_j);
156
+ }
157
+ }
158
+ }
159
+
160
+ Prefilter* Prefilter::OrStrings(set<string>* ss) {
161
+ SimplifyStringSet(ss);
162
+ Prefilter* or_prefilter = NULL;
163
+ if (!ss->empty()) {
164
+ or_prefilter = new Prefilter(NONE);
165
+ for (SSIter i = ss->begin(); i != ss->end(); ++i)
166
+ or_prefilter = Or(or_prefilter, FromString(*i));
167
+ }
168
+ return or_prefilter;
169
+ }
170
+
171
+ static Rune ToLowerRune(Rune r) {
172
+ if (r < Runeself) {
173
+ if ('A' <= r && r <= 'Z')
174
+ r += 'a' - 'A';
175
+ return r;
176
+ }
177
+
178
+ CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r);
179
+ if (f == NULL || r < f->lo)
180
+ return r;
181
+ return ApplyFold(f, r);
182
+ }
183
+
184
+ Prefilter* Prefilter::FromString(const string& str) {
185
+ Prefilter* m = new Prefilter(Prefilter::ATOM);
186
+ m->atom_ = str;
187
+ return m;
188
+ }
189
+
190
+ // Information about a regexp used during computation of Prefilter.
191
+ // Can be thought of as information about the set of strings matching
192
+ // the given regular expression.
193
+ class Prefilter::Info {
194
+ public:
195
+ Info();
196
+ ~Info();
197
+
198
+ // More constructors. They delete their Info* arguments.
199
+ static Info* Alt(Info* a, Info* b);
200
+ static Info* Concat(Info* a, Info* b);
201
+ static Info* And(Info* a, Info* b);
202
+ static Info* Star(Info* a);
203
+ static Info* Plus(Info* a);
204
+ static Info* Quest(Info* a);
205
+ static Info* EmptyString();
206
+ static Info* NoMatch();
207
+ static Info* AnyChar();
208
+ static Info* CClass(CharClass* cc);
209
+ static Info* Literal(Rune r);
210
+ static Info* AnyMatch();
211
+
212
+ // Format Info as a string.
213
+ string ToString();
214
+
215
+ // Caller takes ownership of the Prefilter.
216
+ Prefilter* TakeMatch();
217
+
218
+ set<string>& exact() { return exact_; }
219
+
220
+ bool is_exact() const { return is_exact_; }
221
+
222
+ class Walker;
223
+
224
+ private:
225
+ set<string> exact_;
226
+
227
+ // When is_exact_ is true, the strings that match
228
+ // are placed in exact_. When it is no longer an exact
229
+ // set of strings that match this RE, then is_exact_
230
+ // is false and the match_ contains the required match
231
+ // criteria.
232
+ bool is_exact_;
233
+
234
+ // Accumulated Prefilter query that any
235
+ // match for this regexp is guaranteed to match.
236
+ Prefilter* match_;
237
+ };
238
+
239
+
240
+ Prefilter::Info::Info()
241
+ : is_exact_(false),
242
+ match_(NULL) {
243
+ }
244
+
245
+ Prefilter::Info::~Info() {
246
+ delete match_;
247
+ }
248
+
249
+ Prefilter* Prefilter::Info::TakeMatch() {
250
+ if (is_exact_) {
251
+ match_ = Prefilter::OrStrings(&exact_);
252
+ is_exact_ = false;
253
+ }
254
+ Prefilter* m = match_;
255
+ match_ = NULL;
256
+ return m;
257
+ }
258
+
259
+ // Format a Info in string form.
260
+ string Prefilter::Info::ToString() {
261
+ if (this == NULL) {
262
+ // Sometimes when iterating on children of a node,
263
+ // some children might have NULL Info. Adding
264
+ // the check here for NULL to take care of cases where
265
+ // the caller is not checking.
266
+ return "";
267
+ }
268
+
269
+ if (is_exact_) {
270
+ int n = 0;
271
+ string s;
272
+ for (set<string>::iterator i = exact_.begin(); i != exact_.end(); ++i) {
273
+ if (n++ > 0)
274
+ s += ",";
275
+ s += *i;
276
+ }
277
+ return s;
278
+ }
279
+
280
+ if (match_)
281
+ return match_->DebugString();
282
+
283
+ return "";
284
+ }
285
+
286
+ // Add the strings from src to dst.
287
+ static void CopyIn(const set<string>& src, set<string>* dst) {
288
+ for (ConstSSIter i = src.begin(); i != src.end(); ++i)
289
+ dst->insert(*i);
290
+ }
291
+
292
+ // Add the cross-product of a and b to dst.
293
+ // (For each string i in a and j in b, add i+j.)
294
+ static void CrossProduct(const set<string>& a,
295
+ const set<string>& b,
296
+ set<string>* dst) {
297
+ for (ConstSSIter i = a.begin(); i != a.end(); ++i)
298
+ for (ConstSSIter j = b.begin(); j != b.end(); ++j)
299
+ dst->insert(*i + *j);
300
+ }
301
+
302
+ // Concats a and b. Requires that both are exact sets.
303
+ // Forms an exact set that is a crossproduct of a and b.
304
+ Prefilter::Info* Prefilter::Info::Concat(Info* a, Info* b) {
305
+ if (a == NULL)
306
+ return b;
307
+ DCHECK(a->is_exact_);
308
+ DCHECK(b && b->is_exact_);
309
+ Info *ab = new Info();
310
+
311
+ CrossProduct(a->exact_, b->exact_, &ab->exact_);
312
+ ab->is_exact_ = true;
313
+
314
+ delete a;
315
+ delete b;
316
+ return ab;
317
+ }
318
+
319
+ // Constructs an inexact Info for ab given a and b.
320
+ // Used only when a or b is not exact or when the
321
+ // exact cross product is likely to be too big.
322
+ Prefilter::Info* Prefilter::Info::And(Info* a, Info* b) {
323
+ if (a == NULL)
324
+ return b;
325
+ if (b == NULL)
326
+ return a;
327
+
328
+ Info *ab = new Info();
329
+
330
+ ab->match_ = Prefilter::And(a->TakeMatch(), b->TakeMatch());
331
+ ab->is_exact_ = false;
332
+ delete a;
333
+ delete b;
334
+ return ab;
335
+ }
336
+
337
+ // Constructs Info for a|b given a and b.
338
+ Prefilter::Info* Prefilter::Info::Alt(Info* a, Info* b) {
339
+ Info *ab = new Info();
340
+
341
+ if (a->is_exact_ && b->is_exact_) {
342
+ CopyIn(a->exact_, &ab->exact_);
343
+ CopyIn(b->exact_, &ab->exact_);
344
+ ab->is_exact_ = true;
345
+ } else {
346
+ // Either a or b has is_exact_ = false. If the other
347
+ // one has is_exact_ = true, we move it to match_ and
348
+ // then create a OR of a,b. The resulting Info has
349
+ // is_exact_ = false.
350
+ ab->match_ = Prefilter::Or(a->TakeMatch(), b->TakeMatch());
351
+ ab->is_exact_ = false;
352
+ }
353
+
354
+ delete a;
355
+ delete b;
356
+ return ab;
357
+ }
358
+
359
+ // Constructs Info for a? given a.
360
+ Prefilter::Info* Prefilter::Info::Quest(Info *a) {
361
+ Info *ab = new Info();
362
+
363
+ ab->is_exact_ = false;
364
+ ab->match_ = new Prefilter(ALL);
365
+ delete a;
366
+ return ab;
367
+ }
368
+
369
+ // Constructs Info for a* given a.
370
+ // Same as a? -- not much to do.
371
+ Prefilter::Info* Prefilter::Info::Star(Info *a) {
372
+ return Quest(a);
373
+ }
374
+
375
+ // Constructs Info for a+ given a. If a was exact set, it isn't
376
+ // anymore.
377
+ Prefilter::Info* Prefilter::Info::Plus(Info *a) {
378
+ Info *ab = new Info();
379
+
380
+ ab->match_ = a->TakeMatch();
381
+ ab->is_exact_ = false;
382
+
383
+ delete a;
384
+ return ab;
385
+ }
386
+
387
+ static string RuneToString(Rune r) {
388
+ char buf[UTFmax];
389
+ int n = runetochar(buf, &r);
390
+ return string(buf, n);
391
+ }
392
+
393
+ // Constructs Info for literal rune.
394
+ Prefilter::Info* Prefilter::Info::Literal(Rune r) {
395
+ Info* info = new Info();
396
+ info->exact_.insert(RuneToString(ToLowerRune(r)));
397
+ info->is_exact_ = true;
398
+ return info;
399
+ }
400
+
401
+ // Constructs Info for dot (any character).
402
+ Prefilter::Info* Prefilter::Info::AnyChar() {
403
+ Prefilter::Info* info = new Prefilter::Info();
404
+ info->match_ = new Prefilter(ALL);
405
+ return info;
406
+ }
407
+
408
+ // Constructs Prefilter::Info for no possible match.
409
+ Prefilter::Info* Prefilter::Info::NoMatch() {
410
+ Prefilter::Info* info = new Prefilter::Info();
411
+ info->match_ = new Prefilter(NONE);
412
+ return info;
413
+ }
414
+
415
+ // Constructs Prefilter::Info for any possible match.
416
+ // This Prefilter::Info is valid for any regular expression,
417
+ // since it makes no assertions whatsoever about the
418
+ // strings being matched.
419
+ Prefilter::Info* Prefilter::Info::AnyMatch() {
420
+ Prefilter::Info *info = new Prefilter::Info();
421
+ info->match_ = new Prefilter(ALL);
422
+ return info;
423
+ }
424
+
425
+ // Constructs Prefilter::Info for just the empty string.
426
+ Prefilter::Info* Prefilter::Info::EmptyString() {
427
+ Prefilter::Info* info = new Prefilter::Info();
428
+ info->is_exact_ = true;
429
+ info->exact_.insert("");
430
+ return info;
431
+ }
432
+
433
+ // Constructs Prefilter::Info for a character class.
434
+ typedef CharClass::iterator CCIter;
435
+ Prefilter::Info* Prefilter::Info::CClass(CharClass *cc) {
436
+ if (Trace) {
437
+ VLOG(0) << "CharClassInfo:";
438
+ for (CCIter i = cc->begin(); i != cc->end(); ++i)
439
+ VLOG(0) << " " << i->lo << "-" << i->hi;
440
+ }
441
+
442
+ // If the class is too large, it's okay to overestimate.
443
+ if (cc->size() > 10)
444
+ return AnyChar();
445
+
446
+ Prefilter::Info *a = new Prefilter::Info();
447
+ for (CCIter i = cc->begin(); i != cc->end(); ++i)
448
+ for (Rune r = i->lo; r <= i->hi; r++)
449
+ a->exact_.insert(RuneToString(ToLowerRune(r)));
450
+
451
+ a->is_exact_ = true;
452
+
453
+ if (Trace) {
454
+ VLOG(0) << " = " << a->ToString();
455
+ }
456
+
457
+ return a;
458
+ }
459
+
460
+ class Prefilter::Info::Walker : public Regexp::Walker<Prefilter::Info*> {
461
+ public:
462
+ Walker() {}
463
+
464
+ virtual Info* PostVisit(
465
+ Regexp* re, Info* parent_arg,
466
+ Info* pre_arg,
467
+ Info** child_args, int nchild_args);
468
+
469
+ virtual Info* ShortVisit(
470
+ Regexp* re,
471
+ Info* parent_arg);
472
+
473
+ private:
474
+ DISALLOW_EVIL_CONSTRUCTORS(Walker);
475
+ };
476
+
477
+ Prefilter::Info* Prefilter::BuildInfo(Regexp* re) {
478
+ if (Trace) {
479
+ LOG(INFO) << "BuildPrefilter::Info: " << re->ToString();
480
+ }
481
+ Prefilter::Info::Walker w;
482
+ Prefilter::Info* info = w.WalkExponential(re, NULL, 100000);
483
+
484
+ if (w.stopped_early()) {
485
+ delete info;
486
+ return NULL;
487
+ }
488
+
489
+ return info;
490
+ }
491
+
492
+ Prefilter::Info* Prefilter::Info::Walker::ShortVisit(
493
+ Regexp* re, Prefilter::Info* parent_arg) {
494
+ return AnyMatch();
495
+ }
496
+
497
+ // Constructs the Prefilter::Info for the given regular expression.
498
+ // Assumes re is simplified.
499
+ Prefilter::Info* Prefilter::Info::Walker::PostVisit(
500
+ Regexp* re, Prefilter::Info* parent_arg,
501
+ Prefilter::Info* pre_arg, Prefilter::Info** child_args,
502
+ int nchild_args) {
503
+ Prefilter::Info *info;
504
+ switch (re->op()) {
505
+ default:
506
+ case kRegexpRepeat:
507
+ LOG(DFATAL) << "Bad regexp op " << re->op();
508
+ info = EmptyString();
509
+ break;
510
+
511
+ case kRegexpNoMatch:
512
+ info = NoMatch();
513
+ break;
514
+
515
+ // These ops match the empty string:
516
+ case kRegexpEmptyMatch: // anywhere
517
+ case kRegexpBeginLine: // at beginning of line
518
+ case kRegexpEndLine: // at end of line
519
+ case kRegexpBeginText: // at beginning of text
520
+ case kRegexpEndText: // at end of text
521
+ case kRegexpWordBoundary: // at word boundary
522
+ case kRegexpNoWordBoundary: // not at word boundary
523
+ info = EmptyString();
524
+ break;
525
+
526
+ case kRegexpLiteral:
527
+ info = Literal(re->rune());
528
+ break;
529
+
530
+ case kRegexpLiteralString:
531
+ if (re->nrunes() == 0) {
532
+ info = NoMatch();
533
+ break;
534
+ }
535
+ info = Literal(re->runes()[0]);
536
+ for (int i = 1; i < re->nrunes(); i++)
537
+ info = Concat(info, Literal(re->runes()[i]));
538
+ break;
539
+
540
+ case kRegexpConcat: {
541
+ // Accumulate in info.
542
+ // Exact is concat of recent contiguous exact nodes.
543
+ info = NULL;
544
+ Info* exact = NULL;
545
+ for (int i = 0; i < nchild_args; i++) {
546
+ Info* ci = child_args[i]; // child info
547
+ if (!ci->is_exact() ||
548
+ (exact && ci->exact().size() * exact->exact().size() > 16)) {
549
+ // Exact run is over.
550
+ info = And(info, exact);
551
+ exact = NULL;
552
+ // Add this child's info.
553
+ info = And(info, ci);
554
+ } else {
555
+ // Append to exact run.
556
+ exact = Concat(exact, ci);
557
+ }
558
+ }
559
+ info = And(info, exact);
560
+ }
561
+ break;
562
+
563
+ case kRegexpAlternate:
564
+ info = child_args[0];
565
+ for (int i = 1; i < nchild_args; i++)
566
+ info = Alt(info, child_args[i]);
567
+ VLOG(10) << "Alt: " << info->ToString();
568
+ break;
569
+
570
+ case kRegexpStar:
571
+ info = Star(child_args[0]);
572
+ break;
573
+
574
+ case kRegexpQuest:
575
+ info = Quest(child_args[0]);
576
+ break;
577
+
578
+ case kRegexpPlus:
579
+ info = Plus(child_args[0]);
580
+ break;
581
+
582
+ case kRegexpAnyChar:
583
+ // Claim nothing, except that it's not empty.
584
+ info = AnyChar();
585
+ break;
586
+
587
+ case kRegexpCharClass:
588
+ info = CClass(re->cc());
589
+ break;
590
+
591
+ case kRegexpCapture:
592
+ // These don't affect the set of matching strings.
593
+ info = child_args[0];
594
+ break;
595
+ }
596
+
597
+ if (Trace) {
598
+ VLOG(0) << "BuildInfo " << re->ToString()
599
+ << ": " << info->ToString();
600
+ }
601
+
602
+ return info;
603
+ }
604
+
605
+
606
+ Prefilter* Prefilter::FromRegexp(Regexp* re) {
607
+ if (re == NULL)
608
+ return NULL;
609
+
610
+ Regexp* simple = re->Simplify();
611
+ Prefilter::Info *info = BuildInfo(simple);
612
+
613
+ simple->Decref();
614
+ if (info == NULL)
615
+ return NULL;
616
+
617
+ Prefilter* m = info->TakeMatch();
618
+
619
+ delete info;
620
+ return m;
621
+ }
622
+
623
+ string Prefilter::DebugString() const {
624
+ if (this == NULL)
625
+ return "<nil>";
626
+
627
+ switch (op_) {
628
+ default:
629
+ LOG(DFATAL) << "Bad op in Prefilter::DebugString: " << op_;
630
+ return StringPrintf("op%d", op_);
631
+ case NONE:
632
+ return "*no-matches*";
633
+ case ATOM:
634
+ return atom_;
635
+ case ALL:
636
+ return "";
637
+ case AND: {
638
+ string s = "";
639
+ for (int i = 0; i < subs_->size(); i++) {
640
+ if (i > 0)
641
+ s += " ";
642
+ s += (*subs_)[i]->DebugString();
643
+ }
644
+ return s;
645
+ }
646
+ case OR: {
647
+ string s = "(";
648
+ for (int i = 0; i < subs_->size(); i++) {
649
+ if (i > 0)
650
+ s += "|";
651
+ s += (*subs_)[i]->DebugString();
652
+ }
653
+ s += ")";
654
+ return s;
655
+ }
656
+ }
657
+ }
658
+
659
+ Prefilter* Prefilter::FromRE2(const RE2* re2) {
660
+ if (re2 == NULL)
661
+ return NULL;
662
+
663
+ Regexp* regexp = re2->Regexp();
664
+ if (regexp == NULL)
665
+ return NULL;
666
+
667
+ return FromRegexp(regexp);
668
+ }
669
+
670
+
671
+ } // namespace re2