chipper 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. data/README.rdoc +51 -0
  2. data/ext/extconf.rb +58 -0
  3. data/ext/libstemmer_c/Makefile +10 -0
  4. data/ext/libstemmer_c/examples/stemwords.c +209 -0
  5. data/ext/libstemmer_c/include/libstemmer.h +79 -0
  6. data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
  7. data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
  8. data/ext/libstemmer_c/libstemmer/modules.h +190 -0
  9. data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
  10. data/ext/libstemmer_c/mkinc.mak +82 -0
  11. data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
  12. data/ext/libstemmer_c/runtime/api.c +66 -0
  13. data/ext/libstemmer_c/runtime/api.h +26 -0
  14. data/ext/libstemmer_c/runtime/header.h +58 -0
  15. data/ext/libstemmer_c/runtime/utilities.c +478 -0
  16. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
  17. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
  18. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
  19. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
  20. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
  21. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
  22. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
  23. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
  24. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
  25. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
  26. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
  27. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
  28. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
  29. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
  30. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
  31. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
  32. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
  33. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
  34. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
  35. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
  36. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
  37. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
  38. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
  39. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
  40. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
  41. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
  42. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
  43. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
  44. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
  45. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
  46. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
  47. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
  48. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
  49. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
  50. data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
  51. data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
  52. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
  53. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
  54. data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
  55. data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
  56. data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
  57. data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
  58. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
  59. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
  60. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
  61. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
  62. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
  63. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
  64. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
  65. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
  66. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
  67. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
  68. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
  69. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
  70. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
  71. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
  72. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
  73. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
  74. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
  75. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
  76. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
  77. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
  78. data/ext/re2/bitstate.cc +378 -0
  79. data/ext/re2/compile.cc +1138 -0
  80. data/ext/re2/dfa.cc +2086 -0
  81. data/ext/re2/filtered_re2.cc +100 -0
  82. data/ext/re2/filtered_re2.h +99 -0
  83. data/ext/re2/hash.cc +231 -0
  84. data/ext/re2/mimics_pcre.cc +185 -0
  85. data/ext/re2/nfa.cc +709 -0
  86. data/ext/re2/onepass.cc +614 -0
  87. data/ext/re2/parse.cc +2202 -0
  88. data/ext/re2/perl_groups.cc +119 -0
  89. data/ext/re2/prefilter.cc +671 -0
  90. data/ext/re2/prefilter.h +105 -0
  91. data/ext/re2/prefilter_tree.cc +398 -0
  92. data/ext/re2/prefilter_tree.h +130 -0
  93. data/ext/re2/prog.cc +341 -0
  94. data/ext/re2/prog.h +376 -0
  95. data/ext/re2/re2.cc +1180 -0
  96. data/ext/re2/re2.h +837 -0
  97. data/ext/re2/regexp.cc +920 -0
  98. data/ext/re2/regexp.h +632 -0
  99. data/ext/re2/rune.cc +258 -0
  100. data/ext/re2/set.cc +113 -0
  101. data/ext/re2/set.h +55 -0
  102. data/ext/re2/simplify.cc +393 -0
  103. data/ext/re2/stringpiece.cc +87 -0
  104. data/ext/re2/stringpiece.h +182 -0
  105. data/ext/re2/tostring.cc +341 -0
  106. data/ext/re2/unicode_casefold.cc +469 -0
  107. data/ext/re2/unicode_casefold.h +75 -0
  108. data/ext/re2/unicode_groups.cc +4851 -0
  109. data/ext/re2/unicode_groups.h +64 -0
  110. data/ext/re2/valgrind.cc +24 -0
  111. data/ext/re2/variadic_function.h +346 -0
  112. data/ext/re2/walker-inl.h +244 -0
  113. data/ext/src/chipper.cc +626 -0
  114. data/ext/src/version.h +1 -0
  115. data/ext/stemmer.rb +40 -0
  116. data/ext/util/arena.h +103 -0
  117. data/ext/util/atomicops.h +79 -0
  118. data/ext/util/benchmark.h +41 -0
  119. data/ext/util/flags.h +27 -0
  120. data/ext/util/logging.h +78 -0
  121. data/ext/util/mutex.h +190 -0
  122. data/ext/util/pcre.h +679 -0
  123. data/ext/util/random.h +29 -0
  124. data/ext/util/sparse_array.h +451 -0
  125. data/ext/util/sparse_set.h +177 -0
  126. data/ext/util/test.h +57 -0
  127. data/ext/util/thread.h +26 -0
  128. data/ext/util/utf.h +43 -0
  129. data/ext/util/util.h +127 -0
  130. data/ext/util/valgrind.h +4517 -0
  131. data/test/helper.rb +5 -0
  132. data/test/test_entities.rb +57 -0
  133. data/test/test_tokens.rb +118 -0
  134. metadata +199 -0
@@ -0,0 +1,671 @@
1
+ // Copyright 2009 The RE2 Authors. All Rights Reserved.
2
+ // Use of this source code is governed by a BSD-style
3
+ // license that can be found in the LICENSE file.
4
+
5
+ #include "util/util.h"
6
+ #include "re2/prefilter.h"
7
+ #include "re2/re2.h"
8
+ #include "re2/unicode_casefold.h"
9
+ #include "re2/walker-inl.h"
10
+
11
+ namespace re2 {
12
+
13
+ static const int Trace = false;
14
+
15
+ typedef set<string>::iterator SSIter;
16
+ typedef set<string>::const_iterator ConstSSIter;
17
+
18
+ static int alloc_id = 100000; // Used for debugging.
19
+ // Initializes a Prefilter, allocating subs_ as necessary.
20
+ Prefilter::Prefilter(Op op) {
21
+ op_ = op;
22
+ subs_ = NULL;
23
+ if (op_ == AND || op_ == OR)
24
+ subs_ = new vector<Prefilter*>;
25
+
26
+ alloc_id_ = alloc_id++;
27
+ VLOG(10) << "alloc_id: " << alloc_id_;
28
+ }
29
+
30
+ // Destroys a Prefilter.
31
+ Prefilter::~Prefilter() {
32
+ VLOG(10) << "Deleted: " << alloc_id_;
33
+ if (subs_) {
34
+ for (int i = 0; i < subs_->size(); i++)
35
+ delete (*subs_)[i];
36
+ delete subs_;
37
+ subs_ = NULL;
38
+ }
39
+ }
40
+
41
+ // Simplify if the node is an empty Or or And.
42
+ Prefilter* Prefilter::Simplify() {
43
+ if (op_ != AND && op_ != OR) {
44
+ return this;
45
+ }
46
+
47
+ // Nothing left in the AND/OR.
48
+ if (subs_->size() == 0) {
49
+ if (op_ == AND)
50
+ op_ = ALL; // AND of nothing is true
51
+ else
52
+ op_ = NONE; // OR of nothing is false
53
+
54
+ return this;
55
+ }
56
+
57
+ // Just one subnode: throw away wrapper.
58
+ if (subs_->size() == 1) {
59
+ Prefilter* a = (*subs_)[0];
60
+ subs_->clear();
61
+ delete this;
62
+ return a->Simplify();
63
+ }
64
+
65
+ return this;
66
+ }
67
+
68
+ // Combines two Prefilters together to create an "op" (AND or OR).
69
+ // The passed Prefilters will be part of the returned Prefilter or deleted.
70
+ // Does lots of work to avoid creating unnecessarily complicated structures.
71
+ Prefilter* Prefilter::AndOr(Op op, Prefilter* a, Prefilter* b) {
72
+ // If a, b can be rewritten as op, do so.
73
+ a = a->Simplify();
74
+ b = b->Simplify();
75
+
76
+ // Canonicalize: a->op <= b->op.
77
+ if (a->op() > b->op()) {
78
+ Prefilter* t = a;
79
+ a = b;
80
+ b = t;
81
+ }
82
+
83
+ // Trivial cases.
84
+ // ALL AND b = b
85
+ // NONE OR b = b
86
+ // ALL OR b = ALL
87
+ // NONE AND b = NONE
88
+ // Don't need to look at b, because of canonicalization above.
89
+ // ALL and NONE are smallest opcodes.
90
+ if (a->op() == ALL || a->op() == NONE) {
91
+ if ((a->op() == ALL && op == AND) ||
92
+ (a->op() == NONE && op == OR)) {
93
+ delete a;
94
+ return b;
95
+ } else {
96
+ delete b;
97
+ return a;
98
+ }
99
+ }
100
+
101
+ // If a and b match op, merge their contents.
102
+ if (a->op() == op && b->op() == op) {
103
+ for (int i = 0; i < b->subs()->size(); i++) {
104
+ Prefilter* bb = (*b->subs())[i];
105
+ a->subs()->push_back(bb);
106
+ }
107
+ b->subs()->clear();
108
+ delete b;
109
+ return a;
110
+ }
111
+
112
+ // If a already has the same op as the op that is under construction
113
+ // add in b (similarly if b already has the same op, add in a).
114
+ if (b->op() == op) {
115
+ Prefilter* t = a;
116
+ a = b;
117
+ b = t;
118
+ }
119
+ if (a->op() == op) {
120
+ a->subs()->push_back(b);
121
+ return a;
122
+ }
123
+
124
+ // Otherwise just return the op.
125
+ Prefilter* c = new Prefilter(op);
126
+ c->subs()->push_back(a);
127
+ c->subs()->push_back(b);
128
+ return c;
129
+ }
130
+
131
+ Prefilter* Prefilter::And(Prefilter* a, Prefilter* b) {
132
+ return AndOr(AND, a, b);
133
+ }
134
+
135
+ Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) {
136
+ return AndOr(OR, a, b);
137
+ }
138
+
139
+ static void SimplifyStringSet(set<string> *ss) {
140
+ // Now make sure that the strings aren't redundant. For example, if
141
+ // we know "ab" is a required string, then it doesn't help at all to
142
+ // know that "abc" is also a required string, so delete "abc". This
143
+ // is because, when we are performing a string search to filter
144
+ // regexps, matching ab will already allow this regexp to be a
145
+ // candidate for match, so further matching abc is redundant.
146
+
147
+ for (SSIter i = ss->begin(); i != ss->end(); ++i) {
148
+ SSIter j = i;
149
+ ++j;
150
+ while (j != ss->end()) {
151
+ // Increment j early so that we can erase the element it points to.
152
+ SSIter old_j = j;
153
+ ++j;
154
+ if (old_j->find(*i) != string::npos)
155
+ ss->erase(old_j);
156
+ }
157
+ }
158
+ }
159
+
160
+ Prefilter* Prefilter::OrStrings(set<string>* ss) {
161
+ SimplifyStringSet(ss);
162
+ Prefilter* or_prefilter = NULL;
163
+ if (!ss->empty()) {
164
+ or_prefilter = new Prefilter(NONE);
165
+ for (SSIter i = ss->begin(); i != ss->end(); ++i)
166
+ or_prefilter = Or(or_prefilter, FromString(*i));
167
+ }
168
+ return or_prefilter;
169
+ }
170
+
171
+ static Rune ToLowerRune(Rune r) {
172
+ if (r < Runeself) {
173
+ if ('A' <= r && r <= 'Z')
174
+ r += 'a' - 'A';
175
+ return r;
176
+ }
177
+
178
+ CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r);
179
+ if (f == NULL || r < f->lo)
180
+ return r;
181
+ return ApplyFold(f, r);
182
+ }
183
+
184
+ Prefilter* Prefilter::FromString(const string& str) {
185
+ Prefilter* m = new Prefilter(Prefilter::ATOM);
186
+ m->atom_ = str;
187
+ return m;
188
+ }
189
+
190
+ // Information about a regexp used during computation of Prefilter.
191
+ // Can be thought of as information about the set of strings matching
192
+ // the given regular expression.
193
+ class Prefilter::Info {
194
+ public:
195
+ Info();
196
+ ~Info();
197
+
198
+ // More constructors. They delete their Info* arguments.
199
+ static Info* Alt(Info* a, Info* b);
200
+ static Info* Concat(Info* a, Info* b);
201
+ static Info* And(Info* a, Info* b);
202
+ static Info* Star(Info* a);
203
+ static Info* Plus(Info* a);
204
+ static Info* Quest(Info* a);
205
+ static Info* EmptyString();
206
+ static Info* NoMatch();
207
+ static Info* AnyChar();
208
+ static Info* CClass(CharClass* cc);
209
+ static Info* Literal(Rune r);
210
+ static Info* AnyMatch();
211
+
212
+ // Format Info as a string.
213
+ string ToString();
214
+
215
+ // Caller takes ownership of the Prefilter.
216
+ Prefilter* TakeMatch();
217
+
218
+ set<string>& exact() { return exact_; }
219
+
220
+ bool is_exact() const { return is_exact_; }
221
+
222
+ class Walker;
223
+
224
+ private:
225
+ set<string> exact_;
226
+
227
+ // When is_exact_ is true, the strings that match
228
+ // are placed in exact_. When it is no longer an exact
229
+ // set of strings that match this RE, then is_exact_
230
+ // is false and the match_ contains the required match
231
+ // criteria.
232
+ bool is_exact_;
233
+
234
+ // Accumulated Prefilter query that any
235
+ // match for this regexp is guaranteed to match.
236
+ Prefilter* match_;
237
+ };
238
+
239
+
240
+ Prefilter::Info::Info()
241
+ : is_exact_(false),
242
+ match_(NULL) {
243
+ }
244
+
245
+ Prefilter::Info::~Info() {
246
+ delete match_;
247
+ }
248
+
249
+ Prefilter* Prefilter::Info::TakeMatch() {
250
+ if (is_exact_) {
251
+ match_ = Prefilter::OrStrings(&exact_);
252
+ is_exact_ = false;
253
+ }
254
+ Prefilter* m = match_;
255
+ match_ = NULL;
256
+ return m;
257
+ }
258
+
259
+ // Format a Info in string form.
260
+ string Prefilter::Info::ToString() {
261
+ if (this == NULL) {
262
+ // Sometimes when iterating on children of a node,
263
+ // some children might have NULL Info. Adding
264
+ // the check here for NULL to take care of cases where
265
+ // the caller is not checking.
266
+ return "";
267
+ }
268
+
269
+ if (is_exact_) {
270
+ int n = 0;
271
+ string s;
272
+ for (set<string>::iterator i = exact_.begin(); i != exact_.end(); ++i) {
273
+ if (n++ > 0)
274
+ s += ",";
275
+ s += *i;
276
+ }
277
+ return s;
278
+ }
279
+
280
+ if (match_)
281
+ return match_->DebugString();
282
+
283
+ return "";
284
+ }
285
+
286
+ // Add the strings from src to dst.
287
+ static void CopyIn(const set<string>& src, set<string>* dst) {
288
+ for (ConstSSIter i = src.begin(); i != src.end(); ++i)
289
+ dst->insert(*i);
290
+ }
291
+
292
+ // Add the cross-product of a and b to dst.
293
+ // (For each string i in a and j in b, add i+j.)
294
+ static void CrossProduct(const set<string>& a,
295
+ const set<string>& b,
296
+ set<string>* dst) {
297
+ for (ConstSSIter i = a.begin(); i != a.end(); ++i)
298
+ for (ConstSSIter j = b.begin(); j != b.end(); ++j)
299
+ dst->insert(*i + *j);
300
+ }
301
+
302
+ // Concats a and b. Requires that both are exact sets.
303
+ // Forms an exact set that is a crossproduct of a and b.
304
+ Prefilter::Info* Prefilter::Info::Concat(Info* a, Info* b) {
305
+ if (a == NULL)
306
+ return b;
307
+ DCHECK(a->is_exact_);
308
+ DCHECK(b && b->is_exact_);
309
+ Info *ab = new Info();
310
+
311
+ CrossProduct(a->exact_, b->exact_, &ab->exact_);
312
+ ab->is_exact_ = true;
313
+
314
+ delete a;
315
+ delete b;
316
+ return ab;
317
+ }
318
+
319
+ // Constructs an inexact Info for ab given a and b.
320
+ // Used only when a or b is not exact or when the
321
+ // exact cross product is likely to be too big.
322
+ Prefilter::Info* Prefilter::Info::And(Info* a, Info* b) {
323
+ if (a == NULL)
324
+ return b;
325
+ if (b == NULL)
326
+ return a;
327
+
328
+ Info *ab = new Info();
329
+
330
+ ab->match_ = Prefilter::And(a->TakeMatch(), b->TakeMatch());
331
+ ab->is_exact_ = false;
332
+ delete a;
333
+ delete b;
334
+ return ab;
335
+ }
336
+
337
+ // Constructs Info for a|b given a and b.
338
+ Prefilter::Info* Prefilter::Info::Alt(Info* a, Info* b) {
339
+ Info *ab = new Info();
340
+
341
+ if (a->is_exact_ && b->is_exact_) {
342
+ CopyIn(a->exact_, &ab->exact_);
343
+ CopyIn(b->exact_, &ab->exact_);
344
+ ab->is_exact_ = true;
345
+ } else {
346
+ // Either a or b has is_exact_ = false. If the other
347
+ // one has is_exact_ = true, we move it to match_ and
348
+ // then create a OR of a,b. The resulting Info has
349
+ // is_exact_ = false.
350
+ ab->match_ = Prefilter::Or(a->TakeMatch(), b->TakeMatch());
351
+ ab->is_exact_ = false;
352
+ }
353
+
354
+ delete a;
355
+ delete b;
356
+ return ab;
357
+ }
358
+
359
+ // Constructs Info for a? given a.
360
+ Prefilter::Info* Prefilter::Info::Quest(Info *a) {
361
+ Info *ab = new Info();
362
+
363
+ ab->is_exact_ = false;
364
+ ab->match_ = new Prefilter(ALL);
365
+ delete a;
366
+ return ab;
367
+ }
368
+
369
+ // Constructs Info for a* given a.
370
+ // Same as a? -- not much to do.
371
+ Prefilter::Info* Prefilter::Info::Star(Info *a) {
372
+ return Quest(a);
373
+ }
374
+
375
+ // Constructs Info for a+ given a. If a was exact set, it isn't
376
+ // anymore.
377
+ Prefilter::Info* Prefilter::Info::Plus(Info *a) {
378
+ Info *ab = new Info();
379
+
380
+ ab->match_ = a->TakeMatch();
381
+ ab->is_exact_ = false;
382
+
383
+ delete a;
384
+ return ab;
385
+ }
386
+
387
+ static string RuneToString(Rune r) {
388
+ char buf[UTFmax];
389
+ int n = runetochar(buf, &r);
390
+ return string(buf, n);
391
+ }
392
+
393
+ // Constructs Info for literal rune.
394
+ Prefilter::Info* Prefilter::Info::Literal(Rune r) {
395
+ Info* info = new Info();
396
+ info->exact_.insert(RuneToString(ToLowerRune(r)));
397
+ info->is_exact_ = true;
398
+ return info;
399
+ }
400
+
401
+ // Constructs Info for dot (any character).
402
+ Prefilter::Info* Prefilter::Info::AnyChar() {
403
+ Prefilter::Info* info = new Prefilter::Info();
404
+ info->match_ = new Prefilter(ALL);
405
+ return info;
406
+ }
407
+
408
+ // Constructs Prefilter::Info for no possible match.
409
+ Prefilter::Info* Prefilter::Info::NoMatch() {
410
+ Prefilter::Info* info = new Prefilter::Info();
411
+ info->match_ = new Prefilter(NONE);
412
+ return info;
413
+ }
414
+
415
+ // Constructs Prefilter::Info for any possible match.
416
+ // This Prefilter::Info is valid for any regular expression,
417
+ // since it makes no assertions whatsoever about the
418
+ // strings being matched.
419
+ Prefilter::Info* Prefilter::Info::AnyMatch() {
420
+ Prefilter::Info *info = new Prefilter::Info();
421
+ info->match_ = new Prefilter(ALL);
422
+ return info;
423
+ }
424
+
425
+ // Constructs Prefilter::Info for just the empty string.
426
+ Prefilter::Info* Prefilter::Info::EmptyString() {
427
+ Prefilter::Info* info = new Prefilter::Info();
428
+ info->is_exact_ = true;
429
+ info->exact_.insert("");
430
+ return info;
431
+ }
432
+
433
+ // Constructs Prefilter::Info for a character class.
434
+ typedef CharClass::iterator CCIter;
435
+ Prefilter::Info* Prefilter::Info::CClass(CharClass *cc) {
436
+ if (Trace) {
437
+ VLOG(0) << "CharClassInfo:";
438
+ for (CCIter i = cc->begin(); i != cc->end(); ++i)
439
+ VLOG(0) << " " << i->lo << "-" << i->hi;
440
+ }
441
+
442
+ // If the class is too large, it's okay to overestimate.
443
+ if (cc->size() > 10)
444
+ return AnyChar();
445
+
446
+ Prefilter::Info *a = new Prefilter::Info();
447
+ for (CCIter i = cc->begin(); i != cc->end(); ++i)
448
+ for (Rune r = i->lo; r <= i->hi; r++)
449
+ a->exact_.insert(RuneToString(ToLowerRune(r)));
450
+
451
+ a->is_exact_ = true;
452
+
453
+ if (Trace) {
454
+ VLOG(0) << " = " << a->ToString();
455
+ }
456
+
457
+ return a;
458
+ }
459
+
460
+ class Prefilter::Info::Walker : public Regexp::Walker<Prefilter::Info*> {
461
+ public:
462
+ Walker() {}
463
+
464
+ virtual Info* PostVisit(
465
+ Regexp* re, Info* parent_arg,
466
+ Info* pre_arg,
467
+ Info** child_args, int nchild_args);
468
+
469
+ virtual Info* ShortVisit(
470
+ Regexp* re,
471
+ Info* parent_arg);
472
+
473
+ private:
474
+ DISALLOW_EVIL_CONSTRUCTORS(Walker);
475
+ };
476
+
477
+ Prefilter::Info* Prefilter::BuildInfo(Regexp* re) {
478
+ if (Trace) {
479
+ LOG(INFO) << "BuildPrefilter::Info: " << re->ToString();
480
+ }
481
+ Prefilter::Info::Walker w;
482
+ Prefilter::Info* info = w.WalkExponential(re, NULL, 100000);
483
+
484
+ if (w.stopped_early()) {
485
+ delete info;
486
+ return NULL;
487
+ }
488
+
489
+ return info;
490
+ }
491
+
492
+ Prefilter::Info* Prefilter::Info::Walker::ShortVisit(
493
+ Regexp* re, Prefilter::Info* parent_arg) {
494
+ return AnyMatch();
495
+ }
496
+
497
+ // Constructs the Prefilter::Info for the given regular expression.
498
+ // Assumes re is simplified.
499
+ Prefilter::Info* Prefilter::Info::Walker::PostVisit(
500
+ Regexp* re, Prefilter::Info* parent_arg,
501
+ Prefilter::Info* pre_arg, Prefilter::Info** child_args,
502
+ int nchild_args) {
503
+ Prefilter::Info *info;
504
+ switch (re->op()) {
505
+ default:
506
+ case kRegexpRepeat:
507
+ LOG(DFATAL) << "Bad regexp op " << re->op();
508
+ info = EmptyString();
509
+ break;
510
+
511
+ case kRegexpNoMatch:
512
+ info = NoMatch();
513
+ break;
514
+
515
+ // These ops match the empty string:
516
+ case kRegexpEmptyMatch: // anywhere
517
+ case kRegexpBeginLine: // at beginning of line
518
+ case kRegexpEndLine: // at end of line
519
+ case kRegexpBeginText: // at beginning of text
520
+ case kRegexpEndText: // at end of text
521
+ case kRegexpWordBoundary: // at word boundary
522
+ case kRegexpNoWordBoundary: // not at word boundary
523
+ info = EmptyString();
524
+ break;
525
+
526
+ case kRegexpLiteral:
527
+ info = Literal(re->rune());
528
+ break;
529
+
530
+ case kRegexpLiteralString:
531
+ if (re->nrunes() == 0) {
532
+ info = NoMatch();
533
+ break;
534
+ }
535
+ info = Literal(re->runes()[0]);
536
+ for (int i = 1; i < re->nrunes(); i++)
537
+ info = Concat(info, Literal(re->runes()[i]));
538
+ break;
539
+
540
+ case kRegexpConcat: {
541
+ // Accumulate in info.
542
+ // Exact is concat of recent contiguous exact nodes.
543
+ info = NULL;
544
+ Info* exact = NULL;
545
+ for (int i = 0; i < nchild_args; i++) {
546
+ Info* ci = child_args[i]; // child info
547
+ if (!ci->is_exact() ||
548
+ (exact && ci->exact().size() * exact->exact().size() > 16)) {
549
+ // Exact run is over.
550
+ info = And(info, exact);
551
+ exact = NULL;
552
+ // Add this child's info.
553
+ info = And(info, ci);
554
+ } else {
555
+ // Append to exact run.
556
+ exact = Concat(exact, ci);
557
+ }
558
+ }
559
+ info = And(info, exact);
560
+ }
561
+ break;
562
+
563
+ case kRegexpAlternate:
564
+ info = child_args[0];
565
+ for (int i = 1; i < nchild_args; i++)
566
+ info = Alt(info, child_args[i]);
567
+ VLOG(10) << "Alt: " << info->ToString();
568
+ break;
569
+
570
+ case kRegexpStar:
571
+ info = Star(child_args[0]);
572
+ break;
573
+
574
+ case kRegexpQuest:
575
+ info = Quest(child_args[0]);
576
+ break;
577
+
578
+ case kRegexpPlus:
579
+ info = Plus(child_args[0]);
580
+ break;
581
+
582
+ case kRegexpAnyChar:
583
+ // Claim nothing, except that it's not empty.
584
+ info = AnyChar();
585
+ break;
586
+
587
+ case kRegexpCharClass:
588
+ info = CClass(re->cc());
589
+ break;
590
+
591
+ case kRegexpCapture:
592
+ // These don't affect the set of matching strings.
593
+ info = child_args[0];
594
+ break;
595
+ }
596
+
597
+ if (Trace) {
598
+ VLOG(0) << "BuildInfo " << re->ToString()
599
+ << ": " << info->ToString();
600
+ }
601
+
602
+ return info;
603
+ }
604
+
605
+
606
+ Prefilter* Prefilter::FromRegexp(Regexp* re) {
607
+ if (re == NULL)
608
+ return NULL;
609
+
610
+ Regexp* simple = re->Simplify();
611
+ Prefilter::Info *info = BuildInfo(simple);
612
+
613
+ simple->Decref();
614
+ if (info == NULL)
615
+ return NULL;
616
+
617
+ Prefilter* m = info->TakeMatch();
618
+
619
+ delete info;
620
+ return m;
621
+ }
622
+
623
+ string Prefilter::DebugString() const {
624
+ if (this == NULL)
625
+ return "<nil>";
626
+
627
+ switch (op_) {
628
+ default:
629
+ LOG(DFATAL) << "Bad op in Prefilter::DebugString: " << op_;
630
+ return StringPrintf("op%d", op_);
631
+ case NONE:
632
+ return "*no-matches*";
633
+ case ATOM:
634
+ return atom_;
635
+ case ALL:
636
+ return "";
637
+ case AND: {
638
+ string s = "";
639
+ for (int i = 0; i < subs_->size(); i++) {
640
+ if (i > 0)
641
+ s += " ";
642
+ s += (*subs_)[i]->DebugString();
643
+ }
644
+ return s;
645
+ }
646
+ case OR: {
647
+ string s = "(";
648
+ for (int i = 0; i < subs_->size(); i++) {
649
+ if (i > 0)
650
+ s += "|";
651
+ s += (*subs_)[i]->DebugString();
652
+ }
653
+ s += ")";
654
+ return s;
655
+ }
656
+ }
657
+ }
658
+
659
+ Prefilter* Prefilter::FromRE2(const RE2* re2) {
660
+ if (re2 == NULL)
661
+ return NULL;
662
+
663
+ Regexp* regexp = re2->Regexp();
664
+ if (regexp == NULL)
665
+ return NULL;
666
+
667
+ return FromRegexp(regexp);
668
+ }
669
+
670
+
671
+ } // namespace re2