chipper 0.4.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (134) hide show
  1. data/README.rdoc +51 -0
  2. data/ext/extconf.rb +58 -0
  3. data/ext/libstemmer_c/Makefile +10 -0
  4. data/ext/libstemmer_c/examples/stemwords.c +209 -0
  5. data/ext/libstemmer_c/include/libstemmer.h +79 -0
  6. data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
  7. data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
  8. data/ext/libstemmer_c/libstemmer/modules.h +190 -0
  9. data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
  10. data/ext/libstemmer_c/mkinc.mak +82 -0
  11. data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
  12. data/ext/libstemmer_c/runtime/api.c +66 -0
  13. data/ext/libstemmer_c/runtime/api.h +26 -0
  14. data/ext/libstemmer_c/runtime/header.h +58 -0
  15. data/ext/libstemmer_c/runtime/utilities.c +478 -0
  16. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
  17. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
  18. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
  19. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
  20. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
  21. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
  22. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
  23. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
  24. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
  25. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
  26. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
  27. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
  28. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
  29. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
  30. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
  31. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
  32. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
  33. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
  34. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
  35. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
  36. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
  37. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
  38. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
  39. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
  40. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
  41. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
  42. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
  43. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
  44. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
  45. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
  46. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
  47. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
  48. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
  49. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
  50. data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
  51. data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
  52. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
  53. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
  54. data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
  55. data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
  56. data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
  57. data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
  58. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
  59. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
  60. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
  61. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
  62. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
  63. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
  64. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
  65. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
  66. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
  67. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
  68. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
  69. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
  70. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
  71. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
  72. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
  73. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
  74. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
  75. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
  76. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
  77. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
  78. data/ext/re2/bitstate.cc +378 -0
  79. data/ext/re2/compile.cc +1138 -0
  80. data/ext/re2/dfa.cc +2086 -0
  81. data/ext/re2/filtered_re2.cc +100 -0
  82. data/ext/re2/filtered_re2.h +99 -0
  83. data/ext/re2/hash.cc +231 -0
  84. data/ext/re2/mimics_pcre.cc +185 -0
  85. data/ext/re2/nfa.cc +709 -0
  86. data/ext/re2/onepass.cc +614 -0
  87. data/ext/re2/parse.cc +2202 -0
  88. data/ext/re2/perl_groups.cc +119 -0
  89. data/ext/re2/prefilter.cc +671 -0
  90. data/ext/re2/prefilter.h +105 -0
  91. data/ext/re2/prefilter_tree.cc +398 -0
  92. data/ext/re2/prefilter_tree.h +130 -0
  93. data/ext/re2/prog.cc +341 -0
  94. data/ext/re2/prog.h +376 -0
  95. data/ext/re2/re2.cc +1180 -0
  96. data/ext/re2/re2.h +837 -0
  97. data/ext/re2/regexp.cc +920 -0
  98. data/ext/re2/regexp.h +632 -0
  99. data/ext/re2/rune.cc +258 -0
  100. data/ext/re2/set.cc +113 -0
  101. data/ext/re2/set.h +55 -0
  102. data/ext/re2/simplify.cc +393 -0
  103. data/ext/re2/stringpiece.cc +87 -0
  104. data/ext/re2/stringpiece.h +182 -0
  105. data/ext/re2/tostring.cc +341 -0
  106. data/ext/re2/unicode_casefold.cc +469 -0
  107. data/ext/re2/unicode_casefold.h +75 -0
  108. data/ext/re2/unicode_groups.cc +4851 -0
  109. data/ext/re2/unicode_groups.h +64 -0
  110. data/ext/re2/valgrind.cc +24 -0
  111. data/ext/re2/variadic_function.h +346 -0
  112. data/ext/re2/walker-inl.h +244 -0
  113. data/ext/src/chipper.cc +626 -0
  114. data/ext/src/version.h +1 -0
  115. data/ext/stemmer.rb +40 -0
  116. data/ext/util/arena.h +103 -0
  117. data/ext/util/atomicops.h +79 -0
  118. data/ext/util/benchmark.h +41 -0
  119. data/ext/util/flags.h +27 -0
  120. data/ext/util/logging.h +78 -0
  121. data/ext/util/mutex.h +190 -0
  122. data/ext/util/pcre.h +679 -0
  123. data/ext/util/random.h +29 -0
  124. data/ext/util/sparse_array.h +451 -0
  125. data/ext/util/sparse_set.h +177 -0
  126. data/ext/util/test.h +57 -0
  127. data/ext/util/thread.h +26 -0
  128. data/ext/util/utf.h +43 -0
  129. data/ext/util/util.h +127 -0
  130. data/ext/util/valgrind.h +4517 -0
  131. data/test/helper.rb +5 -0
  132. data/test/test_entities.rb +57 -0
  133. data/test/test_tokens.rb +118 -0
  134. metadata +199 -0
@@ -0,0 +1,105 @@
1
+ // Copyright 2009 The RE2 Authors. All Rights Reserved.
2
+ // Use of this source code is governed by a BSD-style
3
+ // license that can be found in the LICENSE file.
4
+
5
+ // Prefilter is the class used to extract string guards from regexps.
6
+ // Rather than using Prefilter class directly, use FilteredRE2.
7
+ // See filtered_re2.h
8
+
9
+ #ifndef RE2_PREFILTER_H_
10
+ #define RE2_PREFILTER_H_
11
+
12
+ #include "util/util.h"
13
+
14
+ namespace re2 {
15
+
16
+ class RE2;
17
+
18
+ class Regexp;
19
+
20
+ class Prefilter {
21
+ // Instead of using Prefilter directly, use FilteredRE2; see filtered_re2.h
22
+ public:
23
+ enum Op {
24
+ ALL = 0, // Everything matches
25
+ NONE, // Nothing matches
26
+ ATOM, // The string atom() must match
27
+ AND, // All in subs() must match
28
+ OR, // One of subs() must match
29
+ };
30
+
31
+ explicit Prefilter(Op op);
32
+ ~Prefilter();
33
+
34
+ Op op() { return op_; }
35
+ const string& atom() const { return atom_; }
36
+ void set_unique_id(int id) { unique_id_ = id; }
37
+ int unique_id() const { return unique_id_; }
38
+
39
+ // The children of the Prefilter node.
40
+ vector<Prefilter*>* subs() {
41
+ CHECK(op_ == AND || op_ == OR);
42
+ return subs_;
43
+ }
44
+
45
+ // Set the children vector. Prefilter takes ownership of subs and
46
+ // subs_ will be deleted when Prefilter is deleted.
47
+ void set_subs(vector<Prefilter*>* subs) { subs_ = subs; }
48
+
49
+ // Given a RE2, return a Prefilter. The caller takes ownership of
50
+ // the Prefilter and should deallocate it. Returns NULL if Prefilter
51
+ // cannot be formed.
52
+ static Prefilter* FromRE2(const RE2* re2);
53
+
54
+ // Returns a readable debug string of the prefilter.
55
+ string DebugString() const;
56
+
57
+ private:
58
+ class Info;
59
+
60
+ // Combines two prefilters together to create an AND. The passed
61
+ // Prefilters will be part of the returned Prefilter or deleted.
62
+ static Prefilter* And(Prefilter* a, Prefilter* b);
63
+
64
+ // Combines two prefilters together to create an OR. The passed
65
+ // Prefilters will be part of the returned Prefilter or deleted.
66
+ static Prefilter* Or(Prefilter* a, Prefilter* b);
67
+
68
+ // Generalized And/Or
69
+ static Prefilter* AndOr(Op op, Prefilter* a, Prefilter* b);
70
+
71
+ static Prefilter* FromRegexp(Regexp* a);
72
+
73
+ static Prefilter* FromString(const string& str);
74
+
75
+ static Prefilter* OrStrings(set<string>* ss);
76
+
77
+ static Info* BuildInfo(Regexp* re);
78
+
79
+ Prefilter* Simplify();
80
+
81
+ // Kind of Prefilter.
82
+ Op op_;
83
+
84
+ // Sub-matches for AND or OR Prefilter.
85
+ vector<Prefilter*>* subs_;
86
+
87
+ // Actual string to match in leaf node.
88
+ string atom_;
89
+
90
+ // If different prefilters have the same string atom, or if they are
91
+ // structurally the same (e.g., OR of same atom strings) they are
92
+ // considered the same unique nodes. This is the id for each unique
93
+ // node. This field is populated with a unique id for every node,
94
+ // and -1 for duplicate nodes.
95
+ int unique_id_;
96
+
97
+ // Used for debugging, helps in tracking memory leaks.
98
+ int alloc_id_;
99
+
100
+ DISALLOW_EVIL_CONSTRUCTORS(Prefilter);
101
+ };
102
+
103
+ } // namespace re2
104
+
105
+ #endif // RE2_PREFILTER_H_
@@ -0,0 +1,398 @@
1
+ // Copyright 2009 The RE2 Authors. All Rights Reserved.
2
+ // Use of this source code is governed by a BSD-style
3
+ // license that can be found in the LICENSE file.
4
+
5
+ #include "util/util.h"
6
+ #include "util/flags.h"
7
+ #include "re2/prefilter.h"
8
+ #include "re2/prefilter_tree.h"
9
+ #include "re2/re2.h"
10
+
11
+ DEFINE_int32(filtered_re2_min_atom_len,
12
+ 3,
13
+ "Strings less than this length are not stored as atoms");
14
+
15
+ namespace re2 {
16
+
17
+ PrefilterTree::PrefilterTree()
18
+ : compiled_(false) {
19
+ }
20
+
21
+ PrefilterTree::~PrefilterTree() {
22
+ for (int i = 0; i < prefilter_vec_.size(); i++)
23
+ delete prefilter_vec_[i];
24
+
25
+ for (int i = 0; i < entries_.size(); i++)
26
+ delete entries_[i].parents;
27
+ }
28
+
29
+ // Functions used for adding and Compiling prefilters to the
30
+ // PrefilterTree.
31
+ static bool KeepPart(Prefilter* prefilter, int level) {
32
+ if (prefilter == NULL)
33
+ return false;
34
+
35
+ switch (prefilter->op()) {
36
+ default:
37
+ LOG(DFATAL) << "Unexpected op in KeepPart: "
38
+ << prefilter->op();
39
+ return false;
40
+
41
+ case Prefilter::ALL:
42
+ return false;
43
+
44
+ case Prefilter::ATOM:
45
+ return prefilter->atom().size() >=
46
+ FLAGS_filtered_re2_min_atom_len;
47
+
48
+ case Prefilter::AND: {
49
+ int j = 0;
50
+ vector<Prefilter*>* subs = prefilter->subs();
51
+ for (int i = 0; i < subs->size(); i++)
52
+ if (KeepPart((*subs)[i], level + 1))
53
+ (*subs)[j++] = (*subs)[i];
54
+ else
55
+ delete (*subs)[i];
56
+
57
+ subs->resize(j);
58
+ return j > 0;
59
+ }
60
+
61
+ case Prefilter::OR:
62
+ for (int i = 0; i < prefilter->subs()->size(); i++)
63
+ if (!KeepPart((*prefilter->subs())[i], level + 1))
64
+ return false;
65
+ return true;
66
+ }
67
+ }
68
+
69
+ void PrefilterTree::Add(Prefilter *f) {
70
+ if (compiled_) {
71
+ LOG(DFATAL) << "Add after Compile.";
72
+ return;
73
+ }
74
+ if (f != NULL && !KeepPart(f, 0)) {
75
+ delete f;
76
+ f = NULL;
77
+ }
78
+
79
+ prefilter_vec_.push_back(f);
80
+ }
81
+
82
+ void PrefilterTree::Compile(vector<string>* atom_vec) {
83
+ if (compiled_) {
84
+ LOG(DFATAL) << "Compile after Compile.";
85
+ return;
86
+ }
87
+
88
+ // We do this check to support some legacy uses of
89
+ // PrefilterTree that call Compile before adding any regexps,
90
+ // and expect Compile not to have effect.
91
+ if (prefilter_vec_.empty())
92
+ return;
93
+
94
+ compiled_ = true;
95
+
96
+ AssignUniqueIds(atom_vec);
97
+
98
+ // Identify nodes that are too common among prefilters and are
99
+ // triggering too many parents. Then get rid of them if possible.
100
+ // Note that getting rid of a prefilter node simply means they are
101
+ // no longer necessary for their parent to trigger; that is, we do
102
+ // not miss out on any regexps triggering by getting rid of a
103
+ // prefilter node.
104
+ for (int i = 0; i < entries_.size(); i++) {
105
+ IntMap* parents = entries_[i].parents;
106
+ if (parents->size() > 8) {
107
+ // This one triggers too many things. If all the parents are AND
108
+ // nodes and have other things guarding them, then get rid of
109
+ // this trigger. TODO(vsri): Adjust the threshold appropriately,
110
+ // make it a function of total number of nodes?
111
+ bool have_other_guard = true;
112
+ for (IntMap::iterator it = parents->begin(); it != parents->end(); ++it)
113
+ have_other_guard = have_other_guard &&
114
+ (entries_[it->index()].propagate_up_at_count > 1);
115
+
116
+ if (have_other_guard) {
117
+ for (IntMap::iterator it = parents->begin();
118
+ it != parents->end(); ++it)
119
+ entries_[it->index()].propagate_up_at_count -= 1;
120
+
121
+ parents->clear(); // Forget the parents
122
+ }
123
+ }
124
+ }
125
+
126
+ PrintDebugInfo();
127
+ }
128
+
129
+ Prefilter* PrefilterTree::CanonicalNode(Prefilter* node) {
130
+ string node_string = NodeString(node);
131
+ map<string, Prefilter*>::iterator iter = node_map_.find(node_string);
132
+ if (iter == node_map_.end())
133
+ return NULL;
134
+ return (*iter).second;
135
+ }
136
+
137
+ static string Itoa(int n) {
138
+ char buf[100];
139
+ snprintf(buf, sizeof buf, "%d", n);
140
+ return string(buf);
141
+ }
142
+
143
+ string PrefilterTree::NodeString(Prefilter* node) const {
144
+ // Adding the operation disambiguates AND/OR/atom nodes.
145
+ string s = Itoa(node->op()) + ":";
146
+ if (node->op() == Prefilter::ATOM) {
147
+ s += node->atom();
148
+ } else {
149
+ for (int i = 0; i < node->subs()->size() ; i++) {
150
+ if (i > 0)
151
+ s += ',';
152
+ s += Itoa((*node->subs())[i]->unique_id());
153
+ }
154
+ }
155
+ return s;
156
+ }
157
+
158
+ void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) {
159
+ atom_vec->clear();
160
+
161
+ // Build vector of all filter nodes, sorted topologically
162
+ // from top to bottom in v.
163
+ vector<Prefilter*> v;
164
+
165
+ // Add the top level nodes of each regexp prefilter.
166
+ for (int i = 0; i < prefilter_vec_.size(); i++) {
167
+ Prefilter* f = prefilter_vec_[i];
168
+ if (f == NULL)
169
+ unfiltered_.push_back(i);
170
+
171
+ // We push NULL also on to v, so that we maintain the
172
+ // mapping of index==regexpid for level=0 prefilter nodes.
173
+ v.push_back(f);
174
+ }
175
+
176
+ // Now add all the descendant nodes.
177
+ for (int i = 0; i < v.size(); i++) {
178
+ Prefilter* f = v[i];
179
+ if (f == NULL)
180
+ continue;
181
+ if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) {
182
+ const vector<Prefilter*>& subs = *f->subs();
183
+ for (int j = 0; j < subs.size(); j++)
184
+ v.push_back(subs[j]);
185
+ }
186
+ }
187
+
188
+ // Identify unique nodes.
189
+ int unique_id = 0;
190
+ for (int i = v.size() - 1; i >= 0; i--) {
191
+ Prefilter *node = v[i];
192
+ if (node == NULL)
193
+ continue;
194
+ node->set_unique_id(-1);
195
+ Prefilter* canonical = CanonicalNode(node);
196
+ if (canonical == NULL) {
197
+ // Any further nodes that have the same node string
198
+ // will find this node as the canonical node.
199
+ node_map_[NodeString(node)] = node;
200
+ if (node->op() == Prefilter::ATOM) {
201
+ atom_vec->push_back(node->atom());
202
+ atom_index_to_id_.push_back(unique_id);
203
+ }
204
+ node->set_unique_id(unique_id++);
205
+ } else {
206
+ node->set_unique_id(canonical->unique_id());
207
+ }
208
+ }
209
+ entries_.resize(node_map_.size());
210
+
211
+ // Create parent IntMap for the entries.
212
+ for (int i = v.size() - 1; i >= 0; i--) {
213
+ Prefilter* prefilter = v[i];
214
+ if (prefilter == NULL)
215
+ continue;
216
+
217
+ if (CanonicalNode(prefilter) != prefilter)
218
+ continue;
219
+
220
+ Entry* entry = &entries_[prefilter->unique_id()];
221
+ entry->parents = new IntMap(node_map_.size());
222
+ }
223
+
224
+ // Fill the entries.
225
+ for (int i = v.size() - 1; i >= 0; i--) {
226
+ Prefilter* prefilter = v[i];
227
+ if (prefilter == NULL)
228
+ continue;
229
+
230
+ if (CanonicalNode(prefilter) != prefilter)
231
+ continue;
232
+
233
+ Entry* entry = &entries_[prefilter->unique_id()];
234
+
235
+ switch (prefilter->op()) {
236
+ default:
237
+ case Prefilter::ALL:
238
+ LOG(DFATAL) << "Unexpected op: " << prefilter->op();
239
+ return;
240
+
241
+ case Prefilter::ATOM:
242
+ entry->propagate_up_at_count = 1;
243
+ break;
244
+
245
+ case Prefilter::OR:
246
+ case Prefilter::AND: {
247
+ IntMap uniq_child(node_map_.size());
248
+ for (int j = 0; j < prefilter->subs()->size() ; j++) {
249
+ Prefilter* child = (*prefilter->subs())[j];
250
+ Prefilter* canonical = CanonicalNode(child);
251
+ if (canonical == NULL) {
252
+ LOG(DFATAL) << "Null canonical node";
253
+ return;
254
+ }
255
+ int child_id = canonical->unique_id();
256
+ if (!uniq_child.has_index(child_id))
257
+ uniq_child.set_new(child_id, 1);
258
+ // To the child, we want to add to parent indices.
259
+ Entry* child_entry = &entries_[child_id];
260
+ if (!child_entry->parents->has_index(prefilter->unique_id()))
261
+ child_entry->parents->set_new(prefilter->unique_id(), 1);
262
+ }
263
+ entry->propagate_up_at_count =
264
+ prefilter->op() == Prefilter::AND ? uniq_child.size() : 1;
265
+
266
+ break;
267
+ }
268
+ }
269
+ }
270
+
271
+ // For top level nodes, populate regexp id.
272
+ for (int i = 0; i < prefilter_vec_.size(); i++) {
273
+ if (prefilter_vec_[i] == NULL)
274
+ continue;
275
+ int id = CanonicalNode(prefilter_vec_[i])->unique_id();
276
+ DCHECK_LE(0, id);
277
+ Entry* entry = &entries_[id];
278
+ entry->regexps.push_back(i);
279
+ }
280
+ }
281
+
282
+ // Functions for triggering during search.
283
+ void PrefilterTree::RegexpsGivenStrings(
284
+ const vector<int>& matched_atoms,
285
+ vector<int>* regexps) const {
286
+ regexps->clear();
287
+ if (!compiled_) {
288
+ LOG(WARNING) << "Compile() not called";
289
+ for (int i = 0; i < prefilter_vec_.size(); ++i)
290
+ regexps->push_back(i);
291
+ } else {
292
+ if (!prefilter_vec_.empty()) {
293
+ IntMap regexps_map(prefilter_vec_.size());
294
+ vector<int> matched_atom_ids;
295
+ for (int j = 0; j < matched_atoms.size(); j++) {
296
+ matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]);
297
+ VLOG(10) << "Atom id:" << atom_index_to_id_[matched_atoms[j]];
298
+ }
299
+ PropagateMatch(matched_atom_ids, &regexps_map);
300
+ for (IntMap::iterator it = regexps_map.begin();
301
+ it != regexps_map.end();
302
+ ++it)
303
+ regexps->push_back(it->index());
304
+
305
+ regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end());
306
+ }
307
+ }
308
+ sort(regexps->begin(), regexps->end());
309
+ }
310
+
311
+ void PrefilterTree::PropagateMatch(const vector<int>& atom_ids,
312
+ IntMap* regexps) const {
313
+ IntMap count(entries_.size());
314
+ IntMap work(entries_.size());
315
+ for (int i = 0; i < atom_ids.size(); i++)
316
+ work.set(atom_ids[i], 1);
317
+ for (IntMap::iterator it = work.begin(); it != work.end(); ++it) {
318
+ const Entry& entry = entries_[it->index()];
319
+ VLOG(10) << "Processing: " << it->index();
320
+ // Record regexps triggered.
321
+ for (int i = 0; i < entry.regexps.size(); i++) {
322
+ VLOG(10) << "Regexp triggered: " << entry.regexps[i];
323
+ regexps->set(entry.regexps[i], 1);
324
+ }
325
+ int c;
326
+ // Pass trigger up to parents.
327
+ for (IntMap::iterator it = entry.parents->begin();
328
+ it != entry.parents->end();
329
+ ++it) {
330
+ int j = it->index();
331
+ const Entry& parent = entries_[j];
332
+ VLOG(10) << " parent= " << j << " trig= " << parent.propagate_up_at_count;
333
+ // Delay until all the children have succeeded.
334
+ if (parent.propagate_up_at_count > 1) {
335
+ if (count.has_index(j)) {
336
+ c = count.get_existing(j) + 1;
337
+ count.set_existing(j, c);
338
+ } else {
339
+ c = 1;
340
+ count.set_new(j, c);
341
+ }
342
+ if (c < parent.propagate_up_at_count)
343
+ continue;
344
+ }
345
+ VLOG(10) << "Triggering: " << j;
346
+ // Trigger the parent.
347
+ work.set(j, 1);
348
+ }
349
+ }
350
+ }
351
+
352
+ // Debugging help.
353
+ void PrefilterTree::PrintPrefilter(int regexpid) {
354
+ LOG(INFO) << DebugNodeString(prefilter_vec_[regexpid]);
355
+ }
356
+
357
+ void PrefilterTree::PrintDebugInfo() {
358
+ VLOG(10) << "#Unique Atoms: " << atom_index_to_id_.size();
359
+ VLOG(10) << "#Unique Nodes: " << entries_.size();
360
+
361
+ for (int i = 0; i < entries_.size(); ++i) {
362
+ IntMap* parents = entries_[i].parents;
363
+ const vector<int>& regexps = entries_[i].regexps;
364
+ VLOG(10) << "EntryId: " << i
365
+ << " N: " << parents->size() << " R: " << regexps.size();
366
+ for (IntMap::iterator it = parents->begin(); it != parents->end(); ++it)
367
+ VLOG(10) << it->index();
368
+ }
369
+ VLOG(10) << "Map:";
370
+ for (map<string, Prefilter*>::const_iterator iter = node_map_.begin();
371
+ iter != node_map_.end(); ++iter)
372
+ VLOG(10) << "NodeId: " << (*iter).second->unique_id()
373
+ << " Str: " << (*iter).first;
374
+ }
375
+
376
+ string PrefilterTree::DebugNodeString(Prefilter* node) const {
377
+ string node_string = "";
378
+
379
+ if (node->op() == Prefilter::ATOM) {
380
+ DCHECK(!node->atom().empty());
381
+ node_string += node->atom();
382
+ } else {
383
+ // Adding the operation disambiguates AND and OR nodes.
384
+ node_string += node->op() == Prefilter::AND ? "AND" : "OR";
385
+ node_string += "(";
386
+ for (int i = 0; i < node->subs()->size() ; i++) {
387
+ if (i > 0)
388
+ node_string += ',';
389
+ node_string += Itoa((*node->subs())[i]->unique_id());
390
+ node_string += ":";
391
+ node_string += DebugNodeString((*node->subs())[i]);
392
+ }
393
+ node_string += ")";
394
+ }
395
+ return node_string;
396
+ }
397
+
398
+ } // namespace re2
@@ -0,0 +1,130 @@
1
+ // Copyright 2009 The RE2 Authors. All Rights Reserved.
2
+ // Use of this source code is governed by a BSD-style
3
+ // license that can be found in the LICENSE file.
4
+
5
+ // The PrefilterTree class is used to form an AND-OR tree of strings
6
+ // that would trigger each regexp. The 'prefilter' of each regexp is
7
+ // added tp PrefilterTree, and then PrefilterTree is used to find all
8
+ // the unique strings across the prefilters. During search, by using
9
+ // matches from a string matching engine, PrefilterTree deduces the
10
+ // set of regexps that are to be triggered. The 'string matching
11
+ // engine' itself is outside of this class, and the caller can use any
12
+ // favorite engine. PrefilterTree provides a set of strings (called
13
+ // atoms) that the user of this class should use to do the string
14
+ // matching.
15
+ //
16
+ #ifndef RE2_PREFILTER_TREE_H_
17
+ #define RE2_PREFILTER_TREE_H_
18
+
19
+ #include "util/util.h"
20
+ #include "util/sparse_array.h"
21
+
22
+ namespace re2 {
23
+
24
+ typedef SparseArray<int> IntMap;
25
+
26
+ class Prefilter;
27
+
28
+ class PrefilterTree {
29
+ public:
30
+ PrefilterTree();
31
+ ~PrefilterTree();
32
+
33
+ // Adds the prefilter for the next regexp. Note that we assume that
34
+ // Add called sequentially for all regexps. All Add calls
35
+ // must precede Compile.
36
+ void Add(Prefilter* prefilter);
37
+
38
+ // The Compile returns a vector of string in atom_vec.
39
+ // Call this after all the prefilters are added through Add.
40
+ // No calls to Add after Compile are allowed.
41
+ // The caller should use the returned set of strings to do string matching.
42
+ // Each time a string matches, the corresponding index then has to be
43
+ // and passed to RegexpsGivenStrings below.
44
+ void Compile(vector<string>* atom_vec);
45
+
46
+ // Given the indices of the atoms that matched, returns the indexes
47
+ // of regexps that should be searched. The matched_atoms should
48
+ // contain all the ids of string atoms that were found to match the
49
+ // content. The caller can use any string match engine to perform
50
+ // this function. This function is thread safe.
51
+ void RegexpsGivenStrings(const vector<int>& matched_atoms,
52
+ vector<int>* regexps) const;
53
+
54
+ // Print debug prefilter. Also prints unique ids associated with
55
+ // nodes of the prefilter of the regexp.
56
+ void PrintPrefilter(int regexpid);
57
+
58
+
59
+ // Each unique node has a corresponding Entry that helps in
60
+ // passing the matching trigger information along the tree.
61
+ struct Entry {
62
+ public:
63
+ // How many children should match before this node triggers the
64
+ // parent. For an atom and an OR node, this is 1 and for an AND
65
+ // node, it is the number of unique children.
66
+ int propagate_up_at_count;
67
+
68
+ // When this node is ready to trigger the parent, what are the indices
69
+ // of the parent nodes to trigger. The reason there may be more than
70
+ // one is because of sharing. For example (abc | def) and (xyz | def)
71
+ // are two different nodes, but they share the atom 'def'. So when
72
+ // 'def' matches, it triggers two parents, corresponding to the two
73
+ // different OR nodes.
74
+ IntMap* parents;
75
+
76
+ // When this node is ready to trigger the parent, what are the
77
+ // regexps that are triggered.
78
+ vector<int> regexps;
79
+ };
80
+
81
+ private:
82
+ // This function assigns unique ids to various parts of the
83
+ // prefilter, by looking at if these nodes are already in the
84
+ // PrefilterTree.
85
+ void AssignUniqueIds(vector<string>* atom_vec);
86
+
87
+ // Given the matching atoms, find the regexps to be triggered.
88
+ void PropagateMatch(const vector<int>& atom_ids,
89
+ IntMap* regexps) const;
90
+
91
+ // Returns the prefilter node that has the same NodeString as this
92
+ // node. For the canonical node, returns node.
93
+ Prefilter* CanonicalNode(Prefilter* node);
94
+
95
+ // A string that uniquely identifies the node. Assumes that the
96
+ // children of node has already been assigned unique ids.
97
+ string NodeString(Prefilter* node) const;
98
+
99
+ // Recursively constructs a readable prefilter string.
100
+ string DebugNodeString(Prefilter* node) const;
101
+
102
+ // Used for debugging.
103
+ void PrintDebugInfo();
104
+
105
+ // These are all the nodes formed by Compile. Essentially, there is
106
+ // one node for each unique atom and each unique AND/OR node.
107
+ vector<Entry> entries_;
108
+
109
+ // Map node string to canonical Prefilter node.
110
+ map<string, Prefilter*> node_map_;
111
+
112
+ // indices of regexps that always pass through the filter (since we
113
+ // found no required literals in these regexps).
114
+ vector<int> unfiltered_;
115
+
116
+ // vector of Prefilter for all regexps.
117
+ vector<Prefilter*> prefilter_vec_;
118
+
119
+ // Atom index in returned strings to entry id mapping.
120
+ vector<int> atom_index_to_id_;
121
+
122
+ // Has the prefilter tree been compiled.
123
+ bool compiled_;
124
+
125
+ DISALLOW_EVIL_CONSTRUCTORS(PrefilterTree);
126
+ };
127
+
128
+ } // namespace
129
+
130
+ #endif // RE2_PREFILTER_TREE_H_