chipper 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. data/README.rdoc +51 -0
  2. data/ext/extconf.rb +58 -0
  3. data/ext/libstemmer_c/Makefile +10 -0
  4. data/ext/libstemmer_c/examples/stemwords.c +209 -0
  5. data/ext/libstemmer_c/include/libstemmer.h +79 -0
  6. data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
  7. data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
  8. data/ext/libstemmer_c/libstemmer/modules.h +190 -0
  9. data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
  10. data/ext/libstemmer_c/mkinc.mak +82 -0
  11. data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
  12. data/ext/libstemmer_c/runtime/api.c +66 -0
  13. data/ext/libstemmer_c/runtime/api.h +26 -0
  14. data/ext/libstemmer_c/runtime/header.h +58 -0
  15. data/ext/libstemmer_c/runtime/utilities.c +478 -0
  16. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
  17. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
  18. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
  19. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
  20. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
  21. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
  22. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
  23. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
  24. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
  25. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
  26. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
  27. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
  28. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
  29. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
  30. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
  31. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
  32. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
  33. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
  34. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
  35. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
  36. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
  37. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
  38. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
  39. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
  40. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
  41. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
  42. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
  43. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
  44. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
  45. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
  46. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
  47. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
  48. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
  49. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
  50. data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
  51. data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
  52. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
  53. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
  54. data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
  55. data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
  56. data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
  57. data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
  58. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
  59. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
  60. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
  61. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
  62. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
  63. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
  64. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
  65. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
  66. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
  67. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
  68. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
  69. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
  70. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
  71. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
  72. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
  73. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
  74. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
  75. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
  76. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
  77. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
  78. data/ext/re2/bitstate.cc +378 -0
  79. data/ext/re2/compile.cc +1138 -0
  80. data/ext/re2/dfa.cc +2086 -0
  81. data/ext/re2/filtered_re2.cc +100 -0
  82. data/ext/re2/filtered_re2.h +99 -0
  83. data/ext/re2/hash.cc +231 -0
  84. data/ext/re2/mimics_pcre.cc +185 -0
  85. data/ext/re2/nfa.cc +709 -0
  86. data/ext/re2/onepass.cc +614 -0
  87. data/ext/re2/parse.cc +2202 -0
  88. data/ext/re2/perl_groups.cc +119 -0
  89. data/ext/re2/prefilter.cc +671 -0
  90. data/ext/re2/prefilter.h +105 -0
  91. data/ext/re2/prefilter_tree.cc +398 -0
  92. data/ext/re2/prefilter_tree.h +130 -0
  93. data/ext/re2/prog.cc +341 -0
  94. data/ext/re2/prog.h +376 -0
  95. data/ext/re2/re2.cc +1180 -0
  96. data/ext/re2/re2.h +837 -0
  97. data/ext/re2/regexp.cc +920 -0
  98. data/ext/re2/regexp.h +632 -0
  99. data/ext/re2/rune.cc +258 -0
  100. data/ext/re2/set.cc +113 -0
  101. data/ext/re2/set.h +55 -0
  102. data/ext/re2/simplify.cc +393 -0
  103. data/ext/re2/stringpiece.cc +87 -0
  104. data/ext/re2/stringpiece.h +182 -0
  105. data/ext/re2/tostring.cc +341 -0
  106. data/ext/re2/unicode_casefold.cc +469 -0
  107. data/ext/re2/unicode_casefold.h +75 -0
  108. data/ext/re2/unicode_groups.cc +4851 -0
  109. data/ext/re2/unicode_groups.h +64 -0
  110. data/ext/re2/valgrind.cc +24 -0
  111. data/ext/re2/variadic_function.h +346 -0
  112. data/ext/re2/walker-inl.h +244 -0
  113. data/ext/src/chipper.cc +626 -0
  114. data/ext/src/version.h +1 -0
  115. data/ext/stemmer.rb +40 -0
  116. data/ext/util/arena.h +103 -0
  117. data/ext/util/atomicops.h +79 -0
  118. data/ext/util/benchmark.h +41 -0
  119. data/ext/util/flags.h +27 -0
  120. data/ext/util/logging.h +78 -0
  121. data/ext/util/mutex.h +190 -0
  122. data/ext/util/pcre.h +679 -0
  123. data/ext/util/random.h +29 -0
  124. data/ext/util/sparse_array.h +451 -0
  125. data/ext/util/sparse_set.h +177 -0
  126. data/ext/util/test.h +57 -0
  127. data/ext/util/thread.h +26 -0
  128. data/ext/util/utf.h +43 -0
  129. data/ext/util/util.h +127 -0
  130. data/ext/util/valgrind.h +4517 -0
  131. data/test/helper.rb +5 -0
  132. data/test/test_entities.rb +57 -0
  133. data/test/test_tokens.rb +118 -0
  134. metadata +199 -0
@@ -0,0 +1,105 @@
1
+ // Copyright 2009 The RE2 Authors. All Rights Reserved.
2
+ // Use of this source code is governed by a BSD-style
3
+ // license that can be found in the LICENSE file.
4
+
5
+ // Prefilter is the class used to extract string guards from regexps.
6
+ // Rather than using Prefilter class directly, use FilteredRE2.
7
+ // See filtered_re2.h
8
+
9
+ #ifndef RE2_PREFILTER_H_
10
+ #define RE2_PREFILTER_H_
11
+
12
+ #include "util/util.h"
13
+
14
+ namespace re2 {
15
+
16
+ class RE2;
17
+
18
+ class Regexp;
19
+
20
+ class Prefilter {
21
+ // Instead of using Prefilter directly, use FilteredRE2; see filtered_re2.h
22
+ public:
23
+ enum Op {
24
+ ALL = 0, // Everything matches
25
+ NONE, // Nothing matches
26
+ ATOM, // The string atom() must match
27
+ AND, // All in subs() must match
28
+ OR, // One of subs() must match
29
+ };
30
+
31
+ explicit Prefilter(Op op);
32
+ ~Prefilter();
33
+
34
+ Op op() { return op_; }
35
+ const string& atom() const { return atom_; }
36
+ void set_unique_id(int id) { unique_id_ = id; }
37
+ int unique_id() const { return unique_id_; }
38
+
39
+ // The children of the Prefilter node.
40
+ vector<Prefilter*>* subs() {
41
+ CHECK(op_ == AND || op_ == OR);
42
+ return subs_;
43
+ }
44
+
45
+ // Set the children vector. Prefilter takes ownership of subs and
46
+ // subs_ will be deleted when Prefilter is deleted.
47
+ void set_subs(vector<Prefilter*>* subs) { subs_ = subs; }
48
+
49
+ // Given a RE2, return a Prefilter. The caller takes ownership of
50
+ // the Prefilter and should deallocate it. Returns NULL if Prefilter
51
+ // cannot be formed.
52
+ static Prefilter* FromRE2(const RE2* re2);
53
+
54
+ // Returns a readable debug string of the prefilter.
55
+ string DebugString() const;
56
+
57
+ private:
58
+ class Info;
59
+
60
+ // Combines two prefilters together to create an AND. The passed
61
+ // Prefilters will be part of the returned Prefilter or deleted.
62
+ static Prefilter* And(Prefilter* a, Prefilter* b);
63
+
64
+ // Combines two prefilters together to create an OR. The passed
65
+ // Prefilters will be part of the returned Prefilter or deleted.
66
+ static Prefilter* Or(Prefilter* a, Prefilter* b);
67
+
68
+ // Generalized And/Or
69
+ static Prefilter* AndOr(Op op, Prefilter* a, Prefilter* b);
70
+
71
+ static Prefilter* FromRegexp(Regexp* a);
72
+
73
+ static Prefilter* FromString(const string& str);
74
+
75
+ static Prefilter* OrStrings(set<string>* ss);
76
+
77
+ static Info* BuildInfo(Regexp* re);
78
+
79
+ Prefilter* Simplify();
80
+
81
+ // Kind of Prefilter.
82
+ Op op_;
83
+
84
+ // Sub-matches for AND or OR Prefilter.
85
+ vector<Prefilter*>* subs_;
86
+
87
+ // Actual string to match in leaf node.
88
+ string atom_;
89
+
90
+ // If different prefilters have the same string atom, or if they are
91
+ // structurally the same (e.g., OR of same atom strings) they are
92
+ // considered the same unique nodes. This is the id for each unique
93
+ // node. This field is populated with a unique id for every node,
94
+ // and -1 for duplicate nodes.
95
+ int unique_id_;
96
+
97
+ // Used for debugging, helps in tracking memory leaks.
98
+ int alloc_id_;
99
+
100
+ DISALLOW_EVIL_CONSTRUCTORS(Prefilter);
101
+ };
102
+
103
+ } // namespace re2
104
+
105
+ #endif // RE2_PREFILTER_H_
@@ -0,0 +1,398 @@
1
+ // Copyright 2009 The RE2 Authors. All Rights Reserved.
2
+ // Use of this source code is governed by a BSD-style
3
+ // license that can be found in the LICENSE file.
4
+
5
+ #include "util/util.h"
6
+ #include "util/flags.h"
7
+ #include "re2/prefilter.h"
8
+ #include "re2/prefilter_tree.h"
9
+ #include "re2/re2.h"
10
+
11
+ DEFINE_int32(filtered_re2_min_atom_len,
12
+ 3,
13
+ "Strings less than this length are not stored as atoms");
14
+
15
+ namespace re2 {
16
+
17
+ PrefilterTree::PrefilterTree()
18
+ : compiled_(false) {
19
+ }
20
+
21
+ PrefilterTree::~PrefilterTree() {
22
+ for (int i = 0; i < prefilter_vec_.size(); i++)
23
+ delete prefilter_vec_[i];
24
+
25
+ for (int i = 0; i < entries_.size(); i++)
26
+ delete entries_[i].parents;
27
+ }
28
+
29
+ // Functions used for adding and Compiling prefilters to the
30
+ // PrefilterTree.
31
+ static bool KeepPart(Prefilter* prefilter, int level) {
32
+ if (prefilter == NULL)
33
+ return false;
34
+
35
+ switch (prefilter->op()) {
36
+ default:
37
+ LOG(DFATAL) << "Unexpected op in KeepPart: "
38
+ << prefilter->op();
39
+ return false;
40
+
41
+ case Prefilter::ALL:
42
+ return false;
43
+
44
+ case Prefilter::ATOM:
45
+ return prefilter->atom().size() >=
46
+ FLAGS_filtered_re2_min_atom_len;
47
+
48
+ case Prefilter::AND: {
49
+ int j = 0;
50
+ vector<Prefilter*>* subs = prefilter->subs();
51
+ for (int i = 0; i < subs->size(); i++)
52
+ if (KeepPart((*subs)[i], level + 1))
53
+ (*subs)[j++] = (*subs)[i];
54
+ else
55
+ delete (*subs)[i];
56
+
57
+ subs->resize(j);
58
+ return j > 0;
59
+ }
60
+
61
+ case Prefilter::OR:
62
+ for (int i = 0; i < prefilter->subs()->size(); i++)
63
+ if (!KeepPart((*prefilter->subs())[i], level + 1))
64
+ return false;
65
+ return true;
66
+ }
67
+ }
68
+
69
+ void PrefilterTree::Add(Prefilter *f) {
70
+ if (compiled_) {
71
+ LOG(DFATAL) << "Add after Compile.";
72
+ return;
73
+ }
74
+ if (f != NULL && !KeepPart(f, 0)) {
75
+ delete f;
76
+ f = NULL;
77
+ }
78
+
79
+ prefilter_vec_.push_back(f);
80
+ }
81
+
82
+ void PrefilterTree::Compile(vector<string>* atom_vec) {
83
+ if (compiled_) {
84
+ LOG(DFATAL) << "Compile after Compile.";
85
+ return;
86
+ }
87
+
88
+ // We do this check to support some legacy uses of
89
+ // PrefilterTree that call Compile before adding any regexps,
90
+ // and expect Compile not to have effect.
91
+ if (prefilter_vec_.empty())
92
+ return;
93
+
94
+ compiled_ = true;
95
+
96
+ AssignUniqueIds(atom_vec);
97
+
98
+ // Identify nodes that are too common among prefilters and are
99
+ // triggering too many parents. Then get rid of them if possible.
100
+ // Note that getting rid of a prefilter node simply means they are
101
+ // no longer necessary for their parent to trigger; that is, we do
102
+ // not miss out on any regexps triggering by getting rid of a
103
+ // prefilter node.
104
+ for (int i = 0; i < entries_.size(); i++) {
105
+ IntMap* parents = entries_[i].parents;
106
+ if (parents->size() > 8) {
107
+ // This one triggers too many things. If all the parents are AND
108
+ // nodes and have other things guarding them, then get rid of
109
+ // this trigger. TODO(vsri): Adjust the threshold appropriately,
110
+ // make it a function of total number of nodes?
111
+ bool have_other_guard = true;
112
+ for (IntMap::iterator it = parents->begin(); it != parents->end(); ++it)
113
+ have_other_guard = have_other_guard &&
114
+ (entries_[it->index()].propagate_up_at_count > 1);
115
+
116
+ if (have_other_guard) {
117
+ for (IntMap::iterator it = parents->begin();
118
+ it != parents->end(); ++it)
119
+ entries_[it->index()].propagate_up_at_count -= 1;
120
+
121
+ parents->clear(); // Forget the parents
122
+ }
123
+ }
124
+ }
125
+
126
+ PrintDebugInfo();
127
+ }
128
+
129
+ Prefilter* PrefilterTree::CanonicalNode(Prefilter* node) {
130
+ string node_string = NodeString(node);
131
+ map<string, Prefilter*>::iterator iter = node_map_.find(node_string);
132
+ if (iter == node_map_.end())
133
+ return NULL;
134
+ return (*iter).second;
135
+ }
136
+
137
+ static string Itoa(int n) {
138
+ char buf[100];
139
+ snprintf(buf, sizeof buf, "%d", n);
140
+ return string(buf);
141
+ }
142
+
143
+ string PrefilterTree::NodeString(Prefilter* node) const {
144
+ // Adding the operation disambiguates AND/OR/atom nodes.
145
+ string s = Itoa(node->op()) + ":";
146
+ if (node->op() == Prefilter::ATOM) {
147
+ s += node->atom();
148
+ } else {
149
+ for (int i = 0; i < node->subs()->size() ; i++) {
150
+ if (i > 0)
151
+ s += ',';
152
+ s += Itoa((*node->subs())[i]->unique_id());
153
+ }
154
+ }
155
+ return s;
156
+ }
157
+
158
+ void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) {
159
+ atom_vec->clear();
160
+
161
+ // Build vector of all filter nodes, sorted topologically
162
+ // from top to bottom in v.
163
+ vector<Prefilter*> v;
164
+
165
+ // Add the top level nodes of each regexp prefilter.
166
+ for (int i = 0; i < prefilter_vec_.size(); i++) {
167
+ Prefilter* f = prefilter_vec_[i];
168
+ if (f == NULL)
169
+ unfiltered_.push_back(i);
170
+
171
+ // We push NULL also on to v, so that we maintain the
172
+ // mapping of index==regexpid for level=0 prefilter nodes.
173
+ v.push_back(f);
174
+ }
175
+
176
+ // Now add all the descendant nodes.
177
+ for (int i = 0; i < v.size(); i++) {
178
+ Prefilter* f = v[i];
179
+ if (f == NULL)
180
+ continue;
181
+ if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) {
182
+ const vector<Prefilter*>& subs = *f->subs();
183
+ for (int j = 0; j < subs.size(); j++)
184
+ v.push_back(subs[j]);
185
+ }
186
+ }
187
+
188
+ // Identify unique nodes.
189
+ int unique_id = 0;
190
+ for (int i = v.size() - 1; i >= 0; i--) {
191
+ Prefilter *node = v[i];
192
+ if (node == NULL)
193
+ continue;
194
+ node->set_unique_id(-1);
195
+ Prefilter* canonical = CanonicalNode(node);
196
+ if (canonical == NULL) {
197
+ // Any further nodes that have the same node string
198
+ // will find this node as the canonical node.
199
+ node_map_[NodeString(node)] = node;
200
+ if (node->op() == Prefilter::ATOM) {
201
+ atom_vec->push_back(node->atom());
202
+ atom_index_to_id_.push_back(unique_id);
203
+ }
204
+ node->set_unique_id(unique_id++);
205
+ } else {
206
+ node->set_unique_id(canonical->unique_id());
207
+ }
208
+ }
209
+ entries_.resize(node_map_.size());
210
+
211
+ // Create parent IntMap for the entries.
212
+ for (int i = v.size() - 1; i >= 0; i--) {
213
+ Prefilter* prefilter = v[i];
214
+ if (prefilter == NULL)
215
+ continue;
216
+
217
+ if (CanonicalNode(prefilter) != prefilter)
218
+ continue;
219
+
220
+ Entry* entry = &entries_[prefilter->unique_id()];
221
+ entry->parents = new IntMap(node_map_.size());
222
+ }
223
+
224
+ // Fill the entries.
225
+ for (int i = v.size() - 1; i >= 0; i--) {
226
+ Prefilter* prefilter = v[i];
227
+ if (prefilter == NULL)
228
+ continue;
229
+
230
+ if (CanonicalNode(prefilter) != prefilter)
231
+ continue;
232
+
233
+ Entry* entry = &entries_[prefilter->unique_id()];
234
+
235
+ switch (prefilter->op()) {
236
+ default:
237
+ case Prefilter::ALL:
238
+ LOG(DFATAL) << "Unexpected op: " << prefilter->op();
239
+ return;
240
+
241
+ case Prefilter::ATOM:
242
+ entry->propagate_up_at_count = 1;
243
+ break;
244
+
245
+ case Prefilter::OR:
246
+ case Prefilter::AND: {
247
+ IntMap uniq_child(node_map_.size());
248
+ for (int j = 0; j < prefilter->subs()->size() ; j++) {
249
+ Prefilter* child = (*prefilter->subs())[j];
250
+ Prefilter* canonical = CanonicalNode(child);
251
+ if (canonical == NULL) {
252
+ LOG(DFATAL) << "Null canonical node";
253
+ return;
254
+ }
255
+ int child_id = canonical->unique_id();
256
+ if (!uniq_child.has_index(child_id))
257
+ uniq_child.set_new(child_id, 1);
258
+ // To the child, we want to add to parent indices.
259
+ Entry* child_entry = &entries_[child_id];
260
+ if (!child_entry->parents->has_index(prefilter->unique_id()))
261
+ child_entry->parents->set_new(prefilter->unique_id(), 1);
262
+ }
263
+ entry->propagate_up_at_count =
264
+ prefilter->op() == Prefilter::AND ? uniq_child.size() : 1;
265
+
266
+ break;
267
+ }
268
+ }
269
+ }
270
+
271
+ // For top level nodes, populate regexp id.
272
+ for (int i = 0; i < prefilter_vec_.size(); i++) {
273
+ if (prefilter_vec_[i] == NULL)
274
+ continue;
275
+ int id = CanonicalNode(prefilter_vec_[i])->unique_id();
276
+ DCHECK_LE(0, id);
277
+ Entry* entry = &entries_[id];
278
+ entry->regexps.push_back(i);
279
+ }
280
+ }
281
+
282
+ // Functions for triggering during search.
283
+ void PrefilterTree::RegexpsGivenStrings(
284
+ const vector<int>& matched_atoms,
285
+ vector<int>* regexps) const {
286
+ regexps->clear();
287
+ if (!compiled_) {
288
+ LOG(WARNING) << "Compile() not called";
289
+ for (int i = 0; i < prefilter_vec_.size(); ++i)
290
+ regexps->push_back(i);
291
+ } else {
292
+ if (!prefilter_vec_.empty()) {
293
+ IntMap regexps_map(prefilter_vec_.size());
294
+ vector<int> matched_atom_ids;
295
+ for (int j = 0; j < matched_atoms.size(); j++) {
296
+ matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]);
297
+ VLOG(10) << "Atom id:" << atom_index_to_id_[matched_atoms[j]];
298
+ }
299
+ PropagateMatch(matched_atom_ids, &regexps_map);
300
+ for (IntMap::iterator it = regexps_map.begin();
301
+ it != regexps_map.end();
302
+ ++it)
303
+ regexps->push_back(it->index());
304
+
305
+ regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end());
306
+ }
307
+ }
308
+ sort(regexps->begin(), regexps->end());
309
+ }
310
+
311
+ void PrefilterTree::PropagateMatch(const vector<int>& atom_ids,
312
+ IntMap* regexps) const {
313
+ IntMap count(entries_.size());
314
+ IntMap work(entries_.size());
315
+ for (int i = 0; i < atom_ids.size(); i++)
316
+ work.set(atom_ids[i], 1);
317
+ for (IntMap::iterator it = work.begin(); it != work.end(); ++it) {
318
+ const Entry& entry = entries_[it->index()];
319
+ VLOG(10) << "Processing: " << it->index();
320
+ // Record regexps triggered.
321
+ for (int i = 0; i < entry.regexps.size(); i++) {
322
+ VLOG(10) << "Regexp triggered: " << entry.regexps[i];
323
+ regexps->set(entry.regexps[i], 1);
324
+ }
325
+ int c;
326
+ // Pass trigger up to parents.
327
+ for (IntMap::iterator it = entry.parents->begin();
328
+ it != entry.parents->end();
329
+ ++it) {
330
+ int j = it->index();
331
+ const Entry& parent = entries_[j];
332
+ VLOG(10) << " parent= " << j << " trig= " << parent.propagate_up_at_count;
333
+ // Delay until all the children have succeeded.
334
+ if (parent.propagate_up_at_count > 1) {
335
+ if (count.has_index(j)) {
336
+ c = count.get_existing(j) + 1;
337
+ count.set_existing(j, c);
338
+ } else {
339
+ c = 1;
340
+ count.set_new(j, c);
341
+ }
342
+ if (c < parent.propagate_up_at_count)
343
+ continue;
344
+ }
345
+ VLOG(10) << "Triggering: " << j;
346
+ // Trigger the parent.
347
+ work.set(j, 1);
348
+ }
349
+ }
350
+ }
351
+
352
+ // Debugging help.
353
+ void PrefilterTree::PrintPrefilter(int regexpid) {
354
+ LOG(INFO) << DebugNodeString(prefilter_vec_[regexpid]);
355
+ }
356
+
357
+ void PrefilterTree::PrintDebugInfo() {
358
+ VLOG(10) << "#Unique Atoms: " << atom_index_to_id_.size();
359
+ VLOG(10) << "#Unique Nodes: " << entries_.size();
360
+
361
+ for (int i = 0; i < entries_.size(); ++i) {
362
+ IntMap* parents = entries_[i].parents;
363
+ const vector<int>& regexps = entries_[i].regexps;
364
+ VLOG(10) << "EntryId: " << i
365
+ << " N: " << parents->size() << " R: " << regexps.size();
366
+ for (IntMap::iterator it = parents->begin(); it != parents->end(); ++it)
367
+ VLOG(10) << it->index();
368
+ }
369
+ VLOG(10) << "Map:";
370
+ for (map<string, Prefilter*>::const_iterator iter = node_map_.begin();
371
+ iter != node_map_.end(); ++iter)
372
+ VLOG(10) << "NodeId: " << (*iter).second->unique_id()
373
+ << " Str: " << (*iter).first;
374
+ }
375
+
376
+ string PrefilterTree::DebugNodeString(Prefilter* node) const {
377
+ string node_string = "";
378
+
379
+ if (node->op() == Prefilter::ATOM) {
380
+ DCHECK(!node->atom().empty());
381
+ node_string += node->atom();
382
+ } else {
383
+ // Adding the operation disambiguates AND and OR nodes.
384
+ node_string += node->op() == Prefilter::AND ? "AND" : "OR";
385
+ node_string += "(";
386
+ for (int i = 0; i < node->subs()->size() ; i++) {
387
+ if (i > 0)
388
+ node_string += ',';
389
+ node_string += Itoa((*node->subs())[i]->unique_id());
390
+ node_string += ":";
391
+ node_string += DebugNodeString((*node->subs())[i]);
392
+ }
393
+ node_string += ")";
394
+ }
395
+ return node_string;
396
+ }
397
+
398
+ } // namespace re2
@@ -0,0 +1,130 @@
1
+ // Copyright 2009 The RE2 Authors. All Rights Reserved.
2
+ // Use of this source code is governed by a BSD-style
3
+ // license that can be found in the LICENSE file.
4
+
5
+ // The PrefilterTree class is used to form an AND-OR tree of strings
6
+ // that would trigger each regexp. The 'prefilter' of each regexp is
7
+ // added tp PrefilterTree, and then PrefilterTree is used to find all
8
+ // the unique strings across the prefilters. During search, by using
9
+ // matches from a string matching engine, PrefilterTree deduces the
10
+ // set of regexps that are to be triggered. The 'string matching
11
+ // engine' itself is outside of this class, and the caller can use any
12
+ // favorite engine. PrefilterTree provides a set of strings (called
13
+ // atoms) that the user of this class should use to do the string
14
+ // matching.
15
+ //
16
+ #ifndef RE2_PREFILTER_TREE_H_
17
+ #define RE2_PREFILTER_TREE_H_
18
+
19
+ #include "util/util.h"
20
+ #include "util/sparse_array.h"
21
+
22
+ namespace re2 {
23
+
24
+ typedef SparseArray<int> IntMap;
25
+
26
+ class Prefilter;
27
+
28
+ class PrefilterTree {
29
+ public:
30
+ PrefilterTree();
31
+ ~PrefilterTree();
32
+
33
+ // Adds the prefilter for the next regexp. Note that we assume that
34
+ // Add called sequentially for all regexps. All Add calls
35
+ // must precede Compile.
36
+ void Add(Prefilter* prefilter);
37
+
38
+ // The Compile returns a vector of string in atom_vec.
39
+ // Call this after all the prefilters are added through Add.
40
+ // No calls to Add after Compile are allowed.
41
+ // The caller should use the returned set of strings to do string matching.
42
+ // Each time a string matches, the corresponding index then has to be
43
+ // and passed to RegexpsGivenStrings below.
44
+ void Compile(vector<string>* atom_vec);
45
+
46
+ // Given the indices of the atoms that matched, returns the indexes
47
+ // of regexps that should be searched. The matched_atoms should
48
+ // contain all the ids of string atoms that were found to match the
49
+ // content. The caller can use any string match engine to perform
50
+ // this function. This function is thread safe.
51
+ void RegexpsGivenStrings(const vector<int>& matched_atoms,
52
+ vector<int>* regexps) const;
53
+
54
+ // Print debug prefilter. Also prints unique ids associated with
55
+ // nodes of the prefilter of the regexp.
56
+ void PrintPrefilter(int regexpid);
57
+
58
+
59
+ // Each unique node has a corresponding Entry that helps in
60
+ // passing the matching trigger information along the tree.
61
+ struct Entry {
62
+ public:
63
+ // How many children should match before this node triggers the
64
+ // parent. For an atom and an OR node, this is 1 and for an AND
65
+ // node, it is the number of unique children.
66
+ int propagate_up_at_count;
67
+
68
+ // When this node is ready to trigger the parent, what are the indices
69
+ // of the parent nodes to trigger. The reason there may be more than
70
+ // one is because of sharing. For example (abc | def) and (xyz | def)
71
+ // are two different nodes, but they share the atom 'def'. So when
72
+ // 'def' matches, it triggers two parents, corresponding to the two
73
+ // different OR nodes.
74
+ IntMap* parents;
75
+
76
+ // When this node is ready to trigger the parent, what are the
77
+ // regexps that are triggered.
78
+ vector<int> regexps;
79
+ };
80
+
81
+ private:
82
+ // This function assigns unique ids to various parts of the
83
+ // prefilter, by looking at if these nodes are already in the
84
+ // PrefilterTree.
85
+ void AssignUniqueIds(vector<string>* atom_vec);
86
+
87
+ // Given the matching atoms, find the regexps to be triggered.
88
+ void PropagateMatch(const vector<int>& atom_ids,
89
+ IntMap* regexps) const;
90
+
91
+ // Returns the prefilter node that has the same NodeString as this
92
+ // node. For the canonical node, returns node.
93
+ Prefilter* CanonicalNode(Prefilter* node);
94
+
95
+ // A string that uniquely identifies the node. Assumes that the
96
+ // children of node has already been assigned unique ids.
97
+ string NodeString(Prefilter* node) const;
98
+
99
+ // Recursively constructs a readable prefilter string.
100
+ string DebugNodeString(Prefilter* node) const;
101
+
102
+ // Used for debugging.
103
+ void PrintDebugInfo();
104
+
105
+ // These are all the nodes formed by Compile. Essentially, there is
106
+ // one node for each unique atom and each unique AND/OR node.
107
+ vector<Entry> entries_;
108
+
109
+ // Map node string to canonical Prefilter node.
110
+ map<string, Prefilter*> node_map_;
111
+
112
+ // indices of regexps that always pass through the filter (since we
113
+ // found no required literals in these regexps).
114
+ vector<int> unfiltered_;
115
+
116
+ // vector of Prefilter for all regexps.
117
+ vector<Prefilter*> prefilter_vec_;
118
+
119
+ // Atom index in returned strings to entry id mapping.
120
+ vector<int> atom_index_to_id_;
121
+
122
+ // Has the prefilter tree been compiled.
123
+ bool compiled_;
124
+
125
+ DISALLOW_EVIL_CONSTRUCTORS(PrefilterTree);
126
+ };
127
+
128
+ } // namespace
129
+
130
+ #endif // RE2_PREFILTER_TREE_H_