chipper 0.4.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +51 -0
- data/ext/extconf.rb +58 -0
- data/ext/libstemmer_c/Makefile +10 -0
- data/ext/libstemmer_c/examples/stemwords.c +209 -0
- data/ext/libstemmer_c/include/libstemmer.h +79 -0
- data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
- data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
- data/ext/libstemmer_c/libstemmer/modules.h +190 -0
- data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
- data/ext/libstemmer_c/mkinc.mak +82 -0
- data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
- data/ext/libstemmer_c/runtime/api.c +66 -0
- data/ext/libstemmer_c/runtime/api.h +26 -0
- data/ext/libstemmer_c/runtime/header.h +58 -0
- data/ext/libstemmer_c/runtime/utilities.c +478 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
- data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
- data/ext/re2/bitstate.cc +378 -0
- data/ext/re2/compile.cc +1138 -0
- data/ext/re2/dfa.cc +2086 -0
- data/ext/re2/filtered_re2.cc +100 -0
- data/ext/re2/filtered_re2.h +99 -0
- data/ext/re2/hash.cc +231 -0
- data/ext/re2/mimics_pcre.cc +185 -0
- data/ext/re2/nfa.cc +709 -0
- data/ext/re2/onepass.cc +614 -0
- data/ext/re2/parse.cc +2202 -0
- data/ext/re2/perl_groups.cc +119 -0
- data/ext/re2/prefilter.cc +671 -0
- data/ext/re2/prefilter.h +105 -0
- data/ext/re2/prefilter_tree.cc +398 -0
- data/ext/re2/prefilter_tree.h +130 -0
- data/ext/re2/prog.cc +341 -0
- data/ext/re2/prog.h +376 -0
- data/ext/re2/re2.cc +1180 -0
- data/ext/re2/re2.h +837 -0
- data/ext/re2/regexp.cc +920 -0
- data/ext/re2/regexp.h +632 -0
- data/ext/re2/rune.cc +258 -0
- data/ext/re2/set.cc +113 -0
- data/ext/re2/set.h +55 -0
- data/ext/re2/simplify.cc +393 -0
- data/ext/re2/stringpiece.cc +87 -0
- data/ext/re2/stringpiece.h +182 -0
- data/ext/re2/tostring.cc +341 -0
- data/ext/re2/unicode_casefold.cc +469 -0
- data/ext/re2/unicode_casefold.h +75 -0
- data/ext/re2/unicode_groups.cc +4851 -0
- data/ext/re2/unicode_groups.h +64 -0
- data/ext/re2/valgrind.cc +24 -0
- data/ext/re2/variadic_function.h +346 -0
- data/ext/re2/walker-inl.h +244 -0
- data/ext/src/chipper.cc +626 -0
- data/ext/src/version.h +1 -0
- data/ext/stemmer.rb +40 -0
- data/ext/util/arena.h +103 -0
- data/ext/util/atomicops.h +79 -0
- data/ext/util/benchmark.h +41 -0
- data/ext/util/flags.h +27 -0
- data/ext/util/logging.h +78 -0
- data/ext/util/mutex.h +190 -0
- data/ext/util/pcre.h +679 -0
- data/ext/util/random.h +29 -0
- data/ext/util/sparse_array.h +451 -0
- data/ext/util/sparse_set.h +177 -0
- data/ext/util/test.h +57 -0
- data/ext/util/thread.h +26 -0
- data/ext/util/utf.h +43 -0
- data/ext/util/util.h +127 -0
- data/ext/util/valgrind.h +4517 -0
- data/test/helper.rb +5 -0
- data/test/test_entities.rb +57 -0
- data/test/test_tokens.rb +118 -0
- metadata +199 -0
data/ext/re2/prefilter.h
ADDED
@@ -0,0 +1,105 @@
|
|
1
|
+
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style
|
3
|
+
// license that can be found in the LICENSE file.
|
4
|
+
|
5
|
+
// Prefilter is the class used to extract string guards from regexps.
|
6
|
+
// Rather than using Prefilter class directly, use FilteredRE2.
|
7
|
+
// See filtered_re2.h
|
8
|
+
|
9
|
+
#ifndef RE2_PREFILTER_H_
|
10
|
+
#define RE2_PREFILTER_H_
|
11
|
+
|
12
|
+
#include "util/util.h"
|
13
|
+
|
14
|
+
namespace re2 {
|
15
|
+
|
16
|
+
class RE2;
|
17
|
+
|
18
|
+
class Regexp;
|
19
|
+
|
20
|
+
class Prefilter {
|
21
|
+
// Instead of using Prefilter directly, use FilteredRE2; see filtered_re2.h
|
22
|
+
public:
|
23
|
+
enum Op {
|
24
|
+
ALL = 0, // Everything matches
|
25
|
+
NONE, // Nothing matches
|
26
|
+
ATOM, // The string atom() must match
|
27
|
+
AND, // All in subs() must match
|
28
|
+
OR, // One of subs() must match
|
29
|
+
};
|
30
|
+
|
31
|
+
explicit Prefilter(Op op);
|
32
|
+
~Prefilter();
|
33
|
+
|
34
|
+
Op op() { return op_; }
|
35
|
+
const string& atom() const { return atom_; }
|
36
|
+
void set_unique_id(int id) { unique_id_ = id; }
|
37
|
+
int unique_id() const { return unique_id_; }
|
38
|
+
|
39
|
+
// The children of the Prefilter node.
|
40
|
+
vector<Prefilter*>* subs() {
|
41
|
+
CHECK(op_ == AND || op_ == OR);
|
42
|
+
return subs_;
|
43
|
+
}
|
44
|
+
|
45
|
+
// Set the children vector. Prefilter takes ownership of subs and
|
46
|
+
// subs_ will be deleted when Prefilter is deleted.
|
47
|
+
void set_subs(vector<Prefilter*>* subs) { subs_ = subs; }
|
48
|
+
|
49
|
+
// Given a RE2, return a Prefilter. The caller takes ownership of
|
50
|
+
// the Prefilter and should deallocate it. Returns NULL if Prefilter
|
51
|
+
// cannot be formed.
|
52
|
+
static Prefilter* FromRE2(const RE2* re2);
|
53
|
+
|
54
|
+
// Returns a readable debug string of the prefilter.
|
55
|
+
string DebugString() const;
|
56
|
+
|
57
|
+
private:
|
58
|
+
class Info;
|
59
|
+
|
60
|
+
// Combines two prefilters together to create an AND. The passed
|
61
|
+
// Prefilters will be part of the returned Prefilter or deleted.
|
62
|
+
static Prefilter* And(Prefilter* a, Prefilter* b);
|
63
|
+
|
64
|
+
// Combines two prefilters together to create an OR. The passed
|
65
|
+
// Prefilters will be part of the returned Prefilter or deleted.
|
66
|
+
static Prefilter* Or(Prefilter* a, Prefilter* b);
|
67
|
+
|
68
|
+
// Generalized And/Or
|
69
|
+
static Prefilter* AndOr(Op op, Prefilter* a, Prefilter* b);
|
70
|
+
|
71
|
+
static Prefilter* FromRegexp(Regexp* a);
|
72
|
+
|
73
|
+
static Prefilter* FromString(const string& str);
|
74
|
+
|
75
|
+
static Prefilter* OrStrings(set<string>* ss);
|
76
|
+
|
77
|
+
static Info* BuildInfo(Regexp* re);
|
78
|
+
|
79
|
+
Prefilter* Simplify();
|
80
|
+
|
81
|
+
// Kind of Prefilter.
|
82
|
+
Op op_;
|
83
|
+
|
84
|
+
// Sub-matches for AND or OR Prefilter.
|
85
|
+
vector<Prefilter*>* subs_;
|
86
|
+
|
87
|
+
// Actual string to match in leaf node.
|
88
|
+
string atom_;
|
89
|
+
|
90
|
+
// If different prefilters have the same string atom, or if they are
|
91
|
+
// structurally the same (e.g., OR of same atom strings) they are
|
92
|
+
// considered the same unique nodes. This is the id for each unique
|
93
|
+
// node. This field is populated with a unique id for every node,
|
94
|
+
// and -1 for duplicate nodes.
|
95
|
+
int unique_id_;
|
96
|
+
|
97
|
+
// Used for debugging, helps in tracking memory leaks.
|
98
|
+
int alloc_id_;
|
99
|
+
|
100
|
+
DISALLOW_EVIL_CONSTRUCTORS(Prefilter);
|
101
|
+
};
|
102
|
+
|
103
|
+
} // namespace re2
|
104
|
+
|
105
|
+
#endif // RE2_PREFILTER_H_
|
@@ -0,0 +1,398 @@
|
|
1
|
+
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style
|
3
|
+
// license that can be found in the LICENSE file.
|
4
|
+
|
5
|
+
#include "util/util.h"
|
6
|
+
#include "util/flags.h"
|
7
|
+
#include "re2/prefilter.h"
|
8
|
+
#include "re2/prefilter_tree.h"
|
9
|
+
#include "re2/re2.h"
|
10
|
+
|
11
|
+
DEFINE_int32(filtered_re2_min_atom_len,
|
12
|
+
3,
|
13
|
+
"Strings less than this length are not stored as atoms");
|
14
|
+
|
15
|
+
namespace re2 {
|
16
|
+
|
17
|
+
PrefilterTree::PrefilterTree()
|
18
|
+
: compiled_(false) {
|
19
|
+
}
|
20
|
+
|
21
|
+
PrefilterTree::~PrefilterTree() {
|
22
|
+
for (int i = 0; i < prefilter_vec_.size(); i++)
|
23
|
+
delete prefilter_vec_[i];
|
24
|
+
|
25
|
+
for (int i = 0; i < entries_.size(); i++)
|
26
|
+
delete entries_[i].parents;
|
27
|
+
}
|
28
|
+
|
29
|
+
// Functions used for adding and Compiling prefilters to the
|
30
|
+
// PrefilterTree.
|
31
|
+
static bool KeepPart(Prefilter* prefilter, int level) {
|
32
|
+
if (prefilter == NULL)
|
33
|
+
return false;
|
34
|
+
|
35
|
+
switch (prefilter->op()) {
|
36
|
+
default:
|
37
|
+
LOG(DFATAL) << "Unexpected op in KeepPart: "
|
38
|
+
<< prefilter->op();
|
39
|
+
return false;
|
40
|
+
|
41
|
+
case Prefilter::ALL:
|
42
|
+
return false;
|
43
|
+
|
44
|
+
case Prefilter::ATOM:
|
45
|
+
return prefilter->atom().size() >=
|
46
|
+
FLAGS_filtered_re2_min_atom_len;
|
47
|
+
|
48
|
+
case Prefilter::AND: {
|
49
|
+
int j = 0;
|
50
|
+
vector<Prefilter*>* subs = prefilter->subs();
|
51
|
+
for (int i = 0; i < subs->size(); i++)
|
52
|
+
if (KeepPart((*subs)[i], level + 1))
|
53
|
+
(*subs)[j++] = (*subs)[i];
|
54
|
+
else
|
55
|
+
delete (*subs)[i];
|
56
|
+
|
57
|
+
subs->resize(j);
|
58
|
+
return j > 0;
|
59
|
+
}
|
60
|
+
|
61
|
+
case Prefilter::OR:
|
62
|
+
for (int i = 0; i < prefilter->subs()->size(); i++)
|
63
|
+
if (!KeepPart((*prefilter->subs())[i], level + 1))
|
64
|
+
return false;
|
65
|
+
return true;
|
66
|
+
}
|
67
|
+
}
|
68
|
+
|
69
|
+
void PrefilterTree::Add(Prefilter *f) {
|
70
|
+
if (compiled_) {
|
71
|
+
LOG(DFATAL) << "Add after Compile.";
|
72
|
+
return;
|
73
|
+
}
|
74
|
+
if (f != NULL && !KeepPart(f, 0)) {
|
75
|
+
delete f;
|
76
|
+
f = NULL;
|
77
|
+
}
|
78
|
+
|
79
|
+
prefilter_vec_.push_back(f);
|
80
|
+
}
|
81
|
+
|
82
|
+
void PrefilterTree::Compile(vector<string>* atom_vec) {
|
83
|
+
if (compiled_) {
|
84
|
+
LOG(DFATAL) << "Compile after Compile.";
|
85
|
+
return;
|
86
|
+
}
|
87
|
+
|
88
|
+
// We do this check to support some legacy uses of
|
89
|
+
// PrefilterTree that call Compile before adding any regexps,
|
90
|
+
// and expect Compile not to have effect.
|
91
|
+
if (prefilter_vec_.empty())
|
92
|
+
return;
|
93
|
+
|
94
|
+
compiled_ = true;
|
95
|
+
|
96
|
+
AssignUniqueIds(atom_vec);
|
97
|
+
|
98
|
+
// Identify nodes that are too common among prefilters and are
|
99
|
+
// triggering too many parents. Then get rid of them if possible.
|
100
|
+
// Note that getting rid of a prefilter node simply means they are
|
101
|
+
// no longer necessary for their parent to trigger; that is, we do
|
102
|
+
// not miss out on any regexps triggering by getting rid of a
|
103
|
+
// prefilter node.
|
104
|
+
for (int i = 0; i < entries_.size(); i++) {
|
105
|
+
IntMap* parents = entries_[i].parents;
|
106
|
+
if (parents->size() > 8) {
|
107
|
+
// This one triggers too many things. If all the parents are AND
|
108
|
+
// nodes and have other things guarding them, then get rid of
|
109
|
+
// this trigger. TODO(vsri): Adjust the threshold appropriately,
|
110
|
+
// make it a function of total number of nodes?
|
111
|
+
bool have_other_guard = true;
|
112
|
+
for (IntMap::iterator it = parents->begin(); it != parents->end(); ++it)
|
113
|
+
have_other_guard = have_other_guard &&
|
114
|
+
(entries_[it->index()].propagate_up_at_count > 1);
|
115
|
+
|
116
|
+
if (have_other_guard) {
|
117
|
+
for (IntMap::iterator it = parents->begin();
|
118
|
+
it != parents->end(); ++it)
|
119
|
+
entries_[it->index()].propagate_up_at_count -= 1;
|
120
|
+
|
121
|
+
parents->clear(); // Forget the parents
|
122
|
+
}
|
123
|
+
}
|
124
|
+
}
|
125
|
+
|
126
|
+
PrintDebugInfo();
|
127
|
+
}
|
128
|
+
|
129
|
+
Prefilter* PrefilterTree::CanonicalNode(Prefilter* node) {
|
130
|
+
string node_string = NodeString(node);
|
131
|
+
map<string, Prefilter*>::iterator iter = node_map_.find(node_string);
|
132
|
+
if (iter == node_map_.end())
|
133
|
+
return NULL;
|
134
|
+
return (*iter).second;
|
135
|
+
}
|
136
|
+
|
137
|
+
static string Itoa(int n) {
|
138
|
+
char buf[100];
|
139
|
+
snprintf(buf, sizeof buf, "%d", n);
|
140
|
+
return string(buf);
|
141
|
+
}
|
142
|
+
|
143
|
+
string PrefilterTree::NodeString(Prefilter* node) const {
|
144
|
+
// Adding the operation disambiguates AND/OR/atom nodes.
|
145
|
+
string s = Itoa(node->op()) + ":";
|
146
|
+
if (node->op() == Prefilter::ATOM) {
|
147
|
+
s += node->atom();
|
148
|
+
} else {
|
149
|
+
for (int i = 0; i < node->subs()->size() ; i++) {
|
150
|
+
if (i > 0)
|
151
|
+
s += ',';
|
152
|
+
s += Itoa((*node->subs())[i]->unique_id());
|
153
|
+
}
|
154
|
+
}
|
155
|
+
return s;
|
156
|
+
}
|
157
|
+
|
158
|
+
void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) {
|
159
|
+
atom_vec->clear();
|
160
|
+
|
161
|
+
// Build vector of all filter nodes, sorted topologically
|
162
|
+
// from top to bottom in v.
|
163
|
+
vector<Prefilter*> v;
|
164
|
+
|
165
|
+
// Add the top level nodes of each regexp prefilter.
|
166
|
+
for (int i = 0; i < prefilter_vec_.size(); i++) {
|
167
|
+
Prefilter* f = prefilter_vec_[i];
|
168
|
+
if (f == NULL)
|
169
|
+
unfiltered_.push_back(i);
|
170
|
+
|
171
|
+
// We push NULL also on to v, so that we maintain the
|
172
|
+
// mapping of index==regexpid for level=0 prefilter nodes.
|
173
|
+
v.push_back(f);
|
174
|
+
}
|
175
|
+
|
176
|
+
// Now add all the descendant nodes.
|
177
|
+
for (int i = 0; i < v.size(); i++) {
|
178
|
+
Prefilter* f = v[i];
|
179
|
+
if (f == NULL)
|
180
|
+
continue;
|
181
|
+
if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) {
|
182
|
+
const vector<Prefilter*>& subs = *f->subs();
|
183
|
+
for (int j = 0; j < subs.size(); j++)
|
184
|
+
v.push_back(subs[j]);
|
185
|
+
}
|
186
|
+
}
|
187
|
+
|
188
|
+
// Identify unique nodes.
|
189
|
+
int unique_id = 0;
|
190
|
+
for (int i = v.size() - 1; i >= 0; i--) {
|
191
|
+
Prefilter *node = v[i];
|
192
|
+
if (node == NULL)
|
193
|
+
continue;
|
194
|
+
node->set_unique_id(-1);
|
195
|
+
Prefilter* canonical = CanonicalNode(node);
|
196
|
+
if (canonical == NULL) {
|
197
|
+
// Any further nodes that have the same node string
|
198
|
+
// will find this node as the canonical node.
|
199
|
+
node_map_[NodeString(node)] = node;
|
200
|
+
if (node->op() == Prefilter::ATOM) {
|
201
|
+
atom_vec->push_back(node->atom());
|
202
|
+
atom_index_to_id_.push_back(unique_id);
|
203
|
+
}
|
204
|
+
node->set_unique_id(unique_id++);
|
205
|
+
} else {
|
206
|
+
node->set_unique_id(canonical->unique_id());
|
207
|
+
}
|
208
|
+
}
|
209
|
+
entries_.resize(node_map_.size());
|
210
|
+
|
211
|
+
// Create parent IntMap for the entries.
|
212
|
+
for (int i = v.size() - 1; i >= 0; i--) {
|
213
|
+
Prefilter* prefilter = v[i];
|
214
|
+
if (prefilter == NULL)
|
215
|
+
continue;
|
216
|
+
|
217
|
+
if (CanonicalNode(prefilter) != prefilter)
|
218
|
+
continue;
|
219
|
+
|
220
|
+
Entry* entry = &entries_[prefilter->unique_id()];
|
221
|
+
entry->parents = new IntMap(node_map_.size());
|
222
|
+
}
|
223
|
+
|
224
|
+
// Fill the entries.
|
225
|
+
for (int i = v.size() - 1; i >= 0; i--) {
|
226
|
+
Prefilter* prefilter = v[i];
|
227
|
+
if (prefilter == NULL)
|
228
|
+
continue;
|
229
|
+
|
230
|
+
if (CanonicalNode(prefilter) != prefilter)
|
231
|
+
continue;
|
232
|
+
|
233
|
+
Entry* entry = &entries_[prefilter->unique_id()];
|
234
|
+
|
235
|
+
switch (prefilter->op()) {
|
236
|
+
default:
|
237
|
+
case Prefilter::ALL:
|
238
|
+
LOG(DFATAL) << "Unexpected op: " << prefilter->op();
|
239
|
+
return;
|
240
|
+
|
241
|
+
case Prefilter::ATOM:
|
242
|
+
entry->propagate_up_at_count = 1;
|
243
|
+
break;
|
244
|
+
|
245
|
+
case Prefilter::OR:
|
246
|
+
case Prefilter::AND: {
|
247
|
+
IntMap uniq_child(node_map_.size());
|
248
|
+
for (int j = 0; j < prefilter->subs()->size() ; j++) {
|
249
|
+
Prefilter* child = (*prefilter->subs())[j];
|
250
|
+
Prefilter* canonical = CanonicalNode(child);
|
251
|
+
if (canonical == NULL) {
|
252
|
+
LOG(DFATAL) << "Null canonical node";
|
253
|
+
return;
|
254
|
+
}
|
255
|
+
int child_id = canonical->unique_id();
|
256
|
+
if (!uniq_child.has_index(child_id))
|
257
|
+
uniq_child.set_new(child_id, 1);
|
258
|
+
// To the child, we want to add to parent indices.
|
259
|
+
Entry* child_entry = &entries_[child_id];
|
260
|
+
if (!child_entry->parents->has_index(prefilter->unique_id()))
|
261
|
+
child_entry->parents->set_new(prefilter->unique_id(), 1);
|
262
|
+
}
|
263
|
+
entry->propagate_up_at_count =
|
264
|
+
prefilter->op() == Prefilter::AND ? uniq_child.size() : 1;
|
265
|
+
|
266
|
+
break;
|
267
|
+
}
|
268
|
+
}
|
269
|
+
}
|
270
|
+
|
271
|
+
// For top level nodes, populate regexp id.
|
272
|
+
for (int i = 0; i < prefilter_vec_.size(); i++) {
|
273
|
+
if (prefilter_vec_[i] == NULL)
|
274
|
+
continue;
|
275
|
+
int id = CanonicalNode(prefilter_vec_[i])->unique_id();
|
276
|
+
DCHECK_LE(0, id);
|
277
|
+
Entry* entry = &entries_[id];
|
278
|
+
entry->regexps.push_back(i);
|
279
|
+
}
|
280
|
+
}
|
281
|
+
|
282
|
+
// Functions for triggering during search.
|
283
|
+
void PrefilterTree::RegexpsGivenStrings(
|
284
|
+
const vector<int>& matched_atoms,
|
285
|
+
vector<int>* regexps) const {
|
286
|
+
regexps->clear();
|
287
|
+
if (!compiled_) {
|
288
|
+
LOG(WARNING) << "Compile() not called";
|
289
|
+
for (int i = 0; i < prefilter_vec_.size(); ++i)
|
290
|
+
regexps->push_back(i);
|
291
|
+
} else {
|
292
|
+
if (!prefilter_vec_.empty()) {
|
293
|
+
IntMap regexps_map(prefilter_vec_.size());
|
294
|
+
vector<int> matched_atom_ids;
|
295
|
+
for (int j = 0; j < matched_atoms.size(); j++) {
|
296
|
+
matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]);
|
297
|
+
VLOG(10) << "Atom id:" << atom_index_to_id_[matched_atoms[j]];
|
298
|
+
}
|
299
|
+
PropagateMatch(matched_atom_ids, ®exps_map);
|
300
|
+
for (IntMap::iterator it = regexps_map.begin();
|
301
|
+
it != regexps_map.end();
|
302
|
+
++it)
|
303
|
+
regexps->push_back(it->index());
|
304
|
+
|
305
|
+
regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end());
|
306
|
+
}
|
307
|
+
}
|
308
|
+
sort(regexps->begin(), regexps->end());
|
309
|
+
}
|
310
|
+
|
311
|
+
void PrefilterTree::PropagateMatch(const vector<int>& atom_ids,
|
312
|
+
IntMap* regexps) const {
|
313
|
+
IntMap count(entries_.size());
|
314
|
+
IntMap work(entries_.size());
|
315
|
+
for (int i = 0; i < atom_ids.size(); i++)
|
316
|
+
work.set(atom_ids[i], 1);
|
317
|
+
for (IntMap::iterator it = work.begin(); it != work.end(); ++it) {
|
318
|
+
const Entry& entry = entries_[it->index()];
|
319
|
+
VLOG(10) << "Processing: " << it->index();
|
320
|
+
// Record regexps triggered.
|
321
|
+
for (int i = 0; i < entry.regexps.size(); i++) {
|
322
|
+
VLOG(10) << "Regexp triggered: " << entry.regexps[i];
|
323
|
+
regexps->set(entry.regexps[i], 1);
|
324
|
+
}
|
325
|
+
int c;
|
326
|
+
// Pass trigger up to parents.
|
327
|
+
for (IntMap::iterator it = entry.parents->begin();
|
328
|
+
it != entry.parents->end();
|
329
|
+
++it) {
|
330
|
+
int j = it->index();
|
331
|
+
const Entry& parent = entries_[j];
|
332
|
+
VLOG(10) << " parent= " << j << " trig= " << parent.propagate_up_at_count;
|
333
|
+
// Delay until all the children have succeeded.
|
334
|
+
if (parent.propagate_up_at_count > 1) {
|
335
|
+
if (count.has_index(j)) {
|
336
|
+
c = count.get_existing(j) + 1;
|
337
|
+
count.set_existing(j, c);
|
338
|
+
} else {
|
339
|
+
c = 1;
|
340
|
+
count.set_new(j, c);
|
341
|
+
}
|
342
|
+
if (c < parent.propagate_up_at_count)
|
343
|
+
continue;
|
344
|
+
}
|
345
|
+
VLOG(10) << "Triggering: " << j;
|
346
|
+
// Trigger the parent.
|
347
|
+
work.set(j, 1);
|
348
|
+
}
|
349
|
+
}
|
350
|
+
}
|
351
|
+
|
352
|
+
// Debugging help.
|
353
|
+
void PrefilterTree::PrintPrefilter(int regexpid) {
|
354
|
+
LOG(INFO) << DebugNodeString(prefilter_vec_[regexpid]);
|
355
|
+
}
|
356
|
+
|
357
|
+
void PrefilterTree::PrintDebugInfo() {
|
358
|
+
VLOG(10) << "#Unique Atoms: " << atom_index_to_id_.size();
|
359
|
+
VLOG(10) << "#Unique Nodes: " << entries_.size();
|
360
|
+
|
361
|
+
for (int i = 0; i < entries_.size(); ++i) {
|
362
|
+
IntMap* parents = entries_[i].parents;
|
363
|
+
const vector<int>& regexps = entries_[i].regexps;
|
364
|
+
VLOG(10) << "EntryId: " << i
|
365
|
+
<< " N: " << parents->size() << " R: " << regexps.size();
|
366
|
+
for (IntMap::iterator it = parents->begin(); it != parents->end(); ++it)
|
367
|
+
VLOG(10) << it->index();
|
368
|
+
}
|
369
|
+
VLOG(10) << "Map:";
|
370
|
+
for (map<string, Prefilter*>::const_iterator iter = node_map_.begin();
|
371
|
+
iter != node_map_.end(); ++iter)
|
372
|
+
VLOG(10) << "NodeId: " << (*iter).second->unique_id()
|
373
|
+
<< " Str: " << (*iter).first;
|
374
|
+
}
|
375
|
+
|
376
|
+
string PrefilterTree::DebugNodeString(Prefilter* node) const {
|
377
|
+
string node_string = "";
|
378
|
+
|
379
|
+
if (node->op() == Prefilter::ATOM) {
|
380
|
+
DCHECK(!node->atom().empty());
|
381
|
+
node_string += node->atom();
|
382
|
+
} else {
|
383
|
+
// Adding the operation disambiguates AND and OR nodes.
|
384
|
+
node_string += node->op() == Prefilter::AND ? "AND" : "OR";
|
385
|
+
node_string += "(";
|
386
|
+
for (int i = 0; i < node->subs()->size() ; i++) {
|
387
|
+
if (i > 0)
|
388
|
+
node_string += ',';
|
389
|
+
node_string += Itoa((*node->subs())[i]->unique_id());
|
390
|
+
node_string += ":";
|
391
|
+
node_string += DebugNodeString((*node->subs())[i]);
|
392
|
+
}
|
393
|
+
node_string += ")";
|
394
|
+
}
|
395
|
+
return node_string;
|
396
|
+
}
|
397
|
+
|
398
|
+
} // namespace re2
|
@@ -0,0 +1,130 @@
|
|
1
|
+
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style
|
3
|
+
// license that can be found in the LICENSE file.
|
4
|
+
|
5
|
+
// The PrefilterTree class is used to form an AND-OR tree of strings
|
6
|
+
// that would trigger each regexp. The 'prefilter' of each regexp is
|
7
|
+
// added tp PrefilterTree, and then PrefilterTree is used to find all
|
8
|
+
// the unique strings across the prefilters. During search, by using
|
9
|
+
// matches from a string matching engine, PrefilterTree deduces the
|
10
|
+
// set of regexps that are to be triggered. The 'string matching
|
11
|
+
// engine' itself is outside of this class, and the caller can use any
|
12
|
+
// favorite engine. PrefilterTree provides a set of strings (called
|
13
|
+
// atoms) that the user of this class should use to do the string
|
14
|
+
// matching.
|
15
|
+
//
|
16
|
+
#ifndef RE2_PREFILTER_TREE_H_
|
17
|
+
#define RE2_PREFILTER_TREE_H_
|
18
|
+
|
19
|
+
#include "util/util.h"
|
20
|
+
#include "util/sparse_array.h"
|
21
|
+
|
22
|
+
namespace re2 {
|
23
|
+
|
24
|
+
typedef SparseArray<int> IntMap;
|
25
|
+
|
26
|
+
class Prefilter;
|
27
|
+
|
28
|
+
class PrefilterTree {
|
29
|
+
public:
|
30
|
+
PrefilterTree();
|
31
|
+
~PrefilterTree();
|
32
|
+
|
33
|
+
// Adds the prefilter for the next regexp. Note that we assume that
|
34
|
+
// Add called sequentially for all regexps. All Add calls
|
35
|
+
// must precede Compile.
|
36
|
+
void Add(Prefilter* prefilter);
|
37
|
+
|
38
|
+
// The Compile returns a vector of string in atom_vec.
|
39
|
+
// Call this after all the prefilters are added through Add.
|
40
|
+
// No calls to Add after Compile are allowed.
|
41
|
+
// The caller should use the returned set of strings to do string matching.
|
42
|
+
// Each time a string matches, the corresponding index then has to be
|
43
|
+
// and passed to RegexpsGivenStrings below.
|
44
|
+
void Compile(vector<string>* atom_vec);
|
45
|
+
|
46
|
+
// Given the indices of the atoms that matched, returns the indexes
|
47
|
+
// of regexps that should be searched. The matched_atoms should
|
48
|
+
// contain all the ids of string atoms that were found to match the
|
49
|
+
// content. The caller can use any string match engine to perform
|
50
|
+
// this function. This function is thread safe.
|
51
|
+
void RegexpsGivenStrings(const vector<int>& matched_atoms,
|
52
|
+
vector<int>* regexps) const;
|
53
|
+
|
54
|
+
// Print debug prefilter. Also prints unique ids associated with
|
55
|
+
// nodes of the prefilter of the regexp.
|
56
|
+
void PrintPrefilter(int regexpid);
|
57
|
+
|
58
|
+
|
59
|
+
// Each unique node has a corresponding Entry that helps in
|
60
|
+
// passing the matching trigger information along the tree.
|
61
|
+
struct Entry {
|
62
|
+
public:
|
63
|
+
// How many children should match before this node triggers the
|
64
|
+
// parent. For an atom and an OR node, this is 1 and for an AND
|
65
|
+
// node, it is the number of unique children.
|
66
|
+
int propagate_up_at_count;
|
67
|
+
|
68
|
+
// When this node is ready to trigger the parent, what are the indices
|
69
|
+
// of the parent nodes to trigger. The reason there may be more than
|
70
|
+
// one is because of sharing. For example (abc | def) and (xyz | def)
|
71
|
+
// are two different nodes, but they share the atom 'def'. So when
|
72
|
+
// 'def' matches, it triggers two parents, corresponding to the two
|
73
|
+
// different OR nodes.
|
74
|
+
IntMap* parents;
|
75
|
+
|
76
|
+
// When this node is ready to trigger the parent, what are the
|
77
|
+
// regexps that are triggered.
|
78
|
+
vector<int> regexps;
|
79
|
+
};
|
80
|
+
|
81
|
+
private:
|
82
|
+
// This function assigns unique ids to various parts of the
|
83
|
+
// prefilter, by looking at if these nodes are already in the
|
84
|
+
// PrefilterTree.
|
85
|
+
void AssignUniqueIds(vector<string>* atom_vec);
|
86
|
+
|
87
|
+
// Given the matching atoms, find the regexps to be triggered.
|
88
|
+
void PropagateMatch(const vector<int>& atom_ids,
|
89
|
+
IntMap* regexps) const;
|
90
|
+
|
91
|
+
// Returns the prefilter node that has the same NodeString as this
|
92
|
+
// node. For the canonical node, returns node.
|
93
|
+
Prefilter* CanonicalNode(Prefilter* node);
|
94
|
+
|
95
|
+
// A string that uniquely identifies the node. Assumes that the
|
96
|
+
// children of node has already been assigned unique ids.
|
97
|
+
string NodeString(Prefilter* node) const;
|
98
|
+
|
99
|
+
// Recursively constructs a readable prefilter string.
|
100
|
+
string DebugNodeString(Prefilter* node) const;
|
101
|
+
|
102
|
+
// Used for debugging.
|
103
|
+
void PrintDebugInfo();
|
104
|
+
|
105
|
+
// These are all the nodes formed by Compile. Essentially, there is
|
106
|
+
// one node for each unique atom and each unique AND/OR node.
|
107
|
+
vector<Entry> entries_;
|
108
|
+
|
109
|
+
// Map node string to canonical Prefilter node.
|
110
|
+
map<string, Prefilter*> node_map_;
|
111
|
+
|
112
|
+
// indices of regexps that always pass through the filter (since we
|
113
|
+
// found no required literals in these regexps).
|
114
|
+
vector<int> unfiltered_;
|
115
|
+
|
116
|
+
// vector of Prefilter for all regexps.
|
117
|
+
vector<Prefilter*> prefilter_vec_;
|
118
|
+
|
119
|
+
// Atom index in returned strings to entry id mapping.
|
120
|
+
vector<int> atom_index_to_id_;
|
121
|
+
|
122
|
+
// Has the prefilter tree been compiled.
|
123
|
+
bool compiled_;
|
124
|
+
|
125
|
+
DISALLOW_EVIL_CONSTRUCTORS(PrefilterTree);
|
126
|
+
};
|
127
|
+
|
128
|
+
} // namespace
|
129
|
+
|
130
|
+
#endif // RE2_PREFILTER_TREE_H_
|