chipper 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +51 -0
- data/ext/extconf.rb +58 -0
- data/ext/libstemmer_c/Makefile +10 -0
- data/ext/libstemmer_c/examples/stemwords.c +209 -0
- data/ext/libstemmer_c/include/libstemmer.h +79 -0
- data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
- data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
- data/ext/libstemmer_c/libstemmer/modules.h +190 -0
- data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
- data/ext/libstemmer_c/mkinc.mak +82 -0
- data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
- data/ext/libstemmer_c/runtime/api.c +66 -0
- data/ext/libstemmer_c/runtime/api.h +26 -0
- data/ext/libstemmer_c/runtime/header.h +58 -0
- data/ext/libstemmer_c/runtime/utilities.c +478 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
- data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
- data/ext/re2/bitstate.cc +378 -0
- data/ext/re2/compile.cc +1138 -0
- data/ext/re2/dfa.cc +2086 -0
- data/ext/re2/filtered_re2.cc +100 -0
- data/ext/re2/filtered_re2.h +99 -0
- data/ext/re2/hash.cc +231 -0
- data/ext/re2/mimics_pcre.cc +185 -0
- data/ext/re2/nfa.cc +709 -0
- data/ext/re2/onepass.cc +614 -0
- data/ext/re2/parse.cc +2202 -0
- data/ext/re2/perl_groups.cc +119 -0
- data/ext/re2/prefilter.cc +671 -0
- data/ext/re2/prefilter.h +105 -0
- data/ext/re2/prefilter_tree.cc +398 -0
- data/ext/re2/prefilter_tree.h +130 -0
- data/ext/re2/prog.cc +341 -0
- data/ext/re2/prog.h +376 -0
- data/ext/re2/re2.cc +1180 -0
- data/ext/re2/re2.h +837 -0
- data/ext/re2/regexp.cc +920 -0
- data/ext/re2/regexp.h +632 -0
- data/ext/re2/rune.cc +258 -0
- data/ext/re2/set.cc +113 -0
- data/ext/re2/set.h +55 -0
- data/ext/re2/simplify.cc +393 -0
- data/ext/re2/stringpiece.cc +87 -0
- data/ext/re2/stringpiece.h +182 -0
- data/ext/re2/tostring.cc +341 -0
- data/ext/re2/unicode_casefold.cc +469 -0
- data/ext/re2/unicode_casefold.h +75 -0
- data/ext/re2/unicode_groups.cc +4851 -0
- data/ext/re2/unicode_groups.h +64 -0
- data/ext/re2/valgrind.cc +24 -0
- data/ext/re2/variadic_function.h +346 -0
- data/ext/re2/walker-inl.h +244 -0
- data/ext/src/chipper.cc +626 -0
- data/ext/src/version.h +1 -0
- data/ext/stemmer.rb +40 -0
- data/ext/util/arena.h +103 -0
- data/ext/util/atomicops.h +79 -0
- data/ext/util/benchmark.h +41 -0
- data/ext/util/flags.h +27 -0
- data/ext/util/logging.h +78 -0
- data/ext/util/mutex.h +190 -0
- data/ext/util/pcre.h +679 -0
- data/ext/util/random.h +29 -0
- data/ext/util/sparse_array.h +451 -0
- data/ext/util/sparse_set.h +177 -0
- data/ext/util/test.h +57 -0
- data/ext/util/thread.h +26 -0
- data/ext/util/utf.h +43 -0
- data/ext/util/util.h +127 -0
- data/ext/util/valgrind.h +4517 -0
- data/test/helper.rb +5 -0
- data/test/test_entities.rb +57 -0
- data/test/test_tokens.rb +118 -0
- metadata +199 -0
data/ext/re2/prefilter.h
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style
|
|
3
|
+
// license that can be found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
// Prefilter is the class used to extract string guards from regexps.
|
|
6
|
+
// Rather than using Prefilter class directly, use FilteredRE2.
|
|
7
|
+
// See filtered_re2.h
|
|
8
|
+
|
|
9
|
+
#ifndef RE2_PREFILTER_H_
|
|
10
|
+
#define RE2_PREFILTER_H_
|
|
11
|
+
|
|
12
|
+
#include "util/util.h"
|
|
13
|
+
|
|
14
|
+
namespace re2 {
|
|
15
|
+
|
|
16
|
+
class RE2;
|
|
17
|
+
|
|
18
|
+
class Regexp;
|
|
19
|
+
|
|
20
|
+
class Prefilter {
|
|
21
|
+
// Instead of using Prefilter directly, use FilteredRE2; see filtered_re2.h
|
|
22
|
+
public:
|
|
23
|
+
enum Op {
|
|
24
|
+
ALL = 0, // Everything matches
|
|
25
|
+
NONE, // Nothing matches
|
|
26
|
+
ATOM, // The string atom() must match
|
|
27
|
+
AND, // All in subs() must match
|
|
28
|
+
OR, // One of subs() must match
|
|
29
|
+
};
|
|
30
|
+
|
|
31
|
+
explicit Prefilter(Op op);
|
|
32
|
+
~Prefilter();
|
|
33
|
+
|
|
34
|
+
Op op() { return op_; }
|
|
35
|
+
const string& atom() const { return atom_; }
|
|
36
|
+
void set_unique_id(int id) { unique_id_ = id; }
|
|
37
|
+
int unique_id() const { return unique_id_; }
|
|
38
|
+
|
|
39
|
+
// The children of the Prefilter node.
|
|
40
|
+
vector<Prefilter*>* subs() {
|
|
41
|
+
CHECK(op_ == AND || op_ == OR);
|
|
42
|
+
return subs_;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
// Set the children vector. Prefilter takes ownership of subs and
|
|
46
|
+
// subs_ will be deleted when Prefilter is deleted.
|
|
47
|
+
void set_subs(vector<Prefilter*>* subs) { subs_ = subs; }
|
|
48
|
+
|
|
49
|
+
// Given a RE2, return a Prefilter. The caller takes ownership of
|
|
50
|
+
// the Prefilter and should deallocate it. Returns NULL if Prefilter
|
|
51
|
+
// cannot be formed.
|
|
52
|
+
static Prefilter* FromRE2(const RE2* re2);
|
|
53
|
+
|
|
54
|
+
// Returns a readable debug string of the prefilter.
|
|
55
|
+
string DebugString() const;
|
|
56
|
+
|
|
57
|
+
private:
|
|
58
|
+
class Info;
|
|
59
|
+
|
|
60
|
+
// Combines two prefilters together to create an AND. The passed
|
|
61
|
+
// Prefilters will be part of the returned Prefilter or deleted.
|
|
62
|
+
static Prefilter* And(Prefilter* a, Prefilter* b);
|
|
63
|
+
|
|
64
|
+
// Combines two prefilters together to create an OR. The passed
|
|
65
|
+
// Prefilters will be part of the returned Prefilter or deleted.
|
|
66
|
+
static Prefilter* Or(Prefilter* a, Prefilter* b);
|
|
67
|
+
|
|
68
|
+
// Generalized And/Or
|
|
69
|
+
static Prefilter* AndOr(Op op, Prefilter* a, Prefilter* b);
|
|
70
|
+
|
|
71
|
+
static Prefilter* FromRegexp(Regexp* a);
|
|
72
|
+
|
|
73
|
+
static Prefilter* FromString(const string& str);
|
|
74
|
+
|
|
75
|
+
static Prefilter* OrStrings(set<string>* ss);
|
|
76
|
+
|
|
77
|
+
static Info* BuildInfo(Regexp* re);
|
|
78
|
+
|
|
79
|
+
Prefilter* Simplify();
|
|
80
|
+
|
|
81
|
+
// Kind of Prefilter.
|
|
82
|
+
Op op_;
|
|
83
|
+
|
|
84
|
+
// Sub-matches for AND or OR Prefilter.
|
|
85
|
+
vector<Prefilter*>* subs_;
|
|
86
|
+
|
|
87
|
+
// Actual string to match in leaf node.
|
|
88
|
+
string atom_;
|
|
89
|
+
|
|
90
|
+
// If different prefilters have the same string atom, or if they are
|
|
91
|
+
// structurally the same (e.g., OR of same atom strings) they are
|
|
92
|
+
// considered the same unique nodes. This is the id for each unique
|
|
93
|
+
// node. This field is populated with a unique id for every node,
|
|
94
|
+
// and -1 for duplicate nodes.
|
|
95
|
+
int unique_id_;
|
|
96
|
+
|
|
97
|
+
// Used for debugging, helps in tracking memory leaks.
|
|
98
|
+
int alloc_id_;
|
|
99
|
+
|
|
100
|
+
DISALLOW_EVIL_CONSTRUCTORS(Prefilter);
|
|
101
|
+
};
|
|
102
|
+
|
|
103
|
+
} // namespace re2
|
|
104
|
+
|
|
105
|
+
#endif // RE2_PREFILTER_H_
|
|
@@ -0,0 +1,398 @@
|
|
|
1
|
+
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style
|
|
3
|
+
// license that can be found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#include "util/util.h"
|
|
6
|
+
#include "util/flags.h"
|
|
7
|
+
#include "re2/prefilter.h"
|
|
8
|
+
#include "re2/prefilter_tree.h"
|
|
9
|
+
#include "re2/re2.h"
|
|
10
|
+
|
|
11
|
+
DEFINE_int32(filtered_re2_min_atom_len,
|
|
12
|
+
3,
|
|
13
|
+
"Strings less than this length are not stored as atoms");
|
|
14
|
+
|
|
15
|
+
namespace re2 {
|
|
16
|
+
|
|
17
|
+
PrefilterTree::PrefilterTree()
|
|
18
|
+
: compiled_(false) {
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
PrefilterTree::~PrefilterTree() {
|
|
22
|
+
for (int i = 0; i < prefilter_vec_.size(); i++)
|
|
23
|
+
delete prefilter_vec_[i];
|
|
24
|
+
|
|
25
|
+
for (int i = 0; i < entries_.size(); i++)
|
|
26
|
+
delete entries_[i].parents;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
// Functions used for adding and Compiling prefilters to the
|
|
30
|
+
// PrefilterTree.
|
|
31
|
+
static bool KeepPart(Prefilter* prefilter, int level) {
|
|
32
|
+
if (prefilter == NULL)
|
|
33
|
+
return false;
|
|
34
|
+
|
|
35
|
+
switch (prefilter->op()) {
|
|
36
|
+
default:
|
|
37
|
+
LOG(DFATAL) << "Unexpected op in KeepPart: "
|
|
38
|
+
<< prefilter->op();
|
|
39
|
+
return false;
|
|
40
|
+
|
|
41
|
+
case Prefilter::ALL:
|
|
42
|
+
return false;
|
|
43
|
+
|
|
44
|
+
case Prefilter::ATOM:
|
|
45
|
+
return prefilter->atom().size() >=
|
|
46
|
+
FLAGS_filtered_re2_min_atom_len;
|
|
47
|
+
|
|
48
|
+
case Prefilter::AND: {
|
|
49
|
+
int j = 0;
|
|
50
|
+
vector<Prefilter*>* subs = prefilter->subs();
|
|
51
|
+
for (int i = 0; i < subs->size(); i++)
|
|
52
|
+
if (KeepPart((*subs)[i], level + 1))
|
|
53
|
+
(*subs)[j++] = (*subs)[i];
|
|
54
|
+
else
|
|
55
|
+
delete (*subs)[i];
|
|
56
|
+
|
|
57
|
+
subs->resize(j);
|
|
58
|
+
return j > 0;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
case Prefilter::OR:
|
|
62
|
+
for (int i = 0; i < prefilter->subs()->size(); i++)
|
|
63
|
+
if (!KeepPart((*prefilter->subs())[i], level + 1))
|
|
64
|
+
return false;
|
|
65
|
+
return true;
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
void PrefilterTree::Add(Prefilter *f) {
|
|
70
|
+
if (compiled_) {
|
|
71
|
+
LOG(DFATAL) << "Add after Compile.";
|
|
72
|
+
return;
|
|
73
|
+
}
|
|
74
|
+
if (f != NULL && !KeepPart(f, 0)) {
|
|
75
|
+
delete f;
|
|
76
|
+
f = NULL;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
prefilter_vec_.push_back(f);
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
void PrefilterTree::Compile(vector<string>* atom_vec) {
|
|
83
|
+
if (compiled_) {
|
|
84
|
+
LOG(DFATAL) << "Compile after Compile.";
|
|
85
|
+
return;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
// We do this check to support some legacy uses of
|
|
89
|
+
// PrefilterTree that call Compile before adding any regexps,
|
|
90
|
+
// and expect Compile not to have effect.
|
|
91
|
+
if (prefilter_vec_.empty())
|
|
92
|
+
return;
|
|
93
|
+
|
|
94
|
+
compiled_ = true;
|
|
95
|
+
|
|
96
|
+
AssignUniqueIds(atom_vec);
|
|
97
|
+
|
|
98
|
+
// Identify nodes that are too common among prefilters and are
|
|
99
|
+
// triggering too many parents. Then get rid of them if possible.
|
|
100
|
+
// Note that getting rid of a prefilter node simply means they are
|
|
101
|
+
// no longer necessary for their parent to trigger; that is, we do
|
|
102
|
+
// not miss out on any regexps triggering by getting rid of a
|
|
103
|
+
// prefilter node.
|
|
104
|
+
for (int i = 0; i < entries_.size(); i++) {
|
|
105
|
+
IntMap* parents = entries_[i].parents;
|
|
106
|
+
if (parents->size() > 8) {
|
|
107
|
+
// This one triggers too many things. If all the parents are AND
|
|
108
|
+
// nodes and have other things guarding them, then get rid of
|
|
109
|
+
// this trigger. TODO(vsri): Adjust the threshold appropriately,
|
|
110
|
+
// make it a function of total number of nodes?
|
|
111
|
+
bool have_other_guard = true;
|
|
112
|
+
for (IntMap::iterator it = parents->begin(); it != parents->end(); ++it)
|
|
113
|
+
have_other_guard = have_other_guard &&
|
|
114
|
+
(entries_[it->index()].propagate_up_at_count > 1);
|
|
115
|
+
|
|
116
|
+
if (have_other_guard) {
|
|
117
|
+
for (IntMap::iterator it = parents->begin();
|
|
118
|
+
it != parents->end(); ++it)
|
|
119
|
+
entries_[it->index()].propagate_up_at_count -= 1;
|
|
120
|
+
|
|
121
|
+
parents->clear(); // Forget the parents
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
PrintDebugInfo();
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
Prefilter* PrefilterTree::CanonicalNode(Prefilter* node) {
|
|
130
|
+
string node_string = NodeString(node);
|
|
131
|
+
map<string, Prefilter*>::iterator iter = node_map_.find(node_string);
|
|
132
|
+
if (iter == node_map_.end())
|
|
133
|
+
return NULL;
|
|
134
|
+
return (*iter).second;
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
static string Itoa(int n) {
|
|
138
|
+
char buf[100];
|
|
139
|
+
snprintf(buf, sizeof buf, "%d", n);
|
|
140
|
+
return string(buf);
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
string PrefilterTree::NodeString(Prefilter* node) const {
|
|
144
|
+
// Adding the operation disambiguates AND/OR/atom nodes.
|
|
145
|
+
string s = Itoa(node->op()) + ":";
|
|
146
|
+
if (node->op() == Prefilter::ATOM) {
|
|
147
|
+
s += node->atom();
|
|
148
|
+
} else {
|
|
149
|
+
for (int i = 0; i < node->subs()->size() ; i++) {
|
|
150
|
+
if (i > 0)
|
|
151
|
+
s += ',';
|
|
152
|
+
s += Itoa((*node->subs())[i]->unique_id());
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
return s;
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) {
|
|
159
|
+
atom_vec->clear();
|
|
160
|
+
|
|
161
|
+
// Build vector of all filter nodes, sorted topologically
|
|
162
|
+
// from top to bottom in v.
|
|
163
|
+
vector<Prefilter*> v;
|
|
164
|
+
|
|
165
|
+
// Add the top level nodes of each regexp prefilter.
|
|
166
|
+
for (int i = 0; i < prefilter_vec_.size(); i++) {
|
|
167
|
+
Prefilter* f = prefilter_vec_[i];
|
|
168
|
+
if (f == NULL)
|
|
169
|
+
unfiltered_.push_back(i);
|
|
170
|
+
|
|
171
|
+
// We push NULL also on to v, so that we maintain the
|
|
172
|
+
// mapping of index==regexpid for level=0 prefilter nodes.
|
|
173
|
+
v.push_back(f);
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
// Now add all the descendant nodes.
|
|
177
|
+
for (int i = 0; i < v.size(); i++) {
|
|
178
|
+
Prefilter* f = v[i];
|
|
179
|
+
if (f == NULL)
|
|
180
|
+
continue;
|
|
181
|
+
if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) {
|
|
182
|
+
const vector<Prefilter*>& subs = *f->subs();
|
|
183
|
+
for (int j = 0; j < subs.size(); j++)
|
|
184
|
+
v.push_back(subs[j]);
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
// Identify unique nodes.
|
|
189
|
+
int unique_id = 0;
|
|
190
|
+
for (int i = v.size() - 1; i >= 0; i--) {
|
|
191
|
+
Prefilter *node = v[i];
|
|
192
|
+
if (node == NULL)
|
|
193
|
+
continue;
|
|
194
|
+
node->set_unique_id(-1);
|
|
195
|
+
Prefilter* canonical = CanonicalNode(node);
|
|
196
|
+
if (canonical == NULL) {
|
|
197
|
+
// Any further nodes that have the same node string
|
|
198
|
+
// will find this node as the canonical node.
|
|
199
|
+
node_map_[NodeString(node)] = node;
|
|
200
|
+
if (node->op() == Prefilter::ATOM) {
|
|
201
|
+
atom_vec->push_back(node->atom());
|
|
202
|
+
atom_index_to_id_.push_back(unique_id);
|
|
203
|
+
}
|
|
204
|
+
node->set_unique_id(unique_id++);
|
|
205
|
+
} else {
|
|
206
|
+
node->set_unique_id(canonical->unique_id());
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
entries_.resize(node_map_.size());
|
|
210
|
+
|
|
211
|
+
// Create parent IntMap for the entries.
|
|
212
|
+
for (int i = v.size() - 1; i >= 0; i--) {
|
|
213
|
+
Prefilter* prefilter = v[i];
|
|
214
|
+
if (prefilter == NULL)
|
|
215
|
+
continue;
|
|
216
|
+
|
|
217
|
+
if (CanonicalNode(prefilter) != prefilter)
|
|
218
|
+
continue;
|
|
219
|
+
|
|
220
|
+
Entry* entry = &entries_[prefilter->unique_id()];
|
|
221
|
+
entry->parents = new IntMap(node_map_.size());
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
// Fill the entries.
|
|
225
|
+
for (int i = v.size() - 1; i >= 0; i--) {
|
|
226
|
+
Prefilter* prefilter = v[i];
|
|
227
|
+
if (prefilter == NULL)
|
|
228
|
+
continue;
|
|
229
|
+
|
|
230
|
+
if (CanonicalNode(prefilter) != prefilter)
|
|
231
|
+
continue;
|
|
232
|
+
|
|
233
|
+
Entry* entry = &entries_[prefilter->unique_id()];
|
|
234
|
+
|
|
235
|
+
switch (prefilter->op()) {
|
|
236
|
+
default:
|
|
237
|
+
case Prefilter::ALL:
|
|
238
|
+
LOG(DFATAL) << "Unexpected op: " << prefilter->op();
|
|
239
|
+
return;
|
|
240
|
+
|
|
241
|
+
case Prefilter::ATOM:
|
|
242
|
+
entry->propagate_up_at_count = 1;
|
|
243
|
+
break;
|
|
244
|
+
|
|
245
|
+
case Prefilter::OR:
|
|
246
|
+
case Prefilter::AND: {
|
|
247
|
+
IntMap uniq_child(node_map_.size());
|
|
248
|
+
for (int j = 0; j < prefilter->subs()->size() ; j++) {
|
|
249
|
+
Prefilter* child = (*prefilter->subs())[j];
|
|
250
|
+
Prefilter* canonical = CanonicalNode(child);
|
|
251
|
+
if (canonical == NULL) {
|
|
252
|
+
LOG(DFATAL) << "Null canonical node";
|
|
253
|
+
return;
|
|
254
|
+
}
|
|
255
|
+
int child_id = canonical->unique_id();
|
|
256
|
+
if (!uniq_child.has_index(child_id))
|
|
257
|
+
uniq_child.set_new(child_id, 1);
|
|
258
|
+
// To the child, we want to add to parent indices.
|
|
259
|
+
Entry* child_entry = &entries_[child_id];
|
|
260
|
+
if (!child_entry->parents->has_index(prefilter->unique_id()))
|
|
261
|
+
child_entry->parents->set_new(prefilter->unique_id(), 1);
|
|
262
|
+
}
|
|
263
|
+
entry->propagate_up_at_count =
|
|
264
|
+
prefilter->op() == Prefilter::AND ? uniq_child.size() : 1;
|
|
265
|
+
|
|
266
|
+
break;
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
// For top level nodes, populate regexp id.
|
|
272
|
+
for (int i = 0; i < prefilter_vec_.size(); i++) {
|
|
273
|
+
if (prefilter_vec_[i] == NULL)
|
|
274
|
+
continue;
|
|
275
|
+
int id = CanonicalNode(prefilter_vec_[i])->unique_id();
|
|
276
|
+
DCHECK_LE(0, id);
|
|
277
|
+
Entry* entry = &entries_[id];
|
|
278
|
+
entry->regexps.push_back(i);
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
// Functions for triggering during search.
|
|
283
|
+
void PrefilterTree::RegexpsGivenStrings(
|
|
284
|
+
const vector<int>& matched_atoms,
|
|
285
|
+
vector<int>* regexps) const {
|
|
286
|
+
regexps->clear();
|
|
287
|
+
if (!compiled_) {
|
|
288
|
+
LOG(WARNING) << "Compile() not called";
|
|
289
|
+
for (int i = 0; i < prefilter_vec_.size(); ++i)
|
|
290
|
+
regexps->push_back(i);
|
|
291
|
+
} else {
|
|
292
|
+
if (!prefilter_vec_.empty()) {
|
|
293
|
+
IntMap regexps_map(prefilter_vec_.size());
|
|
294
|
+
vector<int> matched_atom_ids;
|
|
295
|
+
for (int j = 0; j < matched_atoms.size(); j++) {
|
|
296
|
+
matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]);
|
|
297
|
+
VLOG(10) << "Atom id:" << atom_index_to_id_[matched_atoms[j]];
|
|
298
|
+
}
|
|
299
|
+
PropagateMatch(matched_atom_ids, ®exps_map);
|
|
300
|
+
for (IntMap::iterator it = regexps_map.begin();
|
|
301
|
+
it != regexps_map.end();
|
|
302
|
+
++it)
|
|
303
|
+
regexps->push_back(it->index());
|
|
304
|
+
|
|
305
|
+
regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end());
|
|
306
|
+
}
|
|
307
|
+
}
|
|
308
|
+
sort(regexps->begin(), regexps->end());
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
void PrefilterTree::PropagateMatch(const vector<int>& atom_ids,
|
|
312
|
+
IntMap* regexps) const {
|
|
313
|
+
IntMap count(entries_.size());
|
|
314
|
+
IntMap work(entries_.size());
|
|
315
|
+
for (int i = 0; i < atom_ids.size(); i++)
|
|
316
|
+
work.set(atom_ids[i], 1);
|
|
317
|
+
for (IntMap::iterator it = work.begin(); it != work.end(); ++it) {
|
|
318
|
+
const Entry& entry = entries_[it->index()];
|
|
319
|
+
VLOG(10) << "Processing: " << it->index();
|
|
320
|
+
// Record regexps triggered.
|
|
321
|
+
for (int i = 0; i < entry.regexps.size(); i++) {
|
|
322
|
+
VLOG(10) << "Regexp triggered: " << entry.regexps[i];
|
|
323
|
+
regexps->set(entry.regexps[i], 1);
|
|
324
|
+
}
|
|
325
|
+
int c;
|
|
326
|
+
// Pass trigger up to parents.
|
|
327
|
+
for (IntMap::iterator it = entry.parents->begin();
|
|
328
|
+
it != entry.parents->end();
|
|
329
|
+
++it) {
|
|
330
|
+
int j = it->index();
|
|
331
|
+
const Entry& parent = entries_[j];
|
|
332
|
+
VLOG(10) << " parent= " << j << " trig= " << parent.propagate_up_at_count;
|
|
333
|
+
// Delay until all the children have succeeded.
|
|
334
|
+
if (parent.propagate_up_at_count > 1) {
|
|
335
|
+
if (count.has_index(j)) {
|
|
336
|
+
c = count.get_existing(j) + 1;
|
|
337
|
+
count.set_existing(j, c);
|
|
338
|
+
} else {
|
|
339
|
+
c = 1;
|
|
340
|
+
count.set_new(j, c);
|
|
341
|
+
}
|
|
342
|
+
if (c < parent.propagate_up_at_count)
|
|
343
|
+
continue;
|
|
344
|
+
}
|
|
345
|
+
VLOG(10) << "Triggering: " << j;
|
|
346
|
+
// Trigger the parent.
|
|
347
|
+
work.set(j, 1);
|
|
348
|
+
}
|
|
349
|
+
}
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
// Debugging help.
|
|
353
|
+
void PrefilterTree::PrintPrefilter(int regexpid) {
|
|
354
|
+
LOG(INFO) << DebugNodeString(prefilter_vec_[regexpid]);
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
void PrefilterTree::PrintDebugInfo() {
|
|
358
|
+
VLOG(10) << "#Unique Atoms: " << atom_index_to_id_.size();
|
|
359
|
+
VLOG(10) << "#Unique Nodes: " << entries_.size();
|
|
360
|
+
|
|
361
|
+
for (int i = 0; i < entries_.size(); ++i) {
|
|
362
|
+
IntMap* parents = entries_[i].parents;
|
|
363
|
+
const vector<int>& regexps = entries_[i].regexps;
|
|
364
|
+
VLOG(10) << "EntryId: " << i
|
|
365
|
+
<< " N: " << parents->size() << " R: " << regexps.size();
|
|
366
|
+
for (IntMap::iterator it = parents->begin(); it != parents->end(); ++it)
|
|
367
|
+
VLOG(10) << it->index();
|
|
368
|
+
}
|
|
369
|
+
VLOG(10) << "Map:";
|
|
370
|
+
for (map<string, Prefilter*>::const_iterator iter = node_map_.begin();
|
|
371
|
+
iter != node_map_.end(); ++iter)
|
|
372
|
+
VLOG(10) << "NodeId: " << (*iter).second->unique_id()
|
|
373
|
+
<< " Str: " << (*iter).first;
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
string PrefilterTree::DebugNodeString(Prefilter* node) const {
|
|
377
|
+
string node_string = "";
|
|
378
|
+
|
|
379
|
+
if (node->op() == Prefilter::ATOM) {
|
|
380
|
+
DCHECK(!node->atom().empty());
|
|
381
|
+
node_string += node->atom();
|
|
382
|
+
} else {
|
|
383
|
+
// Adding the operation disambiguates AND and OR nodes.
|
|
384
|
+
node_string += node->op() == Prefilter::AND ? "AND" : "OR";
|
|
385
|
+
node_string += "(";
|
|
386
|
+
for (int i = 0; i < node->subs()->size() ; i++) {
|
|
387
|
+
if (i > 0)
|
|
388
|
+
node_string += ',';
|
|
389
|
+
node_string += Itoa((*node->subs())[i]->unique_id());
|
|
390
|
+
node_string += ":";
|
|
391
|
+
node_string += DebugNodeString((*node->subs())[i]);
|
|
392
|
+
}
|
|
393
|
+
node_string += ")";
|
|
394
|
+
}
|
|
395
|
+
return node_string;
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
} // namespace re2
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style
|
|
3
|
+
// license that can be found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
// The PrefilterTree class is used to form an AND-OR tree of strings
|
|
6
|
+
// that would trigger each regexp. The 'prefilter' of each regexp is
|
|
7
|
+
// added tp PrefilterTree, and then PrefilterTree is used to find all
|
|
8
|
+
// the unique strings across the prefilters. During search, by using
|
|
9
|
+
// matches from a string matching engine, PrefilterTree deduces the
|
|
10
|
+
// set of regexps that are to be triggered. The 'string matching
|
|
11
|
+
// engine' itself is outside of this class, and the caller can use any
|
|
12
|
+
// favorite engine. PrefilterTree provides a set of strings (called
|
|
13
|
+
// atoms) that the user of this class should use to do the string
|
|
14
|
+
// matching.
|
|
15
|
+
//
|
|
16
|
+
#ifndef RE2_PREFILTER_TREE_H_
|
|
17
|
+
#define RE2_PREFILTER_TREE_H_
|
|
18
|
+
|
|
19
|
+
#include "util/util.h"
|
|
20
|
+
#include "util/sparse_array.h"
|
|
21
|
+
|
|
22
|
+
namespace re2 {
|
|
23
|
+
|
|
24
|
+
typedef SparseArray<int> IntMap;
|
|
25
|
+
|
|
26
|
+
class Prefilter;
|
|
27
|
+
|
|
28
|
+
class PrefilterTree {
|
|
29
|
+
public:
|
|
30
|
+
PrefilterTree();
|
|
31
|
+
~PrefilterTree();
|
|
32
|
+
|
|
33
|
+
// Adds the prefilter for the next regexp. Note that we assume that
|
|
34
|
+
// Add called sequentially for all regexps. All Add calls
|
|
35
|
+
// must precede Compile.
|
|
36
|
+
void Add(Prefilter* prefilter);
|
|
37
|
+
|
|
38
|
+
// The Compile returns a vector of string in atom_vec.
|
|
39
|
+
// Call this after all the prefilters are added through Add.
|
|
40
|
+
// No calls to Add after Compile are allowed.
|
|
41
|
+
// The caller should use the returned set of strings to do string matching.
|
|
42
|
+
// Each time a string matches, the corresponding index then has to be
|
|
43
|
+
// and passed to RegexpsGivenStrings below.
|
|
44
|
+
void Compile(vector<string>* atom_vec);
|
|
45
|
+
|
|
46
|
+
// Given the indices of the atoms that matched, returns the indexes
|
|
47
|
+
// of regexps that should be searched. The matched_atoms should
|
|
48
|
+
// contain all the ids of string atoms that were found to match the
|
|
49
|
+
// content. The caller can use any string match engine to perform
|
|
50
|
+
// this function. This function is thread safe.
|
|
51
|
+
void RegexpsGivenStrings(const vector<int>& matched_atoms,
|
|
52
|
+
vector<int>* regexps) const;
|
|
53
|
+
|
|
54
|
+
// Print debug prefilter. Also prints unique ids associated with
|
|
55
|
+
// nodes of the prefilter of the regexp.
|
|
56
|
+
void PrintPrefilter(int regexpid);
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
// Each unique node has a corresponding Entry that helps in
|
|
60
|
+
// passing the matching trigger information along the tree.
|
|
61
|
+
struct Entry {
|
|
62
|
+
public:
|
|
63
|
+
// How many children should match before this node triggers the
|
|
64
|
+
// parent. For an atom and an OR node, this is 1 and for an AND
|
|
65
|
+
// node, it is the number of unique children.
|
|
66
|
+
int propagate_up_at_count;
|
|
67
|
+
|
|
68
|
+
// When this node is ready to trigger the parent, what are the indices
|
|
69
|
+
// of the parent nodes to trigger. The reason there may be more than
|
|
70
|
+
// one is because of sharing. For example (abc | def) and (xyz | def)
|
|
71
|
+
// are two different nodes, but they share the atom 'def'. So when
|
|
72
|
+
// 'def' matches, it triggers two parents, corresponding to the two
|
|
73
|
+
// different OR nodes.
|
|
74
|
+
IntMap* parents;
|
|
75
|
+
|
|
76
|
+
// When this node is ready to trigger the parent, what are the
|
|
77
|
+
// regexps that are triggered.
|
|
78
|
+
vector<int> regexps;
|
|
79
|
+
};
|
|
80
|
+
|
|
81
|
+
private:
|
|
82
|
+
// This function assigns unique ids to various parts of the
|
|
83
|
+
// prefilter, by looking at if these nodes are already in the
|
|
84
|
+
// PrefilterTree.
|
|
85
|
+
void AssignUniqueIds(vector<string>* atom_vec);
|
|
86
|
+
|
|
87
|
+
// Given the matching atoms, find the regexps to be triggered.
|
|
88
|
+
void PropagateMatch(const vector<int>& atom_ids,
|
|
89
|
+
IntMap* regexps) const;
|
|
90
|
+
|
|
91
|
+
// Returns the prefilter node that has the same NodeString as this
|
|
92
|
+
// node. For the canonical node, returns node.
|
|
93
|
+
Prefilter* CanonicalNode(Prefilter* node);
|
|
94
|
+
|
|
95
|
+
// A string that uniquely identifies the node. Assumes that the
|
|
96
|
+
// children of node has already been assigned unique ids.
|
|
97
|
+
string NodeString(Prefilter* node) const;
|
|
98
|
+
|
|
99
|
+
// Recursively constructs a readable prefilter string.
|
|
100
|
+
string DebugNodeString(Prefilter* node) const;
|
|
101
|
+
|
|
102
|
+
// Used for debugging.
|
|
103
|
+
void PrintDebugInfo();
|
|
104
|
+
|
|
105
|
+
// These are all the nodes formed by Compile. Essentially, there is
|
|
106
|
+
// one node for each unique atom and each unique AND/OR node.
|
|
107
|
+
vector<Entry> entries_;
|
|
108
|
+
|
|
109
|
+
// Map node string to canonical Prefilter node.
|
|
110
|
+
map<string, Prefilter*> node_map_;
|
|
111
|
+
|
|
112
|
+
// indices of regexps that always pass through the filter (since we
|
|
113
|
+
// found no required literals in these regexps).
|
|
114
|
+
vector<int> unfiltered_;
|
|
115
|
+
|
|
116
|
+
// vector of Prefilter for all regexps.
|
|
117
|
+
vector<Prefilter*> prefilter_vec_;
|
|
118
|
+
|
|
119
|
+
// Atom index in returned strings to entry id mapping.
|
|
120
|
+
vector<int> atom_index_to_id_;
|
|
121
|
+
|
|
122
|
+
// Has the prefilter tree been compiled.
|
|
123
|
+
bool compiled_;
|
|
124
|
+
|
|
125
|
+
DISALLOW_EVIL_CONSTRUCTORS(PrefilterTree);
|
|
126
|
+
};
|
|
127
|
+
|
|
128
|
+
} // namespace
|
|
129
|
+
|
|
130
|
+
#endif // RE2_PREFILTER_TREE_H_
|