chipper 0.4.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +51 -0
- data/ext/extconf.rb +58 -0
- data/ext/libstemmer_c/Makefile +10 -0
- data/ext/libstemmer_c/examples/stemwords.c +209 -0
- data/ext/libstemmer_c/include/libstemmer.h +79 -0
- data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
- data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
- data/ext/libstemmer_c/libstemmer/modules.h +190 -0
- data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
- data/ext/libstemmer_c/mkinc.mak +82 -0
- data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
- data/ext/libstemmer_c/runtime/api.c +66 -0
- data/ext/libstemmer_c/runtime/api.h +26 -0
- data/ext/libstemmer_c/runtime/header.h +58 -0
- data/ext/libstemmer_c/runtime/utilities.c +478 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
- data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
- data/ext/re2/bitstate.cc +378 -0
- data/ext/re2/compile.cc +1138 -0
- data/ext/re2/dfa.cc +2086 -0
- data/ext/re2/filtered_re2.cc +100 -0
- data/ext/re2/filtered_re2.h +99 -0
- data/ext/re2/hash.cc +231 -0
- data/ext/re2/mimics_pcre.cc +185 -0
- data/ext/re2/nfa.cc +709 -0
- data/ext/re2/onepass.cc +614 -0
- data/ext/re2/parse.cc +2202 -0
- data/ext/re2/perl_groups.cc +119 -0
- data/ext/re2/prefilter.cc +671 -0
- data/ext/re2/prefilter.h +105 -0
- data/ext/re2/prefilter_tree.cc +398 -0
- data/ext/re2/prefilter_tree.h +130 -0
- data/ext/re2/prog.cc +341 -0
- data/ext/re2/prog.h +376 -0
- data/ext/re2/re2.cc +1180 -0
- data/ext/re2/re2.h +837 -0
- data/ext/re2/regexp.cc +920 -0
- data/ext/re2/regexp.h +632 -0
- data/ext/re2/rune.cc +258 -0
- data/ext/re2/set.cc +113 -0
- data/ext/re2/set.h +55 -0
- data/ext/re2/simplify.cc +393 -0
- data/ext/re2/stringpiece.cc +87 -0
- data/ext/re2/stringpiece.h +182 -0
- data/ext/re2/tostring.cc +341 -0
- data/ext/re2/unicode_casefold.cc +469 -0
- data/ext/re2/unicode_casefold.h +75 -0
- data/ext/re2/unicode_groups.cc +4851 -0
- data/ext/re2/unicode_groups.h +64 -0
- data/ext/re2/valgrind.cc +24 -0
- data/ext/re2/variadic_function.h +346 -0
- data/ext/re2/walker-inl.h +244 -0
- data/ext/src/chipper.cc +626 -0
- data/ext/src/version.h +1 -0
- data/ext/stemmer.rb +40 -0
- data/ext/util/arena.h +103 -0
- data/ext/util/atomicops.h +79 -0
- data/ext/util/benchmark.h +41 -0
- data/ext/util/flags.h +27 -0
- data/ext/util/logging.h +78 -0
- data/ext/util/mutex.h +190 -0
- data/ext/util/pcre.h +679 -0
- data/ext/util/random.h +29 -0
- data/ext/util/sparse_array.h +451 -0
- data/ext/util/sparse_set.h +177 -0
- data/ext/util/test.h +57 -0
- data/ext/util/thread.h +26 -0
- data/ext/util/utf.h +43 -0
- data/ext/util/util.h +127 -0
- data/ext/util/valgrind.h +4517 -0
- data/test/helper.rb +5 -0
- data/test/test_entities.rb +57 -0
- data/test/test_tokens.rb +118 -0
- metadata +199 -0
data/ext/re2/regexp.cc
ADDED
@@ -0,0 +1,920 @@
|
|
1
|
+
// Copyright 2006 The RE2 Authors. All Rights Reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style
|
3
|
+
// license that can be found in the LICENSE file.
|
4
|
+
|
5
|
+
// Regular expression representation.
|
6
|
+
// Tested by parse_test.cc
|
7
|
+
|
8
|
+
#include "util/util.h"
|
9
|
+
#include "re2/regexp.h"
|
10
|
+
#include "re2/stringpiece.h"
|
11
|
+
#include "re2/walker-inl.h"
|
12
|
+
|
13
|
+
namespace re2 {
|
14
|
+
|
15
|
+
// Constructor. Allocates vectors as appropriate for operator.
|
16
|
+
Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
|
17
|
+
: op_(op),
|
18
|
+
simple_(false),
|
19
|
+
parse_flags_(static_cast<uint16>(parse_flags)),
|
20
|
+
ref_(1),
|
21
|
+
nsub_(0),
|
22
|
+
down_(NULL) {
|
23
|
+
subone_ = NULL;
|
24
|
+
memset(the_union_, 0, sizeof the_union_);
|
25
|
+
}
|
26
|
+
|
27
|
+
// Destructor. Assumes already cleaned up children.
|
28
|
+
// Private: use Decref() instead of delete to destroy Regexps.
|
29
|
+
// Can't call Decref on the sub-Regexps here because
|
30
|
+
// that could cause arbitrarily deep recursion, so
|
31
|
+
// required Decref() to have handled them for us.
|
32
|
+
Regexp::~Regexp() {
|
33
|
+
if (nsub_ > 0)
|
34
|
+
LOG(DFATAL) << "Regexp not destroyed.";
|
35
|
+
|
36
|
+
switch (op_) {
|
37
|
+
default:
|
38
|
+
break;
|
39
|
+
case kRegexpCapture:
|
40
|
+
delete name_;
|
41
|
+
break;
|
42
|
+
case kRegexpLiteralString:
|
43
|
+
delete[] runes_;
|
44
|
+
break;
|
45
|
+
case kRegexpCharClass:
|
46
|
+
cc_->Delete();
|
47
|
+
delete ccb_;
|
48
|
+
break;
|
49
|
+
}
|
50
|
+
}
|
51
|
+
|
52
|
+
// If it's possible to destroy this regexp without recurring,
|
53
|
+
// do so and return true. Else return false.
|
54
|
+
bool Regexp::QuickDestroy() {
|
55
|
+
if (nsub_ == 0) {
|
56
|
+
delete this;
|
57
|
+
return true;
|
58
|
+
}
|
59
|
+
return false;
|
60
|
+
}
|
61
|
+
|
62
|
+
static map<Regexp*, int> ref_map;
|
63
|
+
static Mutex ref_mutex;
|
64
|
+
|
65
|
+
int Regexp::Ref() {
|
66
|
+
if (ref_ < kMaxRef)
|
67
|
+
return ref_;
|
68
|
+
|
69
|
+
MutexLock l(&ref_mutex);
|
70
|
+
return ref_map[this];
|
71
|
+
}
|
72
|
+
|
73
|
+
// Increments reference count, returns object as convenience.
|
74
|
+
Regexp* Regexp::Incref() {
|
75
|
+
if (ref_ >= kMaxRef-1) {
|
76
|
+
// Store ref count in overflow map.
|
77
|
+
MutexLock l(&ref_mutex);
|
78
|
+
if (ref_ == kMaxRef) { // already overflowed
|
79
|
+
ref_map[this]++;
|
80
|
+
return this;
|
81
|
+
}
|
82
|
+
// overflowing now
|
83
|
+
ref_map[this] = kMaxRef;
|
84
|
+
ref_ = kMaxRef;
|
85
|
+
return this;
|
86
|
+
}
|
87
|
+
|
88
|
+
ref_++;
|
89
|
+
return this;
|
90
|
+
}
|
91
|
+
|
92
|
+
// Decrements reference count and deletes this object if count reaches 0.
|
93
|
+
void Regexp::Decref() {
|
94
|
+
if (ref_ == kMaxRef) {
|
95
|
+
// Ref count is stored in overflow map.
|
96
|
+
MutexLock l(&ref_mutex);
|
97
|
+
int r = ref_map[this] - 1;
|
98
|
+
if (r < kMaxRef) {
|
99
|
+
ref_ = r;
|
100
|
+
ref_map.erase(this);
|
101
|
+
} else {
|
102
|
+
ref_map[this] = r;
|
103
|
+
}
|
104
|
+
return;
|
105
|
+
}
|
106
|
+
ref_--;
|
107
|
+
if (ref_ == 0)
|
108
|
+
Destroy();
|
109
|
+
}
|
110
|
+
|
111
|
+
// Deletes this object; ref count has count reached 0.
|
112
|
+
void Regexp::Destroy() {
|
113
|
+
if (QuickDestroy())
|
114
|
+
return;
|
115
|
+
|
116
|
+
// Handle recursive Destroy with explicit stack
|
117
|
+
// to avoid arbitrarily deep recursion on process stack [sigh].
|
118
|
+
down_ = NULL;
|
119
|
+
Regexp* stack = this;
|
120
|
+
while (stack != NULL) {
|
121
|
+
Regexp* re = stack;
|
122
|
+
stack = re->down_;
|
123
|
+
if (re->ref_ != 0)
|
124
|
+
LOG(DFATAL) << "Bad reference count " << re->ref_;
|
125
|
+
if (re->nsub_ > 0) {
|
126
|
+
Regexp** subs = re->sub();
|
127
|
+
for (int i = 0; i < re->nsub_; i++) {
|
128
|
+
Regexp* sub = subs[i];
|
129
|
+
if (sub == NULL)
|
130
|
+
continue;
|
131
|
+
if (sub->ref_ == kMaxRef)
|
132
|
+
sub->Decref();
|
133
|
+
else
|
134
|
+
--sub->ref_;
|
135
|
+
if (sub->ref_ == 0 && !sub->QuickDestroy()) {
|
136
|
+
sub->down_ = stack;
|
137
|
+
stack = sub;
|
138
|
+
}
|
139
|
+
}
|
140
|
+
if (re->nsub_ > 1)
|
141
|
+
delete[] subs;
|
142
|
+
re->nsub_ = 0;
|
143
|
+
}
|
144
|
+
delete re;
|
145
|
+
}
|
146
|
+
}
|
147
|
+
|
148
|
+
void Regexp::AddRuneToString(Rune r) {
|
149
|
+
DCHECK(op_ == kRegexpLiteralString);
|
150
|
+
if (nrunes_ == 0) {
|
151
|
+
// start with 8
|
152
|
+
runes_ = new Rune[8];
|
153
|
+
} else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) {
|
154
|
+
// double on powers of two
|
155
|
+
Rune *old = runes_;
|
156
|
+
runes_ = new Rune[nrunes_ * 2];
|
157
|
+
for (int i = 0; i < nrunes_; i++)
|
158
|
+
runes_[i] = old[i];
|
159
|
+
delete[] old;
|
160
|
+
}
|
161
|
+
|
162
|
+
runes_[nrunes_++] = r;
|
163
|
+
}
|
164
|
+
|
165
|
+
Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
|
166
|
+
Regexp* re = new Regexp(kRegexpHaveMatch, flags);
|
167
|
+
re->match_id_ = match_id;
|
168
|
+
return re;
|
169
|
+
}
|
170
|
+
|
171
|
+
Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
|
172
|
+
if (sub->op() == kRegexpPlus && sub->parse_flags() == flags)
|
173
|
+
return sub;
|
174
|
+
Regexp* re = new Regexp(kRegexpPlus, flags);
|
175
|
+
re->AllocSub(1);
|
176
|
+
re->sub()[0] = sub;
|
177
|
+
return re;
|
178
|
+
}
|
179
|
+
|
180
|
+
Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
|
181
|
+
if (sub->op() == kRegexpStar && sub->parse_flags() == flags)
|
182
|
+
return sub;
|
183
|
+
Regexp* re = new Regexp(kRegexpStar, flags);
|
184
|
+
re->AllocSub(1);
|
185
|
+
re->sub()[0] = sub;
|
186
|
+
return re;
|
187
|
+
}
|
188
|
+
|
189
|
+
Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
|
190
|
+
if (sub->op() == kRegexpQuest && sub->parse_flags() == flags)
|
191
|
+
return sub;
|
192
|
+
Regexp* re = new Regexp(kRegexpQuest, flags);
|
193
|
+
re->AllocSub(1);
|
194
|
+
re->sub()[0] = sub;
|
195
|
+
return re;
|
196
|
+
}
|
197
|
+
|
198
|
+
Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
|
199
|
+
ParseFlags flags, bool can_factor) {
|
200
|
+
if (nsub == 1)
|
201
|
+
return sub[0];
|
202
|
+
|
203
|
+
Regexp** subcopy = NULL;
|
204
|
+
if (op == kRegexpAlternate && can_factor) {
|
205
|
+
// Going to edit sub; make a copy so we don't step on caller.
|
206
|
+
subcopy = new Regexp*[nsub];
|
207
|
+
memmove(subcopy, sub, nsub * sizeof sub[0]);
|
208
|
+
sub = subcopy;
|
209
|
+
nsub = FactorAlternation(sub, nsub, flags);
|
210
|
+
if (nsub == 1) {
|
211
|
+
Regexp* re = sub[0];
|
212
|
+
delete[] subcopy;
|
213
|
+
return re;
|
214
|
+
}
|
215
|
+
}
|
216
|
+
|
217
|
+
if (nsub > kMaxNsub) {
|
218
|
+
// Too many subexpressions to fit in a single Regexp.
|
219
|
+
// Make a two-level tree. Two levels gets us to 65535^2.
|
220
|
+
int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub;
|
221
|
+
Regexp* re = new Regexp(op, flags);
|
222
|
+
re->AllocSub(nbigsub);
|
223
|
+
Regexp** subs = re->sub();
|
224
|
+
for (int i = 0; i < nbigsub - 1; i++)
|
225
|
+
subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false);
|
226
|
+
subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,
|
227
|
+
nsub - (nbigsub-1)*kMaxNsub, flags,
|
228
|
+
false);
|
229
|
+
delete[] subcopy;
|
230
|
+
return re;
|
231
|
+
}
|
232
|
+
|
233
|
+
Regexp* re = new Regexp(op, flags);
|
234
|
+
re->AllocSub(nsub);
|
235
|
+
Regexp** subs = re->sub();
|
236
|
+
for (int i = 0; i < nsub; i++)
|
237
|
+
subs[i] = sub[i];
|
238
|
+
|
239
|
+
delete[] subcopy;
|
240
|
+
return re;
|
241
|
+
}
|
242
|
+
|
243
|
+
Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {
|
244
|
+
return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);
|
245
|
+
}
|
246
|
+
|
247
|
+
Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {
|
248
|
+
return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);
|
249
|
+
}
|
250
|
+
|
251
|
+
Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {
|
252
|
+
return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);
|
253
|
+
}
|
254
|
+
|
255
|
+
Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {
|
256
|
+
Regexp* re = new Regexp(kRegexpCapture, flags);
|
257
|
+
re->AllocSub(1);
|
258
|
+
re->sub()[0] = sub;
|
259
|
+
re->cap_ = cap;
|
260
|
+
return re;
|
261
|
+
}
|
262
|
+
|
263
|
+
Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) {
|
264
|
+
Regexp* re = new Regexp(kRegexpRepeat, flags);
|
265
|
+
re->AllocSub(1);
|
266
|
+
re->sub()[0] = sub;
|
267
|
+
re->min_ = min;
|
268
|
+
re->max_ = max;
|
269
|
+
return re;
|
270
|
+
}
|
271
|
+
|
272
|
+
Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) {
|
273
|
+
Regexp* re = new Regexp(kRegexpLiteral, flags);
|
274
|
+
re->rune_ = rune;
|
275
|
+
return re;
|
276
|
+
}
|
277
|
+
|
278
|
+
Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) {
|
279
|
+
if (nrunes <= 0)
|
280
|
+
return new Regexp(kRegexpEmptyMatch, flags);
|
281
|
+
if (nrunes == 1)
|
282
|
+
return NewLiteral(runes[0], flags);
|
283
|
+
Regexp* re = new Regexp(kRegexpLiteralString, flags);
|
284
|
+
for (int i = 0; i < nrunes; i++)
|
285
|
+
re->AddRuneToString(runes[i]);
|
286
|
+
return re;
|
287
|
+
}
|
288
|
+
|
289
|
+
Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) {
|
290
|
+
Regexp* re = new Regexp(kRegexpCharClass, flags);
|
291
|
+
re->cc_ = cc;
|
292
|
+
return re;
|
293
|
+
}
|
294
|
+
|
295
|
+
// Swaps this and that in place.
|
296
|
+
void Regexp::Swap(Regexp* that) {
|
297
|
+
// Can use memmove because Regexp is just a struct (no vtable).
|
298
|
+
char tmp[sizeof *this];
|
299
|
+
memmove(tmp, this, sizeof tmp);
|
300
|
+
memmove(this, that, sizeof tmp);
|
301
|
+
memmove(that, tmp, sizeof tmp);
|
302
|
+
}
|
303
|
+
|
304
|
+
// Tests equality of all top-level structure but not subregexps.
|
305
|
+
static bool TopEqual(Regexp* a, Regexp* b) {
|
306
|
+
if (a->op() != b->op())
|
307
|
+
return false;
|
308
|
+
|
309
|
+
switch (a->op()) {
|
310
|
+
case kRegexpNoMatch:
|
311
|
+
case kRegexpEmptyMatch:
|
312
|
+
case kRegexpAnyChar:
|
313
|
+
case kRegexpAnyByte:
|
314
|
+
case kRegexpBeginLine:
|
315
|
+
case kRegexpEndLine:
|
316
|
+
case kRegexpWordBoundary:
|
317
|
+
case kRegexpNoWordBoundary:
|
318
|
+
case kRegexpBeginText:
|
319
|
+
return true;
|
320
|
+
|
321
|
+
case kRegexpEndText:
|
322
|
+
// The parse flags remember whether it's \z or (?-m:$),
|
323
|
+
// which matters when testing against PCRE.
|
324
|
+
return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;
|
325
|
+
|
326
|
+
case kRegexpLiteral:
|
327
|
+
return a->rune() == b->rune() &&
|
328
|
+
((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;
|
329
|
+
|
330
|
+
case kRegexpLiteralString:
|
331
|
+
return a->nrunes() == b->nrunes() &&
|
332
|
+
((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&
|
333
|
+
memcmp(a->runes(), b->runes(),
|
334
|
+
a->nrunes() * sizeof a->runes()[0]) == 0;
|
335
|
+
|
336
|
+
case kRegexpAlternate:
|
337
|
+
case kRegexpConcat:
|
338
|
+
return a->nsub() == b->nsub();
|
339
|
+
|
340
|
+
case kRegexpStar:
|
341
|
+
case kRegexpPlus:
|
342
|
+
case kRegexpQuest:
|
343
|
+
return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;
|
344
|
+
|
345
|
+
case kRegexpRepeat:
|
346
|
+
return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 &&
|
347
|
+
a->min() == b->min() &&
|
348
|
+
a->max() == b->max();
|
349
|
+
|
350
|
+
case kRegexpCapture:
|
351
|
+
return a->cap() == b->cap() && a->name() == b->name();
|
352
|
+
|
353
|
+
case kRegexpHaveMatch:
|
354
|
+
return a->match_id() == b->match_id();
|
355
|
+
|
356
|
+
case kRegexpCharClass: {
|
357
|
+
CharClass* acc = a->cc();
|
358
|
+
CharClass* bcc = b->cc();
|
359
|
+
return acc->size() == bcc->size() &&
|
360
|
+
acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
|
361
|
+
memcmp(acc->begin(), bcc->begin(),
|
362
|
+
(acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;
|
363
|
+
}
|
364
|
+
}
|
365
|
+
|
366
|
+
LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
|
367
|
+
return 0;
|
368
|
+
}
|
369
|
+
|
370
|
+
bool Regexp::Equal(Regexp* a, Regexp* b) {
|
371
|
+
if (a == NULL || b == NULL)
|
372
|
+
return a == b;
|
373
|
+
|
374
|
+
if (!TopEqual(a, b))
|
375
|
+
return false;
|
376
|
+
|
377
|
+
// Fast path:
|
378
|
+
// return without allocating vector if there are no subregexps.
|
379
|
+
switch (a->op()) {
|
380
|
+
case kRegexpAlternate:
|
381
|
+
case kRegexpConcat:
|
382
|
+
case kRegexpStar:
|
383
|
+
case kRegexpPlus:
|
384
|
+
case kRegexpQuest:
|
385
|
+
case kRegexpRepeat:
|
386
|
+
case kRegexpCapture:
|
387
|
+
break;
|
388
|
+
|
389
|
+
default:
|
390
|
+
return true;
|
391
|
+
}
|
392
|
+
|
393
|
+
// Committed to doing real work.
|
394
|
+
// The stack (vector) has pairs of regexps waiting to
|
395
|
+
// be compared. The regexps are only equal if
|
396
|
+
// all the pairs end up being equal.
|
397
|
+
vector<Regexp*> stk;
|
398
|
+
|
399
|
+
for (;;) {
|
400
|
+
// Invariant: TopEqual(a, b) == true.
|
401
|
+
Regexp* a2;
|
402
|
+
Regexp* b2;
|
403
|
+
switch (a->op()) {
|
404
|
+
default:
|
405
|
+
break;
|
406
|
+
case kRegexpAlternate:
|
407
|
+
case kRegexpConcat:
|
408
|
+
for (int i = 0; i < a->nsub(); i++) {
|
409
|
+
a2 = a->sub()[i];
|
410
|
+
b2 = b->sub()[i];
|
411
|
+
if (!TopEqual(a2, b2))
|
412
|
+
return false;
|
413
|
+
stk.push_back(a2);
|
414
|
+
stk.push_back(b2);
|
415
|
+
}
|
416
|
+
break;
|
417
|
+
|
418
|
+
case kRegexpStar:
|
419
|
+
case kRegexpPlus:
|
420
|
+
case kRegexpQuest:
|
421
|
+
case kRegexpRepeat:
|
422
|
+
case kRegexpCapture:
|
423
|
+
a2 = a->sub()[0];
|
424
|
+
b2 = b->sub()[0];
|
425
|
+
if (!TopEqual(a2, b2))
|
426
|
+
return false;
|
427
|
+
// Really:
|
428
|
+
// stk.push_back(a2);
|
429
|
+
// stk.push_back(b2);
|
430
|
+
// break;
|
431
|
+
// but faster to assign directly and loop.
|
432
|
+
a = a2;
|
433
|
+
b = b2;
|
434
|
+
continue;
|
435
|
+
}
|
436
|
+
|
437
|
+
int n = stk.size();
|
438
|
+
if (n == 0)
|
439
|
+
break;
|
440
|
+
|
441
|
+
a = stk[n-2];
|
442
|
+
b = stk[n-1];
|
443
|
+
stk.resize(n-2);
|
444
|
+
}
|
445
|
+
|
446
|
+
return true;
|
447
|
+
}
|
448
|
+
|
449
|
+
// Keep in sync with enum RegexpStatusCode in regexp.h
|
450
|
+
static const string kErrorStrings[] = {
|
451
|
+
"no error",
|
452
|
+
"unexpected error",
|
453
|
+
"invalid escape sequence",
|
454
|
+
"invalid character class",
|
455
|
+
"invalid character class range",
|
456
|
+
"missing ]",
|
457
|
+
"missing )",
|
458
|
+
"trailing \\",
|
459
|
+
"no argument for repetition operator",
|
460
|
+
"invalid repetition size",
|
461
|
+
"bad repetition operator",
|
462
|
+
"invalid perl operator",
|
463
|
+
"invalid UTF-8",
|
464
|
+
"invalid named capture group",
|
465
|
+
};
|
466
|
+
|
467
|
+
const string& RegexpStatus::CodeText(enum RegexpStatusCode code) {
|
468
|
+
if (code < 0 || code >= arraysize(kErrorStrings))
|
469
|
+
code = kRegexpInternalError;
|
470
|
+
return kErrorStrings[code];
|
471
|
+
}
|
472
|
+
|
473
|
+
string RegexpStatus::Text() const {
|
474
|
+
if (error_arg_.empty())
|
475
|
+
return CodeText(code_);
|
476
|
+
string s;
|
477
|
+
s.append(CodeText(code_));
|
478
|
+
s.append(": ");
|
479
|
+
s.append(error_arg_.data(), error_arg_.size());
|
480
|
+
return s;
|
481
|
+
}
|
482
|
+
|
483
|
+
void RegexpStatus::Copy(const RegexpStatus& status) {
|
484
|
+
code_ = status.code_;
|
485
|
+
error_arg_ = status.error_arg_;
|
486
|
+
}
|
487
|
+
|
488
|
+
typedef int Ignored; // Walker<void> doesn't exist
|
489
|
+
|
490
|
+
// Walker subclass to count capturing parens in regexp.
|
491
|
+
class NumCapturesWalker : public Regexp::Walker<Ignored> {
|
492
|
+
public:
|
493
|
+
NumCapturesWalker() : ncapture_(0) {}
|
494
|
+
int ncapture() { return ncapture_; }
|
495
|
+
|
496
|
+
virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
|
497
|
+
if (re->op() == kRegexpCapture)
|
498
|
+
ncapture_++;
|
499
|
+
return ignored;
|
500
|
+
}
|
501
|
+
virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
|
502
|
+
// Should never be called: we use Walk not WalkExponential.
|
503
|
+
LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";
|
504
|
+
return ignored;
|
505
|
+
}
|
506
|
+
|
507
|
+
private:
|
508
|
+
int ncapture_;
|
509
|
+
DISALLOW_EVIL_CONSTRUCTORS(NumCapturesWalker);
|
510
|
+
};
|
511
|
+
|
512
|
+
int Regexp::NumCaptures() {
|
513
|
+
NumCapturesWalker w;
|
514
|
+
w.Walk(this, 0);
|
515
|
+
return w.ncapture();
|
516
|
+
}
|
517
|
+
|
518
|
+
// Walker class to build map of named capture groups and their indices.
|
519
|
+
class NamedCapturesWalker : public Regexp::Walker<Ignored> {
|
520
|
+
public:
|
521
|
+
NamedCapturesWalker() : map_(NULL) {}
|
522
|
+
~NamedCapturesWalker() { delete map_; }
|
523
|
+
|
524
|
+
map<string, int>* TakeMap() {
|
525
|
+
map<string, int>* m = map_;
|
526
|
+
map_ = NULL;
|
527
|
+
return m;
|
528
|
+
}
|
529
|
+
|
530
|
+
Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
|
531
|
+
if (re->op() == kRegexpCapture && re->name() != NULL) {
|
532
|
+
// Allocate map once we find a name.
|
533
|
+
if (map_ == NULL)
|
534
|
+
map_ = new map<string, int>;
|
535
|
+
|
536
|
+
// Record first occurrence of each name.
|
537
|
+
// (The rule is that if you have the same name
|
538
|
+
// multiple times, only the leftmost one counts.)
|
539
|
+
if (map_->find(*re->name()) == map_->end())
|
540
|
+
(*map_)[*re->name()] = re->cap();
|
541
|
+
}
|
542
|
+
return ignored;
|
543
|
+
}
|
544
|
+
|
545
|
+
virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
|
546
|
+
// Should never be called: we use Walk not WalkExponential.
|
547
|
+
LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";
|
548
|
+
return ignored;
|
549
|
+
}
|
550
|
+
|
551
|
+
private:
|
552
|
+
map<string, int>* map_;
|
553
|
+
DISALLOW_EVIL_CONSTRUCTORS(NamedCapturesWalker);
|
554
|
+
};
|
555
|
+
|
556
|
+
map<string, int>* Regexp::NamedCaptures() {
|
557
|
+
NamedCapturesWalker w;
|
558
|
+
w.Walk(this, 0);
|
559
|
+
return w.TakeMap();
|
560
|
+
}
|
561
|
+
|
562
|
+
// Walker class to build map from capture group indices to their names.
|
563
|
+
class CaptureNamesWalker : public Regexp::Walker<Ignored> {
|
564
|
+
public:
|
565
|
+
CaptureNamesWalker() : map_(NULL) {}
|
566
|
+
~CaptureNamesWalker() { delete map_; }
|
567
|
+
|
568
|
+
map<int, string>* TakeMap() {
|
569
|
+
map<int, string>* m = map_;
|
570
|
+
map_ = NULL;
|
571
|
+
return m;
|
572
|
+
}
|
573
|
+
|
574
|
+
Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
|
575
|
+
if (re->op() == kRegexpCapture && re->name() != NULL) {
|
576
|
+
// Allocate map once we find a name.
|
577
|
+
if (map_ == NULL)
|
578
|
+
map_ = new map<int, string>;
|
579
|
+
|
580
|
+
(*map_)[re->cap()] = *re->name();
|
581
|
+
}
|
582
|
+
return ignored;
|
583
|
+
}
|
584
|
+
|
585
|
+
virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
|
586
|
+
// Should never be called: we use Walk not WalkExponential.
|
587
|
+
LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
|
588
|
+
return ignored;
|
589
|
+
}
|
590
|
+
|
591
|
+
private:
|
592
|
+
map<int, string>* map_;
|
593
|
+
DISALLOW_EVIL_CONSTRUCTORS(CaptureNamesWalker);
|
594
|
+
};
|
595
|
+
|
596
|
+
map<int, string>* Regexp::CaptureNames() {
|
597
|
+
CaptureNamesWalker w;
|
598
|
+
w.Walk(this, 0);
|
599
|
+
return w.TakeMap();
|
600
|
+
}
|
601
|
+
|
602
|
+
// Determines whether regexp matches must be anchored
|
603
|
+
// with a fixed string prefix. If so, returns the prefix and
|
604
|
+
// the regexp that remains after the prefix. The prefix might
|
605
|
+
// be ASCII case-insensitive.
|
606
|
+
bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
|
607
|
+
// No need for a walker: the regexp must be of the form
|
608
|
+
// 1. some number of ^ anchors
|
609
|
+
// 2. a literal char or string
|
610
|
+
// 3. the rest
|
611
|
+
prefix->clear();
|
612
|
+
*foldcase = false;
|
613
|
+
*suffix = NULL;
|
614
|
+
if (op_ != kRegexpConcat)
|
615
|
+
return false;
|
616
|
+
|
617
|
+
// Some number of anchors, then a literal or concatenation.
|
618
|
+
int i = 0;
|
619
|
+
Regexp** sub = this->sub();
|
620
|
+
while (i < nsub_ && sub[i]->op_ == kRegexpBeginText)
|
621
|
+
i++;
|
622
|
+
if (i == 0 || i >= nsub_)
|
623
|
+
return false;
|
624
|
+
|
625
|
+
Regexp* re = sub[i];
|
626
|
+
switch (re->op_) {
|
627
|
+
default:
|
628
|
+
return false;
|
629
|
+
|
630
|
+
case kRegexpLiteralString:
|
631
|
+
// Convert to string in proper encoding.
|
632
|
+
if (re->parse_flags() & Latin1) {
|
633
|
+
prefix->resize(re->nrunes_);
|
634
|
+
for (int j = 0; j < re->nrunes_; j++)
|
635
|
+
(*prefix)[j] = re->runes_[j];
|
636
|
+
} else {
|
637
|
+
// Convert to UTF-8 in place.
|
638
|
+
// Assume worst-case space and then trim.
|
639
|
+
prefix->resize(re->nrunes_ * UTFmax);
|
640
|
+
char *p = &(*prefix)[0];
|
641
|
+
for (int j = 0; j < re->nrunes_; j++) {
|
642
|
+
Rune r = re->runes_[j];
|
643
|
+
if (r < Runeself)
|
644
|
+
*p++ = r;
|
645
|
+
else
|
646
|
+
p += runetochar(p, &r);
|
647
|
+
}
|
648
|
+
prefix->resize(p - &(*prefix)[0]);
|
649
|
+
}
|
650
|
+
break;
|
651
|
+
|
652
|
+
case kRegexpLiteral:
|
653
|
+
if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) {
|
654
|
+
prefix->append(1, re->rune_);
|
655
|
+
} else {
|
656
|
+
char buf[UTFmax];
|
657
|
+
prefix->append(buf, runetochar(buf, &re->rune_));
|
658
|
+
}
|
659
|
+
break;
|
660
|
+
}
|
661
|
+
*foldcase = (sub[i]->parse_flags() & FoldCase);
|
662
|
+
i++;
|
663
|
+
|
664
|
+
// The rest.
|
665
|
+
if (i < nsub_) {
|
666
|
+
for (int j = i; j < nsub_; j++)
|
667
|
+
sub[j]->Incref();
|
668
|
+
re = Concat(sub + i, nsub_ - i, parse_flags());
|
669
|
+
} else {
|
670
|
+
re = new Regexp(kRegexpEmptyMatch, parse_flags());
|
671
|
+
}
|
672
|
+
*suffix = re;
|
673
|
+
return true;
|
674
|
+
}
|
675
|
+
|
676
|
+
// Character class builder is a balanced binary tree (STL set)
|
677
|
+
// containing non-overlapping, non-abutting RuneRanges.
|
678
|
+
// The less-than operator used in the tree treats two
|
679
|
+
// ranges as equal if they overlap at all, so that
|
680
|
+
// lookups for a particular Rune are possible.
|
681
|
+
|
682
|
+
CharClassBuilder::CharClassBuilder() {
|
683
|
+
nrunes_ = 0;
|
684
|
+
upper_ = 0;
|
685
|
+
lower_ = 0;
|
686
|
+
}
|
687
|
+
|
688
|
+
// Add lo-hi to the class; return whether class got bigger.
|
689
|
+
bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
|
690
|
+
if (hi < lo)
|
691
|
+
return false;
|
692
|
+
|
693
|
+
if (lo <= 'z' && hi >= 'A') {
|
694
|
+
// Overlaps some alpha, maybe not all.
|
695
|
+
// Update bitmaps telling which ASCII letters are in the set.
|
696
|
+
Rune lo1 = max<Rune>(lo, 'A');
|
697
|
+
Rune hi1 = min<Rune>(hi, 'Z');
|
698
|
+
if (lo1 <= hi1)
|
699
|
+
upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');
|
700
|
+
|
701
|
+
lo1 = max<Rune>(lo, 'a');
|
702
|
+
hi1 = min<Rune>(hi, 'z');
|
703
|
+
if (lo1 <= hi1)
|
704
|
+
lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
|
705
|
+
}
|
706
|
+
|
707
|
+
{ // Check whether lo, hi is already in the class.
|
708
|
+
iterator it = ranges_.find(RuneRange(lo, lo));
|
709
|
+
if (it != end() && it->lo <= lo && hi <= it->hi)
|
710
|
+
return false;
|
711
|
+
}
|
712
|
+
|
713
|
+
// Look for a range abutting lo on the left.
|
714
|
+
// If it exists, take it out and increase our range.
|
715
|
+
if (lo > 0) {
|
716
|
+
iterator it = ranges_.find(RuneRange(lo-1, lo-1));
|
717
|
+
if (it != end()) {
|
718
|
+
lo = it->lo;
|
719
|
+
if (it->hi > hi)
|
720
|
+
hi = it->hi;
|
721
|
+
nrunes_ -= it->hi - it->lo + 1;
|
722
|
+
ranges_.erase(it);
|
723
|
+
}
|
724
|
+
}
|
725
|
+
|
726
|
+
// Look for a range abutting hi on the right.
|
727
|
+
// If it exists, take it out and increase our range.
|
728
|
+
if (hi < Runemax) {
|
729
|
+
iterator it = ranges_.find(RuneRange(hi+1, hi+1));
|
730
|
+
if (it != end()) {
|
731
|
+
hi = it->hi;
|
732
|
+
nrunes_ -= it->hi - it->lo + 1;
|
733
|
+
ranges_.erase(it);
|
734
|
+
}
|
735
|
+
}
|
736
|
+
|
737
|
+
// Look for ranges between lo and hi. Take them out.
|
738
|
+
// This is only safe because the set has no overlapping ranges.
|
739
|
+
// We've already removed any ranges abutting lo and hi, so
|
740
|
+
// any that overlap [lo, hi] must be contained within it.
|
741
|
+
for (;;) {
|
742
|
+
iterator it = ranges_.find(RuneRange(lo, hi));
|
743
|
+
if (it == end())
|
744
|
+
break;
|
745
|
+
nrunes_ -= it->hi - it->lo + 1;
|
746
|
+
ranges_.erase(it);
|
747
|
+
}
|
748
|
+
|
749
|
+
// Finally, add [lo, hi].
|
750
|
+
nrunes_ += hi - lo + 1;
|
751
|
+
ranges_.insert(RuneRange(lo, hi));
|
752
|
+
return true;
|
753
|
+
}
|
754
|
+
|
755
|
+
void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {
|
756
|
+
for (iterator it = cc->begin(); it != cc->end(); ++it)
|
757
|
+
AddRange(it->lo, it->hi);
|
758
|
+
}
|
759
|
+
|
760
|
+
bool CharClassBuilder::Contains(Rune r) {
|
761
|
+
return ranges_.find(RuneRange(r, r)) != end();
|
762
|
+
}
|
763
|
+
|
764
|
+
// Does the character class behave the same on A-Z as on a-z?
|
765
|
+
bool CharClassBuilder::FoldsASCII() {
|
766
|
+
return ((upper_ ^ lower_) & AlphaMask) == 0;
|
767
|
+
}
|
768
|
+
|
769
|
+
CharClassBuilder* CharClassBuilder::Copy() {
|
770
|
+
CharClassBuilder* cc = new CharClassBuilder;
|
771
|
+
for (iterator it = begin(); it != end(); ++it)
|
772
|
+
cc->ranges_.insert(RuneRange(it->lo, it->hi));
|
773
|
+
cc->upper_ = upper_;
|
774
|
+
cc->lower_ = lower_;
|
775
|
+
cc->nrunes_ = nrunes_;
|
776
|
+
return cc;
|
777
|
+
}
|
778
|
+
|
779
|
+
|
780
|
+
|
781
|
+
void CharClassBuilder::RemoveAbove(Rune r) {
|
782
|
+
if (r >= Runemax)
|
783
|
+
return;
|
784
|
+
|
785
|
+
if (r < 'z') {
|
786
|
+
if (r < 'a')
|
787
|
+
lower_ = 0;
|
788
|
+
else
|
789
|
+
lower_ &= AlphaMask >> ('z' - r);
|
790
|
+
}
|
791
|
+
|
792
|
+
if (r < 'Z') {
|
793
|
+
if (r < 'A')
|
794
|
+
upper_ = 0;
|
795
|
+
else
|
796
|
+
upper_ &= AlphaMask >> ('Z' - r);
|
797
|
+
}
|
798
|
+
|
799
|
+
for (;;) {
|
800
|
+
|
801
|
+
iterator it = ranges_.find(RuneRange(r + 1, Runemax));
|
802
|
+
if (it == end())
|
803
|
+
break;
|
804
|
+
RuneRange rr = *it;
|
805
|
+
ranges_.erase(it);
|
806
|
+
nrunes_ -= rr.hi - rr.lo + 1;
|
807
|
+
if (rr.lo <= r) {
|
808
|
+
rr.hi = r;
|
809
|
+
ranges_.insert(rr);
|
810
|
+
nrunes_ += rr.hi - rr.lo + 1;
|
811
|
+
}
|
812
|
+
}
|
813
|
+
}
|
814
|
+
|
815
|
+
void CharClassBuilder::Negate() {
|
816
|
+
// Build up negation and then copy in.
|
817
|
+
// Could edit ranges in place, but C++ won't let me.
|
818
|
+
vector<RuneRange> v;
|
819
|
+
v.reserve(ranges_.size() + 1);
|
820
|
+
|
821
|
+
// In negation, first range begins at 0, unless
|
822
|
+
// the current class begins at 0.
|
823
|
+
iterator it = begin();
|
824
|
+
if (it == end()) {
|
825
|
+
v.push_back(RuneRange(0, Runemax));
|
826
|
+
} else {
|
827
|
+
int nextlo = 0;
|
828
|
+
if (it->lo == 0) {
|
829
|
+
nextlo = it->hi + 1;
|
830
|
+
++it;
|
831
|
+
}
|
832
|
+
for (; it != end(); ++it) {
|
833
|
+
v.push_back(RuneRange(nextlo, it->lo - 1));
|
834
|
+
nextlo = it->hi + 1;
|
835
|
+
}
|
836
|
+
if (nextlo <= Runemax)
|
837
|
+
v.push_back(RuneRange(nextlo, Runemax));
|
838
|
+
}
|
839
|
+
|
840
|
+
ranges_.clear();
|
841
|
+
for (int i = 0; i < v.size(); i++)
|
842
|
+
ranges_.insert(v[i]);
|
843
|
+
|
844
|
+
upper_ = AlphaMask & ~upper_;
|
845
|
+
lower_ = AlphaMask & ~lower_;
|
846
|
+
nrunes_ = Runemax+1 - nrunes_;
|
847
|
+
}
|
848
|
+
|
849
|
+
// Character class is a sorted list of ranges.
|
850
|
+
// The ranges are allocated in the same block as the header,
|
851
|
+
// necessitating a special allocator and Delete method.
|
852
|
+
|
853
|
+
CharClass* CharClass::New(int maxranges) {
|
854
|
+
CharClass* cc;
|
855
|
+
uint8* data = new uint8[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
|
856
|
+
cc = reinterpret_cast<CharClass*>(data);
|
857
|
+
cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc);
|
858
|
+
cc->nranges_ = 0;
|
859
|
+
cc->folds_ascii_ = false;
|
860
|
+
cc->nrunes_ = 0;
|
861
|
+
return cc;
|
862
|
+
}
|
863
|
+
|
864
|
+
void CharClass::Delete() {
|
865
|
+
if (this == NULL)
|
866
|
+
return;
|
867
|
+
uint8 *data = reinterpret_cast<uint8*>(this);
|
868
|
+
delete[] data;
|
869
|
+
}
|
870
|
+
|
871
|
+
CharClass* CharClass::Negate() {
|
872
|
+
CharClass* cc = CharClass::New(nranges_+1);
|
873
|
+
cc->folds_ascii_ = folds_ascii_;
|
874
|
+
cc->nrunes_ = Runemax + 1 - nrunes_;
|
875
|
+
int n = 0;
|
876
|
+
int nextlo = 0;
|
877
|
+
for (CharClass::iterator it = begin(); it != end(); ++it) {
|
878
|
+
if (it->lo == nextlo) {
|
879
|
+
nextlo = it->hi + 1;
|
880
|
+
} else {
|
881
|
+
cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1);
|
882
|
+
nextlo = it->hi + 1;
|
883
|
+
}
|
884
|
+
}
|
885
|
+
if (nextlo <= Runemax)
|
886
|
+
cc->ranges_[n++] = RuneRange(nextlo, Runemax);
|
887
|
+
cc->nranges_ = n;
|
888
|
+
return cc;
|
889
|
+
}
|
890
|
+
|
891
|
+
bool CharClass::Contains(Rune r) {
|
892
|
+
RuneRange* rr = ranges_;
|
893
|
+
int n = nranges_;
|
894
|
+
while (n > 0) {
|
895
|
+
int m = n/2;
|
896
|
+
if (rr[m].hi < r) {
|
897
|
+
rr += m+1;
|
898
|
+
n -= m+1;
|
899
|
+
} else if (r < rr[m].lo) {
|
900
|
+
n = m;
|
901
|
+
} else { // rr[m].lo <= r && r <= rr[m].hi
|
902
|
+
return true;
|
903
|
+
}
|
904
|
+
}
|
905
|
+
return false;
|
906
|
+
}
|
907
|
+
|
908
|
+
CharClass* CharClassBuilder::GetCharClass() {
|
909
|
+
CharClass* cc = CharClass::New(ranges_.size());
|
910
|
+
int n = 0;
|
911
|
+
for (iterator it = begin(); it != end(); ++it)
|
912
|
+
cc->ranges_[n++] = *it;
|
913
|
+
cc->nranges_ = n;
|
914
|
+
DCHECK_LE(n, ranges_.size());
|
915
|
+
cc->nrunes_ = nrunes_;
|
916
|
+
cc->folds_ascii_ = FoldsASCII();
|
917
|
+
return cc;
|
918
|
+
}
|
919
|
+
|
920
|
+
} // namespace re2
|