chipper 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +51 -0
- data/ext/extconf.rb +58 -0
- data/ext/libstemmer_c/Makefile +10 -0
- data/ext/libstemmer_c/examples/stemwords.c +209 -0
- data/ext/libstemmer_c/include/libstemmer.h +79 -0
- data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
- data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
- data/ext/libstemmer_c/libstemmer/modules.h +190 -0
- data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
- data/ext/libstemmer_c/mkinc.mak +82 -0
- data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
- data/ext/libstemmer_c/runtime/api.c +66 -0
- data/ext/libstemmer_c/runtime/api.h +26 -0
- data/ext/libstemmer_c/runtime/header.h +58 -0
- data/ext/libstemmer_c/runtime/utilities.c +478 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
- data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
- data/ext/re2/bitstate.cc +378 -0
- data/ext/re2/compile.cc +1138 -0
- data/ext/re2/dfa.cc +2086 -0
- data/ext/re2/filtered_re2.cc +100 -0
- data/ext/re2/filtered_re2.h +99 -0
- data/ext/re2/hash.cc +231 -0
- data/ext/re2/mimics_pcre.cc +185 -0
- data/ext/re2/nfa.cc +709 -0
- data/ext/re2/onepass.cc +614 -0
- data/ext/re2/parse.cc +2202 -0
- data/ext/re2/perl_groups.cc +119 -0
- data/ext/re2/prefilter.cc +671 -0
- data/ext/re2/prefilter.h +105 -0
- data/ext/re2/prefilter_tree.cc +398 -0
- data/ext/re2/prefilter_tree.h +130 -0
- data/ext/re2/prog.cc +341 -0
- data/ext/re2/prog.h +376 -0
- data/ext/re2/re2.cc +1180 -0
- data/ext/re2/re2.h +837 -0
- data/ext/re2/regexp.cc +920 -0
- data/ext/re2/regexp.h +632 -0
- data/ext/re2/rune.cc +258 -0
- data/ext/re2/set.cc +113 -0
- data/ext/re2/set.h +55 -0
- data/ext/re2/simplify.cc +393 -0
- data/ext/re2/stringpiece.cc +87 -0
- data/ext/re2/stringpiece.h +182 -0
- data/ext/re2/tostring.cc +341 -0
- data/ext/re2/unicode_casefold.cc +469 -0
- data/ext/re2/unicode_casefold.h +75 -0
- data/ext/re2/unicode_groups.cc +4851 -0
- data/ext/re2/unicode_groups.h +64 -0
- data/ext/re2/valgrind.cc +24 -0
- data/ext/re2/variadic_function.h +346 -0
- data/ext/re2/walker-inl.h +244 -0
- data/ext/src/chipper.cc +626 -0
- data/ext/src/version.h +1 -0
- data/ext/stemmer.rb +40 -0
- data/ext/util/arena.h +103 -0
- data/ext/util/atomicops.h +79 -0
- data/ext/util/benchmark.h +41 -0
- data/ext/util/flags.h +27 -0
- data/ext/util/logging.h +78 -0
- data/ext/util/mutex.h +190 -0
- data/ext/util/pcre.h +679 -0
- data/ext/util/random.h +29 -0
- data/ext/util/sparse_array.h +451 -0
- data/ext/util/sparse_set.h +177 -0
- data/ext/util/test.h +57 -0
- data/ext/util/thread.h +26 -0
- data/ext/util/utf.h +43 -0
- data/ext/util/util.h +127 -0
- data/ext/util/valgrind.h +4517 -0
- data/test/helper.rb +5 -0
- data/test/test_entities.rb +57 -0
- data/test/test_tokens.rb +118 -0
- metadata +199 -0
data/ext/re2/prog.cc
ADDED
|
@@ -0,0 +1,341 @@
|
|
|
1
|
+
// Copyright 2007 The RE2 Authors. All Rights Reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style
|
|
3
|
+
// license that can be found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
// Compiled regular expression representation.
|
|
6
|
+
// Tested by compile_test.cc
|
|
7
|
+
|
|
8
|
+
#include "util/util.h"
|
|
9
|
+
#include "util/sparse_set.h"
|
|
10
|
+
#include "re2/prog.h"
|
|
11
|
+
#include "re2/stringpiece.h"
|
|
12
|
+
|
|
13
|
+
namespace re2 {
|
|
14
|
+
|
|
15
|
+
// Constructors per Inst opcode
|
|
16
|
+
|
|
17
|
+
void Prog::Inst::InitAlt(uint32 out, uint32 out1) {
|
|
18
|
+
DCHECK_EQ(out_opcode_, 0);
|
|
19
|
+
set_out_opcode(out, kInstAlt);
|
|
20
|
+
out1_ = out1;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32 out) {
|
|
24
|
+
DCHECK_EQ(out_opcode_, 0);
|
|
25
|
+
set_out_opcode(out, kInstByteRange);
|
|
26
|
+
lo_ = lo & 0xFF;
|
|
27
|
+
hi_ = hi & 0xFF;
|
|
28
|
+
foldcase_ = foldcase;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
void Prog::Inst::InitCapture(int cap, uint32 out) {
|
|
32
|
+
DCHECK_EQ(out_opcode_, 0);
|
|
33
|
+
set_out_opcode(out, kInstCapture);
|
|
34
|
+
cap_ = cap;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
void Prog::Inst::InitEmptyWidth(EmptyOp empty, uint32 out) {
|
|
38
|
+
DCHECK_EQ(out_opcode_, 0);
|
|
39
|
+
set_out_opcode(out, kInstEmptyWidth);
|
|
40
|
+
empty_ = empty;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
void Prog::Inst::InitMatch(int32 id) {
|
|
44
|
+
DCHECK_EQ(out_opcode_, 0);
|
|
45
|
+
set_opcode(kInstMatch);
|
|
46
|
+
match_id_ = id;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
void Prog::Inst::InitNop(uint32 out) {
|
|
50
|
+
DCHECK_EQ(out_opcode_, 0);
|
|
51
|
+
set_opcode(kInstNop);
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
void Prog::Inst::InitFail() {
|
|
55
|
+
DCHECK_EQ(out_opcode_, 0);
|
|
56
|
+
set_opcode(kInstFail);
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
string Prog::Inst::Dump() {
|
|
60
|
+
switch (opcode()) {
|
|
61
|
+
default:
|
|
62
|
+
return StringPrintf("opcode %d", static_cast<int>(opcode()));
|
|
63
|
+
|
|
64
|
+
case kInstAlt:
|
|
65
|
+
return StringPrintf("alt -> %d | %d", out(), out1_);
|
|
66
|
+
|
|
67
|
+
case kInstAltMatch:
|
|
68
|
+
return StringPrintf("altmatch -> %d | %d", out(), out1_);
|
|
69
|
+
|
|
70
|
+
case kInstByteRange:
|
|
71
|
+
return StringPrintf("byte%s [%02x-%02x] -> %d",
|
|
72
|
+
foldcase_ ? "/i" : "",
|
|
73
|
+
lo_, hi_, out());
|
|
74
|
+
|
|
75
|
+
case kInstCapture:
|
|
76
|
+
return StringPrintf("capture %d -> %d", cap_, out());
|
|
77
|
+
|
|
78
|
+
case kInstEmptyWidth:
|
|
79
|
+
return StringPrintf("emptywidth %#x -> %d",
|
|
80
|
+
static_cast<int>(empty_), out());
|
|
81
|
+
|
|
82
|
+
case kInstMatch:
|
|
83
|
+
return StringPrintf("match! %d", match_id());
|
|
84
|
+
|
|
85
|
+
case kInstNop:
|
|
86
|
+
return StringPrintf("nop -> %d", out());
|
|
87
|
+
|
|
88
|
+
case kInstFail:
|
|
89
|
+
return StringPrintf("fail");
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
Prog::Prog()
|
|
94
|
+
: anchor_start_(false),
|
|
95
|
+
anchor_end_(false),
|
|
96
|
+
reversed_(false),
|
|
97
|
+
did_onepass_(false),
|
|
98
|
+
start_(0),
|
|
99
|
+
start_unanchored_(0),
|
|
100
|
+
size_(0),
|
|
101
|
+
byte_inst_count_(0),
|
|
102
|
+
bytemap_range_(0),
|
|
103
|
+
flags_(0),
|
|
104
|
+
onepass_statesize_(0),
|
|
105
|
+
inst_(NULL),
|
|
106
|
+
dfa_first_(NULL),
|
|
107
|
+
dfa_longest_(NULL),
|
|
108
|
+
dfa_mem_(0),
|
|
109
|
+
delete_dfa_(NULL),
|
|
110
|
+
unbytemap_(NULL),
|
|
111
|
+
onepass_nodes_(NULL),
|
|
112
|
+
onepass_start_(NULL) {
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
Prog::~Prog() {
|
|
116
|
+
if (delete_dfa_) {
|
|
117
|
+
if (dfa_first_)
|
|
118
|
+
delete_dfa_(dfa_first_);
|
|
119
|
+
if (dfa_longest_)
|
|
120
|
+
delete_dfa_(dfa_longest_);
|
|
121
|
+
}
|
|
122
|
+
delete[] onepass_nodes_;
|
|
123
|
+
delete[] inst_;
|
|
124
|
+
delete[] unbytemap_;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
typedef SparseSet Workq;
|
|
128
|
+
|
|
129
|
+
static inline void AddToQueue(Workq* q, int id) {
|
|
130
|
+
if (id != 0)
|
|
131
|
+
q->insert(id);
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
static string ProgToString(Prog* prog, Workq* q) {
|
|
135
|
+
string s;
|
|
136
|
+
|
|
137
|
+
for (Workq::iterator i = q->begin(); i != q->end(); ++i) {
|
|
138
|
+
int id = *i;
|
|
139
|
+
Prog::Inst* ip = prog->inst(id);
|
|
140
|
+
StringAppendF(&s, "%d. %s\n", id, ip->Dump().c_str());
|
|
141
|
+
AddToQueue(q, ip->out());
|
|
142
|
+
if (ip->opcode() == kInstAlt || ip->opcode() == kInstAltMatch)
|
|
143
|
+
AddToQueue(q, ip->out1());
|
|
144
|
+
}
|
|
145
|
+
return s;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
string Prog::Dump() {
|
|
149
|
+
string map;
|
|
150
|
+
if (false) { // Debugging
|
|
151
|
+
int lo = 0;
|
|
152
|
+
StringAppendF(&map, "byte map:\n");
|
|
153
|
+
for (int i = 0; i < bytemap_range_; i++) {
|
|
154
|
+
StringAppendF(&map, "\t%d. [%02x-%02x]\n", i, lo, unbytemap_[i]);
|
|
155
|
+
lo = unbytemap_[i] + 1;
|
|
156
|
+
}
|
|
157
|
+
StringAppendF(&map, "\n");
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
Workq q(size_);
|
|
161
|
+
AddToQueue(&q, start_);
|
|
162
|
+
return map + ProgToString(this, &q);
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
string Prog::DumpUnanchored() {
|
|
166
|
+
Workq q(size_);
|
|
167
|
+
AddToQueue(&q, start_unanchored_);
|
|
168
|
+
return ProgToString(this, &q);
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
static bool IsMatch(Prog*, Prog::Inst*);
|
|
172
|
+
|
|
173
|
+
// Peep-hole optimizer.
|
|
174
|
+
void Prog::Optimize() {
|
|
175
|
+
Workq q(size_);
|
|
176
|
+
|
|
177
|
+
// Eliminate nops. Most are taken out during compilation
|
|
178
|
+
// but a few are hard to avoid.
|
|
179
|
+
q.clear();
|
|
180
|
+
AddToQueue(&q, start_);
|
|
181
|
+
for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
|
|
182
|
+
int id = *i;
|
|
183
|
+
|
|
184
|
+
Inst* ip = inst(id);
|
|
185
|
+
int j = ip->out();
|
|
186
|
+
Inst* jp;
|
|
187
|
+
while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
|
|
188
|
+
j = jp->out();
|
|
189
|
+
}
|
|
190
|
+
ip->set_out(j);
|
|
191
|
+
AddToQueue(&q, ip->out());
|
|
192
|
+
|
|
193
|
+
if (ip->opcode() == kInstAlt) {
|
|
194
|
+
j = ip->out1();
|
|
195
|
+
while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
|
|
196
|
+
j = jp->out();
|
|
197
|
+
}
|
|
198
|
+
ip->out1_ = j;
|
|
199
|
+
AddToQueue(&q, ip->out1());
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
// Insert kInstAltMatch instructions
|
|
204
|
+
// Look for
|
|
205
|
+
// ip: Alt -> j | k
|
|
206
|
+
// j: ByteRange [00-FF] -> ip
|
|
207
|
+
// k: Match
|
|
208
|
+
// or the reverse (the above is the greedy one).
|
|
209
|
+
// Rewrite Alt to AltMatch.
|
|
210
|
+
q.clear();
|
|
211
|
+
AddToQueue(&q, start_);
|
|
212
|
+
for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
|
|
213
|
+
int id = *i;
|
|
214
|
+
Inst* ip = inst(id);
|
|
215
|
+
AddToQueue(&q, ip->out());
|
|
216
|
+
if (ip->opcode() == kInstAlt)
|
|
217
|
+
AddToQueue(&q, ip->out1());
|
|
218
|
+
|
|
219
|
+
if (ip->opcode() == kInstAlt) {
|
|
220
|
+
Inst* j = inst(ip->out());
|
|
221
|
+
Inst* k = inst(ip->out1());
|
|
222
|
+
if (j->opcode() == kInstByteRange && j->out() == id &&
|
|
223
|
+
j->lo() == 0x00 && j->hi() == 0xFF &&
|
|
224
|
+
IsMatch(this, k)) {
|
|
225
|
+
ip->set_opcode(kInstAltMatch);
|
|
226
|
+
continue;
|
|
227
|
+
}
|
|
228
|
+
if (IsMatch(this, j) &&
|
|
229
|
+
k->opcode() == kInstByteRange && k->out() == id &&
|
|
230
|
+
k->lo() == 0x00 && k->hi() == 0xFF) {
|
|
231
|
+
ip->set_opcode(kInstAltMatch);
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
// Is ip a guaranteed match at end of text, perhaps after some capturing?
|
|
238
|
+
static bool IsMatch(Prog* prog, Prog::Inst* ip) {
|
|
239
|
+
for (;;) {
|
|
240
|
+
switch (ip->opcode()) {
|
|
241
|
+
default:
|
|
242
|
+
LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode();
|
|
243
|
+
return false;
|
|
244
|
+
|
|
245
|
+
case kInstAlt:
|
|
246
|
+
case kInstAltMatch:
|
|
247
|
+
case kInstByteRange:
|
|
248
|
+
case kInstFail:
|
|
249
|
+
case kInstEmptyWidth:
|
|
250
|
+
return false;
|
|
251
|
+
|
|
252
|
+
case kInstCapture:
|
|
253
|
+
case kInstNop:
|
|
254
|
+
ip = prog->inst(ip->out());
|
|
255
|
+
break;
|
|
256
|
+
|
|
257
|
+
case kInstMatch:
|
|
258
|
+
return true;
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
uint32 Prog::EmptyFlags(const StringPiece& text, const char* p) {
|
|
264
|
+
int flags = 0;
|
|
265
|
+
|
|
266
|
+
// ^ and \A
|
|
267
|
+
if (p == text.begin())
|
|
268
|
+
flags |= kEmptyBeginText | kEmptyBeginLine;
|
|
269
|
+
else if (p[-1] == '\n')
|
|
270
|
+
flags |= kEmptyBeginLine;
|
|
271
|
+
|
|
272
|
+
// $ and \z
|
|
273
|
+
if (p == text.end())
|
|
274
|
+
flags |= kEmptyEndText | kEmptyEndLine;
|
|
275
|
+
else if (p < text.end() && p[0] == '\n')
|
|
276
|
+
flags |= kEmptyEndLine;
|
|
277
|
+
|
|
278
|
+
// \b and \B
|
|
279
|
+
if (p == text.begin() && p == text.end()) {
|
|
280
|
+
// no word boundary here
|
|
281
|
+
} else if (p == text.begin()) {
|
|
282
|
+
if (IsWordChar(p[0]))
|
|
283
|
+
flags |= kEmptyWordBoundary;
|
|
284
|
+
} else if (p == text.end()) {
|
|
285
|
+
if (IsWordChar(p[-1]))
|
|
286
|
+
flags |= kEmptyWordBoundary;
|
|
287
|
+
} else {
|
|
288
|
+
if (IsWordChar(p[-1]) != IsWordChar(p[0]))
|
|
289
|
+
flags |= kEmptyWordBoundary;
|
|
290
|
+
}
|
|
291
|
+
if (!(flags & kEmptyWordBoundary))
|
|
292
|
+
flags |= kEmptyNonWordBoundary;
|
|
293
|
+
|
|
294
|
+
return flags;
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
void Prog::MarkByteRange(int lo, int hi) {
|
|
298
|
+
CHECK_GE(lo, 0);
|
|
299
|
+
CHECK_GE(hi, 0);
|
|
300
|
+
CHECK_LE(lo, 255);
|
|
301
|
+
CHECK_LE(hi, 255);
|
|
302
|
+
if (lo > 0)
|
|
303
|
+
byterange_.Set(lo - 1);
|
|
304
|
+
byterange_.Set(hi);
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
void Prog::ComputeByteMap() {
|
|
308
|
+
// Fill in bytemap with byte classes for prog_.
|
|
309
|
+
// Ranges of bytes that are treated as indistinguishable
|
|
310
|
+
// by the regexp program are mapped to a single byte class.
|
|
311
|
+
// The vector prog_->byterange() marks the end of each
|
|
312
|
+
// such range.
|
|
313
|
+
const Bitmap<256>& v = byterange();
|
|
314
|
+
|
|
315
|
+
COMPILE_ASSERT(8*sizeof(v.Word(0)) == 32, wordsize);
|
|
316
|
+
uint8 n = 0;
|
|
317
|
+
uint32 bits = 0;
|
|
318
|
+
for (int i = 0; i < 256; i++) {
|
|
319
|
+
if ((i&31) == 0)
|
|
320
|
+
bits = v.Word(i >> 5);
|
|
321
|
+
bytemap_[i] = n;
|
|
322
|
+
n += bits & 1;
|
|
323
|
+
bits >>= 1;
|
|
324
|
+
}
|
|
325
|
+
bytemap_range_ = bytemap_[255] + 1;
|
|
326
|
+
unbytemap_ = new uint8[bytemap_range_];
|
|
327
|
+
for (int i = 0; i < 256; i++)
|
|
328
|
+
unbytemap_[bytemap_[i]] = i;
|
|
329
|
+
|
|
330
|
+
if (0) { // For debugging: use trivial byte map.
|
|
331
|
+
for (int i = 0; i < 256; i++) {
|
|
332
|
+
bytemap_[i] = i;
|
|
333
|
+
unbytemap_[i] = i;
|
|
334
|
+
}
|
|
335
|
+
bytemap_range_ = 256;
|
|
336
|
+
LOG(INFO) << "Using trivial bytemap.";
|
|
337
|
+
}
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
} // namespace re2
|
|
341
|
+
|