chipper 0.4.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +51 -0
- data/ext/extconf.rb +58 -0
- data/ext/libstemmer_c/Makefile +10 -0
- data/ext/libstemmer_c/examples/stemwords.c +209 -0
- data/ext/libstemmer_c/include/libstemmer.h +79 -0
- data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
- data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
- data/ext/libstemmer_c/libstemmer/modules.h +190 -0
- data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
- data/ext/libstemmer_c/mkinc.mak +82 -0
- data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
- data/ext/libstemmer_c/runtime/api.c +66 -0
- data/ext/libstemmer_c/runtime/api.h +26 -0
- data/ext/libstemmer_c/runtime/header.h +58 -0
- data/ext/libstemmer_c/runtime/utilities.c +478 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
- data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
- data/ext/re2/bitstate.cc +378 -0
- data/ext/re2/compile.cc +1138 -0
- data/ext/re2/dfa.cc +2086 -0
- data/ext/re2/filtered_re2.cc +100 -0
- data/ext/re2/filtered_re2.h +99 -0
- data/ext/re2/hash.cc +231 -0
- data/ext/re2/mimics_pcre.cc +185 -0
- data/ext/re2/nfa.cc +709 -0
- data/ext/re2/onepass.cc +614 -0
- data/ext/re2/parse.cc +2202 -0
- data/ext/re2/perl_groups.cc +119 -0
- data/ext/re2/prefilter.cc +671 -0
- data/ext/re2/prefilter.h +105 -0
- data/ext/re2/prefilter_tree.cc +398 -0
- data/ext/re2/prefilter_tree.h +130 -0
- data/ext/re2/prog.cc +341 -0
- data/ext/re2/prog.h +376 -0
- data/ext/re2/re2.cc +1180 -0
- data/ext/re2/re2.h +837 -0
- data/ext/re2/regexp.cc +920 -0
- data/ext/re2/regexp.h +632 -0
- data/ext/re2/rune.cc +258 -0
- data/ext/re2/set.cc +113 -0
- data/ext/re2/set.h +55 -0
- data/ext/re2/simplify.cc +393 -0
- data/ext/re2/stringpiece.cc +87 -0
- data/ext/re2/stringpiece.h +182 -0
- data/ext/re2/tostring.cc +341 -0
- data/ext/re2/unicode_casefold.cc +469 -0
- data/ext/re2/unicode_casefold.h +75 -0
- data/ext/re2/unicode_groups.cc +4851 -0
- data/ext/re2/unicode_groups.h +64 -0
- data/ext/re2/valgrind.cc +24 -0
- data/ext/re2/variadic_function.h +346 -0
- data/ext/re2/walker-inl.h +244 -0
- data/ext/src/chipper.cc +626 -0
- data/ext/src/version.h +1 -0
- data/ext/stemmer.rb +40 -0
- data/ext/util/arena.h +103 -0
- data/ext/util/atomicops.h +79 -0
- data/ext/util/benchmark.h +41 -0
- data/ext/util/flags.h +27 -0
- data/ext/util/logging.h +78 -0
- data/ext/util/mutex.h +190 -0
- data/ext/util/pcre.h +679 -0
- data/ext/util/random.h +29 -0
- data/ext/util/sparse_array.h +451 -0
- data/ext/util/sparse_set.h +177 -0
- data/ext/util/test.h +57 -0
- data/ext/util/thread.h +26 -0
- data/ext/util/utf.h +43 -0
- data/ext/util/util.h +127 -0
- data/ext/util/valgrind.h +4517 -0
- data/test/helper.rb +5 -0
- data/test/test_entities.rb +57 -0
- data/test/test_tokens.rb +118 -0
- metadata +199 -0
@@ -0,0 +1,16 @@
|
|
1
|
+
|
2
|
+
/* This file was generated automatically by the Snowball to ANSI C compiler */
|
3
|
+
|
4
|
+
#ifdef __cplusplus
|
5
|
+
extern "C" {
|
6
|
+
#endif
|
7
|
+
|
8
|
+
extern struct SN_env * turkish_UTF_8_create_env(void);
|
9
|
+
extern void turkish_UTF_8_close_env(struct SN_env * z);
|
10
|
+
|
11
|
+
extern int turkish_UTF_8_stem(struct SN_env * z);
|
12
|
+
|
13
|
+
#ifdef __cplusplus
|
14
|
+
}
|
15
|
+
#endif
|
16
|
+
|
data/ext/re2/bitstate.cc
ADDED
@@ -0,0 +1,378 @@
|
|
1
|
+
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style
|
3
|
+
// license that can be found in the LICENSE file.
|
4
|
+
|
5
|
+
// Tested by search_test.cc, exhaustive_test.cc, tester.cc
|
6
|
+
|
7
|
+
// Prog::SearchBitState is a regular expression search with submatch
|
8
|
+
// tracking for small regular expressions and texts. Like
|
9
|
+
// testing/backtrack.cc, it allocates a bit vector with (length of
|
10
|
+
// text) * (length of prog) bits, to make sure it never explores the
|
11
|
+
// same (character position, instruction) state multiple times. This
|
12
|
+
// limits the search to run in time linear in the length of the text.
|
13
|
+
//
|
14
|
+
// Unlike testing/backtrack.cc, SearchBitState is not recursive
|
15
|
+
// on the text.
|
16
|
+
//
|
17
|
+
// SearchBitState is a fast replacement for the NFA code on small
|
18
|
+
// regexps and texts when SearchOnePass cannot be used.
|
19
|
+
|
20
|
+
#include "re2/prog.h"
|
21
|
+
#include "re2/regexp.h"
|
22
|
+
|
23
|
+
namespace re2 {
|
24
|
+
|
25
|
+
struct Job {
|
26
|
+
int id;
|
27
|
+
int arg;
|
28
|
+
const char* p;
|
29
|
+
};
|
30
|
+
|
31
|
+
class BitState {
|
32
|
+
public:
|
33
|
+
explicit BitState(Prog* prog);
|
34
|
+
~BitState();
|
35
|
+
|
36
|
+
// The usual Search prototype.
|
37
|
+
// Can only call Search once per BitState.
|
38
|
+
bool Search(const StringPiece& text, const StringPiece& context,
|
39
|
+
bool anchored, bool longest,
|
40
|
+
StringPiece* submatch, int nsubmatch);
|
41
|
+
|
42
|
+
private:
|
43
|
+
inline bool ShouldVisit(int id, const char* p);
|
44
|
+
void Push(int id, const char* p, int arg);
|
45
|
+
bool GrowStack();
|
46
|
+
bool TrySearch(int id, const char* p);
|
47
|
+
|
48
|
+
// Search parameters
|
49
|
+
Prog* prog_; // program being run
|
50
|
+
StringPiece text_; // text being searched
|
51
|
+
StringPiece context_; // greater context of text being searched
|
52
|
+
bool anchored_; // whether search is anchored at text.begin()
|
53
|
+
bool longest_; // whether search wants leftmost-longest match
|
54
|
+
bool endmatch_; // whether match must end at text.end()
|
55
|
+
StringPiece *submatch_; // submatches to fill in
|
56
|
+
int nsubmatch_; // # of submatches to fill in
|
57
|
+
|
58
|
+
// Search state
|
59
|
+
const char** cap_; // capture registers
|
60
|
+
int ncap_;
|
61
|
+
|
62
|
+
static const int VisitedBits = 32;
|
63
|
+
uint32 *visited_; // bitmap: (Inst*, char*) pairs already backtracked
|
64
|
+
int nvisited_; // # of words in bitmap
|
65
|
+
|
66
|
+
Job *job_; // stack of text positions to explore
|
67
|
+
int njob_;
|
68
|
+
int maxjob_;
|
69
|
+
};
|
70
|
+
|
71
|
+
BitState::BitState(Prog* prog)
|
72
|
+
: prog_(prog),
|
73
|
+
anchored_(false),
|
74
|
+
longest_(false),
|
75
|
+
endmatch_(false),
|
76
|
+
submatch_(NULL),
|
77
|
+
nsubmatch_(0),
|
78
|
+
cap_(NULL),
|
79
|
+
ncap_(0),
|
80
|
+
visited_(NULL),
|
81
|
+
nvisited_(0),
|
82
|
+
job_(NULL),
|
83
|
+
njob_(0),
|
84
|
+
maxjob_(0) {
|
85
|
+
}
|
86
|
+
|
87
|
+
BitState::~BitState() {
|
88
|
+
delete[] visited_;
|
89
|
+
delete[] job_;
|
90
|
+
delete[] cap_;
|
91
|
+
}
|
92
|
+
|
93
|
+
// Should the search visit the pair ip, p?
|
94
|
+
// If so, remember that it was visited so that the next time,
|
95
|
+
// we don't repeat the visit.
|
96
|
+
bool BitState::ShouldVisit(int id, const char* p) {
|
97
|
+
uint n = id * (text_.size() + 1) + (p - text_.begin());
|
98
|
+
if (visited_[n/VisitedBits] & (1 << (n & (VisitedBits-1))))
|
99
|
+
return false;
|
100
|
+
visited_[n/VisitedBits] |= 1 << (n & (VisitedBits-1));
|
101
|
+
return true;
|
102
|
+
}
|
103
|
+
|
104
|
+
// Grow the stack.
|
105
|
+
bool BitState::GrowStack() {
|
106
|
+
// VLOG(0) << "Reallocate.";
|
107
|
+
maxjob_ *= 2;
|
108
|
+
Job* newjob = new Job[maxjob_];
|
109
|
+
memmove(newjob, job_, njob_*sizeof job_[0]);
|
110
|
+
delete[] job_;
|
111
|
+
job_ = newjob;
|
112
|
+
if (njob_ >= maxjob_) {
|
113
|
+
LOG(DFATAL) << "Job stack overflow.";
|
114
|
+
return false;
|
115
|
+
}
|
116
|
+
return true;
|
117
|
+
}
|
118
|
+
|
119
|
+
// Push the triple (id, p, arg) onto the stack, growing it if necessary.
|
120
|
+
void BitState::Push(int id, const char* p, int arg) {
|
121
|
+
if (njob_ >= maxjob_) {
|
122
|
+
if (!GrowStack())
|
123
|
+
return;
|
124
|
+
}
|
125
|
+
int op = prog_->inst(id)->opcode();
|
126
|
+
if (op == kInstFail)
|
127
|
+
return;
|
128
|
+
|
129
|
+
// Only check ShouldVisit when arg == 0.
|
130
|
+
// When arg > 0, we are continuing a previous visit.
|
131
|
+
if (arg == 0 && !ShouldVisit(id, p))
|
132
|
+
return;
|
133
|
+
|
134
|
+
Job* j = &job_[njob_++];
|
135
|
+
j->id = id;
|
136
|
+
j->p = p;
|
137
|
+
j->arg = arg;
|
138
|
+
}
|
139
|
+
|
140
|
+
// Try a search from instruction id0 in state p0.
|
141
|
+
// Return whether it succeeded.
|
142
|
+
bool BitState::TrySearch(int id0, const char* p0) {
|
143
|
+
bool matched = false;
|
144
|
+
const char* end = text_.end();
|
145
|
+
njob_ = 0;
|
146
|
+
Push(id0, p0, 0);
|
147
|
+
while (njob_ > 0) {
|
148
|
+
// Pop job off stack.
|
149
|
+
--njob_;
|
150
|
+
int id = job_[njob_].id;
|
151
|
+
const char* p = job_[njob_].p;
|
152
|
+
int arg = job_[njob_].arg;
|
153
|
+
|
154
|
+
// Optimization: rather than push and pop,
|
155
|
+
// code that is going to Push and continue
|
156
|
+
// the loop simply updates ip, p, and arg
|
157
|
+
// and jumps to CheckAndLoop. We have to
|
158
|
+
// do the ShouldVisit check that Push
|
159
|
+
// would have, but we avoid the stack
|
160
|
+
// manipulation.
|
161
|
+
if (0) {
|
162
|
+
CheckAndLoop:
|
163
|
+
if (!ShouldVisit(id, p))
|
164
|
+
continue;
|
165
|
+
}
|
166
|
+
|
167
|
+
// Visit ip, p.
|
168
|
+
// VLOG(0) << "Job: " << ip->id() << " "
|
169
|
+
// << (p - text_.begin()) << " " << arg;
|
170
|
+
Prog::Inst* ip = prog_->inst(id);
|
171
|
+
switch (ip->opcode()) {
|
172
|
+
case kInstFail:
|
173
|
+
default:
|
174
|
+
LOG(DFATAL) << "Unexpected opcode: " << ip->opcode() << " arg " << arg;
|
175
|
+
return false;
|
176
|
+
|
177
|
+
case kInstAlt:
|
178
|
+
// Cannot just
|
179
|
+
// Push(ip->out1(), p, 0);
|
180
|
+
// Push(ip->out(), p, 0);
|
181
|
+
// If, during the processing of ip->out(), we encounter
|
182
|
+
// ip->out1() via another path, we want to process it then.
|
183
|
+
// Pushing it here will inhibit that. Instead, re-push
|
184
|
+
// ip with arg==1 as a reminder to push ip->out1() later.
|
185
|
+
switch (arg) {
|
186
|
+
case 0:
|
187
|
+
Push(id, p, 1); // come back when we're done
|
188
|
+
id = ip->out();
|
189
|
+
goto CheckAndLoop;
|
190
|
+
|
191
|
+
case 1:
|
192
|
+
// Finished ip->out(); try ip->out1().
|
193
|
+
arg = 0;
|
194
|
+
id = ip->out1();
|
195
|
+
goto CheckAndLoop;
|
196
|
+
}
|
197
|
+
LOG(DFATAL) << "Bad arg in kInstCapture: " << arg;
|
198
|
+
continue;
|
199
|
+
|
200
|
+
case kInstAltMatch:
|
201
|
+
// One opcode is byte range; the other leads to match.
|
202
|
+
if (ip->greedy(prog_)) {
|
203
|
+
// out1 is the match
|
204
|
+
Push(ip->out1(), p, 0);
|
205
|
+
id = ip->out1();
|
206
|
+
p = end;
|
207
|
+
goto CheckAndLoop;
|
208
|
+
}
|
209
|
+
// out is the match - non-greedy
|
210
|
+
Push(ip->out(), end, 0);
|
211
|
+
id = ip->out();
|
212
|
+
goto CheckAndLoop;
|
213
|
+
|
214
|
+
case kInstByteRange: {
|
215
|
+
int c = -1;
|
216
|
+
if (p < end)
|
217
|
+
c = *p & 0xFF;
|
218
|
+
if (ip->Matches(c)) {
|
219
|
+
id = ip->out();
|
220
|
+
p++;
|
221
|
+
goto CheckAndLoop;
|
222
|
+
}
|
223
|
+
continue;
|
224
|
+
}
|
225
|
+
|
226
|
+
case kInstCapture:
|
227
|
+
switch (arg) {
|
228
|
+
case 0:
|
229
|
+
if (0 <= ip->cap() && ip->cap() < ncap_) {
|
230
|
+
// Capture p to register, but save old value.
|
231
|
+
Push(id, cap_[ip->cap()], 1); // come back when we're done
|
232
|
+
cap_[ip->cap()] = p;
|
233
|
+
}
|
234
|
+
// Continue on.
|
235
|
+
id = ip->out();
|
236
|
+
goto CheckAndLoop;
|
237
|
+
case 1:
|
238
|
+
// Finished ip->out(); restore the old value.
|
239
|
+
cap_[ip->cap()] = p;
|
240
|
+
continue;
|
241
|
+
}
|
242
|
+
LOG(DFATAL) << "Bad arg in kInstCapture: " << arg;
|
243
|
+
continue;
|
244
|
+
|
245
|
+
case kInstEmptyWidth:
|
246
|
+
if (ip->empty() & ~Prog::EmptyFlags(context_, p))
|
247
|
+
continue;
|
248
|
+
id = ip->out();
|
249
|
+
goto CheckAndLoop;
|
250
|
+
|
251
|
+
case kInstNop:
|
252
|
+
id = ip->out();
|
253
|
+
goto CheckAndLoop;
|
254
|
+
|
255
|
+
case kInstMatch: {
|
256
|
+
if (endmatch_ && p != text_.end())
|
257
|
+
continue;
|
258
|
+
|
259
|
+
// VLOG(0) << "Found match.";
|
260
|
+
// We found a match. If the caller doesn't care
|
261
|
+
// where the match is, no point going further.
|
262
|
+
if (nsubmatch_ == 0)
|
263
|
+
return true;
|
264
|
+
|
265
|
+
// Record best match so far.
|
266
|
+
// Only need to check end point, because this entire
|
267
|
+
// call is only considering one start position.
|
268
|
+
matched = true;
|
269
|
+
cap_[1] = p;
|
270
|
+
if (submatch_[0].data() == NULL ||
|
271
|
+
(longest_ && p > submatch_[0].end())) {
|
272
|
+
for (int i = 0; i < nsubmatch_; i++)
|
273
|
+
submatch_[i] = StringPiece(cap_[2*i], cap_[2*i+1] - cap_[2*i]);
|
274
|
+
}
|
275
|
+
|
276
|
+
// If going for first match, we're done.
|
277
|
+
if (!longest_)
|
278
|
+
return true;
|
279
|
+
|
280
|
+
// If we used the entire text, no longer match is possible.
|
281
|
+
if (p == text_.end())
|
282
|
+
return true;
|
283
|
+
|
284
|
+
// Otherwise, continue on in hope of a longer match.
|
285
|
+
continue;
|
286
|
+
}
|
287
|
+
}
|
288
|
+
}
|
289
|
+
return matched;
|
290
|
+
}
|
291
|
+
|
292
|
+
// Search text (within context) for prog_.
|
293
|
+
bool BitState::Search(const StringPiece& text, const StringPiece& context,
|
294
|
+
bool anchored, bool longest,
|
295
|
+
StringPiece* submatch, int nsubmatch) {
|
296
|
+
// Search parameters.
|
297
|
+
text_ = text;
|
298
|
+
context_ = context;
|
299
|
+
if (context_.begin() == NULL)
|
300
|
+
context_ = text;
|
301
|
+
if (prog_->anchor_start() && context_.begin() != text.begin())
|
302
|
+
return false;
|
303
|
+
if (prog_->anchor_end() && context_.end() != text.end())
|
304
|
+
return false;
|
305
|
+
anchored_ = anchored || prog_->anchor_start();
|
306
|
+
longest_ = longest || prog_->anchor_end();
|
307
|
+
endmatch_ = prog_->anchor_end();
|
308
|
+
submatch_ = submatch;
|
309
|
+
nsubmatch_ = nsubmatch;
|
310
|
+
for (int i = 0; i < nsubmatch_; i++)
|
311
|
+
submatch_[i] = NULL;
|
312
|
+
|
313
|
+
// Allocate scratch space.
|
314
|
+
nvisited_ = (prog_->size() * (text.size()+1) + VisitedBits-1) / VisitedBits;
|
315
|
+
visited_ = new uint32[nvisited_];
|
316
|
+
memset(visited_, 0, nvisited_*sizeof visited_[0]);
|
317
|
+
// VLOG(0) << "nvisited_ = " << nvisited_;
|
318
|
+
|
319
|
+
ncap_ = 2*nsubmatch;
|
320
|
+
if (ncap_ < 2)
|
321
|
+
ncap_ = 2;
|
322
|
+
cap_ = new const char*[ncap_];
|
323
|
+
memset(cap_, 0, ncap_*sizeof cap_[0]);
|
324
|
+
|
325
|
+
maxjob_ = 256;
|
326
|
+
job_ = new Job[maxjob_];
|
327
|
+
|
328
|
+
// Anchored search must start at text.begin().
|
329
|
+
if (anchored_) {
|
330
|
+
cap_[0] = text.begin();
|
331
|
+
return TrySearch(prog_->start(), text.begin());
|
332
|
+
}
|
333
|
+
|
334
|
+
// Unanchored search, starting from each possible text position.
|
335
|
+
// Notice that we have to try the empty string at the end of
|
336
|
+
// the text, so the loop condition is p <= text.end(), not p < text.end().
|
337
|
+
// This looks like it's quadratic in the size of the text,
|
338
|
+
// but we are not clearing visited_ between calls to TrySearch,
|
339
|
+
// so no work is duplicated and it ends up still being linear.
|
340
|
+
for (const char* p = text.begin(); p <= text.end(); p++) {
|
341
|
+
cap_[0] = p;
|
342
|
+
if (TrySearch(prog_->start(), p)) // Match must be leftmost; done.
|
343
|
+
return true;
|
344
|
+
}
|
345
|
+
return false;
|
346
|
+
}
|
347
|
+
|
348
|
+
// Bit-state search.
|
349
|
+
bool Prog::SearchBitState(const StringPiece& text,
|
350
|
+
const StringPiece& context,
|
351
|
+
Anchor anchor,
|
352
|
+
MatchKind kind,
|
353
|
+
StringPiece* match,
|
354
|
+
int nmatch) {
|
355
|
+
// If full match, we ask for an anchored longest match
|
356
|
+
// and then check that match[0] == text.
|
357
|
+
// So make sure match[0] exists.
|
358
|
+
StringPiece sp0;
|
359
|
+
if (kind == kFullMatch) {
|
360
|
+
anchor = kAnchored;
|
361
|
+
if (nmatch < 1) {
|
362
|
+
match = &sp0;
|
363
|
+
nmatch = 1;
|
364
|
+
}
|
365
|
+
}
|
366
|
+
|
367
|
+
// Run the search.
|
368
|
+
BitState b(this);
|
369
|
+
bool anchored = anchor == kAnchored;
|
370
|
+
bool longest = kind != kFirstMatch;
|
371
|
+
if (!b.Search(text, context, anchored, longest, match, nmatch))
|
372
|
+
return false;
|
373
|
+
if (kind == kFullMatch && match[0].end() != text.end())
|
374
|
+
return false;
|
375
|
+
return true;
|
376
|
+
}
|
377
|
+
|
378
|
+
} // namespace re2
|