chipper 0.4.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +51 -0
- data/ext/extconf.rb +58 -0
- data/ext/libstemmer_c/Makefile +10 -0
- data/ext/libstemmer_c/examples/stemwords.c +209 -0
- data/ext/libstemmer_c/include/libstemmer.h +79 -0
- data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
- data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
- data/ext/libstemmer_c/libstemmer/modules.h +190 -0
- data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
- data/ext/libstemmer_c/mkinc.mak +82 -0
- data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
- data/ext/libstemmer_c/runtime/api.c +66 -0
- data/ext/libstemmer_c/runtime/api.h +26 -0
- data/ext/libstemmer_c/runtime/header.h +58 -0
- data/ext/libstemmer_c/runtime/utilities.c +478 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
- data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
- data/ext/re2/bitstate.cc +378 -0
- data/ext/re2/compile.cc +1138 -0
- data/ext/re2/dfa.cc +2086 -0
- data/ext/re2/filtered_re2.cc +100 -0
- data/ext/re2/filtered_re2.h +99 -0
- data/ext/re2/hash.cc +231 -0
- data/ext/re2/mimics_pcre.cc +185 -0
- data/ext/re2/nfa.cc +709 -0
- data/ext/re2/onepass.cc +614 -0
- data/ext/re2/parse.cc +2202 -0
- data/ext/re2/perl_groups.cc +119 -0
- data/ext/re2/prefilter.cc +671 -0
- data/ext/re2/prefilter.h +105 -0
- data/ext/re2/prefilter_tree.cc +398 -0
- data/ext/re2/prefilter_tree.h +130 -0
- data/ext/re2/prog.cc +341 -0
- data/ext/re2/prog.h +376 -0
- data/ext/re2/re2.cc +1180 -0
- data/ext/re2/re2.h +837 -0
- data/ext/re2/regexp.cc +920 -0
- data/ext/re2/regexp.h +632 -0
- data/ext/re2/rune.cc +258 -0
- data/ext/re2/set.cc +113 -0
- data/ext/re2/set.h +55 -0
- data/ext/re2/simplify.cc +393 -0
- data/ext/re2/stringpiece.cc +87 -0
- data/ext/re2/stringpiece.h +182 -0
- data/ext/re2/tostring.cc +341 -0
- data/ext/re2/unicode_casefold.cc +469 -0
- data/ext/re2/unicode_casefold.h +75 -0
- data/ext/re2/unicode_groups.cc +4851 -0
- data/ext/re2/unicode_groups.h +64 -0
- data/ext/re2/valgrind.cc +24 -0
- data/ext/re2/variadic_function.h +346 -0
- data/ext/re2/walker-inl.h +244 -0
- data/ext/src/chipper.cc +626 -0
- data/ext/src/version.h +1 -0
- data/ext/stemmer.rb +40 -0
- data/ext/util/arena.h +103 -0
- data/ext/util/atomicops.h +79 -0
- data/ext/util/benchmark.h +41 -0
- data/ext/util/flags.h +27 -0
- data/ext/util/logging.h +78 -0
- data/ext/util/mutex.h +190 -0
- data/ext/util/pcre.h +679 -0
- data/ext/util/random.h +29 -0
- data/ext/util/sparse_array.h +451 -0
- data/ext/util/sparse_set.h +177 -0
- data/ext/util/test.h +57 -0
- data/ext/util/thread.h +26 -0
- data/ext/util/utf.h +43 -0
- data/ext/util/util.h +127 -0
- data/ext/util/valgrind.h +4517 -0
- data/test/helper.rb +5 -0
- data/test/test_entities.rb +57 -0
- data/test/test_tokens.rb +118 -0
- metadata +199 -0
data/ext/re2/re2.cc
ADDED
@@ -0,0 +1,1180 @@
|
|
1
|
+
// Copyright 2003-2009 The RE2 Authors. All Rights Reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style
|
3
|
+
// license that can be found in the LICENSE file.
|
4
|
+
|
5
|
+
// Regular expression interface RE2.
|
6
|
+
//
|
7
|
+
// Originally the PCRE C++ wrapper, but adapted to use
|
8
|
+
// the new automata-based regular expression engines.
|
9
|
+
|
10
|
+
#include "re2/re2.h"
|
11
|
+
|
12
|
+
#include <stdio.h>
|
13
|
+
#include <string>
|
14
|
+
#include <pthread.h>
|
15
|
+
#include <errno.h>
|
16
|
+
#include "util/util.h"
|
17
|
+
#include "util/flags.h"
|
18
|
+
#include "re2/prog.h"
|
19
|
+
#include "re2/regexp.h"
|
20
|
+
|
21
|
+
DEFINE_bool(trace_re2, false, "trace RE2 execution");
|
22
|
+
|
23
|
+
namespace re2 {
|
24
|
+
|
25
|
+
// Maximum number of args we can set
|
26
|
+
static const int kMaxArgs = 16;
|
27
|
+
static const int kVecSize = 1+kMaxArgs;
|
28
|
+
|
29
|
+
const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::FullMatchN> RE2::FullMatch;
|
30
|
+
const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::PartialMatchN> RE2::PartialMatch;
|
31
|
+
const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::ConsumeN> RE2::Consume;
|
32
|
+
const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::FindAndConsumeN> RE2::FindAndConsume;
|
33
|
+
|
34
|
+
const int RE2::Options::kDefaultMaxMem; // initialized in re2.h
|
35
|
+
|
36
|
+
// Commonly-used option sets; arguments to constructor are:
|
37
|
+
// utf8 input
|
38
|
+
// posix syntax
|
39
|
+
// longest match
|
40
|
+
// log errors
|
41
|
+
const RE2::Options RE2::DefaultOptions; // EncodingUTF8, false, false, true
|
42
|
+
const RE2::Options RE2::Latin1(RE2::Options::EncodingLatin1, false, false, true);
|
43
|
+
const RE2::Options RE2::POSIX(RE2::Options::EncodingUTF8, true, true, true);
|
44
|
+
const RE2::Options RE2::Quiet(RE2::Options::EncodingUTF8, false, false, false);
|
45
|
+
|
46
|
+
// If a regular expression has no error, its error_ field points here
|
47
|
+
static const string empty_string;
|
48
|
+
|
49
|
+
// Converts from Regexp error code to RE2 error code.
|
50
|
+
// Maybe some day they will diverge. In any event, this
|
51
|
+
// hides the existence of Regexp from RE2 users.
|
52
|
+
static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) {
|
53
|
+
switch (code) {
|
54
|
+
case re2::kRegexpSuccess:
|
55
|
+
return RE2::NoError;
|
56
|
+
case re2::kRegexpInternalError:
|
57
|
+
return RE2::ErrorInternal;
|
58
|
+
case re2::kRegexpBadEscape:
|
59
|
+
return RE2::ErrorBadEscape;
|
60
|
+
case re2::kRegexpBadCharClass:
|
61
|
+
return RE2::ErrorBadCharClass;
|
62
|
+
case re2::kRegexpBadCharRange:
|
63
|
+
return RE2::ErrorBadCharRange;
|
64
|
+
case re2::kRegexpMissingBracket:
|
65
|
+
return RE2::ErrorMissingBracket;
|
66
|
+
case re2::kRegexpMissingParen:
|
67
|
+
return RE2::ErrorMissingParen;
|
68
|
+
case re2::kRegexpTrailingBackslash:
|
69
|
+
return RE2::ErrorTrailingBackslash;
|
70
|
+
case re2::kRegexpRepeatArgument:
|
71
|
+
return RE2::ErrorRepeatArgument;
|
72
|
+
case re2::kRegexpRepeatSize:
|
73
|
+
return RE2::ErrorRepeatSize;
|
74
|
+
case re2::kRegexpRepeatOp:
|
75
|
+
return RE2::ErrorRepeatOp;
|
76
|
+
case re2::kRegexpBadPerlOp:
|
77
|
+
return RE2::ErrorBadPerlOp;
|
78
|
+
case re2::kRegexpBadUTF8:
|
79
|
+
return RE2::ErrorBadUTF8;
|
80
|
+
case re2::kRegexpBadNamedCapture:
|
81
|
+
return RE2::ErrorBadNamedCapture;
|
82
|
+
}
|
83
|
+
return RE2::ErrorInternal;
|
84
|
+
}
|
85
|
+
|
86
|
+
static string trunc(const StringPiece& pattern) {
|
87
|
+
if (pattern.size() < 100)
|
88
|
+
return pattern.as_string();
|
89
|
+
return pattern.substr(0, 100).as_string() + "...";
|
90
|
+
}
|
91
|
+
|
92
|
+
|
93
|
+
RE2::RE2(const char* pattern) {
|
94
|
+
Init(pattern, DefaultOptions);
|
95
|
+
}
|
96
|
+
|
97
|
+
RE2::RE2(const string& pattern) {
|
98
|
+
Init(pattern, DefaultOptions);
|
99
|
+
}
|
100
|
+
|
101
|
+
RE2::RE2(const StringPiece& pattern) {
|
102
|
+
Init(pattern, DefaultOptions);
|
103
|
+
}
|
104
|
+
|
105
|
+
RE2::RE2(const StringPiece& pattern, const Options& options) {
|
106
|
+
Init(pattern, options);
|
107
|
+
}
|
108
|
+
|
109
|
+
int RE2::Options::ParseFlags() const {
|
110
|
+
int flags = Regexp::ClassNL;
|
111
|
+
switch (encoding()) {
|
112
|
+
default:
|
113
|
+
LOG(ERROR) << "Unknown encoding " << encoding();
|
114
|
+
break;
|
115
|
+
case RE2::Options::EncodingUTF8:
|
116
|
+
break;
|
117
|
+
case RE2::Options::EncodingLatin1:
|
118
|
+
flags |= Regexp::Latin1;
|
119
|
+
break;
|
120
|
+
}
|
121
|
+
|
122
|
+
if (!posix_syntax())
|
123
|
+
flags |= Regexp::LikePerl;
|
124
|
+
|
125
|
+
if (literal())
|
126
|
+
flags |= Regexp::Literal;
|
127
|
+
|
128
|
+
if (never_nl())
|
129
|
+
flags |= Regexp::NeverNL;
|
130
|
+
|
131
|
+
if (!case_sensitive())
|
132
|
+
flags |= Regexp::FoldCase;
|
133
|
+
|
134
|
+
if (perl_classes())
|
135
|
+
flags |= Regexp::PerlClasses;
|
136
|
+
|
137
|
+
if (word_boundary())
|
138
|
+
flags |= Regexp::PerlB;
|
139
|
+
|
140
|
+
if (one_line())
|
141
|
+
flags |= Regexp::OneLine;
|
142
|
+
|
143
|
+
return flags;
|
144
|
+
}
|
145
|
+
|
146
|
+
void RE2::Init(const StringPiece& pattern, const Options& options) {
|
147
|
+
mutex_ = new Mutex;
|
148
|
+
pattern_ = pattern.as_string();
|
149
|
+
options_.Copy(options);
|
150
|
+
error_ = &empty_string;
|
151
|
+
error_code_ = NoError;
|
152
|
+
suffix_regexp_ = NULL;
|
153
|
+
entire_regexp_ = NULL;
|
154
|
+
prog_ = NULL;
|
155
|
+
rprog_ = NULL;
|
156
|
+
named_groups_ = NULL;
|
157
|
+
group_names_ = NULL;
|
158
|
+
num_captures_ = -1;
|
159
|
+
|
160
|
+
RegexpStatus status;
|
161
|
+
entire_regexp_ = Regexp::Parse(
|
162
|
+
pattern_,
|
163
|
+
static_cast<Regexp::ParseFlags>(options_.ParseFlags()),
|
164
|
+
&status);
|
165
|
+
if (entire_regexp_ == NULL) {
|
166
|
+
if (error_ == &empty_string)
|
167
|
+
error_ = new string(status.Text());
|
168
|
+
if (options_.log_errors()) {
|
169
|
+
LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': "
|
170
|
+
<< status.Text();
|
171
|
+
}
|
172
|
+
error_arg_ = status.error_arg().as_string();
|
173
|
+
error_code_ = RegexpErrorToRE2(status.code());
|
174
|
+
return;
|
175
|
+
}
|
176
|
+
|
177
|
+
prefix_.clear();
|
178
|
+
prefix_foldcase_ = false;
|
179
|
+
re2::Regexp* suffix;
|
180
|
+
if (entire_regexp_->RequiredPrefix(&prefix_, &prefix_foldcase_, &suffix))
|
181
|
+
suffix_regexp_ = suffix;
|
182
|
+
else
|
183
|
+
suffix_regexp_ = entire_regexp_->Incref();
|
184
|
+
|
185
|
+
// Two thirds of the memory goes to the forward Prog,
|
186
|
+
// one third to the reverse prog, because the forward
|
187
|
+
// Prog has two DFAs but the reverse prog has one.
|
188
|
+
prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*2/3);
|
189
|
+
if (prog_ == NULL) {
|
190
|
+
if (options_.log_errors())
|
191
|
+
LOG(ERROR) << "Error compiling '" << trunc(pattern_) << "'";
|
192
|
+
error_ = new string("pattern too large - compile failed");
|
193
|
+
error_code_ = RE2::ErrorPatternTooLarge;
|
194
|
+
return;
|
195
|
+
}
|
196
|
+
|
197
|
+
// Could delay this until the first match call that
|
198
|
+
// cares about submatch information, but the one-pass
|
199
|
+
// machine's memory gets cut from the DFA memory budget,
|
200
|
+
// and that is harder to do if the DFA has already
|
201
|
+
// been built.
|
202
|
+
is_one_pass_ = prog_->IsOnePass();
|
203
|
+
}
|
204
|
+
|
205
|
+
// Returns rprog_, computing it if needed.
|
206
|
+
re2::Prog* RE2::ReverseProg() const {
|
207
|
+
MutexLock l(mutex_);
|
208
|
+
if (rprog_ == NULL && error_ == &empty_string) {
|
209
|
+
rprog_ = suffix_regexp_->CompileToReverseProg(options_.max_mem()/3);
|
210
|
+
if (rprog_ == NULL) {
|
211
|
+
if (options_.log_errors())
|
212
|
+
LOG(ERROR) << "Error reverse compiling '" << trunc(pattern_) << "'";
|
213
|
+
error_ = new string("pattern too large - reverse compile failed");
|
214
|
+
error_code_ = RE2::ErrorPatternTooLarge;
|
215
|
+
return NULL;
|
216
|
+
}
|
217
|
+
}
|
218
|
+
return rprog_;
|
219
|
+
}
|
220
|
+
|
221
|
+
static const map<string, int> empty_named_groups;
|
222
|
+
static const map<int, string> empty_group_names;
|
223
|
+
|
224
|
+
RE2::~RE2() {
|
225
|
+
if (suffix_regexp_)
|
226
|
+
suffix_regexp_->Decref();
|
227
|
+
if (entire_regexp_)
|
228
|
+
entire_regexp_->Decref();
|
229
|
+
delete mutex_;
|
230
|
+
delete prog_;
|
231
|
+
delete rprog_;
|
232
|
+
if (error_ != &empty_string)
|
233
|
+
delete error_;
|
234
|
+
if (named_groups_ != NULL && named_groups_ != &empty_named_groups)
|
235
|
+
delete named_groups_;
|
236
|
+
if (group_names_ != NULL && group_names_ != &empty_group_names)
|
237
|
+
delete group_names_;
|
238
|
+
}
|
239
|
+
|
240
|
+
int RE2::ProgramSize() const {
|
241
|
+
if (prog_ == NULL)
|
242
|
+
return -1;
|
243
|
+
return prog_->size();
|
244
|
+
}
|
245
|
+
|
246
|
+
// Returns named_groups_, computing it if needed.
|
247
|
+
const map<string, int>& RE2::NamedCapturingGroups() const {
|
248
|
+
MutexLock l(mutex_);
|
249
|
+
if (!ok())
|
250
|
+
return empty_named_groups;
|
251
|
+
if (named_groups_ == NULL) {
|
252
|
+
named_groups_ = suffix_regexp_->NamedCaptures();
|
253
|
+
if (named_groups_ == NULL)
|
254
|
+
named_groups_ = &empty_named_groups;
|
255
|
+
}
|
256
|
+
return *named_groups_;
|
257
|
+
}
|
258
|
+
|
259
|
+
// Returns group_names_, computing it if needed.
|
260
|
+
const map<int, string>& RE2::CapturingGroupNames() const {
|
261
|
+
MutexLock l(mutex_);
|
262
|
+
if (!ok())
|
263
|
+
return empty_group_names;
|
264
|
+
if (group_names_ == NULL) {
|
265
|
+
group_names_ = suffix_regexp_->CaptureNames();
|
266
|
+
if (group_names_ == NULL)
|
267
|
+
group_names_ = &empty_group_names;
|
268
|
+
}
|
269
|
+
return *group_names_;
|
270
|
+
}
|
271
|
+
|
272
|
+
/***** Convenience interfaces *****/
|
273
|
+
|
274
|
+
bool RE2::FullMatchN(const StringPiece& text, const RE2& re,
|
275
|
+
const Arg* const args[], int n) {
|
276
|
+
return re.DoMatch(text, ANCHOR_BOTH, NULL, args, n);
|
277
|
+
}
|
278
|
+
|
279
|
+
bool RE2::PartialMatchN(const StringPiece& text, const RE2& re,
|
280
|
+
const Arg* const args[], int n) {
|
281
|
+
return re.DoMatch(text, UNANCHORED, NULL, args, n);
|
282
|
+
}
|
283
|
+
|
284
|
+
bool RE2::ConsumeN(StringPiece* input, const RE2& re,
|
285
|
+
const Arg* const args[], int n) {
|
286
|
+
int consumed;
|
287
|
+
if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) {
|
288
|
+
input->remove_prefix(consumed);
|
289
|
+
return true;
|
290
|
+
} else {
|
291
|
+
return false;
|
292
|
+
}
|
293
|
+
}
|
294
|
+
|
295
|
+
bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re,
|
296
|
+
const Arg* const args[], int n) {
|
297
|
+
int consumed;
|
298
|
+
if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) {
|
299
|
+
input->remove_prefix(consumed);
|
300
|
+
return true;
|
301
|
+
} else {
|
302
|
+
return false;
|
303
|
+
}
|
304
|
+
}
|
305
|
+
|
306
|
+
// Returns the maximum submatch needed for the rewrite to be done by Replace().
|
307
|
+
// E.g. if rewrite == "foo \\2,\\1", returns 2.
|
308
|
+
static int MaxSubmatch(const StringPiece& rewrite) {
|
309
|
+
int max = 0;
|
310
|
+
for (const char *s = rewrite.data(), *end = s + rewrite.size();
|
311
|
+
s < end; s++) {
|
312
|
+
if (*s == '\\') {
|
313
|
+
s++;
|
314
|
+
int c = (s < end) ? *s : -1;
|
315
|
+
if (isdigit(c)) {
|
316
|
+
int n = (c - '0');
|
317
|
+
if (n > max)
|
318
|
+
max = n;
|
319
|
+
}
|
320
|
+
}
|
321
|
+
}
|
322
|
+
return max;
|
323
|
+
}
|
324
|
+
|
325
|
+
bool RE2::Replace(string *str,
|
326
|
+
const RE2& re,
|
327
|
+
const StringPiece& rewrite) {
|
328
|
+
StringPiece vec[kVecSize];
|
329
|
+
int nvec = 1 + MaxSubmatch(rewrite);
|
330
|
+
if (nvec > arraysize(vec))
|
331
|
+
return false;
|
332
|
+
if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec))
|
333
|
+
return false;
|
334
|
+
|
335
|
+
string s;
|
336
|
+
if (!re.Rewrite(&s, rewrite, vec, nvec))
|
337
|
+
return false;
|
338
|
+
|
339
|
+
assert(vec[0].begin() >= str->data());
|
340
|
+
assert(vec[0].end() <= str->data()+str->size());
|
341
|
+
str->replace(vec[0].data() - str->data(), vec[0].size(), s);
|
342
|
+
return true;
|
343
|
+
}
|
344
|
+
|
345
|
+
int RE2::GlobalReplace(string *str,
|
346
|
+
const RE2& re,
|
347
|
+
const StringPiece& rewrite) {
|
348
|
+
StringPiece vec[kVecSize];
|
349
|
+
int nvec = 1 + MaxSubmatch(rewrite);
|
350
|
+
if (nvec > arraysize(vec))
|
351
|
+
return false;
|
352
|
+
|
353
|
+
const char* p = str->data();
|
354
|
+
const char* ep = p + str->size();
|
355
|
+
const char* lastend = NULL;
|
356
|
+
string out;
|
357
|
+
int count = 0;
|
358
|
+
while (p <= ep) {
|
359
|
+
if (!re.Match(*str, p - str->data(), str->size(), UNANCHORED, vec, nvec))
|
360
|
+
break;
|
361
|
+
if (p < vec[0].begin())
|
362
|
+
out.append(p, vec[0].begin() - p);
|
363
|
+
if (vec[0].begin() == lastend && vec[0].size() == 0) {
|
364
|
+
// Disallow empty match at end of last match: skip ahead.
|
365
|
+
if (p < ep)
|
366
|
+
out.append(p, 1);
|
367
|
+
p++;
|
368
|
+
continue;
|
369
|
+
}
|
370
|
+
re.Rewrite(&out, rewrite, vec, nvec);
|
371
|
+
p = vec[0].end();
|
372
|
+
lastend = p;
|
373
|
+
count++;
|
374
|
+
}
|
375
|
+
|
376
|
+
if (count == 0)
|
377
|
+
return 0;
|
378
|
+
|
379
|
+
if (p < ep)
|
380
|
+
out.append(p, ep - p);
|
381
|
+
swap(out, *str);
|
382
|
+
return count;
|
383
|
+
}
|
384
|
+
|
385
|
+
bool RE2::Extract(const StringPiece &text,
|
386
|
+
const RE2& re,
|
387
|
+
const StringPiece &rewrite,
|
388
|
+
string *out) {
|
389
|
+
StringPiece vec[kVecSize];
|
390
|
+
int nvec = 1 + MaxSubmatch(rewrite);
|
391
|
+
if (nvec > arraysize(vec))
|
392
|
+
return false;
|
393
|
+
|
394
|
+
if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec))
|
395
|
+
return false;
|
396
|
+
|
397
|
+
out->clear();
|
398
|
+
return re.Rewrite(out, rewrite, vec, nvec);
|
399
|
+
}
|
400
|
+
|
401
|
+
string RE2::QuoteMeta(const StringPiece& unquoted) {
|
402
|
+
string result;
|
403
|
+
result.reserve(unquoted.size() << 1);
|
404
|
+
|
405
|
+
// Escape any ascii character not in [A-Za-z_0-9].
|
406
|
+
//
|
407
|
+
// Note that it's legal to escape a character even if it has no
|
408
|
+
// special meaning in a regular expression -- so this function does
|
409
|
+
// that. (This also makes it identical to the perl function of the
|
410
|
+
// same name except for the null-character special case;
|
411
|
+
// see `perldoc -f quotemeta`.)
|
412
|
+
for (int ii = 0; ii < unquoted.length(); ++ii) {
|
413
|
+
// Note that using 'isalnum' here raises the benchmark time from
|
414
|
+
// 32ns to 58ns:
|
415
|
+
if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
|
416
|
+
(unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
|
417
|
+
(unquoted[ii] < '0' || unquoted[ii] > '9') &&
|
418
|
+
unquoted[ii] != '_' &&
|
419
|
+
// If this is the part of a UTF8 or Latin1 character, we need
|
420
|
+
// to copy this byte without escaping. Experimentally this is
|
421
|
+
// what works correctly with the regexp library.
|
422
|
+
!(unquoted[ii] & 128)) {
|
423
|
+
if (unquoted[ii] == '\0') { // Special handling for null chars.
|
424
|
+
// Note that this special handling is not strictly required for RE2,
|
425
|
+
// but this quoting is required for other regexp libraries such as
|
426
|
+
// PCRE.
|
427
|
+
// Can't use "\\0" since the next character might be a digit.
|
428
|
+
result += "\\x00";
|
429
|
+
continue;
|
430
|
+
}
|
431
|
+
result += '\\';
|
432
|
+
}
|
433
|
+
result += unquoted[ii];
|
434
|
+
}
|
435
|
+
|
436
|
+
return result;
|
437
|
+
}
|
438
|
+
|
439
|
+
bool RE2::PossibleMatchRange(string* min, string* max, int maxlen) const {
|
440
|
+
if (prog_ == NULL)
|
441
|
+
return false;
|
442
|
+
|
443
|
+
int n = prefix_.size();
|
444
|
+
if (n > maxlen)
|
445
|
+
n = maxlen;
|
446
|
+
|
447
|
+
// Determine initial min max from prefix_ literal.
|
448
|
+
string pmin, pmax;
|
449
|
+
pmin = prefix_.substr(0, n);
|
450
|
+
pmax = prefix_.substr(0, n);
|
451
|
+
if (prefix_foldcase_) {
|
452
|
+
// prefix is ASCII lowercase; change pmin to uppercase.
|
453
|
+
for (int i = 0; i < n; i++) {
|
454
|
+
if ('a' <= pmin[i] && pmin[i] <= 'z')
|
455
|
+
pmin[i] += 'A' - 'a';
|
456
|
+
}
|
457
|
+
}
|
458
|
+
|
459
|
+
// Add to prefix min max using PossibleMatchRange on regexp.
|
460
|
+
string dmin, dmax;
|
461
|
+
maxlen -= n;
|
462
|
+
if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) {
|
463
|
+
pmin += dmin;
|
464
|
+
pmax += dmax;
|
465
|
+
} else if (pmax.size() > 0) {
|
466
|
+
// prog_->PossibleMatchRange has failed us,
|
467
|
+
// but we still have useful information from prefix_.
|
468
|
+
// Round up pmax to allow any possible suffix.
|
469
|
+
pmax = PrefixSuccessor(pmax);
|
470
|
+
} else {
|
471
|
+
// Nothing useful.
|
472
|
+
*min = "";
|
473
|
+
*max = "";
|
474
|
+
return false;
|
475
|
+
}
|
476
|
+
|
477
|
+
*min = pmin;
|
478
|
+
*max = pmax;
|
479
|
+
return true;
|
480
|
+
}
|
481
|
+
|
482
|
+
// Avoid possible locale nonsense in standard strcasecmp.
|
483
|
+
// The string a is known to be all lowercase.
|
484
|
+
static int ascii_strcasecmp(const char* a, const char* b, int len) {
|
485
|
+
const char *ae = a + len;
|
486
|
+
|
487
|
+
for (; a < ae; a++, b++) {
|
488
|
+
uint8 x = *a;
|
489
|
+
uint8 y = *b;
|
490
|
+
if ('A' <= y && y <= 'Z')
|
491
|
+
y += 'a' - 'A';
|
492
|
+
if (x != y)
|
493
|
+
return x - y;
|
494
|
+
}
|
495
|
+
return 0;
|
496
|
+
}
|
497
|
+
|
498
|
+
|
499
|
+
/***** Actual matching and rewriting code *****/
|
500
|
+
|
501
|
+
bool RE2::Match(const StringPiece& text,
|
502
|
+
int startpos,
|
503
|
+
int endpos,
|
504
|
+
Anchor re_anchor,
|
505
|
+
StringPiece* submatch,
|
506
|
+
int nsubmatch) const {
|
507
|
+
if (!ok() || suffix_regexp_ == NULL) {
|
508
|
+
if (options_.log_errors())
|
509
|
+
LOG(ERROR) << "Invalid RE2: " << *error_;
|
510
|
+
return false;
|
511
|
+
}
|
512
|
+
|
513
|
+
if (startpos < 0 || startpos > endpos || endpos > text.size()) {
|
514
|
+
LOG(ERROR) << "RE2: invalid startpos, endpos pair.";
|
515
|
+
return false;
|
516
|
+
}
|
517
|
+
|
518
|
+
StringPiece subtext = text;
|
519
|
+
subtext.remove_prefix(startpos);
|
520
|
+
subtext.remove_suffix(text.size() - endpos);
|
521
|
+
|
522
|
+
// Use DFAs to find exact location of match, filter out non-matches.
|
523
|
+
|
524
|
+
// Don't ask for the location if we won't use it.
|
525
|
+
// SearchDFA can do extra optimizations in that case.
|
526
|
+
StringPiece match;
|
527
|
+
StringPiece* matchp = &match;
|
528
|
+
if (nsubmatch == 0)
|
529
|
+
matchp = NULL;
|
530
|
+
|
531
|
+
int ncap = 1 + NumberOfCapturingGroups();
|
532
|
+
if (ncap > nsubmatch)
|
533
|
+
ncap = nsubmatch;
|
534
|
+
|
535
|
+
// If the regexp is anchored explicitly, must not be in middle of text.
|
536
|
+
if (prog_->anchor_start() && startpos != 0)
|
537
|
+
return false;
|
538
|
+
|
539
|
+
// If the regexp is anchored explicitly, update re_anchor
|
540
|
+
// so that we can potentially fall into a faster case below.
|
541
|
+
if (prog_->anchor_start() && prog_->anchor_end())
|
542
|
+
re_anchor = ANCHOR_BOTH;
|
543
|
+
else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH)
|
544
|
+
re_anchor = ANCHOR_START;
|
545
|
+
|
546
|
+
// Check for the required prefix, if any.
|
547
|
+
int prefixlen = 0;
|
548
|
+
if (!prefix_.empty()) {
|
549
|
+
if (startpos != 0)
|
550
|
+
return false;
|
551
|
+
prefixlen = prefix_.size();
|
552
|
+
if (prefixlen > subtext.size())
|
553
|
+
return false;
|
554
|
+
if (prefix_foldcase_) {
|
555
|
+
if (ascii_strcasecmp(&prefix_[0], subtext.data(), prefixlen) != 0)
|
556
|
+
return false;
|
557
|
+
} else {
|
558
|
+
if (memcmp(&prefix_[0], subtext.data(), prefixlen) != 0)
|
559
|
+
return false;
|
560
|
+
}
|
561
|
+
subtext.remove_prefix(prefixlen);
|
562
|
+
// If there is a required prefix, the anchor must be at least ANCHOR_START.
|
563
|
+
if (re_anchor != ANCHOR_BOTH)
|
564
|
+
re_anchor = ANCHOR_START;
|
565
|
+
}
|
566
|
+
|
567
|
+
Prog::Anchor anchor = Prog::kUnanchored;
|
568
|
+
Prog::MatchKind kind = Prog::kFirstMatch;
|
569
|
+
if (options_.longest_match())
|
570
|
+
kind = Prog::kLongestMatch;
|
571
|
+
bool skipped_test = false;
|
572
|
+
|
573
|
+
bool can_one_pass = (is_one_pass_ && ncap <= Prog::kMaxOnePassCapture);
|
574
|
+
|
575
|
+
// SearchBitState allocates a bit vector of size prog_->size() * text.size().
|
576
|
+
// It also allocates a stack of 3-word structures which could potentially
|
577
|
+
// grow as large as prog_->size() * text.size() but in practice is much
|
578
|
+
// smaller.
|
579
|
+
// Conditions for using SearchBitState:
|
580
|
+
const int MaxBitStateProg = 500; // prog_->size() <= Max.
|
581
|
+
const int MaxBitStateVector = 256*1024; // bit vector size <= Max (bits)
|
582
|
+
bool can_bit_state = prog_->size() <= MaxBitStateProg;
|
583
|
+
int bit_state_text_max = MaxBitStateVector / prog_->size();
|
584
|
+
|
585
|
+
bool dfa_failed = false;
|
586
|
+
switch (re_anchor) {
|
587
|
+
default:
|
588
|
+
case UNANCHORED: {
|
589
|
+
if (!prog_->SearchDFA(subtext, text, anchor, kind,
|
590
|
+
matchp, &dfa_failed, NULL)) {
|
591
|
+
if (dfa_failed) {
|
592
|
+
// Fall back to NFA below.
|
593
|
+
skipped_test = true;
|
594
|
+
if (FLAGS_trace_re2)
|
595
|
+
LOG(INFO) << "Match " << trunc(pattern_)
|
596
|
+
<< " [" << CEscape(subtext) << "]"
|
597
|
+
<< " DFA failed.";
|
598
|
+
break;
|
599
|
+
}
|
600
|
+
if (FLAGS_trace_re2)
|
601
|
+
LOG(INFO) << "Match " << trunc(pattern_)
|
602
|
+
<< " [" << CEscape(subtext) << "]"
|
603
|
+
<< " used DFA - no match.";
|
604
|
+
return false;
|
605
|
+
}
|
606
|
+
if (FLAGS_trace_re2)
|
607
|
+
LOG(INFO) << "Match " << trunc(pattern_)
|
608
|
+
<< " [" << CEscape(subtext) << "]"
|
609
|
+
<< " used DFA - match";
|
610
|
+
if (matchp == NULL) // Matched. Don't care where
|
611
|
+
return true;
|
612
|
+
// SearchDFA set match[0].end() but didn't know where the
|
613
|
+
// match started. Run the regexp backward from match[0].end()
|
614
|
+
// to find the longest possible match -- that's where it started.
|
615
|
+
Prog* prog = ReverseProg();
|
616
|
+
if (prog == NULL)
|
617
|
+
return false;
|
618
|
+
if (!prog->SearchDFA(match, text, Prog::kAnchored,
|
619
|
+
Prog::kLongestMatch, &match, &dfa_failed, NULL)) {
|
620
|
+
if (dfa_failed) {
|
621
|
+
// Fall back to NFA below.
|
622
|
+
skipped_test = true;
|
623
|
+
if (FLAGS_trace_re2)
|
624
|
+
LOG(INFO) << "Match " << trunc(pattern_)
|
625
|
+
<< " [" << CEscape(subtext) << "]"
|
626
|
+
<< " reverse DFA failed.";
|
627
|
+
break;
|
628
|
+
}
|
629
|
+
if (FLAGS_trace_re2)
|
630
|
+
LOG(INFO) << "Match " << trunc(pattern_)
|
631
|
+
<< " [" << CEscape(subtext) << "]"
|
632
|
+
<< " DFA inconsistency.";
|
633
|
+
LOG(ERROR) << "DFA inconsistency";
|
634
|
+
return false;
|
635
|
+
}
|
636
|
+
if (FLAGS_trace_re2)
|
637
|
+
LOG(INFO) << "Match " << trunc(pattern_)
|
638
|
+
<< " [" << CEscape(subtext) << "]"
|
639
|
+
<< " used reverse DFA.";
|
640
|
+
break;
|
641
|
+
}
|
642
|
+
|
643
|
+
case ANCHOR_BOTH:
|
644
|
+
case ANCHOR_START:
|
645
|
+
if (re_anchor == ANCHOR_BOTH)
|
646
|
+
kind = Prog::kFullMatch;
|
647
|
+
anchor = Prog::kAnchored;
|
648
|
+
|
649
|
+
// If only a small amount of text and need submatch
|
650
|
+
// information anyway and we're going to use OnePass or BitState
|
651
|
+
// to get it, we might as well not even bother with the DFA:
|
652
|
+
// OnePass or BitState will be fast enough.
|
653
|
+
// On tiny texts, OnePass outruns even the DFA, and
|
654
|
+
// it doesn't have the shared state and occasional mutex that
|
655
|
+
// the DFA does.
|
656
|
+
if (can_one_pass && text.size() <= 4096 &&
|
657
|
+
(ncap > 1 || text.size() <= 8)) {
|
658
|
+
if (FLAGS_trace_re2)
|
659
|
+
LOG(INFO) << "Match " << trunc(pattern_)
|
660
|
+
<< " [" << CEscape(subtext) << "]"
|
661
|
+
<< " skipping DFA for OnePass.";
|
662
|
+
skipped_test = true;
|
663
|
+
break;
|
664
|
+
}
|
665
|
+
if (can_bit_state && text.size() <= bit_state_text_max && ncap > 1) {
|
666
|
+
if (FLAGS_trace_re2)
|
667
|
+
LOG(INFO) << "Match " << trunc(pattern_)
|
668
|
+
<< " [" << CEscape(subtext) << "]"
|
669
|
+
<< " skipping DFA for BitState.";
|
670
|
+
skipped_test = true;
|
671
|
+
break;
|
672
|
+
}
|
673
|
+
if (!prog_->SearchDFA(subtext, text, anchor, kind,
|
674
|
+
&match, &dfa_failed, NULL)) {
|
675
|
+
if (dfa_failed) {
|
676
|
+
if (FLAGS_trace_re2)
|
677
|
+
LOG(INFO) << "Match " << trunc(pattern_)
|
678
|
+
<< " [" << CEscape(subtext) << "]"
|
679
|
+
<< " DFA failed.";
|
680
|
+
skipped_test = true;
|
681
|
+
break;
|
682
|
+
}
|
683
|
+
if (FLAGS_trace_re2)
|
684
|
+
LOG(INFO) << "Match " << trunc(pattern_)
|
685
|
+
<< " [" << CEscape(subtext) << "]"
|
686
|
+
<< " used DFA - no match.";
|
687
|
+
return false;
|
688
|
+
}
|
689
|
+
break;
|
690
|
+
}
|
691
|
+
|
692
|
+
if (!skipped_test && ncap <= 1) {
|
693
|
+
// We know exactly where it matches. That's enough.
|
694
|
+
if (ncap == 1)
|
695
|
+
submatch[0] = match;
|
696
|
+
} else {
|
697
|
+
StringPiece subtext1;
|
698
|
+
if (skipped_test) {
|
699
|
+
// DFA ran out of memory or was skipped:
|
700
|
+
// need to search in entire original text.
|
701
|
+
subtext1 = subtext;
|
702
|
+
} else {
|
703
|
+
// DFA found the exact match location:
|
704
|
+
// let NFA run an anchored, full match search
|
705
|
+
// to find submatch locations.
|
706
|
+
subtext1 = match;
|
707
|
+
anchor = Prog::kAnchored;
|
708
|
+
kind = Prog::kFullMatch;
|
709
|
+
}
|
710
|
+
|
711
|
+
if (can_one_pass && anchor != Prog::kUnanchored) {
|
712
|
+
if (FLAGS_trace_re2)
|
713
|
+
LOG(INFO) << "Match " << trunc(pattern_)
|
714
|
+
<< " [" << CEscape(subtext) << "]"
|
715
|
+
<< " using OnePass.";
|
716
|
+
if (!prog_->SearchOnePass(subtext1, text, anchor, kind, submatch, ncap)) {
|
717
|
+
if (!skipped_test)
|
718
|
+
LOG(ERROR) << "SearchOnePass inconsistency";
|
719
|
+
return false;
|
720
|
+
}
|
721
|
+
} else if (can_bit_state && subtext1.size() <= bit_state_text_max) {
|
722
|
+
if (FLAGS_trace_re2)
|
723
|
+
LOG(INFO) << "Match " << trunc(pattern_)
|
724
|
+
<< " [" << CEscape(subtext) << "]"
|
725
|
+
<< " using BitState.";
|
726
|
+
if (!prog_->SearchBitState(subtext1, text, anchor,
|
727
|
+
kind, submatch, ncap)) {
|
728
|
+
if (!skipped_test)
|
729
|
+
LOG(ERROR) << "SearchBitState inconsistency";
|
730
|
+
return false;
|
731
|
+
}
|
732
|
+
} else {
|
733
|
+
if (FLAGS_trace_re2)
|
734
|
+
LOG(INFO) << "Match " << trunc(pattern_)
|
735
|
+
<< " [" << CEscape(subtext) << "]"
|
736
|
+
<< " using NFA.";
|
737
|
+
if (!prog_->SearchNFA(subtext1, text, anchor, kind, submatch, ncap)) {
|
738
|
+
if (!skipped_test)
|
739
|
+
LOG(ERROR) << "SearchNFA inconsistency";
|
740
|
+
return false;
|
741
|
+
}
|
742
|
+
}
|
743
|
+
}
|
744
|
+
|
745
|
+
// Adjust overall match for required prefix that we stripped off.
|
746
|
+
if (prefixlen > 0 && nsubmatch > 0)
|
747
|
+
submatch[0] = StringPiece(submatch[0].begin() - prefixlen,
|
748
|
+
submatch[0].size() + prefixlen);
|
749
|
+
|
750
|
+
// Zero submatches that don't exist in the regexp.
|
751
|
+
for (int i = ncap; i < nsubmatch; i++)
|
752
|
+
submatch[i] = NULL;
|
753
|
+
return true;
|
754
|
+
}
|
755
|
+
|
756
|
+
// Internal matcher - like Match() but takes Args not StringPieces.
|
757
|
+
bool RE2::DoMatch(const StringPiece& text,
|
758
|
+
Anchor anchor,
|
759
|
+
int* consumed,
|
760
|
+
const Arg* const* args,
|
761
|
+
int n) const {
|
762
|
+
if (!ok()) {
|
763
|
+
if (options_.log_errors())
|
764
|
+
LOG(ERROR) << "Invalid RE2: " << *error_;
|
765
|
+
return false;
|
766
|
+
}
|
767
|
+
|
768
|
+
// Count number of capture groups needed.
|
769
|
+
int nvec;
|
770
|
+
if (n == 0 && consumed == NULL)
|
771
|
+
nvec = 0;
|
772
|
+
else
|
773
|
+
nvec = n+1;
|
774
|
+
|
775
|
+
StringPiece* vec;
|
776
|
+
StringPiece stkvec[kVecSize];
|
777
|
+
StringPiece* heapvec = NULL;
|
778
|
+
|
779
|
+
if (nvec <= arraysize(stkvec)) {
|
780
|
+
vec = stkvec;
|
781
|
+
} else {
|
782
|
+
vec = new StringPiece[nvec];
|
783
|
+
heapvec = vec;
|
784
|
+
}
|
785
|
+
|
786
|
+
if (!Match(text, 0, text.size(), anchor, vec, nvec)) {
|
787
|
+
delete[] heapvec;
|
788
|
+
return false;
|
789
|
+
}
|
790
|
+
|
791
|
+
if(consumed != NULL)
|
792
|
+
*consumed = vec[0].end() - text.begin();
|
793
|
+
|
794
|
+
if (n == 0 || args == NULL) {
|
795
|
+
// We are not interested in results
|
796
|
+
delete[] heapvec;
|
797
|
+
return true;
|
798
|
+
}
|
799
|
+
|
800
|
+
int ncap = NumberOfCapturingGroups();
|
801
|
+
if (ncap < n) {
|
802
|
+
// RE has fewer capturing groups than number of arg pointers passed in
|
803
|
+
VLOG(1) << "Asked for " << n << " but only have " << ncap;
|
804
|
+
delete[] heapvec;
|
805
|
+
return false;
|
806
|
+
}
|
807
|
+
|
808
|
+
// If we got here, we must have matched the whole pattern.
|
809
|
+
for (int i = 0; i < n; i++) {
|
810
|
+
const StringPiece& s = vec[i+1];
|
811
|
+
if (!args[i]->Parse(s.data(), s.size())) {
|
812
|
+
// TODO: Should we indicate what the error was?
|
813
|
+
VLOG(1) << "Parse error on #" << i << " " << s << " "
|
814
|
+
<< (void*)s.data() << "/" << s.size();
|
815
|
+
delete[] heapvec;
|
816
|
+
return false;
|
817
|
+
}
|
818
|
+
}
|
819
|
+
|
820
|
+
delete[] heapvec;
|
821
|
+
return true;
|
822
|
+
}
|
823
|
+
|
824
|
+
// Append the "rewrite" string, with backslash subsitutions from "vec",
|
825
|
+
// to string "out".
|
826
|
+
bool RE2::Rewrite(string *out, const StringPiece &rewrite,
|
827
|
+
const StringPiece *vec, int veclen) const {
|
828
|
+
for (const char *s = rewrite.data(), *end = s + rewrite.size();
|
829
|
+
s < end; s++) {
|
830
|
+
int c = *s;
|
831
|
+
if (c == '\\') {
|
832
|
+
s++;
|
833
|
+
c = (s < end) ? *s : -1;
|
834
|
+
if (isdigit(c)) {
|
835
|
+
int n = (c - '0');
|
836
|
+
if (n >= veclen) {
|
837
|
+
LOG(ERROR) << "requested group " << n
|
838
|
+
<< " in regexp " << rewrite.data();
|
839
|
+
return false;
|
840
|
+
}
|
841
|
+
StringPiece snip = vec[n];
|
842
|
+
if (snip.size() > 0)
|
843
|
+
out->append(snip.data(), snip.size());
|
844
|
+
} else if (c == '\\') {
|
845
|
+
out->push_back('\\');
|
846
|
+
} else {
|
847
|
+
LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data();
|
848
|
+
return false;
|
849
|
+
}
|
850
|
+
} else {
|
851
|
+
out->push_back(c);
|
852
|
+
}
|
853
|
+
}
|
854
|
+
return true;
|
855
|
+
}
|
856
|
+
|
857
|
+
// Return the number of capturing subpatterns, or -1 if the
|
858
|
+
// regexp wasn't valid on construction.
|
859
|
+
int RE2::NumberOfCapturingGroups() const {
|
860
|
+
if (suffix_regexp_ == NULL)
|
861
|
+
return -1;
|
862
|
+
ANNOTATE_BENIGN_RACE(&num_captures_, "benign race: in the worst case"
|
863
|
+
" multiple threads end up doing the same work in parallel.");
|
864
|
+
if (num_captures_ == -1)
|
865
|
+
num_captures_ = suffix_regexp_->NumCaptures();
|
866
|
+
return num_captures_;
|
867
|
+
}
|
868
|
+
|
869
|
+
// Checks that the rewrite string is well-formed with respect to this
|
870
|
+
// regular expression.
|
871
|
+
bool RE2::CheckRewriteString(const StringPiece& rewrite, string* error) const {
|
872
|
+
int max_token = -1;
|
873
|
+
for (const char *s = rewrite.data(), *end = s + rewrite.size();
|
874
|
+
s < end; s++) {
|
875
|
+
int c = *s;
|
876
|
+
if (c != '\\') {
|
877
|
+
continue;
|
878
|
+
}
|
879
|
+
if (++s == end) {
|
880
|
+
*error = "Rewrite schema error: '\\' not allowed at end.";
|
881
|
+
return false;
|
882
|
+
}
|
883
|
+
c = *s;
|
884
|
+
if (c == '\\') {
|
885
|
+
continue;
|
886
|
+
}
|
887
|
+
if (!isdigit(c)) {
|
888
|
+
*error = "Rewrite schema error: "
|
889
|
+
"'\\' must be followed by a digit or '\\'.";
|
890
|
+
return false;
|
891
|
+
}
|
892
|
+
int n = (c - '0');
|
893
|
+
if (max_token < n) {
|
894
|
+
max_token = n;
|
895
|
+
}
|
896
|
+
}
|
897
|
+
|
898
|
+
if (max_token > NumberOfCapturingGroups()) {
|
899
|
+
SStringPrintf(error, "Rewrite schema requests %d matches, "
|
900
|
+
"but the regexp only has %d parenthesized subexpressions.",
|
901
|
+
max_token, NumberOfCapturingGroups());
|
902
|
+
return false;
|
903
|
+
}
|
904
|
+
return true;
|
905
|
+
}
|
906
|
+
|
907
|
+
/***** Parsers for various types *****/
|
908
|
+
|
909
|
+
bool RE2::Arg::parse_null(const char* str, int n, void* dest) {
|
910
|
+
// We fail if somebody asked us to store into a non-NULL void* pointer
|
911
|
+
return (dest == NULL);
|
912
|
+
}
|
913
|
+
|
914
|
+
bool RE2::Arg::parse_string(const char* str, int n, void* dest) {
|
915
|
+
if (dest == NULL) return true;
|
916
|
+
reinterpret_cast<string*>(dest)->assign(str, n);
|
917
|
+
return true;
|
918
|
+
}
|
919
|
+
|
920
|
+
bool RE2::Arg::parse_stringpiece(const char* str, int n, void* dest) {
|
921
|
+
if (dest == NULL) return true;
|
922
|
+
reinterpret_cast<StringPiece*>(dest)->set(str, n);
|
923
|
+
return true;
|
924
|
+
}
|
925
|
+
|
926
|
+
bool RE2::Arg::parse_char(const char* str, int n, void* dest) {
|
927
|
+
if (n != 1) return false;
|
928
|
+
if (dest == NULL) return true;
|
929
|
+
*(reinterpret_cast<char*>(dest)) = str[0];
|
930
|
+
return true;
|
931
|
+
}
|
932
|
+
|
933
|
+
bool RE2::Arg::parse_uchar(const char* str, int n, void* dest) {
|
934
|
+
if (n != 1) return false;
|
935
|
+
if (dest == NULL) return true;
|
936
|
+
*(reinterpret_cast<unsigned char*>(dest)) = str[0];
|
937
|
+
return true;
|
938
|
+
}
|
939
|
+
|
940
|
+
// Largest number spec that we are willing to parse
|
941
|
+
static const int kMaxNumberLength = 32;
|
942
|
+
|
943
|
+
// REQUIRES "buf" must have length at least kMaxNumberLength+1
|
944
|
+
// Copies "str" into "buf" and null-terminates.
|
945
|
+
// Overwrites *np with the new length.
|
946
|
+
static const char* TerminateNumber(char* buf, const char* str, int* np) {
|
947
|
+
int n = *np;
|
948
|
+
if (n <= 0) return "";
|
949
|
+
if (n > 0 && isspace(*str)) {
|
950
|
+
// We are less forgiving than the strtoxxx() routines and do not
|
951
|
+
// allow leading spaces.
|
952
|
+
return "";
|
953
|
+
}
|
954
|
+
|
955
|
+
// Although buf has a fixed maximum size, we can still handle
|
956
|
+
// arbitrarily large integers correctly by omitting leading zeros.
|
957
|
+
// (Numbers that are still too long will be out of range.)
|
958
|
+
// Before deciding whether str is too long,
|
959
|
+
// remove leading zeros with s/000+/00/.
|
960
|
+
// Leaving the leading two zeros in place means that
|
961
|
+
// we don't change 0000x123 (invalid) into 0x123 (valid).
|
962
|
+
// Skip over leading - before replacing.
|
963
|
+
bool neg = false;
|
964
|
+
if (n >= 1 && str[0] == '-') {
|
965
|
+
neg = true;
|
966
|
+
n--;
|
967
|
+
str++;
|
968
|
+
}
|
969
|
+
|
970
|
+
if (n >= 3 && str[0] == '0' && str[1] == '0') {
|
971
|
+
while (n >= 3 && str[2] == '0') {
|
972
|
+
n--;
|
973
|
+
str++;
|
974
|
+
}
|
975
|
+
}
|
976
|
+
|
977
|
+
if (neg) { // make room in buf for -
|
978
|
+
n++;
|
979
|
+
str--;
|
980
|
+
}
|
981
|
+
|
982
|
+
if (n > kMaxNumberLength) return "";
|
983
|
+
|
984
|
+
memmove(buf, str, n);
|
985
|
+
if (neg) {
|
986
|
+
buf[0] = '-';
|
987
|
+
}
|
988
|
+
buf[n] = '\0';
|
989
|
+
*np = n;
|
990
|
+
return buf;
|
991
|
+
}
|
992
|
+
|
993
|
+
bool RE2::Arg::parse_long_radix(const char* str,
|
994
|
+
int n,
|
995
|
+
void* dest,
|
996
|
+
int radix) {
|
997
|
+
if (n == 0) return false;
|
998
|
+
char buf[kMaxNumberLength+1];
|
999
|
+
str = TerminateNumber(buf, str, &n);
|
1000
|
+
char* end;
|
1001
|
+
errno = 0;
|
1002
|
+
long r = strtol(str, &end, radix);
|
1003
|
+
if (end != str + n) return false; // Leftover junk
|
1004
|
+
if (errno) return false;
|
1005
|
+
if (dest == NULL) return true;
|
1006
|
+
*(reinterpret_cast<long*>(dest)) = r;
|
1007
|
+
return true;
|
1008
|
+
}
|
1009
|
+
|
1010
|
+
bool RE2::Arg::parse_ulong_radix(const char* str,
|
1011
|
+
int n,
|
1012
|
+
void* dest,
|
1013
|
+
int radix) {
|
1014
|
+
if (n == 0) return false;
|
1015
|
+
char buf[kMaxNumberLength+1];
|
1016
|
+
str = TerminateNumber(buf, str, &n);
|
1017
|
+
if (str[0] == '-') {
|
1018
|
+
// strtoul() will silently accept negative numbers and parse
|
1019
|
+
// them. This module is more strict and treats them as errors.
|
1020
|
+
return false;
|
1021
|
+
}
|
1022
|
+
|
1023
|
+
char* end;
|
1024
|
+
errno = 0;
|
1025
|
+
unsigned long r = strtoul(str, &end, radix);
|
1026
|
+
if (end != str + n) return false; // Leftover junk
|
1027
|
+
if (errno) return false;
|
1028
|
+
if (dest == NULL) return true;
|
1029
|
+
*(reinterpret_cast<unsigned long*>(dest)) = r;
|
1030
|
+
return true;
|
1031
|
+
}
|
1032
|
+
|
1033
|
+
bool RE2::Arg::parse_short_radix(const char* str,
|
1034
|
+
int n,
|
1035
|
+
void* dest,
|
1036
|
+
int radix) {
|
1037
|
+
long r;
|
1038
|
+
if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
|
1039
|
+
if ((short)r != r) return false; // Out of range
|
1040
|
+
if (dest == NULL) return true;
|
1041
|
+
*(reinterpret_cast<short*>(dest)) = r;
|
1042
|
+
return true;
|
1043
|
+
}
|
1044
|
+
|
1045
|
+
bool RE2::Arg::parse_ushort_radix(const char* str,
|
1046
|
+
int n,
|
1047
|
+
void* dest,
|
1048
|
+
int radix) {
|
1049
|
+
unsigned long r;
|
1050
|
+
if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
|
1051
|
+
if ((ushort)r != r) return false; // Out of range
|
1052
|
+
if (dest == NULL) return true;
|
1053
|
+
*(reinterpret_cast<unsigned short*>(dest)) = r;
|
1054
|
+
return true;
|
1055
|
+
}
|
1056
|
+
|
1057
|
+
bool RE2::Arg::parse_int_radix(const char* str,
|
1058
|
+
int n,
|
1059
|
+
void* dest,
|
1060
|
+
int radix) {
|
1061
|
+
long r;
|
1062
|
+
if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
|
1063
|
+
if ((int)r != r) return false; // Out of range
|
1064
|
+
if (dest == NULL) return true;
|
1065
|
+
*(reinterpret_cast<int*>(dest)) = r;
|
1066
|
+
return true;
|
1067
|
+
}
|
1068
|
+
|
1069
|
+
bool RE2::Arg::parse_uint_radix(const char* str,
|
1070
|
+
int n,
|
1071
|
+
void* dest,
|
1072
|
+
int radix) {
|
1073
|
+
unsigned long r;
|
1074
|
+
if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
|
1075
|
+
if ((uint)r != r) return false; // Out of range
|
1076
|
+
if (dest == NULL) return true;
|
1077
|
+
*(reinterpret_cast<unsigned int*>(dest)) = r;
|
1078
|
+
return true;
|
1079
|
+
}
|
1080
|
+
|
1081
|
+
bool RE2::Arg::parse_longlong_radix(const char* str,
|
1082
|
+
int n,
|
1083
|
+
void* dest,
|
1084
|
+
int radix) {
|
1085
|
+
if (n == 0) return false;
|
1086
|
+
char buf[kMaxNumberLength+1];
|
1087
|
+
str = TerminateNumber(buf, str, &n);
|
1088
|
+
char* end;
|
1089
|
+
errno = 0;
|
1090
|
+
int64 r = strtoll(str, &end, radix);
|
1091
|
+
if (end != str + n) return false; // Leftover junk
|
1092
|
+
if (errno) return false;
|
1093
|
+
if (dest == NULL) return true;
|
1094
|
+
*(reinterpret_cast<int64*>(dest)) = r;
|
1095
|
+
return true;
|
1096
|
+
}
|
1097
|
+
|
1098
|
+
bool RE2::Arg::parse_ulonglong_radix(const char* str,
|
1099
|
+
int n,
|
1100
|
+
void* dest,
|
1101
|
+
int radix) {
|
1102
|
+
if (n == 0) return false;
|
1103
|
+
char buf[kMaxNumberLength+1];
|
1104
|
+
str = TerminateNumber(buf, str, &n);
|
1105
|
+
if (str[0] == '-') {
|
1106
|
+
// strtoull() will silently accept negative numbers and parse
|
1107
|
+
// them. This module is more strict and treats them as errors.
|
1108
|
+
return false;
|
1109
|
+
}
|
1110
|
+
char* end;
|
1111
|
+
errno = 0;
|
1112
|
+
uint64 r = strtoull(str, &end, radix);
|
1113
|
+
if (end != str + n) return false; // Leftover junk
|
1114
|
+
if (errno) return false;
|
1115
|
+
if (dest == NULL) return true;
|
1116
|
+
*(reinterpret_cast<uint64*>(dest)) = r;
|
1117
|
+
return true;
|
1118
|
+
}
|
1119
|
+
|
1120
|
+
static bool parse_double_float(const char* str, int n, bool isfloat, void *dest) {
|
1121
|
+
if (n == 0) return false;
|
1122
|
+
static const int kMaxLength = 200;
|
1123
|
+
char buf[kMaxLength];
|
1124
|
+
if (n >= kMaxLength) return false;
|
1125
|
+
memcpy(buf, str, n);
|
1126
|
+
buf[n] = '\0';
|
1127
|
+
errno = 0;
|
1128
|
+
char* end;
|
1129
|
+
double r;
|
1130
|
+
if (isfloat) {
|
1131
|
+
r = strtof(buf, &end);
|
1132
|
+
} else {
|
1133
|
+
r = strtod(buf, &end);
|
1134
|
+
}
|
1135
|
+
if (end != buf + n) return false; // Leftover junk
|
1136
|
+
if (errno) return false;
|
1137
|
+
if (dest == NULL) return true;
|
1138
|
+
if (isfloat) {
|
1139
|
+
*(reinterpret_cast<float*>(dest)) = r;
|
1140
|
+
} else {
|
1141
|
+
*(reinterpret_cast<double*>(dest)) = r;
|
1142
|
+
}
|
1143
|
+
return true;
|
1144
|
+
}
|
1145
|
+
|
1146
|
+
bool RE2::Arg::parse_double(const char* str, int n, void* dest) {
|
1147
|
+
return parse_double_float(str, n, false, dest);
|
1148
|
+
}
|
1149
|
+
|
1150
|
+
bool RE2::Arg::parse_float(const char* str, int n, void* dest) {
|
1151
|
+
return parse_double_float(str, n, true, dest);
|
1152
|
+
}
|
1153
|
+
|
1154
|
+
|
1155
|
+
#define DEFINE_INTEGER_PARSERS(name) \
|
1156
|
+
bool RE2::Arg::parse_##name(const char* str, int n, void* dest) { \
|
1157
|
+
return parse_##name##_radix(str, n, dest, 10); \
|
1158
|
+
} \
|
1159
|
+
bool RE2::Arg::parse_##name##_hex(const char* str, int n, void* dest) { \
|
1160
|
+
return parse_##name##_radix(str, n, dest, 16); \
|
1161
|
+
} \
|
1162
|
+
bool RE2::Arg::parse_##name##_octal(const char* str, int n, void* dest) { \
|
1163
|
+
return parse_##name##_radix(str, n, dest, 8); \
|
1164
|
+
} \
|
1165
|
+
bool RE2::Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \
|
1166
|
+
return parse_##name##_radix(str, n, dest, 0); \
|
1167
|
+
}
|
1168
|
+
|
1169
|
+
DEFINE_INTEGER_PARSERS(short);
|
1170
|
+
DEFINE_INTEGER_PARSERS(ushort);
|
1171
|
+
DEFINE_INTEGER_PARSERS(int);
|
1172
|
+
DEFINE_INTEGER_PARSERS(uint);
|
1173
|
+
DEFINE_INTEGER_PARSERS(long);
|
1174
|
+
DEFINE_INTEGER_PARSERS(ulong);
|
1175
|
+
DEFINE_INTEGER_PARSERS(longlong);
|
1176
|
+
DEFINE_INTEGER_PARSERS(ulonglong);
|
1177
|
+
|
1178
|
+
#undef DEFINE_INTEGER_PARSERS
|
1179
|
+
|
1180
|
+
} // namespace re2
|