chipper 0.4.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +51 -0
- data/ext/extconf.rb +58 -0
- data/ext/libstemmer_c/Makefile +10 -0
- data/ext/libstemmer_c/examples/stemwords.c +209 -0
- data/ext/libstemmer_c/include/libstemmer.h +79 -0
- data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
- data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
- data/ext/libstemmer_c/libstemmer/modules.h +190 -0
- data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
- data/ext/libstemmer_c/mkinc.mak +82 -0
- data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
- data/ext/libstemmer_c/runtime/api.c +66 -0
- data/ext/libstemmer_c/runtime/api.h +26 -0
- data/ext/libstemmer_c/runtime/header.h +58 -0
- data/ext/libstemmer_c/runtime/utilities.c +478 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
- data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
- data/ext/re2/bitstate.cc +378 -0
- data/ext/re2/compile.cc +1138 -0
- data/ext/re2/dfa.cc +2086 -0
- data/ext/re2/filtered_re2.cc +100 -0
- data/ext/re2/filtered_re2.h +99 -0
- data/ext/re2/hash.cc +231 -0
- data/ext/re2/mimics_pcre.cc +185 -0
- data/ext/re2/nfa.cc +709 -0
- data/ext/re2/onepass.cc +614 -0
- data/ext/re2/parse.cc +2202 -0
- data/ext/re2/perl_groups.cc +119 -0
- data/ext/re2/prefilter.cc +671 -0
- data/ext/re2/prefilter.h +105 -0
- data/ext/re2/prefilter_tree.cc +398 -0
- data/ext/re2/prefilter_tree.h +130 -0
- data/ext/re2/prog.cc +341 -0
- data/ext/re2/prog.h +376 -0
- data/ext/re2/re2.cc +1180 -0
- data/ext/re2/re2.h +837 -0
- data/ext/re2/regexp.cc +920 -0
- data/ext/re2/regexp.h +632 -0
- data/ext/re2/rune.cc +258 -0
- data/ext/re2/set.cc +113 -0
- data/ext/re2/set.h +55 -0
- data/ext/re2/simplify.cc +393 -0
- data/ext/re2/stringpiece.cc +87 -0
- data/ext/re2/stringpiece.h +182 -0
- data/ext/re2/tostring.cc +341 -0
- data/ext/re2/unicode_casefold.cc +469 -0
- data/ext/re2/unicode_casefold.h +75 -0
- data/ext/re2/unicode_groups.cc +4851 -0
- data/ext/re2/unicode_groups.h +64 -0
- data/ext/re2/valgrind.cc +24 -0
- data/ext/re2/variadic_function.h +346 -0
- data/ext/re2/walker-inl.h +244 -0
- data/ext/src/chipper.cc +626 -0
- data/ext/src/version.h +1 -0
- data/ext/stemmer.rb +40 -0
- data/ext/util/arena.h +103 -0
- data/ext/util/atomicops.h +79 -0
- data/ext/util/benchmark.h +41 -0
- data/ext/util/flags.h +27 -0
- data/ext/util/logging.h +78 -0
- data/ext/util/mutex.h +190 -0
- data/ext/util/pcre.h +679 -0
- data/ext/util/random.h +29 -0
- data/ext/util/sparse_array.h +451 -0
- data/ext/util/sparse_set.h +177 -0
- data/ext/util/test.h +57 -0
- data/ext/util/thread.h +26 -0
- data/ext/util/utf.h +43 -0
- data/ext/util/util.h +127 -0
- data/ext/util/valgrind.h +4517 -0
- data/test/helper.rb +5 -0
- data/test/test_entities.rb +57 -0
- data/test/test_tokens.rb +118 -0
- metadata +199 -0
data/ext/re2/simplify.cc
ADDED
@@ -0,0 +1,393 @@
|
|
1
|
+
// Copyright 2006 The RE2 Authors. All Rights Reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style
|
3
|
+
// license that can be found in the LICENSE file.
|
4
|
+
|
5
|
+
// Rewrite POSIX and other features in re
|
6
|
+
// to use simple extended regular expression features.
|
7
|
+
// Also sort and simplify character classes.
|
8
|
+
|
9
|
+
#include "util/util.h"
|
10
|
+
#include "re2/regexp.h"
|
11
|
+
#include "re2/walker-inl.h"
|
12
|
+
|
13
|
+
namespace re2 {
|
14
|
+
|
15
|
+
// Parses the regexp src and then simplifies it and sets *dst to the
|
16
|
+
// string representation of the simplified form. Returns true on success.
|
17
|
+
// Returns false and sets *error (if error != NULL) on error.
|
18
|
+
bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags,
|
19
|
+
string* dst,
|
20
|
+
RegexpStatus* status) {
|
21
|
+
Regexp* re = Parse(src, flags, status);
|
22
|
+
if (re == NULL)
|
23
|
+
return false;
|
24
|
+
Regexp* sre = re->Simplify();
|
25
|
+
re->Decref();
|
26
|
+
if (sre == NULL) {
|
27
|
+
// Should not happen, since Simplify never fails.
|
28
|
+
LOG(ERROR) << "Simplify failed on " << src;
|
29
|
+
if (status) {
|
30
|
+
status->set_code(kRegexpInternalError);
|
31
|
+
status->set_error_arg(src);
|
32
|
+
}
|
33
|
+
return false;
|
34
|
+
}
|
35
|
+
*dst = sre->ToString();
|
36
|
+
sre->Decref();
|
37
|
+
return true;
|
38
|
+
}
|
39
|
+
|
40
|
+
// Assuming the simple_ flags on the children are accurate,
|
41
|
+
// is this Regexp* simple?
|
42
|
+
bool Regexp::ComputeSimple() {
|
43
|
+
Regexp** subs;
|
44
|
+
switch (op_) {
|
45
|
+
case kRegexpNoMatch:
|
46
|
+
case kRegexpEmptyMatch:
|
47
|
+
case kRegexpLiteral:
|
48
|
+
case kRegexpLiteralString:
|
49
|
+
case kRegexpBeginLine:
|
50
|
+
case kRegexpEndLine:
|
51
|
+
case kRegexpBeginText:
|
52
|
+
case kRegexpWordBoundary:
|
53
|
+
case kRegexpNoWordBoundary:
|
54
|
+
case kRegexpEndText:
|
55
|
+
case kRegexpAnyChar:
|
56
|
+
case kRegexpAnyByte:
|
57
|
+
case kRegexpHaveMatch:
|
58
|
+
return true;
|
59
|
+
case kRegexpConcat:
|
60
|
+
case kRegexpAlternate:
|
61
|
+
// These are simple as long as the subpieces are simple.
|
62
|
+
subs = sub();
|
63
|
+
for (int i = 0; i < nsub_; i++)
|
64
|
+
if (!subs[i]->simple_)
|
65
|
+
return false;
|
66
|
+
return true;
|
67
|
+
case kRegexpCharClass:
|
68
|
+
// Simple as long as the char class is not empty, not full.
|
69
|
+
if (ccb_ != NULL)
|
70
|
+
return !ccb_->empty() && !ccb_->full();
|
71
|
+
return !cc_->empty() && !cc_->full();
|
72
|
+
case kRegexpCapture:
|
73
|
+
subs = sub();
|
74
|
+
return subs[0]->simple_;
|
75
|
+
case kRegexpStar:
|
76
|
+
case kRegexpPlus:
|
77
|
+
case kRegexpQuest:
|
78
|
+
subs = sub();
|
79
|
+
if (!subs[0]->simple_)
|
80
|
+
return false;
|
81
|
+
switch (subs[0]->op_) {
|
82
|
+
case kRegexpStar:
|
83
|
+
case kRegexpPlus:
|
84
|
+
case kRegexpQuest:
|
85
|
+
case kRegexpEmptyMatch:
|
86
|
+
case kRegexpNoMatch:
|
87
|
+
return false;
|
88
|
+
default:
|
89
|
+
break;
|
90
|
+
}
|
91
|
+
return true;
|
92
|
+
case kRegexpRepeat:
|
93
|
+
return false;
|
94
|
+
}
|
95
|
+
LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_;
|
96
|
+
return false;
|
97
|
+
}
|
98
|
+
|
99
|
+
// Walker subclass used by Simplify.
|
100
|
+
// The simplify walk is purely post-recursive: given the simplified children,
|
101
|
+
// PostVisit creates the simplified result.
|
102
|
+
// The child_args are simplified Regexp*s.
|
103
|
+
class SimplifyWalker : public Regexp::Walker<Regexp*> {
|
104
|
+
public:
|
105
|
+
SimplifyWalker() {}
|
106
|
+
virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
|
107
|
+
virtual Regexp* PostVisit(Regexp* re,
|
108
|
+
Regexp* parent_arg,
|
109
|
+
Regexp* pre_arg,
|
110
|
+
Regexp** child_args, int nchild_args);
|
111
|
+
virtual Regexp* Copy(Regexp* re);
|
112
|
+
virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
|
113
|
+
|
114
|
+
private:
|
115
|
+
// These functions are declared inside SimplifyWalker so that
|
116
|
+
// they can edit the private fields of the Regexps they construct.
|
117
|
+
|
118
|
+
// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
|
119
|
+
// Caller must Decref return value when done with it.
|
120
|
+
static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags);
|
121
|
+
|
122
|
+
// Simplifies the expression re{min,max} in terms of *, +, and ?.
|
123
|
+
// Returns a new regexp. Does not edit re. Does not consume reference to re.
|
124
|
+
// Caller must Decref return value when done with it.
|
125
|
+
static Regexp* SimplifyRepeat(Regexp* re, int min, int max,
|
126
|
+
Regexp::ParseFlags parse_flags);
|
127
|
+
|
128
|
+
// Simplifies a character class by expanding any named classes
|
129
|
+
// into rune ranges. Does not edit re. Does not consume ref to re.
|
130
|
+
// Caller must Decref return value when done with it.
|
131
|
+
static Regexp* SimplifyCharClass(Regexp* re);
|
132
|
+
|
133
|
+
DISALLOW_EVIL_CONSTRUCTORS(SimplifyWalker);
|
134
|
+
};
|
135
|
+
|
136
|
+
// Simplifies a regular expression, returning a new regexp.
|
137
|
+
// The new regexp uses traditional Unix egrep features only,
|
138
|
+
// plus the Perl (?:) non-capturing parentheses.
|
139
|
+
// Otherwise, no POSIX or Perl additions. The new regexp
|
140
|
+
// captures exactly the same subexpressions (with the same indices)
|
141
|
+
// as the original.
|
142
|
+
// Does not edit current object.
|
143
|
+
// Caller must Decref() return value when done with it.
|
144
|
+
|
145
|
+
Regexp* Regexp::Simplify() {
|
146
|
+
if (simple_)
|
147
|
+
return Incref();
|
148
|
+
SimplifyWalker w;
|
149
|
+
return w.Walk(this, NULL);
|
150
|
+
}
|
151
|
+
|
152
|
+
#define Simplify DontCallSimplify // Avoid accidental recursion
|
153
|
+
|
154
|
+
Regexp* SimplifyWalker::Copy(Regexp* re) {
|
155
|
+
return re->Incref();
|
156
|
+
}
|
157
|
+
|
158
|
+
Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
|
159
|
+
// This should never be called, since we use Walk and not
|
160
|
+
// WalkExponential.
|
161
|
+
LOG(DFATAL) << "SimplifyWalker::ShortVisit called";
|
162
|
+
return re->Incref();
|
163
|
+
}
|
164
|
+
|
165
|
+
Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {
|
166
|
+
if (re->simple_) {
|
167
|
+
*stop = true;
|
168
|
+
return re->Incref();
|
169
|
+
}
|
170
|
+
return NULL;
|
171
|
+
}
|
172
|
+
|
173
|
+
Regexp* SimplifyWalker::PostVisit(Regexp* re,
|
174
|
+
Regexp* parent_arg,
|
175
|
+
Regexp* pre_arg,
|
176
|
+
Regexp** child_args,
|
177
|
+
int nchild_args) {
|
178
|
+
switch (re->op()) {
|
179
|
+
case kRegexpNoMatch:
|
180
|
+
case kRegexpEmptyMatch:
|
181
|
+
case kRegexpLiteral:
|
182
|
+
case kRegexpLiteralString:
|
183
|
+
case kRegexpBeginLine:
|
184
|
+
case kRegexpEndLine:
|
185
|
+
case kRegexpBeginText:
|
186
|
+
case kRegexpWordBoundary:
|
187
|
+
case kRegexpNoWordBoundary:
|
188
|
+
case kRegexpEndText:
|
189
|
+
case kRegexpAnyChar:
|
190
|
+
case kRegexpAnyByte:
|
191
|
+
case kRegexpHaveMatch:
|
192
|
+
// All these are always simple.
|
193
|
+
re->simple_ = true;
|
194
|
+
return re->Incref();
|
195
|
+
|
196
|
+
case kRegexpConcat:
|
197
|
+
case kRegexpAlternate: {
|
198
|
+
// These are simple as long as the subpieces are simple.
|
199
|
+
// Two passes to avoid allocation in the common case.
|
200
|
+
bool changed = false;
|
201
|
+
Regexp** subs = re->sub();
|
202
|
+
for (int i = 0; i < re->nsub_; i++) {
|
203
|
+
Regexp* sub = subs[i];
|
204
|
+
Regexp* newsub = child_args[i];
|
205
|
+
if (newsub != sub) {
|
206
|
+
changed = true;
|
207
|
+
break;
|
208
|
+
}
|
209
|
+
}
|
210
|
+
if (!changed) {
|
211
|
+
for (int i = 0; i < re->nsub_; i++) {
|
212
|
+
Regexp* newsub = child_args[i];
|
213
|
+
newsub->Decref();
|
214
|
+
}
|
215
|
+
re->simple_ = true;
|
216
|
+
return re->Incref();
|
217
|
+
}
|
218
|
+
Regexp* nre = new Regexp(re->op(), re->parse_flags());
|
219
|
+
nre->AllocSub(re->nsub_);
|
220
|
+
Regexp** nre_subs = nre->sub();
|
221
|
+
for (int i = 0; i <re->nsub_; i++)
|
222
|
+
nre_subs[i] = child_args[i];
|
223
|
+
nre->simple_ = true;
|
224
|
+
return nre;
|
225
|
+
}
|
226
|
+
|
227
|
+
case kRegexpCapture: {
|
228
|
+
Regexp* newsub = child_args[0];
|
229
|
+
if (newsub == re->sub()[0]) {
|
230
|
+
newsub->Decref();
|
231
|
+
re->simple_ = true;
|
232
|
+
return re->Incref();
|
233
|
+
}
|
234
|
+
Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());
|
235
|
+
nre->AllocSub(1);
|
236
|
+
nre->sub()[0] = newsub;
|
237
|
+
nre->cap_ = re->cap_;
|
238
|
+
nre->simple_ = true;
|
239
|
+
return nre;
|
240
|
+
}
|
241
|
+
|
242
|
+
case kRegexpStar:
|
243
|
+
case kRegexpPlus:
|
244
|
+
case kRegexpQuest: {
|
245
|
+
Regexp* newsub = child_args[0];
|
246
|
+
// Special case: repeat the empty string as much as
|
247
|
+
// you want, but it's still the empty string.
|
248
|
+
if (newsub->op() == kRegexpEmptyMatch)
|
249
|
+
return newsub;
|
250
|
+
|
251
|
+
// These are simple as long as the subpiece is simple.
|
252
|
+
if (newsub == re->sub()[0]) {
|
253
|
+
newsub->Decref();
|
254
|
+
re->simple_ = true;
|
255
|
+
return re->Incref();
|
256
|
+
}
|
257
|
+
|
258
|
+
// These are also idempotent if flags are constant.
|
259
|
+
if (re->op() == newsub->op() &&
|
260
|
+
re->parse_flags() == newsub->parse_flags())
|
261
|
+
return newsub;
|
262
|
+
|
263
|
+
Regexp* nre = new Regexp(re->op(), re->parse_flags());
|
264
|
+
nre->AllocSub(1);
|
265
|
+
nre->sub()[0] = newsub;
|
266
|
+
nre->simple_ = true;
|
267
|
+
return nre;
|
268
|
+
}
|
269
|
+
|
270
|
+
case kRegexpRepeat: {
|
271
|
+
Regexp* newsub = child_args[0];
|
272
|
+
// Special case: repeat the empty string as much as
|
273
|
+
// you want, but it's still the empty string.
|
274
|
+
if (newsub->op() == kRegexpEmptyMatch)
|
275
|
+
return newsub;
|
276
|
+
|
277
|
+
Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_,
|
278
|
+
re->parse_flags());
|
279
|
+
newsub->Decref();
|
280
|
+
nre->simple_ = true;
|
281
|
+
return nre;
|
282
|
+
}
|
283
|
+
|
284
|
+
case kRegexpCharClass: {
|
285
|
+
Regexp* nre = SimplifyCharClass(re);
|
286
|
+
nre->simple_ = true;
|
287
|
+
return nre;
|
288
|
+
}
|
289
|
+
}
|
290
|
+
|
291
|
+
LOG(ERROR) << "Simplify case not handled: " << re->op();
|
292
|
+
return re->Incref();
|
293
|
+
}
|
294
|
+
|
295
|
+
// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
|
296
|
+
// Returns a new Regexp, handing the ref to the caller.
|
297
|
+
Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,
|
298
|
+
Regexp::ParseFlags parse_flags) {
|
299
|
+
Regexp* re = new Regexp(kRegexpConcat, parse_flags);
|
300
|
+
re->AllocSub(2);
|
301
|
+
Regexp** subs = re->sub();
|
302
|
+
subs[0] = re1;
|
303
|
+
subs[1] = re2;
|
304
|
+
return re;
|
305
|
+
}
|
306
|
+
|
307
|
+
// Simplifies the expression re{min,max} in terms of *, +, and ?.
|
308
|
+
// Returns a new regexp. Does not edit re. Does not consume reference to re.
|
309
|
+
// Caller must Decref return value when done with it.
|
310
|
+
// The result will *not* necessarily have the right capturing parens
|
311
|
+
// if you call ToString() and re-parse it: (x){2} becomes (x)(x),
|
312
|
+
// but in the Regexp* representation, both (x) are marked as $1.
|
313
|
+
Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
|
314
|
+
Regexp::ParseFlags f) {
|
315
|
+
// x{n,} means at least n matches of x.
|
316
|
+
if (max == -1) {
|
317
|
+
// Special case: x{0,} is x*
|
318
|
+
if (min == 0)
|
319
|
+
return Regexp::Star(re->Incref(), f);
|
320
|
+
|
321
|
+
// Special case: x{1,} is x+
|
322
|
+
if (min == 1)
|
323
|
+
return Regexp::Plus(re->Incref(), f);
|
324
|
+
|
325
|
+
// General case: x{4,} is xxxx+
|
326
|
+
Regexp* nre = new Regexp(kRegexpConcat, f);
|
327
|
+
nre->AllocSub(min);
|
328
|
+
VLOG(1) << "Simplify " << min;
|
329
|
+
Regexp** nre_subs = nre->sub();
|
330
|
+
for (int i = 0; i < min-1; i++)
|
331
|
+
nre_subs[i] = re->Incref();
|
332
|
+
nre_subs[min-1] = Regexp::Plus(re->Incref(), f);
|
333
|
+
return nre;
|
334
|
+
}
|
335
|
+
|
336
|
+
// Special case: (x){0} matches only empty string.
|
337
|
+
if (min == 0 && max == 0)
|
338
|
+
return new Regexp(kRegexpEmptyMatch, f);
|
339
|
+
|
340
|
+
// Special case: x{1} is just x.
|
341
|
+
if (min == 1 && max == 1)
|
342
|
+
return re->Incref();
|
343
|
+
|
344
|
+
// General case: x{n,m} means n copies of x and m copies of x?.
|
345
|
+
// The machine will do less work if we nest the final m copies,
|
346
|
+
// so that x{2,5} = xx(x(x(x)?)?)?
|
347
|
+
|
348
|
+
// Build leading prefix: xx. Capturing only on the last one.
|
349
|
+
Regexp* nre = NULL;
|
350
|
+
if (min > 0) {
|
351
|
+
nre = new Regexp(kRegexpConcat, f);
|
352
|
+
nre->AllocSub(min);
|
353
|
+
Regexp** nre_subs = nre->sub();
|
354
|
+
for (int i = 0; i < min; i++)
|
355
|
+
nre_subs[i] = re->Incref();
|
356
|
+
}
|
357
|
+
|
358
|
+
// Build and attach suffix: (x(x(x)?)?)?
|
359
|
+
if (max > min) {
|
360
|
+
Regexp* suf = Regexp::Quest(re->Incref(), f);
|
361
|
+
for (int i = min+1; i < max; i++)
|
362
|
+
suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f);
|
363
|
+
if (nre == NULL)
|
364
|
+
nre = suf;
|
365
|
+
else
|
366
|
+
nre = Concat2(nre, suf, f);
|
367
|
+
}
|
368
|
+
|
369
|
+
if (nre == NULL) {
|
370
|
+
// Some degenerate case, like min > max, or min < max < 0.
|
371
|
+
// This shouldn't happen, because the parser rejects such regexps.
|
372
|
+
LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max;
|
373
|
+
return new Regexp(kRegexpNoMatch, f);
|
374
|
+
}
|
375
|
+
|
376
|
+
return nre;
|
377
|
+
}
|
378
|
+
|
379
|
+
// Simplifies a character class.
|
380
|
+
// Caller must Decref return value when done with it.
|
381
|
+
Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) {
|
382
|
+
CharClass* cc = re->cc();
|
383
|
+
|
384
|
+
// Special cases
|
385
|
+
if (cc->empty())
|
386
|
+
return new Regexp(kRegexpNoMatch, re->parse_flags());
|
387
|
+
if (cc->full())
|
388
|
+
return new Regexp(kRegexpAnyChar, re->parse_flags());
|
389
|
+
|
390
|
+
return re->Incref();
|
391
|
+
}
|
392
|
+
|
393
|
+
} // namespace re2
|
@@ -0,0 +1,87 @@
|
|
1
|
+
// Copyright 2004 The RE2 Authors. All Rights Reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style
|
3
|
+
// license that can be found in the LICENSE file.
|
4
|
+
|
5
|
+
#include "re2/stringpiece.h"
|
6
|
+
#include "util/util.h"
|
7
|
+
|
8
|
+
using re2::StringPiece;
|
9
|
+
|
10
|
+
std::ostream& operator<<(std::ostream& o, const StringPiece& piece) {
|
11
|
+
o.write(piece.data(), piece.size());
|
12
|
+
return o;
|
13
|
+
}
|
14
|
+
|
15
|
+
bool StringPiece::_equal(const StringPiece& x, const StringPiece& y) {
|
16
|
+
int len = x.size();
|
17
|
+
if (len != y.size()) {
|
18
|
+
return false;
|
19
|
+
}
|
20
|
+
const char* p = x.data();
|
21
|
+
const char* p2 = y.data();
|
22
|
+
// Test last byte in case strings share large common prefix
|
23
|
+
if ((len > 0) && (p[len-1] != p2[len-1])) return false;
|
24
|
+
const char* p_limit = p + len;
|
25
|
+
for (; p < p_limit; p++, p2++) {
|
26
|
+
if (*p != *p2)
|
27
|
+
return false;
|
28
|
+
}
|
29
|
+
return true;
|
30
|
+
}
|
31
|
+
|
32
|
+
void StringPiece::CopyToString(string* target) const {
|
33
|
+
target->assign(ptr_, length_);
|
34
|
+
}
|
35
|
+
|
36
|
+
int StringPiece::copy(char* buf, size_type n, size_type pos) const {
|
37
|
+
int ret = min(length_ - pos, n);
|
38
|
+
memcpy(buf, ptr_ + pos, ret);
|
39
|
+
return ret;
|
40
|
+
}
|
41
|
+
|
42
|
+
int StringPiece::find(const StringPiece& s, size_type pos) const {
|
43
|
+
if (length_ < 0 || pos > static_cast<size_type>(length_))
|
44
|
+
return npos;
|
45
|
+
|
46
|
+
const char* result = std::search(ptr_ + pos, ptr_ + length_,
|
47
|
+
s.ptr_, s.ptr_ + s.length_);
|
48
|
+
const size_type xpos = result - ptr_;
|
49
|
+
return xpos + s.length_ <= length_ ? xpos : npos;
|
50
|
+
}
|
51
|
+
|
52
|
+
int StringPiece::find(char c, size_type pos) const {
|
53
|
+
if (length_ <= 0 || pos >= static_cast<size_type>(length_)) {
|
54
|
+
return npos;
|
55
|
+
}
|
56
|
+
const char* result = std::find(ptr_ + pos, ptr_ + length_, c);
|
57
|
+
return result != ptr_ + length_ ? result - ptr_ : npos;
|
58
|
+
}
|
59
|
+
|
60
|
+
int StringPiece::rfind(const StringPiece& s, size_type pos) const {
|
61
|
+
if (length_ < s.length_) return npos;
|
62
|
+
const size_t ulen = length_;
|
63
|
+
if (s.length_ == 0) return min(ulen, pos);
|
64
|
+
|
65
|
+
const char* last = ptr_ + min(ulen - s.length_, pos) + s.length_;
|
66
|
+
const char* result = std::find_end(ptr_, last, s.ptr_, s.ptr_ + s.length_);
|
67
|
+
return result != last ? result - ptr_ : npos;
|
68
|
+
}
|
69
|
+
|
70
|
+
int StringPiece::rfind(char c, size_type pos) const {
|
71
|
+
if (length_ <= 0) return npos;
|
72
|
+
for (int i = min(pos, static_cast<size_type>(length_ - 1));
|
73
|
+
i >= 0; --i) {
|
74
|
+
if (ptr_[i] == c) {
|
75
|
+
return i;
|
76
|
+
}
|
77
|
+
}
|
78
|
+
return npos;
|
79
|
+
}
|
80
|
+
|
81
|
+
StringPiece StringPiece::substr(size_type pos, size_type n) const {
|
82
|
+
if (pos > length_) pos = length_;
|
83
|
+
if (n > length_ - pos) n = length_ - pos;
|
84
|
+
return StringPiece(ptr_ + pos, n);
|
85
|
+
}
|
86
|
+
|
87
|
+
const StringPiece::size_type StringPiece::npos = size_type(-1);
|
@@ -0,0 +1,182 @@
|
|
1
|
+
// Copyright 2001-2010 The RE2 Authors. All Rights Reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style
|
3
|
+
// license that can be found in the LICENSE file.
|
4
|
+
|
5
|
+
// A string-like object that points to a sized piece of memory.
|
6
|
+
//
|
7
|
+
// Functions or methods may use const StringPiece& parameters to accept either
|
8
|
+
// a "const char*" or a "string" value that will be implicitly converted to
|
9
|
+
// a StringPiece. The implicit conversion means that it is often appropriate
|
10
|
+
// to include this .h file in other files rather than forward-declaring
|
11
|
+
// StringPiece as would be appropriate for most other Google classes.
|
12
|
+
//
|
13
|
+
// Systematic usage of StringPiece is encouraged as it will reduce unnecessary
|
14
|
+
// conversions from "const char*" to "string" and back again.
|
15
|
+
//
|
16
|
+
//
|
17
|
+
// Arghh! I wish C++ literals were "string".
|
18
|
+
|
19
|
+
#ifndef STRINGS_STRINGPIECE_H__
|
20
|
+
#define STRINGS_STRINGPIECE_H__
|
21
|
+
|
22
|
+
#include <string.h>
|
23
|
+
#include <cstddef>
|
24
|
+
#include <iosfwd>
|
25
|
+
#include <string>
|
26
|
+
|
27
|
+
namespace re2 {
|
28
|
+
|
29
|
+
class StringPiece {
|
30
|
+
private:
|
31
|
+
const char* ptr_;
|
32
|
+
int length_;
|
33
|
+
|
34
|
+
public:
|
35
|
+
// We provide non-explicit singleton constructors so users can pass
|
36
|
+
// in a "const char*" or a "string" wherever a "StringPiece" is
|
37
|
+
// expected.
|
38
|
+
StringPiece() : ptr_(NULL), length_(0) { }
|
39
|
+
StringPiece(const char* str)
|
40
|
+
: ptr_(str), length_((str == NULL) ? 0 : static_cast<int>(strlen(str))) { }
|
41
|
+
StringPiece(const std::string& str)
|
42
|
+
: ptr_(str.data()), length_(static_cast<int>(str.size())) { }
|
43
|
+
StringPiece(const char* offset, int len) : ptr_(offset), length_(len) { }
|
44
|
+
|
45
|
+
// data() may return a pointer to a buffer with embedded NULs, and the
|
46
|
+
// returned buffer may or may not be null terminated. Therefore it is
|
47
|
+
// typically a mistake to pass data() to a routine that expects a NUL
|
48
|
+
// terminated string.
|
49
|
+
const char* data() const { return ptr_; }
|
50
|
+
int size() const { return length_; }
|
51
|
+
int length() const { return length_; }
|
52
|
+
bool empty() const { return length_ == 0; }
|
53
|
+
|
54
|
+
void clear() { ptr_ = NULL; length_ = 0; }
|
55
|
+
void set(const char* data, int len) { ptr_ = data; length_ = len; }
|
56
|
+
void set(const char* str) {
|
57
|
+
ptr_ = str;
|
58
|
+
if (str != NULL)
|
59
|
+
length_ = static_cast<int>(strlen(str));
|
60
|
+
else
|
61
|
+
length_ = 0;
|
62
|
+
}
|
63
|
+
void set(const void* data, int len) {
|
64
|
+
ptr_ = reinterpret_cast<const char*>(data);
|
65
|
+
length_ = len;
|
66
|
+
}
|
67
|
+
|
68
|
+
char operator[](int i) const { return ptr_[i]; }
|
69
|
+
|
70
|
+
void remove_prefix(int n) {
|
71
|
+
ptr_ += n;
|
72
|
+
length_ -= n;
|
73
|
+
}
|
74
|
+
|
75
|
+
void remove_suffix(int n) {
|
76
|
+
length_ -= n;
|
77
|
+
}
|
78
|
+
|
79
|
+
int compare(const StringPiece& x) const {
|
80
|
+
int r = memcmp(ptr_, x.ptr_, std::min(length_, x.length_));
|
81
|
+
if (r == 0) {
|
82
|
+
if (length_ < x.length_) r = -1;
|
83
|
+
else if (length_ > x.length_) r = +1;
|
84
|
+
}
|
85
|
+
return r;
|
86
|
+
}
|
87
|
+
|
88
|
+
std::string as_string() const {
|
89
|
+
return std::string(data(), size());
|
90
|
+
}
|
91
|
+
// We also define ToString() here, since many other string-like
|
92
|
+
// interfaces name the routine that converts to a C++ string
|
93
|
+
// "ToString", and it's confusing to have the method that does that
|
94
|
+
// for a StringPiece be called "as_string()". We also leave the
|
95
|
+
// "as_string()" method defined here for existing code.
|
96
|
+
std::string ToString() const {
|
97
|
+
return std::string(data(), size());
|
98
|
+
}
|
99
|
+
|
100
|
+
void CopyToString(std::string* target) const;
|
101
|
+
void AppendToString(std::string* target) const;
|
102
|
+
|
103
|
+
// Does "this" start with "x"
|
104
|
+
bool starts_with(const StringPiece& x) const {
|
105
|
+
return ((length_ >= x.length_) &&
|
106
|
+
(memcmp(ptr_, x.ptr_, x.length_) == 0));
|
107
|
+
}
|
108
|
+
|
109
|
+
// Does "this" end with "x"
|
110
|
+
bool ends_with(const StringPiece& x) const {
|
111
|
+
return ((length_ >= x.length_) &&
|
112
|
+
(memcmp(ptr_ + (length_-x.length_), x.ptr_, x.length_) == 0));
|
113
|
+
}
|
114
|
+
|
115
|
+
// standard STL container boilerplate
|
116
|
+
typedef char value_type;
|
117
|
+
typedef const char* pointer;
|
118
|
+
typedef const char& reference;
|
119
|
+
typedef const char& const_reference;
|
120
|
+
typedef size_t size_type;
|
121
|
+
typedef ptrdiff_t difference_type;
|
122
|
+
static const size_type npos;
|
123
|
+
typedef const char* const_iterator;
|
124
|
+
typedef const char* iterator;
|
125
|
+
typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
|
126
|
+
typedef std::reverse_iterator<iterator> reverse_iterator;
|
127
|
+
iterator begin() const { return ptr_; }
|
128
|
+
iterator end() const { return ptr_ + length_; }
|
129
|
+
const_reverse_iterator rbegin() const {
|
130
|
+
return const_reverse_iterator(ptr_ + length_);
|
131
|
+
}
|
132
|
+
const_reverse_iterator rend() const {
|
133
|
+
return const_reverse_iterator(ptr_);
|
134
|
+
}
|
135
|
+
// STLS says return size_type, but Google says return int
|
136
|
+
int max_size() const { return length_; }
|
137
|
+
int capacity() const { return length_; }
|
138
|
+
|
139
|
+
int copy(char* buf, size_type n, size_type pos = 0) const;
|
140
|
+
|
141
|
+
int find(const StringPiece& s, size_type pos = 0) const;
|
142
|
+
int find(char c, size_type pos = 0) const;
|
143
|
+
int rfind(const StringPiece& s, size_type pos = npos) const;
|
144
|
+
int rfind(char c, size_type pos = npos) const;
|
145
|
+
|
146
|
+
StringPiece substr(size_type pos, size_type n = npos) const;
|
147
|
+
|
148
|
+
static bool _equal(const StringPiece&, const StringPiece&);
|
149
|
+
};
|
150
|
+
|
151
|
+
inline bool operator==(const StringPiece& x, const StringPiece& y) {
|
152
|
+
return StringPiece::_equal(x, y);
|
153
|
+
}
|
154
|
+
|
155
|
+
inline bool operator!=(const StringPiece& x, const StringPiece& y) {
|
156
|
+
return !(x == y);
|
157
|
+
}
|
158
|
+
|
159
|
+
inline bool operator<(const StringPiece& x, const StringPiece& y) {
|
160
|
+
const int r = memcmp(x.data(), y.data(),
|
161
|
+
std::min(x.size(), y.size()));
|
162
|
+
return ((r < 0) || ((r == 0) && (x.size() < y.size())));
|
163
|
+
}
|
164
|
+
|
165
|
+
inline bool operator>(const StringPiece& x, const StringPiece& y) {
|
166
|
+
return y < x;
|
167
|
+
}
|
168
|
+
|
169
|
+
inline bool operator<=(const StringPiece& x, const StringPiece& y) {
|
170
|
+
return !(x > y);
|
171
|
+
}
|
172
|
+
|
173
|
+
inline bool operator>=(const StringPiece& x, const StringPiece& y) {
|
174
|
+
return !(x < y);
|
175
|
+
}
|
176
|
+
|
177
|
+
} // namespace re2
|
178
|
+
|
179
|
+
// allow StringPiece to be logged
|
180
|
+
extern std::ostream& operator<<(std::ostream& o, const re2::StringPiece& piece);
|
181
|
+
|
182
|
+
#endif // STRINGS_STRINGPIECE_H__
|