chipper 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +51 -0
- data/ext/extconf.rb +58 -0
- data/ext/libstemmer_c/Makefile +10 -0
- data/ext/libstemmer_c/examples/stemwords.c +209 -0
- data/ext/libstemmer_c/include/libstemmer.h +79 -0
- data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
- data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
- data/ext/libstemmer_c/libstemmer/modules.h +190 -0
- data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
- data/ext/libstemmer_c/mkinc.mak +82 -0
- data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
- data/ext/libstemmer_c/runtime/api.c +66 -0
- data/ext/libstemmer_c/runtime/api.h +26 -0
- data/ext/libstemmer_c/runtime/header.h +58 -0
- data/ext/libstemmer_c/runtime/utilities.c +478 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
- data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
- data/ext/re2/bitstate.cc +378 -0
- data/ext/re2/compile.cc +1138 -0
- data/ext/re2/dfa.cc +2086 -0
- data/ext/re2/filtered_re2.cc +100 -0
- data/ext/re2/filtered_re2.h +99 -0
- data/ext/re2/hash.cc +231 -0
- data/ext/re2/mimics_pcre.cc +185 -0
- data/ext/re2/nfa.cc +709 -0
- data/ext/re2/onepass.cc +614 -0
- data/ext/re2/parse.cc +2202 -0
- data/ext/re2/perl_groups.cc +119 -0
- data/ext/re2/prefilter.cc +671 -0
- data/ext/re2/prefilter.h +105 -0
- data/ext/re2/prefilter_tree.cc +398 -0
- data/ext/re2/prefilter_tree.h +130 -0
- data/ext/re2/prog.cc +341 -0
- data/ext/re2/prog.h +376 -0
- data/ext/re2/re2.cc +1180 -0
- data/ext/re2/re2.h +837 -0
- data/ext/re2/regexp.cc +920 -0
- data/ext/re2/regexp.h +632 -0
- data/ext/re2/rune.cc +258 -0
- data/ext/re2/set.cc +113 -0
- data/ext/re2/set.h +55 -0
- data/ext/re2/simplify.cc +393 -0
- data/ext/re2/stringpiece.cc +87 -0
- data/ext/re2/stringpiece.h +182 -0
- data/ext/re2/tostring.cc +341 -0
- data/ext/re2/unicode_casefold.cc +469 -0
- data/ext/re2/unicode_casefold.h +75 -0
- data/ext/re2/unicode_groups.cc +4851 -0
- data/ext/re2/unicode_groups.h +64 -0
- data/ext/re2/valgrind.cc +24 -0
- data/ext/re2/variadic_function.h +346 -0
- data/ext/re2/walker-inl.h +244 -0
- data/ext/src/chipper.cc +626 -0
- data/ext/src/version.h +1 -0
- data/ext/stemmer.rb +40 -0
- data/ext/util/arena.h +103 -0
- data/ext/util/atomicops.h +79 -0
- data/ext/util/benchmark.h +41 -0
- data/ext/util/flags.h +27 -0
- data/ext/util/logging.h +78 -0
- data/ext/util/mutex.h +190 -0
- data/ext/util/pcre.h +679 -0
- data/ext/util/random.h +29 -0
- data/ext/util/sparse_array.h +451 -0
- data/ext/util/sparse_set.h +177 -0
- data/ext/util/test.h +57 -0
- data/ext/util/thread.h +26 -0
- data/ext/util/utf.h +43 -0
- data/ext/util/util.h +127 -0
- data/ext/util/valgrind.h +4517 -0
- data/test/helper.rb +5 -0
- data/test/test_entities.rb +57 -0
- data/test/test_tokens.rb +118 -0
- metadata +199 -0
data/ext/re2/simplify.cc
ADDED
|
@@ -0,0 +1,393 @@
|
|
|
1
|
+
// Copyright 2006 The RE2 Authors. All Rights Reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style
|
|
3
|
+
// license that can be found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
// Rewrite POSIX and other features in re
|
|
6
|
+
// to use simple extended regular expression features.
|
|
7
|
+
// Also sort and simplify character classes.
|
|
8
|
+
|
|
9
|
+
#include "util/util.h"
|
|
10
|
+
#include "re2/regexp.h"
|
|
11
|
+
#include "re2/walker-inl.h"
|
|
12
|
+
|
|
13
|
+
namespace re2 {
|
|
14
|
+
|
|
15
|
+
// Parses the regexp src and then simplifies it and sets *dst to the
|
|
16
|
+
// string representation of the simplified form. Returns true on success.
|
|
17
|
+
// Returns false and sets *error (if error != NULL) on error.
|
|
18
|
+
bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags,
|
|
19
|
+
string* dst,
|
|
20
|
+
RegexpStatus* status) {
|
|
21
|
+
Regexp* re = Parse(src, flags, status);
|
|
22
|
+
if (re == NULL)
|
|
23
|
+
return false;
|
|
24
|
+
Regexp* sre = re->Simplify();
|
|
25
|
+
re->Decref();
|
|
26
|
+
if (sre == NULL) {
|
|
27
|
+
// Should not happen, since Simplify never fails.
|
|
28
|
+
LOG(ERROR) << "Simplify failed on " << src;
|
|
29
|
+
if (status) {
|
|
30
|
+
status->set_code(kRegexpInternalError);
|
|
31
|
+
status->set_error_arg(src);
|
|
32
|
+
}
|
|
33
|
+
return false;
|
|
34
|
+
}
|
|
35
|
+
*dst = sre->ToString();
|
|
36
|
+
sre->Decref();
|
|
37
|
+
return true;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
// Assuming the simple_ flags on the children are accurate,
|
|
41
|
+
// is this Regexp* simple?
|
|
42
|
+
bool Regexp::ComputeSimple() {
|
|
43
|
+
Regexp** subs;
|
|
44
|
+
switch (op_) {
|
|
45
|
+
case kRegexpNoMatch:
|
|
46
|
+
case kRegexpEmptyMatch:
|
|
47
|
+
case kRegexpLiteral:
|
|
48
|
+
case kRegexpLiteralString:
|
|
49
|
+
case kRegexpBeginLine:
|
|
50
|
+
case kRegexpEndLine:
|
|
51
|
+
case kRegexpBeginText:
|
|
52
|
+
case kRegexpWordBoundary:
|
|
53
|
+
case kRegexpNoWordBoundary:
|
|
54
|
+
case kRegexpEndText:
|
|
55
|
+
case kRegexpAnyChar:
|
|
56
|
+
case kRegexpAnyByte:
|
|
57
|
+
case kRegexpHaveMatch:
|
|
58
|
+
return true;
|
|
59
|
+
case kRegexpConcat:
|
|
60
|
+
case kRegexpAlternate:
|
|
61
|
+
// These are simple as long as the subpieces are simple.
|
|
62
|
+
subs = sub();
|
|
63
|
+
for (int i = 0; i < nsub_; i++)
|
|
64
|
+
if (!subs[i]->simple_)
|
|
65
|
+
return false;
|
|
66
|
+
return true;
|
|
67
|
+
case kRegexpCharClass:
|
|
68
|
+
// Simple as long as the char class is not empty, not full.
|
|
69
|
+
if (ccb_ != NULL)
|
|
70
|
+
return !ccb_->empty() && !ccb_->full();
|
|
71
|
+
return !cc_->empty() && !cc_->full();
|
|
72
|
+
case kRegexpCapture:
|
|
73
|
+
subs = sub();
|
|
74
|
+
return subs[0]->simple_;
|
|
75
|
+
case kRegexpStar:
|
|
76
|
+
case kRegexpPlus:
|
|
77
|
+
case kRegexpQuest:
|
|
78
|
+
subs = sub();
|
|
79
|
+
if (!subs[0]->simple_)
|
|
80
|
+
return false;
|
|
81
|
+
switch (subs[0]->op_) {
|
|
82
|
+
case kRegexpStar:
|
|
83
|
+
case kRegexpPlus:
|
|
84
|
+
case kRegexpQuest:
|
|
85
|
+
case kRegexpEmptyMatch:
|
|
86
|
+
case kRegexpNoMatch:
|
|
87
|
+
return false;
|
|
88
|
+
default:
|
|
89
|
+
break;
|
|
90
|
+
}
|
|
91
|
+
return true;
|
|
92
|
+
case kRegexpRepeat:
|
|
93
|
+
return false;
|
|
94
|
+
}
|
|
95
|
+
LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_;
|
|
96
|
+
return false;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
// Walker subclass used by Simplify.
|
|
100
|
+
// The simplify walk is purely post-recursive: given the simplified children,
|
|
101
|
+
// PostVisit creates the simplified result.
|
|
102
|
+
// The child_args are simplified Regexp*s.
|
|
103
|
+
class SimplifyWalker : public Regexp::Walker<Regexp*> {
|
|
104
|
+
public:
|
|
105
|
+
SimplifyWalker() {}
|
|
106
|
+
virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
|
|
107
|
+
virtual Regexp* PostVisit(Regexp* re,
|
|
108
|
+
Regexp* parent_arg,
|
|
109
|
+
Regexp* pre_arg,
|
|
110
|
+
Regexp** child_args, int nchild_args);
|
|
111
|
+
virtual Regexp* Copy(Regexp* re);
|
|
112
|
+
virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
|
|
113
|
+
|
|
114
|
+
private:
|
|
115
|
+
// These functions are declared inside SimplifyWalker so that
|
|
116
|
+
// they can edit the private fields of the Regexps they construct.
|
|
117
|
+
|
|
118
|
+
// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
|
|
119
|
+
// Caller must Decref return value when done with it.
|
|
120
|
+
static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags);
|
|
121
|
+
|
|
122
|
+
// Simplifies the expression re{min,max} in terms of *, +, and ?.
|
|
123
|
+
// Returns a new regexp. Does not edit re. Does not consume reference to re.
|
|
124
|
+
// Caller must Decref return value when done with it.
|
|
125
|
+
static Regexp* SimplifyRepeat(Regexp* re, int min, int max,
|
|
126
|
+
Regexp::ParseFlags parse_flags);
|
|
127
|
+
|
|
128
|
+
// Simplifies a character class by expanding any named classes
|
|
129
|
+
// into rune ranges. Does not edit re. Does not consume ref to re.
|
|
130
|
+
// Caller must Decref return value when done with it.
|
|
131
|
+
static Regexp* SimplifyCharClass(Regexp* re);
|
|
132
|
+
|
|
133
|
+
DISALLOW_EVIL_CONSTRUCTORS(SimplifyWalker);
|
|
134
|
+
};
|
|
135
|
+
|
|
136
|
+
// Simplifies a regular expression, returning a new regexp.
|
|
137
|
+
// The new regexp uses traditional Unix egrep features only,
|
|
138
|
+
// plus the Perl (?:) non-capturing parentheses.
|
|
139
|
+
// Otherwise, no POSIX or Perl additions. The new regexp
|
|
140
|
+
// captures exactly the same subexpressions (with the same indices)
|
|
141
|
+
// as the original.
|
|
142
|
+
// Does not edit current object.
|
|
143
|
+
// Caller must Decref() return value when done with it.
|
|
144
|
+
|
|
145
|
+
Regexp* Regexp::Simplify() {
|
|
146
|
+
if (simple_)
|
|
147
|
+
return Incref();
|
|
148
|
+
SimplifyWalker w;
|
|
149
|
+
return w.Walk(this, NULL);
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
#define Simplify DontCallSimplify // Avoid accidental recursion
|
|
153
|
+
|
|
154
|
+
Regexp* SimplifyWalker::Copy(Regexp* re) {
|
|
155
|
+
return re->Incref();
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
|
|
159
|
+
// This should never be called, since we use Walk and not
|
|
160
|
+
// WalkExponential.
|
|
161
|
+
LOG(DFATAL) << "SimplifyWalker::ShortVisit called";
|
|
162
|
+
return re->Incref();
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {
|
|
166
|
+
if (re->simple_) {
|
|
167
|
+
*stop = true;
|
|
168
|
+
return re->Incref();
|
|
169
|
+
}
|
|
170
|
+
return NULL;
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
Regexp* SimplifyWalker::PostVisit(Regexp* re,
|
|
174
|
+
Regexp* parent_arg,
|
|
175
|
+
Regexp* pre_arg,
|
|
176
|
+
Regexp** child_args,
|
|
177
|
+
int nchild_args) {
|
|
178
|
+
switch (re->op()) {
|
|
179
|
+
case kRegexpNoMatch:
|
|
180
|
+
case kRegexpEmptyMatch:
|
|
181
|
+
case kRegexpLiteral:
|
|
182
|
+
case kRegexpLiteralString:
|
|
183
|
+
case kRegexpBeginLine:
|
|
184
|
+
case kRegexpEndLine:
|
|
185
|
+
case kRegexpBeginText:
|
|
186
|
+
case kRegexpWordBoundary:
|
|
187
|
+
case kRegexpNoWordBoundary:
|
|
188
|
+
case kRegexpEndText:
|
|
189
|
+
case kRegexpAnyChar:
|
|
190
|
+
case kRegexpAnyByte:
|
|
191
|
+
case kRegexpHaveMatch:
|
|
192
|
+
// All these are always simple.
|
|
193
|
+
re->simple_ = true;
|
|
194
|
+
return re->Incref();
|
|
195
|
+
|
|
196
|
+
case kRegexpConcat:
|
|
197
|
+
case kRegexpAlternate: {
|
|
198
|
+
// These are simple as long as the subpieces are simple.
|
|
199
|
+
// Two passes to avoid allocation in the common case.
|
|
200
|
+
bool changed = false;
|
|
201
|
+
Regexp** subs = re->sub();
|
|
202
|
+
for (int i = 0; i < re->nsub_; i++) {
|
|
203
|
+
Regexp* sub = subs[i];
|
|
204
|
+
Regexp* newsub = child_args[i];
|
|
205
|
+
if (newsub != sub) {
|
|
206
|
+
changed = true;
|
|
207
|
+
break;
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
if (!changed) {
|
|
211
|
+
for (int i = 0; i < re->nsub_; i++) {
|
|
212
|
+
Regexp* newsub = child_args[i];
|
|
213
|
+
newsub->Decref();
|
|
214
|
+
}
|
|
215
|
+
re->simple_ = true;
|
|
216
|
+
return re->Incref();
|
|
217
|
+
}
|
|
218
|
+
Regexp* nre = new Regexp(re->op(), re->parse_flags());
|
|
219
|
+
nre->AllocSub(re->nsub_);
|
|
220
|
+
Regexp** nre_subs = nre->sub();
|
|
221
|
+
for (int i = 0; i <re->nsub_; i++)
|
|
222
|
+
nre_subs[i] = child_args[i];
|
|
223
|
+
nre->simple_ = true;
|
|
224
|
+
return nre;
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
case kRegexpCapture: {
|
|
228
|
+
Regexp* newsub = child_args[0];
|
|
229
|
+
if (newsub == re->sub()[0]) {
|
|
230
|
+
newsub->Decref();
|
|
231
|
+
re->simple_ = true;
|
|
232
|
+
return re->Incref();
|
|
233
|
+
}
|
|
234
|
+
Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());
|
|
235
|
+
nre->AllocSub(1);
|
|
236
|
+
nre->sub()[0] = newsub;
|
|
237
|
+
nre->cap_ = re->cap_;
|
|
238
|
+
nre->simple_ = true;
|
|
239
|
+
return nre;
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
case kRegexpStar:
|
|
243
|
+
case kRegexpPlus:
|
|
244
|
+
case kRegexpQuest: {
|
|
245
|
+
Regexp* newsub = child_args[0];
|
|
246
|
+
// Special case: repeat the empty string as much as
|
|
247
|
+
// you want, but it's still the empty string.
|
|
248
|
+
if (newsub->op() == kRegexpEmptyMatch)
|
|
249
|
+
return newsub;
|
|
250
|
+
|
|
251
|
+
// These are simple as long as the subpiece is simple.
|
|
252
|
+
if (newsub == re->sub()[0]) {
|
|
253
|
+
newsub->Decref();
|
|
254
|
+
re->simple_ = true;
|
|
255
|
+
return re->Incref();
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
// These are also idempotent if flags are constant.
|
|
259
|
+
if (re->op() == newsub->op() &&
|
|
260
|
+
re->parse_flags() == newsub->parse_flags())
|
|
261
|
+
return newsub;
|
|
262
|
+
|
|
263
|
+
Regexp* nre = new Regexp(re->op(), re->parse_flags());
|
|
264
|
+
nre->AllocSub(1);
|
|
265
|
+
nre->sub()[0] = newsub;
|
|
266
|
+
nre->simple_ = true;
|
|
267
|
+
return nre;
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
case kRegexpRepeat: {
|
|
271
|
+
Regexp* newsub = child_args[0];
|
|
272
|
+
// Special case: repeat the empty string as much as
|
|
273
|
+
// you want, but it's still the empty string.
|
|
274
|
+
if (newsub->op() == kRegexpEmptyMatch)
|
|
275
|
+
return newsub;
|
|
276
|
+
|
|
277
|
+
Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_,
|
|
278
|
+
re->parse_flags());
|
|
279
|
+
newsub->Decref();
|
|
280
|
+
nre->simple_ = true;
|
|
281
|
+
return nre;
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
case kRegexpCharClass: {
|
|
285
|
+
Regexp* nre = SimplifyCharClass(re);
|
|
286
|
+
nre->simple_ = true;
|
|
287
|
+
return nre;
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
LOG(ERROR) << "Simplify case not handled: " << re->op();
|
|
292
|
+
return re->Incref();
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
|
|
296
|
+
// Returns a new Regexp, handing the ref to the caller.
|
|
297
|
+
Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,
|
|
298
|
+
Regexp::ParseFlags parse_flags) {
|
|
299
|
+
Regexp* re = new Regexp(kRegexpConcat, parse_flags);
|
|
300
|
+
re->AllocSub(2);
|
|
301
|
+
Regexp** subs = re->sub();
|
|
302
|
+
subs[0] = re1;
|
|
303
|
+
subs[1] = re2;
|
|
304
|
+
return re;
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
// Simplifies the expression re{min,max} in terms of *, +, and ?.
|
|
308
|
+
// Returns a new regexp. Does not edit re. Does not consume reference to re.
|
|
309
|
+
// Caller must Decref return value when done with it.
|
|
310
|
+
// The result will *not* necessarily have the right capturing parens
|
|
311
|
+
// if you call ToString() and re-parse it: (x){2} becomes (x)(x),
|
|
312
|
+
// but in the Regexp* representation, both (x) are marked as $1.
|
|
313
|
+
Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
|
|
314
|
+
Regexp::ParseFlags f) {
|
|
315
|
+
// x{n,} means at least n matches of x.
|
|
316
|
+
if (max == -1) {
|
|
317
|
+
// Special case: x{0,} is x*
|
|
318
|
+
if (min == 0)
|
|
319
|
+
return Regexp::Star(re->Incref(), f);
|
|
320
|
+
|
|
321
|
+
// Special case: x{1,} is x+
|
|
322
|
+
if (min == 1)
|
|
323
|
+
return Regexp::Plus(re->Incref(), f);
|
|
324
|
+
|
|
325
|
+
// General case: x{4,} is xxxx+
|
|
326
|
+
Regexp* nre = new Regexp(kRegexpConcat, f);
|
|
327
|
+
nre->AllocSub(min);
|
|
328
|
+
VLOG(1) << "Simplify " << min;
|
|
329
|
+
Regexp** nre_subs = nre->sub();
|
|
330
|
+
for (int i = 0; i < min-1; i++)
|
|
331
|
+
nre_subs[i] = re->Incref();
|
|
332
|
+
nre_subs[min-1] = Regexp::Plus(re->Incref(), f);
|
|
333
|
+
return nre;
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
// Special case: (x){0} matches only empty string.
|
|
337
|
+
if (min == 0 && max == 0)
|
|
338
|
+
return new Regexp(kRegexpEmptyMatch, f);
|
|
339
|
+
|
|
340
|
+
// Special case: x{1} is just x.
|
|
341
|
+
if (min == 1 && max == 1)
|
|
342
|
+
return re->Incref();
|
|
343
|
+
|
|
344
|
+
// General case: x{n,m} means n copies of x and m copies of x?.
|
|
345
|
+
// The machine will do less work if we nest the final m copies,
|
|
346
|
+
// so that x{2,5} = xx(x(x(x)?)?)?
|
|
347
|
+
|
|
348
|
+
// Build leading prefix: xx. Capturing only on the last one.
|
|
349
|
+
Regexp* nre = NULL;
|
|
350
|
+
if (min > 0) {
|
|
351
|
+
nre = new Regexp(kRegexpConcat, f);
|
|
352
|
+
nre->AllocSub(min);
|
|
353
|
+
Regexp** nre_subs = nre->sub();
|
|
354
|
+
for (int i = 0; i < min; i++)
|
|
355
|
+
nre_subs[i] = re->Incref();
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
// Build and attach suffix: (x(x(x)?)?)?
|
|
359
|
+
if (max > min) {
|
|
360
|
+
Regexp* suf = Regexp::Quest(re->Incref(), f);
|
|
361
|
+
for (int i = min+1; i < max; i++)
|
|
362
|
+
suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f);
|
|
363
|
+
if (nre == NULL)
|
|
364
|
+
nre = suf;
|
|
365
|
+
else
|
|
366
|
+
nre = Concat2(nre, suf, f);
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
if (nre == NULL) {
|
|
370
|
+
// Some degenerate case, like min > max, or min < max < 0.
|
|
371
|
+
// This shouldn't happen, because the parser rejects such regexps.
|
|
372
|
+
LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max;
|
|
373
|
+
return new Regexp(kRegexpNoMatch, f);
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
return nre;
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
// Simplifies a character class.
|
|
380
|
+
// Caller must Decref return value when done with it.
|
|
381
|
+
Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) {
|
|
382
|
+
CharClass* cc = re->cc();
|
|
383
|
+
|
|
384
|
+
// Special cases
|
|
385
|
+
if (cc->empty())
|
|
386
|
+
return new Regexp(kRegexpNoMatch, re->parse_flags());
|
|
387
|
+
if (cc->full())
|
|
388
|
+
return new Regexp(kRegexpAnyChar, re->parse_flags());
|
|
389
|
+
|
|
390
|
+
return re->Incref();
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
} // namespace re2
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
// Copyright 2004 The RE2 Authors. All Rights Reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style
|
|
3
|
+
// license that can be found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#include "re2/stringpiece.h"
|
|
6
|
+
#include "util/util.h"
|
|
7
|
+
|
|
8
|
+
using re2::StringPiece;
|
|
9
|
+
|
|
10
|
+
std::ostream& operator<<(std::ostream& o, const StringPiece& piece) {
|
|
11
|
+
o.write(piece.data(), piece.size());
|
|
12
|
+
return o;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
bool StringPiece::_equal(const StringPiece& x, const StringPiece& y) {
|
|
16
|
+
int len = x.size();
|
|
17
|
+
if (len != y.size()) {
|
|
18
|
+
return false;
|
|
19
|
+
}
|
|
20
|
+
const char* p = x.data();
|
|
21
|
+
const char* p2 = y.data();
|
|
22
|
+
// Test last byte in case strings share large common prefix
|
|
23
|
+
if ((len > 0) && (p[len-1] != p2[len-1])) return false;
|
|
24
|
+
const char* p_limit = p + len;
|
|
25
|
+
for (; p < p_limit; p++, p2++) {
|
|
26
|
+
if (*p != *p2)
|
|
27
|
+
return false;
|
|
28
|
+
}
|
|
29
|
+
return true;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
void StringPiece::CopyToString(string* target) const {
|
|
33
|
+
target->assign(ptr_, length_);
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
int StringPiece::copy(char* buf, size_type n, size_type pos) const {
|
|
37
|
+
int ret = min(length_ - pos, n);
|
|
38
|
+
memcpy(buf, ptr_ + pos, ret);
|
|
39
|
+
return ret;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
int StringPiece::find(const StringPiece& s, size_type pos) const {
|
|
43
|
+
if (length_ < 0 || pos > static_cast<size_type>(length_))
|
|
44
|
+
return npos;
|
|
45
|
+
|
|
46
|
+
const char* result = std::search(ptr_ + pos, ptr_ + length_,
|
|
47
|
+
s.ptr_, s.ptr_ + s.length_);
|
|
48
|
+
const size_type xpos = result - ptr_;
|
|
49
|
+
return xpos + s.length_ <= length_ ? xpos : npos;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
int StringPiece::find(char c, size_type pos) const {
|
|
53
|
+
if (length_ <= 0 || pos >= static_cast<size_type>(length_)) {
|
|
54
|
+
return npos;
|
|
55
|
+
}
|
|
56
|
+
const char* result = std::find(ptr_ + pos, ptr_ + length_, c);
|
|
57
|
+
return result != ptr_ + length_ ? result - ptr_ : npos;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
int StringPiece::rfind(const StringPiece& s, size_type pos) const {
|
|
61
|
+
if (length_ < s.length_) return npos;
|
|
62
|
+
const size_t ulen = length_;
|
|
63
|
+
if (s.length_ == 0) return min(ulen, pos);
|
|
64
|
+
|
|
65
|
+
const char* last = ptr_ + min(ulen - s.length_, pos) + s.length_;
|
|
66
|
+
const char* result = std::find_end(ptr_, last, s.ptr_, s.ptr_ + s.length_);
|
|
67
|
+
return result != last ? result - ptr_ : npos;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
int StringPiece::rfind(char c, size_type pos) const {
|
|
71
|
+
if (length_ <= 0) return npos;
|
|
72
|
+
for (int i = min(pos, static_cast<size_type>(length_ - 1));
|
|
73
|
+
i >= 0; --i) {
|
|
74
|
+
if (ptr_[i] == c) {
|
|
75
|
+
return i;
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
return npos;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
StringPiece StringPiece::substr(size_type pos, size_type n) const {
|
|
82
|
+
if (pos > length_) pos = length_;
|
|
83
|
+
if (n > length_ - pos) n = length_ - pos;
|
|
84
|
+
return StringPiece(ptr_ + pos, n);
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
const StringPiece::size_type StringPiece::npos = size_type(-1);
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
// Copyright 2001-2010 The RE2 Authors. All Rights Reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style
|
|
3
|
+
// license that can be found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
// A string-like object that points to a sized piece of memory.
|
|
6
|
+
//
|
|
7
|
+
// Functions or methods may use const StringPiece& parameters to accept either
|
|
8
|
+
// a "const char*" or a "string" value that will be implicitly converted to
|
|
9
|
+
// a StringPiece. The implicit conversion means that it is often appropriate
|
|
10
|
+
// to include this .h file in other files rather than forward-declaring
|
|
11
|
+
// StringPiece as would be appropriate for most other Google classes.
|
|
12
|
+
//
|
|
13
|
+
// Systematic usage of StringPiece is encouraged as it will reduce unnecessary
|
|
14
|
+
// conversions from "const char*" to "string" and back again.
|
|
15
|
+
//
|
|
16
|
+
//
|
|
17
|
+
// Arghh! I wish C++ literals were "string".
|
|
18
|
+
|
|
19
|
+
#ifndef STRINGS_STRINGPIECE_H__
|
|
20
|
+
#define STRINGS_STRINGPIECE_H__
|
|
21
|
+
|
|
22
|
+
#include <string.h>
|
|
23
|
+
#include <cstddef>
|
|
24
|
+
#include <iosfwd>
|
|
25
|
+
#include <string>
|
|
26
|
+
|
|
27
|
+
namespace re2 {
|
|
28
|
+
|
|
29
|
+
class StringPiece {
|
|
30
|
+
private:
|
|
31
|
+
const char* ptr_;
|
|
32
|
+
int length_;
|
|
33
|
+
|
|
34
|
+
public:
|
|
35
|
+
// We provide non-explicit singleton constructors so users can pass
|
|
36
|
+
// in a "const char*" or a "string" wherever a "StringPiece" is
|
|
37
|
+
// expected.
|
|
38
|
+
StringPiece() : ptr_(NULL), length_(0) { }
|
|
39
|
+
StringPiece(const char* str)
|
|
40
|
+
: ptr_(str), length_((str == NULL) ? 0 : static_cast<int>(strlen(str))) { }
|
|
41
|
+
StringPiece(const std::string& str)
|
|
42
|
+
: ptr_(str.data()), length_(static_cast<int>(str.size())) { }
|
|
43
|
+
StringPiece(const char* offset, int len) : ptr_(offset), length_(len) { }
|
|
44
|
+
|
|
45
|
+
// data() may return a pointer to a buffer with embedded NULs, and the
|
|
46
|
+
// returned buffer may or may not be null terminated. Therefore it is
|
|
47
|
+
// typically a mistake to pass data() to a routine that expects a NUL
|
|
48
|
+
// terminated string.
|
|
49
|
+
const char* data() const { return ptr_; }
|
|
50
|
+
int size() const { return length_; }
|
|
51
|
+
int length() const { return length_; }
|
|
52
|
+
bool empty() const { return length_ == 0; }
|
|
53
|
+
|
|
54
|
+
void clear() { ptr_ = NULL; length_ = 0; }
|
|
55
|
+
void set(const char* data, int len) { ptr_ = data; length_ = len; }
|
|
56
|
+
void set(const char* str) {
|
|
57
|
+
ptr_ = str;
|
|
58
|
+
if (str != NULL)
|
|
59
|
+
length_ = static_cast<int>(strlen(str));
|
|
60
|
+
else
|
|
61
|
+
length_ = 0;
|
|
62
|
+
}
|
|
63
|
+
void set(const void* data, int len) {
|
|
64
|
+
ptr_ = reinterpret_cast<const char*>(data);
|
|
65
|
+
length_ = len;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
char operator[](int i) const { return ptr_[i]; }
|
|
69
|
+
|
|
70
|
+
void remove_prefix(int n) {
|
|
71
|
+
ptr_ += n;
|
|
72
|
+
length_ -= n;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
void remove_suffix(int n) {
|
|
76
|
+
length_ -= n;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
int compare(const StringPiece& x) const {
|
|
80
|
+
int r = memcmp(ptr_, x.ptr_, std::min(length_, x.length_));
|
|
81
|
+
if (r == 0) {
|
|
82
|
+
if (length_ < x.length_) r = -1;
|
|
83
|
+
else if (length_ > x.length_) r = +1;
|
|
84
|
+
}
|
|
85
|
+
return r;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
std::string as_string() const {
|
|
89
|
+
return std::string(data(), size());
|
|
90
|
+
}
|
|
91
|
+
// We also define ToString() here, since many other string-like
|
|
92
|
+
// interfaces name the routine that converts to a C++ string
|
|
93
|
+
// "ToString", and it's confusing to have the method that does that
|
|
94
|
+
// for a StringPiece be called "as_string()". We also leave the
|
|
95
|
+
// "as_string()" method defined here for existing code.
|
|
96
|
+
std::string ToString() const {
|
|
97
|
+
return std::string(data(), size());
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
void CopyToString(std::string* target) const;
|
|
101
|
+
void AppendToString(std::string* target) const;
|
|
102
|
+
|
|
103
|
+
// Does "this" start with "x"
|
|
104
|
+
bool starts_with(const StringPiece& x) const {
|
|
105
|
+
return ((length_ >= x.length_) &&
|
|
106
|
+
(memcmp(ptr_, x.ptr_, x.length_) == 0));
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
// Does "this" end with "x"
|
|
110
|
+
bool ends_with(const StringPiece& x) const {
|
|
111
|
+
return ((length_ >= x.length_) &&
|
|
112
|
+
(memcmp(ptr_ + (length_-x.length_), x.ptr_, x.length_) == 0));
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
// standard STL container boilerplate
|
|
116
|
+
typedef char value_type;
|
|
117
|
+
typedef const char* pointer;
|
|
118
|
+
typedef const char& reference;
|
|
119
|
+
typedef const char& const_reference;
|
|
120
|
+
typedef size_t size_type;
|
|
121
|
+
typedef ptrdiff_t difference_type;
|
|
122
|
+
static const size_type npos;
|
|
123
|
+
typedef const char* const_iterator;
|
|
124
|
+
typedef const char* iterator;
|
|
125
|
+
typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
|
|
126
|
+
typedef std::reverse_iterator<iterator> reverse_iterator;
|
|
127
|
+
iterator begin() const { return ptr_; }
|
|
128
|
+
iterator end() const { return ptr_ + length_; }
|
|
129
|
+
const_reverse_iterator rbegin() const {
|
|
130
|
+
return const_reverse_iterator(ptr_ + length_);
|
|
131
|
+
}
|
|
132
|
+
const_reverse_iterator rend() const {
|
|
133
|
+
return const_reverse_iterator(ptr_);
|
|
134
|
+
}
|
|
135
|
+
// STLS says return size_type, but Google says return int
|
|
136
|
+
int max_size() const { return length_; }
|
|
137
|
+
int capacity() const { return length_; }
|
|
138
|
+
|
|
139
|
+
int copy(char* buf, size_type n, size_type pos = 0) const;
|
|
140
|
+
|
|
141
|
+
int find(const StringPiece& s, size_type pos = 0) const;
|
|
142
|
+
int find(char c, size_type pos = 0) const;
|
|
143
|
+
int rfind(const StringPiece& s, size_type pos = npos) const;
|
|
144
|
+
int rfind(char c, size_type pos = npos) const;
|
|
145
|
+
|
|
146
|
+
StringPiece substr(size_type pos, size_type n = npos) const;
|
|
147
|
+
|
|
148
|
+
static bool _equal(const StringPiece&, const StringPiece&);
|
|
149
|
+
};
|
|
150
|
+
|
|
151
|
+
inline bool operator==(const StringPiece& x, const StringPiece& y) {
|
|
152
|
+
return StringPiece::_equal(x, y);
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
inline bool operator!=(const StringPiece& x, const StringPiece& y) {
|
|
156
|
+
return !(x == y);
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
inline bool operator<(const StringPiece& x, const StringPiece& y) {
|
|
160
|
+
const int r = memcmp(x.data(), y.data(),
|
|
161
|
+
std::min(x.size(), y.size()));
|
|
162
|
+
return ((r < 0) || ((r == 0) && (x.size() < y.size())));
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
inline bool operator>(const StringPiece& x, const StringPiece& y) {
|
|
166
|
+
return y < x;
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
inline bool operator<=(const StringPiece& x, const StringPiece& y) {
|
|
170
|
+
return !(x > y);
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
inline bool operator>=(const StringPiece& x, const StringPiece& y) {
|
|
174
|
+
return !(x < y);
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
} // namespace re2
|
|
178
|
+
|
|
179
|
+
// allow StringPiece to be logged
|
|
180
|
+
extern std::ostream& operator<<(std::ostream& o, const re2::StringPiece& piece);
|
|
181
|
+
|
|
182
|
+
#endif // STRINGS_STRINGPIECE_H__
|