chipper 0.4.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +51 -0
- data/ext/extconf.rb +58 -0
- data/ext/libstemmer_c/Makefile +10 -0
- data/ext/libstemmer_c/examples/stemwords.c +209 -0
- data/ext/libstemmer_c/include/libstemmer.h +79 -0
- data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
- data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
- data/ext/libstemmer_c/libstemmer/modules.h +190 -0
- data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
- data/ext/libstemmer_c/mkinc.mak +82 -0
- data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
- data/ext/libstemmer_c/runtime/api.c +66 -0
- data/ext/libstemmer_c/runtime/api.h +26 -0
- data/ext/libstemmer_c/runtime/header.h +58 -0
- data/ext/libstemmer_c/runtime/utilities.c +478 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
- data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
- data/ext/re2/bitstate.cc +378 -0
- data/ext/re2/compile.cc +1138 -0
- data/ext/re2/dfa.cc +2086 -0
- data/ext/re2/filtered_re2.cc +100 -0
- data/ext/re2/filtered_re2.h +99 -0
- data/ext/re2/hash.cc +231 -0
- data/ext/re2/mimics_pcre.cc +185 -0
- data/ext/re2/nfa.cc +709 -0
- data/ext/re2/onepass.cc +614 -0
- data/ext/re2/parse.cc +2202 -0
- data/ext/re2/perl_groups.cc +119 -0
- data/ext/re2/prefilter.cc +671 -0
- data/ext/re2/prefilter.h +105 -0
- data/ext/re2/prefilter_tree.cc +398 -0
- data/ext/re2/prefilter_tree.h +130 -0
- data/ext/re2/prog.cc +341 -0
- data/ext/re2/prog.h +376 -0
- data/ext/re2/re2.cc +1180 -0
- data/ext/re2/re2.h +837 -0
- data/ext/re2/regexp.cc +920 -0
- data/ext/re2/regexp.h +632 -0
- data/ext/re2/rune.cc +258 -0
- data/ext/re2/set.cc +113 -0
- data/ext/re2/set.h +55 -0
- data/ext/re2/simplify.cc +393 -0
- data/ext/re2/stringpiece.cc +87 -0
- data/ext/re2/stringpiece.h +182 -0
- data/ext/re2/tostring.cc +341 -0
- data/ext/re2/unicode_casefold.cc +469 -0
- data/ext/re2/unicode_casefold.h +75 -0
- data/ext/re2/unicode_groups.cc +4851 -0
- data/ext/re2/unicode_groups.h +64 -0
- data/ext/re2/valgrind.cc +24 -0
- data/ext/re2/variadic_function.h +346 -0
- data/ext/re2/walker-inl.h +244 -0
- data/ext/src/chipper.cc +626 -0
- data/ext/src/version.h +1 -0
- data/ext/stemmer.rb +40 -0
- data/ext/util/arena.h +103 -0
- data/ext/util/atomicops.h +79 -0
- data/ext/util/benchmark.h +41 -0
- data/ext/util/flags.h +27 -0
- data/ext/util/logging.h +78 -0
- data/ext/util/mutex.h +190 -0
- data/ext/util/pcre.h +679 -0
- data/ext/util/random.h +29 -0
- data/ext/util/sparse_array.h +451 -0
- data/ext/util/sparse_set.h +177 -0
- data/ext/util/test.h +57 -0
- data/ext/util/thread.h +26 -0
- data/ext/util/utf.h +43 -0
- data/ext/util/util.h +127 -0
- data/ext/util/valgrind.h +4517 -0
- data/test/helper.rb +5 -0
- data/test/test_entities.rb +57 -0
- data/test/test_tokens.rb +118 -0
- metadata +199 -0
data/ext/re2/regexp.h
ADDED
@@ -0,0 +1,632 @@
|
|
1
|
+
// Copyright 2006 The RE2 Authors. All Rights Reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style
|
3
|
+
// license that can be found in the LICENSE file.
|
4
|
+
|
5
|
+
// --- SPONSORED LINK --------------------------------------------------
|
6
|
+
// If you want to use this library for regular expression matching,
|
7
|
+
// you should use re2/re2.h, which provides a class RE2 that
|
8
|
+
// mimics the PCRE interface provided by PCRE's C++ wrappers.
|
9
|
+
// This header describes the low-level interface used to implement RE2
|
10
|
+
// and may change in backwards-incompatible ways from time to time.
|
11
|
+
// In contrast, RE2's interface will not.
|
12
|
+
// ---------------------------------------------------------------------
|
13
|
+
|
14
|
+
// Regular expression library: parsing, execution, and manipulation
|
15
|
+
// of regular expressions.
|
16
|
+
//
|
17
|
+
// Any operation that traverses the Regexp structures should be written
|
18
|
+
// using Regexp::Walker (see walker-inl.h), not recursively, because deeply nested
|
19
|
+
// regular expressions such as x++++++++++++++++++++... might cause recursive
|
20
|
+
// traversals to overflow the stack.
|
21
|
+
//
|
22
|
+
// It is the caller's responsibility to provide appropriate mutual exclusion
|
23
|
+
// around manipulation of the regexps. RE2 does this.
|
24
|
+
//
|
25
|
+
// PARSING
|
26
|
+
//
|
27
|
+
// Regexp::Parse parses regular expressions encoded in UTF-8.
|
28
|
+
// The default syntax is POSIX extended regular expressions,
|
29
|
+
// with the following changes:
|
30
|
+
//
|
31
|
+
// 1. Backreferences (optional in POSIX EREs) are not supported.
|
32
|
+
// (Supporting them precludes the use of DFA-based
|
33
|
+
// matching engines.)
|
34
|
+
//
|
35
|
+
// 2. Collating elements and collation classes are not supported.
|
36
|
+
// (No one has needed or wanted them.)
|
37
|
+
//
|
38
|
+
// The exact syntax accepted can be modified by passing flags to
|
39
|
+
// Regexp::Parse. In particular, many of the basic Perl additions
|
40
|
+
// are available. The flags are documented below (search for LikePerl).
|
41
|
+
//
|
42
|
+
// If parsed with the flag Regexp::Latin1, both the regular expression
|
43
|
+
// and the input to the matching routines are assumed to be encoded in
|
44
|
+
// Latin-1, not UTF-8.
|
45
|
+
//
|
46
|
+
// EXECUTION
|
47
|
+
//
|
48
|
+
// Once Regexp has parsed a regular expression, it provides methods
|
49
|
+
// to search text using that regular expression. These methods are
|
50
|
+
// implemented via calling out to other regular expression libraries.
|
51
|
+
// (Let's call them the sublibraries.)
|
52
|
+
//
|
53
|
+
// To call a sublibrary, Regexp does not simply prepare a
|
54
|
+
// string version of the regular expression and hand it to the
|
55
|
+
// sublibrary. Instead, Regexp prepares, from its own parsed form, the
|
56
|
+
// corresponding internal representation used by the sublibrary.
|
57
|
+
// This has the drawback of needing to know the internal representation
|
58
|
+
// used by the sublibrary, but it has two important benefits:
|
59
|
+
//
|
60
|
+
// 1. The syntax and meaning of regular expressions is guaranteed
|
61
|
+
// to be that used by Regexp's parser, not the syntax expected
|
62
|
+
// by the sublibrary. Regexp might accept a restricted or
|
63
|
+
// expanded syntax for regular expressions as compared with
|
64
|
+
// the sublibrary. As long as Regexp can translate from its
|
65
|
+
// internal form into the sublibrary's, clients need not know
|
66
|
+
// exactly which sublibrary they are using.
|
67
|
+
//
|
68
|
+
// 2. The sublibrary parsers are bypassed. For whatever reason,
|
69
|
+
// sublibrary regular expression parsers often have security
|
70
|
+
// problems. For example, plan9grep's regular expression parser
|
71
|
+
// has a buffer overflow in its handling of large character
|
72
|
+
// classes, and PCRE's parser has had buffer overflow problems
|
73
|
+
// in the past. Security-team requires sandboxing of sublibrary
|
74
|
+
// regular expression parsers. Avoiding the sublibrary parsers
|
75
|
+
// avoids the sandbox.
|
76
|
+
//
|
77
|
+
// The execution methods we use now are provided by the compiled form,
|
78
|
+
// Prog, described in prog.h
|
79
|
+
//
|
80
|
+
// MANIPULATION
|
81
|
+
//
|
82
|
+
// Unlike other regular expression libraries, Regexp makes its parsed
|
83
|
+
// form accessible to clients, so that client code can analyze the
|
84
|
+
// parsed regular expressions.
|
85
|
+
|
86
|
+
#ifndef RE2_REGEXP_H__
|
87
|
+
#define RE2_REGEXP_H__
|
88
|
+
|
89
|
+
#include "util/util.h"
|
90
|
+
#include "re2/stringpiece.h"
|
91
|
+
|
92
|
+
namespace re2 {
|
93
|
+
|
94
|
+
// Keep in sync with string list kOpcodeNames[] in testing/dump.cc
|
95
|
+
enum RegexpOp {
|
96
|
+
// Matches no strings.
|
97
|
+
kRegexpNoMatch = 1,
|
98
|
+
|
99
|
+
// Matches empty string.
|
100
|
+
kRegexpEmptyMatch,
|
101
|
+
|
102
|
+
// Matches rune_.
|
103
|
+
kRegexpLiteral,
|
104
|
+
|
105
|
+
// Matches runes_.
|
106
|
+
kRegexpLiteralString,
|
107
|
+
|
108
|
+
// Matches concatenation of sub_[0..nsub-1].
|
109
|
+
kRegexpConcat,
|
110
|
+
// Matches union of sub_[0..nsub-1].
|
111
|
+
kRegexpAlternate,
|
112
|
+
|
113
|
+
// Matches sub_[0] zero or more times.
|
114
|
+
kRegexpStar,
|
115
|
+
// Matches sub_[0] one or more times.
|
116
|
+
kRegexpPlus,
|
117
|
+
// Matches sub_[0] zero or one times.
|
118
|
+
kRegexpQuest,
|
119
|
+
|
120
|
+
// Matches sub_[0] at least min_ times, at most max_ times.
|
121
|
+
// max_ == -1 means no upper limit.
|
122
|
+
kRegexpRepeat,
|
123
|
+
|
124
|
+
// Parenthesized (capturing) subexpression. Index is cap_.
|
125
|
+
// Optionally, capturing name is name_.
|
126
|
+
kRegexpCapture,
|
127
|
+
|
128
|
+
// Matches any character.
|
129
|
+
kRegexpAnyChar,
|
130
|
+
|
131
|
+
// Matches any byte [sic].
|
132
|
+
kRegexpAnyByte,
|
133
|
+
|
134
|
+
// Matches empty string at beginning of line.
|
135
|
+
kRegexpBeginLine,
|
136
|
+
// Matches empty string at end of line.
|
137
|
+
kRegexpEndLine,
|
138
|
+
|
139
|
+
// Matches word boundary "\b".
|
140
|
+
kRegexpWordBoundary,
|
141
|
+
// Matches not-a-word boundary "\B".
|
142
|
+
kRegexpNoWordBoundary,
|
143
|
+
|
144
|
+
// Matches empty string at beginning of text.
|
145
|
+
kRegexpBeginText,
|
146
|
+
// Matches empty string at end of text.
|
147
|
+
kRegexpEndText,
|
148
|
+
|
149
|
+
// Matches character class given by cc_.
|
150
|
+
kRegexpCharClass,
|
151
|
+
|
152
|
+
// Forces match of entire expression right now,
|
153
|
+
// with match ID match_id_ (used by RE2::Set).
|
154
|
+
kRegexpHaveMatch,
|
155
|
+
|
156
|
+
kMaxRegexpOp = kRegexpHaveMatch,
|
157
|
+
};
|
158
|
+
|
159
|
+
// Keep in sync with string list in regexp.cc
|
160
|
+
enum RegexpStatusCode {
|
161
|
+
// No error
|
162
|
+
kRegexpSuccess = 0,
|
163
|
+
|
164
|
+
// Unexpected error
|
165
|
+
kRegexpInternalError,
|
166
|
+
|
167
|
+
// Parse errors
|
168
|
+
kRegexpBadEscape, // bad escape sequence
|
169
|
+
kRegexpBadCharClass, // bad character class
|
170
|
+
kRegexpBadCharRange, // bad character class range
|
171
|
+
kRegexpMissingBracket, // missing closing ]
|
172
|
+
kRegexpMissingParen, // missing closing )
|
173
|
+
kRegexpTrailingBackslash, // at end of regexp
|
174
|
+
kRegexpRepeatArgument, // repeat argument missing, e.g. "*"
|
175
|
+
kRegexpRepeatSize, // bad repetition argument
|
176
|
+
kRegexpRepeatOp, // bad repetition operator
|
177
|
+
kRegexpBadPerlOp, // bad perl operator
|
178
|
+
kRegexpBadUTF8, // invalid UTF-8 in regexp
|
179
|
+
kRegexpBadNamedCapture, // bad named capture
|
180
|
+
};
|
181
|
+
|
182
|
+
// Error status for certain operations.
|
183
|
+
class RegexpStatus {
|
184
|
+
public:
|
185
|
+
RegexpStatus() : code_(kRegexpSuccess), tmp_(NULL) {}
|
186
|
+
~RegexpStatus() { delete tmp_; }
|
187
|
+
|
188
|
+
void set_code(enum RegexpStatusCode code) { code_ = code; }
|
189
|
+
void set_error_arg(const StringPiece& error_arg) { error_arg_ = error_arg; }
|
190
|
+
void set_tmp(string* tmp) { delete tmp_; tmp_ = tmp; }
|
191
|
+
enum RegexpStatusCode code() const { return code_; }
|
192
|
+
const StringPiece& error_arg() const { return error_arg_; }
|
193
|
+
bool ok() const { return code() == kRegexpSuccess; }
|
194
|
+
|
195
|
+
// Copies state from status.
|
196
|
+
void Copy(const RegexpStatus& status);
|
197
|
+
|
198
|
+
// Returns text equivalent of code, e.g.:
|
199
|
+
// "Bad character class"
|
200
|
+
static const string& CodeText(enum RegexpStatusCode code);
|
201
|
+
|
202
|
+
// Returns text describing error, e.g.:
|
203
|
+
// "Bad character class: [z-a]"
|
204
|
+
string Text() const;
|
205
|
+
|
206
|
+
private:
|
207
|
+
enum RegexpStatusCode code_; // Kind of error
|
208
|
+
StringPiece error_arg_; // Piece of regexp containing syntax error.
|
209
|
+
string* tmp_; // Temporary storage, possibly where error_arg_ is.
|
210
|
+
|
211
|
+
DISALLOW_EVIL_CONSTRUCTORS(RegexpStatus);
|
212
|
+
};
|
213
|
+
|
214
|
+
// Walker to implement Simplify.
|
215
|
+
class SimplifyWalker;
|
216
|
+
|
217
|
+
// Compiled form; see prog.h
|
218
|
+
class Prog;
|
219
|
+
|
220
|
+
struct RuneRange {
|
221
|
+
RuneRange() : lo(0), hi(0) { }
|
222
|
+
RuneRange(int l, int h) : lo(l), hi(h) { }
|
223
|
+
Rune lo;
|
224
|
+
Rune hi;
|
225
|
+
};
|
226
|
+
|
227
|
+
// Less-than on RuneRanges treats a == b if they overlap at all.
|
228
|
+
// This lets us look in a set to find the range covering a particular Rune.
|
229
|
+
struct RuneRangeLess {
|
230
|
+
bool operator()(const RuneRange& a, const RuneRange& b) const {
|
231
|
+
return a.hi < b.lo;
|
232
|
+
}
|
233
|
+
};
|
234
|
+
|
235
|
+
class CharClassBuilder;
|
236
|
+
|
237
|
+
class CharClass {
|
238
|
+
public:
|
239
|
+
void Delete();
|
240
|
+
|
241
|
+
typedef RuneRange* iterator;
|
242
|
+
iterator begin() { return ranges_; }
|
243
|
+
iterator end() { return ranges_ + nranges_; }
|
244
|
+
|
245
|
+
int size() { return nrunes_; }
|
246
|
+
bool empty() { return nrunes_ == 0; }
|
247
|
+
bool full() { return nrunes_ == Runemax+1; }
|
248
|
+
bool FoldsASCII() { return folds_ascii_; }
|
249
|
+
|
250
|
+
bool Contains(Rune r);
|
251
|
+
CharClass* Negate();
|
252
|
+
|
253
|
+
private:
|
254
|
+
CharClass(); // not implemented
|
255
|
+
~CharClass(); // not implemented
|
256
|
+
static CharClass* New(int maxranges);
|
257
|
+
|
258
|
+
friend class CharClassBuilder;
|
259
|
+
|
260
|
+
bool folds_ascii_;
|
261
|
+
int nrunes_;
|
262
|
+
RuneRange *ranges_;
|
263
|
+
int nranges_;
|
264
|
+
DISALLOW_EVIL_CONSTRUCTORS(CharClass);
|
265
|
+
};
|
266
|
+
|
267
|
+
class Regexp {
|
268
|
+
public:
|
269
|
+
|
270
|
+
// Flags for parsing. Can be ORed together.
|
271
|
+
enum ParseFlags {
|
272
|
+
NoParseFlags = 0,
|
273
|
+
FoldCase = 1<<0, // Fold case during matching (case-insensitive).
|
274
|
+
Literal = 1<<1, // Treat s as literal string instead of a regexp.
|
275
|
+
ClassNL = 1<<2, // Allow char classes like [^a-z] and \D and \s
|
276
|
+
// and [[:space:]] to match newline.
|
277
|
+
DotNL = 1<<3, // Allow . to match newline.
|
278
|
+
MatchNL = ClassNL | DotNL,
|
279
|
+
OneLine = 1<<4, // Treat ^ and $ as only matching at beginning and
|
280
|
+
// end of text, not around embedded newlines.
|
281
|
+
// (Perl's default)
|
282
|
+
Latin1 = 1<<5, // Regexp and text are in Latin1, not UTF-8.
|
283
|
+
NonGreedy = 1<<6, // Repetition operators are non-greedy by default.
|
284
|
+
PerlClasses = 1<<7, // Allow Perl character classes like \d.
|
285
|
+
PerlB = 1<<8, // Allow Perl's \b and \B.
|
286
|
+
PerlX = 1<<9, // Perl extensions:
|
287
|
+
// non-capturing parens - (?: )
|
288
|
+
// non-greedy operators - *? +? ?? {}?
|
289
|
+
// flag edits - (?i) (?-i) (?i: )
|
290
|
+
// i - FoldCase
|
291
|
+
// m - !OneLine
|
292
|
+
// s - DotNL
|
293
|
+
// U - NonGreedy
|
294
|
+
// line ends: \A \z
|
295
|
+
// \Q and \E to disable/enable metacharacters
|
296
|
+
// (?P<name>expr) for named captures
|
297
|
+
// \C to match any single byte
|
298
|
+
UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group
|
299
|
+
// and \P{Han} for its negation.
|
300
|
+
NeverNL = 1<<11, // Never match NL, even if the regexp mentions
|
301
|
+
// it explicitly.
|
302
|
+
|
303
|
+
// As close to Perl as we can get.
|
304
|
+
LikePerl = ClassNL | OneLine | PerlClasses | PerlB | PerlX |
|
305
|
+
UnicodeGroups,
|
306
|
+
|
307
|
+
// Internal use only.
|
308
|
+
WasDollar = 1<<15, // on kRegexpEndText: was $ in regexp text
|
309
|
+
};
|
310
|
+
|
311
|
+
// Get. No set, Regexps are logically immutable once created.
|
312
|
+
RegexpOp op() { return static_cast<RegexpOp>(op_); }
|
313
|
+
int nsub() { return nsub_; }
|
314
|
+
bool simple() { return simple_; }
|
315
|
+
enum ParseFlags parse_flags() { return static_cast<ParseFlags>(parse_flags_); }
|
316
|
+
int Ref(); // For testing.
|
317
|
+
|
318
|
+
Regexp** sub() {
|
319
|
+
if(nsub_ <= 1)
|
320
|
+
return &subone_;
|
321
|
+
else
|
322
|
+
return submany_;
|
323
|
+
}
|
324
|
+
|
325
|
+
int min() { DCHECK_EQ(op_, kRegexpRepeat); return min_; }
|
326
|
+
int max() { DCHECK_EQ(op_, kRegexpRepeat); return max_; }
|
327
|
+
Rune rune() { DCHECK_EQ(op_, kRegexpLiteral); return rune_; }
|
328
|
+
CharClass* cc() { DCHECK_EQ(op_, kRegexpCharClass); return cc_; }
|
329
|
+
int cap() { DCHECK_EQ(op_, kRegexpCapture); return cap_; }
|
330
|
+
const string* name() { DCHECK_EQ(op_, kRegexpCapture); return name_; }
|
331
|
+
Rune* runes() { DCHECK_EQ(op_, kRegexpLiteralString); return runes_; }
|
332
|
+
int nrunes() { DCHECK_EQ(op_, kRegexpLiteralString); return nrunes_; }
|
333
|
+
int match_id() { DCHECK_EQ(op_, kRegexpHaveMatch); return match_id_; }
|
334
|
+
|
335
|
+
// Increments reference count, returns object as convenience.
|
336
|
+
Regexp* Incref();
|
337
|
+
|
338
|
+
// Decrements reference count and deletes this object if count reaches 0.
|
339
|
+
void Decref();
|
340
|
+
|
341
|
+
// Parses string s to produce regular expression, returned.
|
342
|
+
// Caller must release return value with re->Decref().
|
343
|
+
// On failure, sets *status (if status != NULL) and returns NULL.
|
344
|
+
static Regexp* Parse(const StringPiece& s, ParseFlags flags,
|
345
|
+
RegexpStatus* status);
|
346
|
+
|
347
|
+
// Returns a _new_ simplified version of the current regexp.
|
348
|
+
// Does not edit the current regexp.
|
349
|
+
// Caller must release return value with re->Decref().
|
350
|
+
// Simplified means that counted repetition has been rewritten
|
351
|
+
// into simpler terms and all Perl/POSIX features have been
|
352
|
+
// removed. The result will capture exactly the same
|
353
|
+
// subexpressions the original did, unless formatted with ToString.
|
354
|
+
Regexp* Simplify();
|
355
|
+
friend class SimplifyWalker;
|
356
|
+
|
357
|
+
// Parses the regexp src and then simplifies it and sets *dst to the
|
358
|
+
// string representation of the simplified form. Returns true on success.
|
359
|
+
// Returns false and sets *status (if status != NULL) on parse error.
|
360
|
+
static bool SimplifyRegexp(const StringPiece& src, ParseFlags flags,
|
361
|
+
string* dst,
|
362
|
+
RegexpStatus* status);
|
363
|
+
|
364
|
+
// Returns the number of capturing groups in the regexp.
|
365
|
+
int NumCaptures();
|
366
|
+
friend class NumCapturesWalker;
|
367
|
+
|
368
|
+
// Returns a map from names to capturing group indices,
|
369
|
+
// or NULL if the regexp contains no named capture groups.
|
370
|
+
// The caller is responsible for deleting the map.
|
371
|
+
map<string, int>* NamedCaptures();
|
372
|
+
|
373
|
+
// Returns a map from capturing group indices to capturing group
|
374
|
+
// names or NULL if the regexp contains no named capture groups. The
|
375
|
+
// caller is responsible for deleting the map.
|
376
|
+
map<int, string>* CaptureNames();
|
377
|
+
|
378
|
+
// Returns a string representation of the current regexp,
|
379
|
+
// using as few parentheses as possible.
|
380
|
+
string ToString();
|
381
|
+
|
382
|
+
// Convenience functions. They consume the passed reference,
|
383
|
+
// so in many cases you should use, e.g., Plus(re->Incref(), flags).
|
384
|
+
// They do not consume allocated arrays like subs or runes.
|
385
|
+
static Regexp* Plus(Regexp* sub, ParseFlags flags);
|
386
|
+
static Regexp* Star(Regexp* sub, ParseFlags flags);
|
387
|
+
static Regexp* Quest(Regexp* sub, ParseFlags flags);
|
388
|
+
static Regexp* Concat(Regexp** subs, int nsubs, ParseFlags flags);
|
389
|
+
static Regexp* Alternate(Regexp** subs, int nsubs, ParseFlags flags);
|
390
|
+
static Regexp* Capture(Regexp* sub, ParseFlags flags, int cap);
|
391
|
+
static Regexp* Repeat(Regexp* sub, ParseFlags flags, int min, int max);
|
392
|
+
static Regexp* NewLiteral(Rune rune, ParseFlags flags);
|
393
|
+
static Regexp* NewCharClass(CharClass* cc, ParseFlags flags);
|
394
|
+
static Regexp* LiteralString(Rune* runes, int nrunes, ParseFlags flags);
|
395
|
+
static Regexp* HaveMatch(int match_id, ParseFlags flags);
|
396
|
+
|
397
|
+
// Like Alternate but does not factor out common prefixes.
|
398
|
+
static Regexp* AlternateNoFactor(Regexp** subs, int nsubs, ParseFlags flags);
|
399
|
+
|
400
|
+
// Debugging function. Returns string format for regexp
|
401
|
+
// that makes structure clear. Does NOT use regexp syntax.
|
402
|
+
string Dump();
|
403
|
+
|
404
|
+
// Helper traversal class, defined fully in walker-inl.h.
|
405
|
+
template<typename T> class Walker;
|
406
|
+
|
407
|
+
// Compile to Prog. See prog.h
|
408
|
+
// Reverse prog expects to be run over text backward.
|
409
|
+
// Construction and execution of prog will
|
410
|
+
// stay within approximately max_mem bytes of memory.
|
411
|
+
// If max_mem <= 0, a reasonable default is used.
|
412
|
+
Prog* CompileToProg(int64 max_mem);
|
413
|
+
Prog* CompileToReverseProg(int64 max_mem);
|
414
|
+
|
415
|
+
// Whether to expect this library to find exactly the same answer as PCRE
|
416
|
+
// when running this regexp. Most regexps do mimic PCRE exactly, but a few
|
417
|
+
// obscure cases behave differently. Technically this is more a property
|
418
|
+
// of the Prog than the Regexp, but the computation is much easier to do
|
419
|
+
// on the Regexp. See mimics_pcre.cc for the exact conditions.
|
420
|
+
bool MimicsPCRE();
|
421
|
+
|
422
|
+
// Benchmarking function.
|
423
|
+
void NullWalk();
|
424
|
+
|
425
|
+
// Whether every match of this regexp must be anchored and
|
426
|
+
// begin with a non-empty fixed string (perhaps after ASCII
|
427
|
+
// case-folding). If so, returns the prefix and the sub-regexp that
|
428
|
+
// follows it.
|
429
|
+
bool RequiredPrefix(string* prefix, bool *foldcase, Regexp** suffix);
|
430
|
+
|
431
|
+
private:
|
432
|
+
// Constructor allocates vectors as appropriate for operator.
|
433
|
+
explicit Regexp(RegexpOp op, ParseFlags parse_flags);
|
434
|
+
|
435
|
+
// Use Decref() instead of delete to release Regexps.
|
436
|
+
// This is private to catch deletes at compile time.
|
437
|
+
~Regexp();
|
438
|
+
void Destroy();
|
439
|
+
bool QuickDestroy();
|
440
|
+
|
441
|
+
// Helpers for Parse. Listed here so they can edit Regexps.
|
442
|
+
class ParseState;
|
443
|
+
friend class ParseState;
|
444
|
+
friend bool ParseCharClass(StringPiece* s, Regexp** out_re,
|
445
|
+
RegexpStatus* status);
|
446
|
+
|
447
|
+
// Helper for testing [sic].
|
448
|
+
friend bool RegexpEqualTestingOnly(Regexp*, Regexp*);
|
449
|
+
|
450
|
+
// Computes whether Regexp is already simple.
|
451
|
+
bool ComputeSimple();
|
452
|
+
|
453
|
+
// Constructor that generates a concatenation or alternation,
|
454
|
+
// enforcing the limit on the number of subexpressions for
|
455
|
+
// a particular Regexp.
|
456
|
+
static Regexp* ConcatOrAlternate(RegexpOp op, Regexp** subs, int nsubs,
|
457
|
+
ParseFlags flags, bool can_factor);
|
458
|
+
|
459
|
+
// Returns the leading string that re starts with.
|
460
|
+
// The returned Rune* points into a piece of re,
|
461
|
+
// so it must not be used after the caller calls re->Decref().
|
462
|
+
static Rune* LeadingString(Regexp* re, int* nrune, ParseFlags* flags);
|
463
|
+
|
464
|
+
// Removes the first n leading runes from the beginning of re.
|
465
|
+
// Edits re in place.
|
466
|
+
static void RemoveLeadingString(Regexp* re, int n);
|
467
|
+
|
468
|
+
// Returns the leading regexp in re's top-level concatenation.
|
469
|
+
// The returned Regexp* points at re or a sub-expression of re,
|
470
|
+
// so it must not be used after the caller calls re->Decref().
|
471
|
+
static Regexp* LeadingRegexp(Regexp* re);
|
472
|
+
|
473
|
+
// Removes LeadingRegexp(re) from re and returns the remainder.
|
474
|
+
// Might edit re in place.
|
475
|
+
static Regexp* RemoveLeadingRegexp(Regexp* re);
|
476
|
+
|
477
|
+
// Simplifies an alternation of literal strings by factoring out
|
478
|
+
// common prefixes.
|
479
|
+
static int FactorAlternation(Regexp** sub, int nsub, ParseFlags flags);
|
480
|
+
static int FactorAlternationRecursive(Regexp** sub, int nsub,
|
481
|
+
ParseFlags flags, int maxdepth);
|
482
|
+
|
483
|
+
// Is a == b? Only efficient on regexps that have not been through
|
484
|
+
// Simplify yet - the expansion of a kRegexpRepeat will make this
|
485
|
+
// take a long time. Do not call on such regexps, hence private.
|
486
|
+
static bool Equal(Regexp* a, Regexp* b);
|
487
|
+
|
488
|
+
// Allocate space for n sub-regexps.
|
489
|
+
void AllocSub(int n) {
|
490
|
+
if (n < 0 || static_cast<uint16>(n) != n)
|
491
|
+
LOG(FATAL) << "Cannot AllocSub " << n;
|
492
|
+
if (n > 1)
|
493
|
+
submany_ = new Regexp*[n];
|
494
|
+
nsub_ = n;
|
495
|
+
}
|
496
|
+
|
497
|
+
// Add Rune to LiteralString
|
498
|
+
void AddRuneToString(Rune r);
|
499
|
+
|
500
|
+
// Swaps this with that, in place.
|
501
|
+
void Swap(Regexp *that);
|
502
|
+
|
503
|
+
// Operator. See description of operators above.
|
504
|
+
// uint8 instead of RegexpOp to control space usage.
|
505
|
+
uint8 op_;
|
506
|
+
|
507
|
+
// Is this regexp structure already simple
|
508
|
+
// (has it been returned by Simplify)?
|
509
|
+
// uint8 instead of bool to control space usage.
|
510
|
+
uint8 simple_;
|
511
|
+
|
512
|
+
// Flags saved from parsing and used during execution.
|
513
|
+
// (Only FoldCase is used.)
|
514
|
+
// uint16 instead of ParseFlags to control space usage.
|
515
|
+
uint16 parse_flags_;
|
516
|
+
|
517
|
+
// Reference count. Exists so that SimplifyRegexp can build
|
518
|
+
// regexp structures that are dags rather than trees to avoid
|
519
|
+
// exponential blowup in space requirements.
|
520
|
+
// uint16 to control space usage.
|
521
|
+
// The standard regexp routines will never generate a
|
522
|
+
// ref greater than the maximum repeat count (100),
|
523
|
+
// but even so, Incref and Decref consult an overflow map
|
524
|
+
// when ref_ reaches kMaxRef.
|
525
|
+
uint16 ref_;
|
526
|
+
static const uint16 kMaxRef = 0xffff;
|
527
|
+
|
528
|
+
// Subexpressions.
|
529
|
+
// uint16 to control space usage.
|
530
|
+
// Concat and Alternate handle larger numbers of subexpressions
|
531
|
+
// by building concatenation or alternation trees.
|
532
|
+
// Other routines should call Concat or Alternate instead of
|
533
|
+
// filling in sub() by hand.
|
534
|
+
uint16 nsub_;
|
535
|
+
static const uint16 kMaxNsub = 0xffff;
|
536
|
+
union {
|
537
|
+
Regexp** submany_; // if nsub_ > 1
|
538
|
+
Regexp* subone_; // if nsub_ == 1
|
539
|
+
};
|
540
|
+
|
541
|
+
// Extra space for parse and teardown stacks.
|
542
|
+
Regexp* down_;
|
543
|
+
|
544
|
+
// Arguments to operator. See description of operators above.
|
545
|
+
union {
|
546
|
+
struct { // Repeat
|
547
|
+
int max_;
|
548
|
+
int min_;
|
549
|
+
};
|
550
|
+
struct { // Capture
|
551
|
+
int cap_;
|
552
|
+
string* name_;
|
553
|
+
};
|
554
|
+
struct { // LiteralString
|
555
|
+
int nrunes_;
|
556
|
+
Rune* runes_;
|
557
|
+
};
|
558
|
+
struct { // CharClass
|
559
|
+
// These two could be in separate union members,
|
560
|
+
// but it wouldn't save any space (there are other two-word structs)
|
561
|
+
// and keeping them separate avoids confusion during parsing.
|
562
|
+
CharClass* cc_;
|
563
|
+
CharClassBuilder* ccb_;
|
564
|
+
};
|
565
|
+
Rune rune_; // Literal
|
566
|
+
int match_id_; // HaveMatch
|
567
|
+
void *the_union_[2]; // as big as any other element, for memset
|
568
|
+
};
|
569
|
+
|
570
|
+
DISALLOW_EVIL_CONSTRUCTORS(Regexp);
|
571
|
+
};
|
572
|
+
|
573
|
+
// Character class set: contains non-overlapping, non-abutting RuneRanges.
|
574
|
+
typedef set<RuneRange, RuneRangeLess> RuneRangeSet;
|
575
|
+
|
576
|
+
class CharClassBuilder {
|
577
|
+
public:
|
578
|
+
CharClassBuilder();
|
579
|
+
|
580
|
+
typedef RuneRangeSet::iterator iterator;
|
581
|
+
iterator begin() { return ranges_.begin(); }
|
582
|
+
iterator end() { return ranges_.end(); }
|
583
|
+
|
584
|
+
int size() { return nrunes_; }
|
585
|
+
bool empty() { return nrunes_ == 0; }
|
586
|
+
bool full() { return nrunes_ == Runemax+1; }
|
587
|
+
|
588
|
+
bool Contains(Rune r);
|
589
|
+
bool FoldsASCII();
|
590
|
+
bool AddRange(Rune lo, Rune hi); // returns whether class changed
|
591
|
+
CharClassBuilder* Copy();
|
592
|
+
void AddCharClass(CharClassBuilder* cc);
|
593
|
+
void Negate();
|
594
|
+
void RemoveAbove(Rune r);
|
595
|
+
CharClass* GetCharClass();
|
596
|
+
void AddRangeFlags(Rune lo, Rune hi, Regexp::ParseFlags parse_flags);
|
597
|
+
|
598
|
+
private:
|
599
|
+
static const uint32 AlphaMask = (1<<26) - 1;
|
600
|
+
uint32 upper_; // bitmap of A-Z
|
601
|
+
uint32 lower_; // bitmap of a-z
|
602
|
+
int nrunes_;
|
603
|
+
RuneRangeSet ranges_;
|
604
|
+
DISALLOW_EVIL_CONSTRUCTORS(CharClassBuilder);
|
605
|
+
};
|
606
|
+
|
607
|
+
// Tell g++ that bitwise ops on ParseFlags produce ParseFlags.
|
608
|
+
inline Regexp::ParseFlags operator|(Regexp::ParseFlags a, Regexp::ParseFlags b)
|
609
|
+
{
|
610
|
+
return static_cast<Regexp::ParseFlags>(static_cast<int>(a) | static_cast<int>(b));
|
611
|
+
}
|
612
|
+
|
613
|
+
inline Regexp::ParseFlags operator^(Regexp::ParseFlags a, Regexp::ParseFlags b)
|
614
|
+
{
|
615
|
+
return static_cast<Regexp::ParseFlags>(static_cast<int>(a) ^ static_cast<int>(b));
|
616
|
+
}
|
617
|
+
|
618
|
+
inline Regexp::ParseFlags operator&(Regexp::ParseFlags a, Regexp::ParseFlags b)
|
619
|
+
{
|
620
|
+
return static_cast<Regexp::ParseFlags>(static_cast<int>(a) & static_cast<int>(b));
|
621
|
+
}
|
622
|
+
|
623
|
+
inline Regexp::ParseFlags operator~(Regexp::ParseFlags a)
|
624
|
+
{
|
625
|
+
return static_cast<Regexp::ParseFlags>(~static_cast<int>(a));
|
626
|
+
}
|
627
|
+
|
628
|
+
|
629
|
+
|
630
|
+
} // namespace re2
|
631
|
+
|
632
|
+
#endif // RE2_REGEXP_H__
|