chipper 0.4.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +51 -0
- data/ext/extconf.rb +58 -0
- data/ext/libstemmer_c/Makefile +10 -0
- data/ext/libstemmer_c/examples/stemwords.c +209 -0
- data/ext/libstemmer_c/include/libstemmer.h +79 -0
- data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
- data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
- data/ext/libstemmer_c/libstemmer/modules.h +190 -0
- data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
- data/ext/libstemmer_c/mkinc.mak +82 -0
- data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
- data/ext/libstemmer_c/runtime/api.c +66 -0
- data/ext/libstemmer_c/runtime/api.h +26 -0
- data/ext/libstemmer_c/runtime/header.h +58 -0
- data/ext/libstemmer_c/runtime/utilities.c +478 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
- data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
- data/ext/re2/bitstate.cc +378 -0
- data/ext/re2/compile.cc +1138 -0
- data/ext/re2/dfa.cc +2086 -0
- data/ext/re2/filtered_re2.cc +100 -0
- data/ext/re2/filtered_re2.h +99 -0
- data/ext/re2/hash.cc +231 -0
- data/ext/re2/mimics_pcre.cc +185 -0
- data/ext/re2/nfa.cc +709 -0
- data/ext/re2/onepass.cc +614 -0
- data/ext/re2/parse.cc +2202 -0
- data/ext/re2/perl_groups.cc +119 -0
- data/ext/re2/prefilter.cc +671 -0
- data/ext/re2/prefilter.h +105 -0
- data/ext/re2/prefilter_tree.cc +398 -0
- data/ext/re2/prefilter_tree.h +130 -0
- data/ext/re2/prog.cc +341 -0
- data/ext/re2/prog.h +376 -0
- data/ext/re2/re2.cc +1180 -0
- data/ext/re2/re2.h +837 -0
- data/ext/re2/regexp.cc +920 -0
- data/ext/re2/regexp.h +632 -0
- data/ext/re2/rune.cc +258 -0
- data/ext/re2/set.cc +113 -0
- data/ext/re2/set.h +55 -0
- data/ext/re2/simplify.cc +393 -0
- data/ext/re2/stringpiece.cc +87 -0
- data/ext/re2/stringpiece.h +182 -0
- data/ext/re2/tostring.cc +341 -0
- data/ext/re2/unicode_casefold.cc +469 -0
- data/ext/re2/unicode_casefold.h +75 -0
- data/ext/re2/unicode_groups.cc +4851 -0
- data/ext/re2/unicode_groups.h +64 -0
- data/ext/re2/valgrind.cc +24 -0
- data/ext/re2/variadic_function.h +346 -0
- data/ext/re2/walker-inl.h +244 -0
- data/ext/src/chipper.cc +626 -0
- data/ext/src/version.h +1 -0
- data/ext/stemmer.rb +40 -0
- data/ext/util/arena.h +103 -0
- data/ext/util/atomicops.h +79 -0
- data/ext/util/benchmark.h +41 -0
- data/ext/util/flags.h +27 -0
- data/ext/util/logging.h +78 -0
- data/ext/util/mutex.h +190 -0
- data/ext/util/pcre.h +679 -0
- data/ext/util/random.h +29 -0
- data/ext/util/sparse_array.h +451 -0
- data/ext/util/sparse_set.h +177 -0
- data/ext/util/test.h +57 -0
- data/ext/util/thread.h +26 -0
- data/ext/util/utf.h +43 -0
- data/ext/util/util.h +127 -0
- data/ext/util/valgrind.h +4517 -0
- data/test/helper.rb +5 -0
- data/test/test_entities.rb +57 -0
- data/test/test_tokens.rb +118 -0
- metadata +199 -0
data/ext/re2/prog.h
ADDED
@@ -0,0 +1,376 @@
|
|
1
|
+
// Copyright 2007 The RE2 Authors. All Rights Reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style
|
3
|
+
// license that can be found in the LICENSE file.
|
4
|
+
|
5
|
+
// Compiled representation of regular expressions.
|
6
|
+
// See regexp.h for the Regexp class, which represents a regular
|
7
|
+
// expression symbolically.
|
8
|
+
|
9
|
+
#ifndef RE2_PROG_H__
|
10
|
+
#define RE2_PROG_H__
|
11
|
+
|
12
|
+
#include "util/util.h"
|
13
|
+
#include "re2/re2.h"
|
14
|
+
|
15
|
+
namespace re2 {
|
16
|
+
|
17
|
+
// Simple fixed-size bitmap.
|
18
|
+
template<int Bits>
|
19
|
+
class Bitmap {
|
20
|
+
public:
|
21
|
+
Bitmap() { Reset(); }
|
22
|
+
int Size() { return Bits; }
|
23
|
+
|
24
|
+
void Reset() {
|
25
|
+
for (int i = 0; i < Words; i++)
|
26
|
+
w_[i] = 0;
|
27
|
+
}
|
28
|
+
bool Get(int k) const {
|
29
|
+
return w_[k >> WordLog] & (1<<(k & 31));
|
30
|
+
}
|
31
|
+
void Set(int k) {
|
32
|
+
w_[k >> WordLog] |= 1<<(k & 31);
|
33
|
+
}
|
34
|
+
void Clear(int k) {
|
35
|
+
w_[k >> WordLog] &= ~(1<<(k & 31));
|
36
|
+
}
|
37
|
+
uint32 Word(int i) const {
|
38
|
+
return w_[i];
|
39
|
+
}
|
40
|
+
|
41
|
+
private:
|
42
|
+
static const int WordLog = 5;
|
43
|
+
static const int Words = (Bits+31)/32;
|
44
|
+
uint32 w_[Words];
|
45
|
+
DISALLOW_EVIL_CONSTRUCTORS(Bitmap);
|
46
|
+
};
|
47
|
+
|
48
|
+
|
49
|
+
// Opcodes for Inst
|
50
|
+
enum InstOp {
|
51
|
+
kInstAlt = 0, // choose between out_ and out1_
|
52
|
+
kInstAltMatch, // Alt: out_ is [00-FF] and back, out1_ is match; or vice versa.
|
53
|
+
kInstByteRange, // next (possible case-folded) byte must be in [lo_, hi_]
|
54
|
+
kInstCapture, // capturing parenthesis number cap_
|
55
|
+
kInstEmptyWidth, // empty-width special (^ $ ...); bit(s) set in empty_
|
56
|
+
kInstMatch, // found a match!
|
57
|
+
kInstNop, // no-op; occasionally unavoidable
|
58
|
+
kInstFail, // never match; occasionally unavoidable
|
59
|
+
};
|
60
|
+
|
61
|
+
// Bit flags for empty-width specials
|
62
|
+
enum EmptyOp {
|
63
|
+
kEmptyBeginLine = 1<<0, // ^ - beginning of line
|
64
|
+
kEmptyEndLine = 1<<1, // $ - end of line
|
65
|
+
kEmptyBeginText = 1<<2, // \A - beginning of text
|
66
|
+
kEmptyEndText = 1<<3, // \z - end of text
|
67
|
+
kEmptyWordBoundary = 1<<4, // \b - word boundary
|
68
|
+
kEmptyNonWordBoundary = 1<<5, // \B - not \b
|
69
|
+
kEmptyAllFlags = (1<<6)-1,
|
70
|
+
};
|
71
|
+
|
72
|
+
class Regexp;
|
73
|
+
|
74
|
+
class DFA;
|
75
|
+
struct OneState;
|
76
|
+
|
77
|
+
// Compiled form of regexp program.
|
78
|
+
class Prog {
|
79
|
+
public:
|
80
|
+
Prog();
|
81
|
+
~Prog();
|
82
|
+
|
83
|
+
// Single instruction in regexp program.
|
84
|
+
class Inst {
|
85
|
+
public:
|
86
|
+
Inst() : out_opcode_(0), out1_(0) { }
|
87
|
+
|
88
|
+
// Constructors per opcode
|
89
|
+
void InitAlt(uint32 out, uint32 out1);
|
90
|
+
void InitByteRange(int lo, int hi, int foldcase, uint32 out);
|
91
|
+
void InitCapture(int cap, uint32 out);
|
92
|
+
void InitEmptyWidth(EmptyOp empty, uint32 out);
|
93
|
+
void InitMatch(int id);
|
94
|
+
void InitNop(uint32 out);
|
95
|
+
void InitFail();
|
96
|
+
|
97
|
+
// Getters
|
98
|
+
int id(Prog* p) { return this - p->inst_; }
|
99
|
+
InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); }
|
100
|
+
int out() { return out_opcode_>>3; }
|
101
|
+
int out1() { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; }
|
102
|
+
int cap() { DCHECK_EQ(opcode(), kInstCapture); return cap_; }
|
103
|
+
int lo() { DCHECK_EQ(opcode(), kInstByteRange); return lo_; }
|
104
|
+
int hi() { DCHECK_EQ(opcode(), kInstByteRange); return hi_; }
|
105
|
+
int foldcase() { DCHECK_EQ(opcode(), kInstByteRange); return foldcase_; }
|
106
|
+
int match_id() { DCHECK_EQ(opcode(), kInstMatch); return match_id_; }
|
107
|
+
EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; }
|
108
|
+
bool greedy(Prog *p) {
|
109
|
+
DCHECK_EQ(opcode(), kInstAltMatch);
|
110
|
+
return p->inst(out())->opcode() == kInstByteRange;
|
111
|
+
}
|
112
|
+
|
113
|
+
// Does this inst (an kInstByteRange) match c?
|
114
|
+
inline bool Matches(int c) {
|
115
|
+
DCHECK_EQ(opcode(), kInstByteRange);
|
116
|
+
if (foldcase_ && 'A' <= c && c <= 'Z')
|
117
|
+
c += 'a' - 'A';
|
118
|
+
return lo_ <= c && c <= hi_;
|
119
|
+
}
|
120
|
+
|
121
|
+
// Returns string representation for debugging.
|
122
|
+
string Dump();
|
123
|
+
|
124
|
+
// Maximum instruction id.
|
125
|
+
// (Must fit in out_opcode_, and PatchList steals another bit.)
|
126
|
+
static const int kMaxInst = (1<<28) - 1;
|
127
|
+
|
128
|
+
private:
|
129
|
+
void set_opcode(InstOp opcode) {
|
130
|
+
out_opcode_ = (out()<<3) | opcode;
|
131
|
+
}
|
132
|
+
|
133
|
+
void set_out(int out) {
|
134
|
+
out_opcode_ = (out<<3) | opcode();
|
135
|
+
}
|
136
|
+
|
137
|
+
void set_out_opcode(int out, InstOp opcode) {
|
138
|
+
out_opcode_ = (out<<3) | opcode;
|
139
|
+
}
|
140
|
+
|
141
|
+
uint32 out_opcode_; // 29 bits of out, 3 (low) bits opcode
|
142
|
+
union { // additional instruction arguments:
|
143
|
+
uint32 out1_; // opcode == kInstAlt
|
144
|
+
// alternate next instruction
|
145
|
+
|
146
|
+
int32 cap_; // opcode == kInstCapture
|
147
|
+
// Index of capture register (holds text
|
148
|
+
// position recorded by capturing parentheses).
|
149
|
+
// For \n (the submatch for the nth parentheses),
|
150
|
+
// the left parenthesis captures into register 2*n
|
151
|
+
// and the right one captures into register 2*n+1.
|
152
|
+
|
153
|
+
int32 match_id_; // opcode == kInstMatch
|
154
|
+
// Match ID to identify this match (for re2::Set).
|
155
|
+
|
156
|
+
struct { // opcode == kInstByteRange
|
157
|
+
uint8 lo_; // byte range is lo_-hi_ inclusive
|
158
|
+
uint8 hi_; //
|
159
|
+
uint8 foldcase_; // convert A-Z to a-z before checking range.
|
160
|
+
};
|
161
|
+
|
162
|
+
EmptyOp empty_; // opcode == kInstEmptyWidth
|
163
|
+
// empty_ is bitwise OR of kEmpty* flags above.
|
164
|
+
};
|
165
|
+
|
166
|
+
friend class Compiler;
|
167
|
+
friend struct PatchList;
|
168
|
+
friend class Prog;
|
169
|
+
|
170
|
+
DISALLOW_EVIL_CONSTRUCTORS(Inst);
|
171
|
+
};
|
172
|
+
|
173
|
+
// Whether to anchor the search.
|
174
|
+
enum Anchor {
|
175
|
+
kUnanchored, // match anywhere
|
176
|
+
kAnchored, // match only starting at beginning of text
|
177
|
+
};
|
178
|
+
|
179
|
+
// Kind of match to look for (for anchor != kFullMatch)
|
180
|
+
//
|
181
|
+
// kLongestMatch mode finds the overall longest
|
182
|
+
// match but still makes its submatch choices the way
|
183
|
+
// Perl would, not in the way prescribed by POSIX.
|
184
|
+
// The POSIX rules are much more expensive to implement,
|
185
|
+
// and no one has needed them.
|
186
|
+
//
|
187
|
+
// kFullMatch is not strictly necessary -- we could use
|
188
|
+
// kLongestMatch and then check the length of the match -- but
|
189
|
+
// the matching code can run faster if it knows to consider only
|
190
|
+
// full matches.
|
191
|
+
enum MatchKind {
|
192
|
+
kFirstMatch, // like Perl, PCRE
|
193
|
+
kLongestMatch, // like egrep or POSIX
|
194
|
+
kFullMatch, // match only entire text; implies anchor==kAnchored
|
195
|
+
kManyMatch // for SearchDFA, records set of matches
|
196
|
+
};
|
197
|
+
|
198
|
+
Inst *inst(int id) { return &inst_[id]; }
|
199
|
+
int start() { return start_; }
|
200
|
+
int start_unanchored() { return start_unanchored_; }
|
201
|
+
void set_start(int start) { start_ = start; }
|
202
|
+
void set_start_unanchored(int start) { start_unanchored_ = start; }
|
203
|
+
int64 size() { return size_; }
|
204
|
+
bool reversed() { return reversed_; }
|
205
|
+
void set_reversed(bool reversed) { reversed_ = reversed; }
|
206
|
+
int64 byte_inst_count() { return byte_inst_count_; }
|
207
|
+
const Bitmap<256>& byterange() { return byterange_; }
|
208
|
+
void set_dfa_mem(int64 dfa_mem) { dfa_mem_ = dfa_mem; }
|
209
|
+
int64 dfa_mem() { return dfa_mem_; }
|
210
|
+
int flags() { return flags_; }
|
211
|
+
void set_flags(int flags) { flags_ = flags; }
|
212
|
+
bool anchor_start() { return anchor_start_; }
|
213
|
+
void set_anchor_start(bool b) { anchor_start_ = b; }
|
214
|
+
bool anchor_end() { return anchor_end_; }
|
215
|
+
void set_anchor_end(bool b) { anchor_end_ = b; }
|
216
|
+
int bytemap_range() { return bytemap_range_; }
|
217
|
+
const uint8* bytemap() { return bytemap_; }
|
218
|
+
|
219
|
+
// Returns string representation of program for debugging.
|
220
|
+
string Dump();
|
221
|
+
string DumpUnanchored();
|
222
|
+
|
223
|
+
// Record that at some point in the prog, the bytes in the range
|
224
|
+
// lo-hi (inclusive) are treated as different from bytes outside the range.
|
225
|
+
// Tracking this lets the DFA collapse commonly-treated byte ranges
|
226
|
+
// when recording state pointers, greatly reducing its memory footprint.
|
227
|
+
void MarkByteRange(int lo, int hi);
|
228
|
+
|
229
|
+
// Returns the set of kEmpty flags that are in effect at
|
230
|
+
// position p within context.
|
231
|
+
static uint32 EmptyFlags(const StringPiece& context, const char* p);
|
232
|
+
|
233
|
+
// Returns whether byte c is a word character: ASCII only.
|
234
|
+
// Used by the implementation of \b and \B.
|
235
|
+
// This is not right for Unicode, but:
|
236
|
+
// - it's hard to get right in a byte-at-a-time matching world
|
237
|
+
// (the DFA has only one-byte lookahead).
|
238
|
+
// - even if the lookahead were possible, the Progs would be huge.
|
239
|
+
// This crude approximation is the same one PCRE uses.
|
240
|
+
static bool IsWordChar(uint8 c) {
|
241
|
+
return ('A' <= c && c <= 'Z') ||
|
242
|
+
('a' <= c && c <= 'z') ||
|
243
|
+
('0' <= c && c <= '9') ||
|
244
|
+
c == '_';
|
245
|
+
}
|
246
|
+
|
247
|
+
// Execution engines. They all search for the regexp (run the prog)
|
248
|
+
// in text, which is in the larger context (used for ^ $ \b etc).
|
249
|
+
// Anchor and kind control the kind of search.
|
250
|
+
// Returns true if match found, false if not.
|
251
|
+
// If match found, fills match[0..nmatch-1] with submatch info.
|
252
|
+
// match[0] is overall match, match[1] is first set of parens, etc.
|
253
|
+
// If a particular submatch is not matched during the regexp match,
|
254
|
+
// it is set to NULL.
|
255
|
+
//
|
256
|
+
// Matching text == StringPiece(NULL, 0) is treated as any other empty
|
257
|
+
// string, but note that on return, it will not be possible to distinguish
|
258
|
+
// submatches that matched that empty string from submatches that didn't
|
259
|
+
// match anything. Either way, match[i] == NULL.
|
260
|
+
|
261
|
+
// Search using NFA: can find submatches but kind of slow.
|
262
|
+
bool SearchNFA(const StringPiece& text, const StringPiece& context,
|
263
|
+
Anchor anchor, MatchKind kind,
|
264
|
+
StringPiece* match, int nmatch);
|
265
|
+
|
266
|
+
// Search using DFA: much faster than NFA but only finds
|
267
|
+
// end of match and can use a lot more memory.
|
268
|
+
// Returns whether a match was found.
|
269
|
+
// If the DFA runs out of memory, sets *failed to true and returns false.
|
270
|
+
// If matches != NULL and kind == kManyMatch and there is a match,
|
271
|
+
// SearchDFA fills matches with the match IDs of the final matching state.
|
272
|
+
bool SearchDFA(const StringPiece& text, const StringPiece& context,
|
273
|
+
Anchor anchor, MatchKind kind,
|
274
|
+
StringPiece* match0, bool* failed,
|
275
|
+
vector<int>* matches);
|
276
|
+
|
277
|
+
// Build the entire DFA for the given match kind. FOR TESTING ONLY.
|
278
|
+
// Usually the DFA is built out incrementally, as needed, which
|
279
|
+
// avoids lots of unnecessary work. This function is useful only
|
280
|
+
// for testing purposes. Returns number of states.
|
281
|
+
int BuildEntireDFA(MatchKind kind);
|
282
|
+
|
283
|
+
// Compute byte map.
|
284
|
+
void ComputeByteMap();
|
285
|
+
|
286
|
+
// Run peep-hole optimizer on program.
|
287
|
+
void Optimize();
|
288
|
+
|
289
|
+
// One-pass NFA: only correct if IsOnePass() is true,
|
290
|
+
// but much faster than NFA (competitive with PCRE)
|
291
|
+
// for those expressions.
|
292
|
+
bool IsOnePass();
|
293
|
+
bool SearchOnePass(const StringPiece& text, const StringPiece& context,
|
294
|
+
Anchor anchor, MatchKind kind,
|
295
|
+
StringPiece* match, int nmatch);
|
296
|
+
|
297
|
+
// Bit-state backtracking. Fast on small cases but uses memory
|
298
|
+
// proportional to the product of the program size and the text size.
|
299
|
+
bool SearchBitState(const StringPiece& text, const StringPiece& context,
|
300
|
+
Anchor anchor, MatchKind kind,
|
301
|
+
StringPiece* match, int nmatch);
|
302
|
+
|
303
|
+
static const int kMaxOnePassCapture = 5; // $0 through $4
|
304
|
+
|
305
|
+
// Backtracking search: the gold standard against which the other
|
306
|
+
// implementations are checked. FOR TESTING ONLY.
|
307
|
+
// It allocates a ton of memory to avoid running forever.
|
308
|
+
// It is also recursive, so can't use in production (will overflow stacks).
|
309
|
+
// The name "Unsafe" here is supposed to be a flag that
|
310
|
+
// you should not be using this function.
|
311
|
+
bool UnsafeSearchBacktrack(const StringPiece& text,
|
312
|
+
const StringPiece& context,
|
313
|
+
Anchor anchor, MatchKind kind,
|
314
|
+
StringPiece* match, int nmatch);
|
315
|
+
|
316
|
+
// Computes range for any strings matching regexp. The min and max can in
|
317
|
+
// some cases be arbitrarily precise, so the caller gets to specify the
|
318
|
+
// maximum desired length of string returned.
|
319
|
+
//
|
320
|
+
// Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
|
321
|
+
// string s that is an anchored match for this regexp satisfies
|
322
|
+
// min <= s && s <= max.
|
323
|
+
//
|
324
|
+
// Note that PossibleMatchRange() will only consider the first copy of an
|
325
|
+
// infinitely repeated element (i.e., any regexp element followed by a '*' or
|
326
|
+
// '+' operator). Regexps with "{N}" constructions are not affected, as those
|
327
|
+
// do not compile down to infinite repetitions.
|
328
|
+
//
|
329
|
+
// Returns true on success, false on error.
|
330
|
+
bool PossibleMatchRange(string* min, string* max, int maxlen);
|
331
|
+
|
332
|
+
// Compiles a collection of regexps to Prog. Each regexp will have
|
333
|
+
// its own Match instruction recording the index in the vector.
|
334
|
+
static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor,
|
335
|
+
Regexp* re);
|
336
|
+
|
337
|
+
private:
|
338
|
+
friend class Compiler;
|
339
|
+
|
340
|
+
DFA* GetDFA(MatchKind kind);
|
341
|
+
|
342
|
+
bool anchor_start_; // regexp has explicit start anchor
|
343
|
+
bool anchor_end_; // regexp has explicit end anchor
|
344
|
+
bool reversed_; // whether program runs backward over input
|
345
|
+
bool did_onepass_; // has IsOnePass been called?
|
346
|
+
|
347
|
+
int start_; // entry point for program
|
348
|
+
int start_unanchored_; // unanchored entry point for program
|
349
|
+
int size_; // number of instructions
|
350
|
+
int byte_inst_count_; // number of kInstByteRange instructions
|
351
|
+
int bytemap_range_; // bytemap_[x] < bytemap_range_
|
352
|
+
int flags_; // regexp parse flags
|
353
|
+
int onepass_statesize_; // byte size of each OneState* node
|
354
|
+
|
355
|
+
Inst* inst_; // pointer to instruction array
|
356
|
+
|
357
|
+
Mutex dfa_mutex_; // Protects dfa_first_, dfa_longest_
|
358
|
+
DFA* volatile dfa_first_; // DFA cached for kFirstMatch
|
359
|
+
DFA* volatile dfa_longest_; // DFA cached for kLongestMatch and kFullMatch
|
360
|
+
int64 dfa_mem_; // Maximum memory for DFAs.
|
361
|
+
void (*delete_dfa_)(DFA* dfa);
|
362
|
+
|
363
|
+
Bitmap<256> byterange_; // byterange.Get(x) true if x ends a
|
364
|
+
// commonly-treated byte range.
|
365
|
+
uint8 bytemap_[256]; // map from input bytes to byte classes
|
366
|
+
uint8 *unbytemap_; // bytemap_[unbytemap_[x]] == x
|
367
|
+
|
368
|
+
uint8* onepass_nodes_; // data for OnePass nodes
|
369
|
+
OneState* onepass_start_; // start node for OnePass program
|
370
|
+
|
371
|
+
DISALLOW_EVIL_CONSTRUCTORS(Prog);
|
372
|
+
};
|
373
|
+
|
374
|
+
} // namespace re2
|
375
|
+
|
376
|
+
#endif // RE2_PROG_H__
|