chipper 0.4.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +51 -0
- data/ext/extconf.rb +58 -0
- data/ext/libstemmer_c/Makefile +10 -0
- data/ext/libstemmer_c/examples/stemwords.c +209 -0
- data/ext/libstemmer_c/include/libstemmer.h +79 -0
- data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
- data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
- data/ext/libstemmer_c/libstemmer/modules.h +190 -0
- data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
- data/ext/libstemmer_c/mkinc.mak +82 -0
- data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
- data/ext/libstemmer_c/runtime/api.c +66 -0
- data/ext/libstemmer_c/runtime/api.h +26 -0
- data/ext/libstemmer_c/runtime/header.h +58 -0
- data/ext/libstemmer_c/runtime/utilities.c +478 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
- data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
- data/ext/re2/bitstate.cc +378 -0
- data/ext/re2/compile.cc +1138 -0
- data/ext/re2/dfa.cc +2086 -0
- data/ext/re2/filtered_re2.cc +100 -0
- data/ext/re2/filtered_re2.h +99 -0
- data/ext/re2/hash.cc +231 -0
- data/ext/re2/mimics_pcre.cc +185 -0
- data/ext/re2/nfa.cc +709 -0
- data/ext/re2/onepass.cc +614 -0
- data/ext/re2/parse.cc +2202 -0
- data/ext/re2/perl_groups.cc +119 -0
- data/ext/re2/prefilter.cc +671 -0
- data/ext/re2/prefilter.h +105 -0
- data/ext/re2/prefilter_tree.cc +398 -0
- data/ext/re2/prefilter_tree.h +130 -0
- data/ext/re2/prog.cc +341 -0
- data/ext/re2/prog.h +376 -0
- data/ext/re2/re2.cc +1180 -0
- data/ext/re2/re2.h +837 -0
- data/ext/re2/regexp.cc +920 -0
- data/ext/re2/regexp.h +632 -0
- data/ext/re2/rune.cc +258 -0
- data/ext/re2/set.cc +113 -0
- data/ext/re2/set.h +55 -0
- data/ext/re2/simplify.cc +393 -0
- data/ext/re2/stringpiece.cc +87 -0
- data/ext/re2/stringpiece.h +182 -0
- data/ext/re2/tostring.cc +341 -0
- data/ext/re2/unicode_casefold.cc +469 -0
- data/ext/re2/unicode_casefold.h +75 -0
- data/ext/re2/unicode_groups.cc +4851 -0
- data/ext/re2/unicode_groups.h +64 -0
- data/ext/re2/valgrind.cc +24 -0
- data/ext/re2/variadic_function.h +346 -0
- data/ext/re2/walker-inl.h +244 -0
- data/ext/src/chipper.cc +626 -0
- data/ext/src/version.h +1 -0
- data/ext/stemmer.rb +40 -0
- data/ext/util/arena.h +103 -0
- data/ext/util/atomicops.h +79 -0
- data/ext/util/benchmark.h +41 -0
- data/ext/util/flags.h +27 -0
- data/ext/util/logging.h +78 -0
- data/ext/util/mutex.h +190 -0
- data/ext/util/pcre.h +679 -0
- data/ext/util/random.h +29 -0
- data/ext/util/sparse_array.h +451 -0
- data/ext/util/sparse_set.h +177 -0
- data/ext/util/test.h +57 -0
- data/ext/util/thread.h +26 -0
- data/ext/util/utf.h +43 -0
- data/ext/util/util.h +127 -0
- data/ext/util/valgrind.h +4517 -0
- data/test/helper.rb +5 -0
- data/test/test_entities.rb +57 -0
- data/test/test_tokens.rb +118 -0
- metadata +199 -0
data/ext/re2/onepass.cc
ADDED
@@ -0,0 +1,614 @@
|
|
1
|
+
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style
|
3
|
+
// license that can be found in the LICENSE file.
|
4
|
+
|
5
|
+
// Tested by search_test.cc.
|
6
|
+
//
|
7
|
+
// Prog::SearchOnePass is an efficient implementation of
|
8
|
+
// regular expression search with submatch tracking for
|
9
|
+
// what I call "one-pass regular expressions". (An alternate
|
10
|
+
// name might be "backtracking-free regular expressions".)
|
11
|
+
//
|
12
|
+
// One-pass regular expressions have the property that
|
13
|
+
// at each input byte during an anchored match, there may be
|
14
|
+
// multiple alternatives but only one can proceed for any
|
15
|
+
// given input byte.
|
16
|
+
//
|
17
|
+
// For example, the regexp /x*yx*/ is one-pass: you read
|
18
|
+
// x's until a y, then you read the y, then you keep reading x's.
|
19
|
+
// At no point do you have to guess what to do or back up
|
20
|
+
// and try a different guess.
|
21
|
+
//
|
22
|
+
// On the other hand, /x*x/ is not one-pass: when you're
|
23
|
+
// looking at an input "x", it's not clear whether you should
|
24
|
+
// use it to extend the x* or as the final x.
|
25
|
+
//
|
26
|
+
// More examples: /([^ ]*) (.*)/ is one-pass; /(.*) (.*)/ is not.
|
27
|
+
// /(\d+)-(\d+)/ is one-pass; /(\d+).(\d+)/ is not.
|
28
|
+
//
|
29
|
+
// A simple intuition for identifying one-pass regular expressions
|
30
|
+
// is that it's always immediately obvious when a repetition ends.
|
31
|
+
// It must also be immediately obvious which branch of an | to take:
|
32
|
+
//
|
33
|
+
// /x(y|z)/ is one-pass, but /(xy|xz)/ is not.
|
34
|
+
//
|
35
|
+
// The NFA-based search in nfa.cc does some bookkeeping to
|
36
|
+
// avoid the need for backtracking and its associated exponential blowup.
|
37
|
+
// But if we have a one-pass regular expression, there is no
|
38
|
+
// possibility of backtracking, so there is no need for the
|
39
|
+
// extra bookkeeping. Hence, this code.
|
40
|
+
//
|
41
|
+
// On a one-pass regular expression, the NFA code in nfa.cc
|
42
|
+
// runs at about 1/20 of the backtracking-based PCRE speed.
|
43
|
+
// In contrast, the code in this file runs at about the same
|
44
|
+
// speed as PCRE.
|
45
|
+
//
|
46
|
+
// One-pass regular expressions get used a lot when RE is
|
47
|
+
// used for parsing simple strings, so it pays off to
|
48
|
+
// notice them and handle them efficiently.
|
49
|
+
//
|
50
|
+
// See also Anne Brüggemann-Klein and Derick Wood,
|
51
|
+
// "One-unambiguous regular languages", Information and Computation 142(2).
|
52
|
+
|
53
|
+
#include <string.h>
|
54
|
+
#include <map>
|
55
|
+
#include "util/util.h"
|
56
|
+
#include "util/arena.h"
|
57
|
+
#include "util/sparse_set.h"
|
58
|
+
#include "re2/prog.h"
|
59
|
+
#include "re2/stringpiece.h"
|
60
|
+
|
61
|
+
namespace re2 {
|
62
|
+
|
63
|
+
static const int Debug = 0;
|
64
|
+
|
65
|
+
// The key insight behind this implementation is that the
|
66
|
+
// non-determinism in an NFA for a one-pass regular expression
|
67
|
+
// is contained. To explain what that means, first a
|
68
|
+
// refresher about what regular expression programs look like
|
69
|
+
// and how the usual NFA execution runs.
|
70
|
+
//
|
71
|
+
// In a regular expression program, only the kInstByteRange
|
72
|
+
// instruction processes an input byte c and moves on to the
|
73
|
+
// next byte in the string (it does so if c is in the given range).
|
74
|
+
// The kInstByteRange instructions correspond to literal characters
|
75
|
+
// and character classes in the regular expression.
|
76
|
+
//
|
77
|
+
// The kInstAlt instructions are used as wiring to connect the
|
78
|
+
// kInstByteRange instructions together in interesting ways when
|
79
|
+
// implementing | + and *.
|
80
|
+
// The kInstAlt instruction forks execution, like a goto that
|
81
|
+
// jumps to ip->out() and ip->out1() in parallel. Each of the
|
82
|
+
// resulting computation paths is called a thread.
|
83
|
+
//
|
84
|
+
// The other instructions -- kInstEmptyWidth, kInstMatch, kInstCapture --
|
85
|
+
// are interesting in their own right but like kInstAlt they don't
|
86
|
+
// advance the input pointer. Only kInstByteRange does.
|
87
|
+
//
|
88
|
+
// The automaton execution in nfa.cc runs all the possible
|
89
|
+
// threads of execution in lock-step over the input. To process
|
90
|
+
// a particular byte, each thread gets run until it either dies
|
91
|
+
// or finds a kInstByteRange instruction matching the byte.
|
92
|
+
// If the latter happens, the thread stops just past the
|
93
|
+
// kInstByteRange instruction (at ip->out()) and waits for
|
94
|
+
// the other threads to finish processing the input byte.
|
95
|
+
// Then, once all the threads have processed that input byte,
|
96
|
+
// the whole process repeats. The kInstAlt state instruction
|
97
|
+
// might create new threads during input processing, but no
|
98
|
+
// matter what, all the threads stop after a kInstByteRange
|
99
|
+
// and wait for the other threads to "catch up".
|
100
|
+
// Running in lock step like this ensures that the NFA reads
|
101
|
+
// the input string only once.
|
102
|
+
//
|
103
|
+
// Each thread maintains its own set of capture registers
|
104
|
+
// (the string positions at which it executed the kInstCapture
|
105
|
+
// instructions corresponding to capturing parentheses in the
|
106
|
+
// regular expression). Repeated copying of the capture registers
|
107
|
+
// is the main performance bottleneck in the NFA implementation.
|
108
|
+
//
|
109
|
+
// A regular expression program is "one-pass" if, no matter what
|
110
|
+
// the input string, there is only one thread that makes it
|
111
|
+
// past a kInstByteRange instruction at each input byte. This means
|
112
|
+
// that there is in some sense only one active thread throughout
|
113
|
+
// the execution. Other threads might be created during the
|
114
|
+
// processing of an input byte, but they are ephemeral: only one
|
115
|
+
// thread is left to start processing the next input byte.
|
116
|
+
// This is what I meant above when I said the non-determinism
|
117
|
+
// was "contained".
|
118
|
+
//
|
119
|
+
// To execute a one-pass regular expression program, we can build
|
120
|
+
// a DFA (no non-determinism) that has at most as many states as
|
121
|
+
// the NFA (compare this to the possibly exponential number of states
|
122
|
+
// in the general case). Each state records, for each possible
|
123
|
+
// input byte, the next state along with the conditions required
|
124
|
+
// before entering that state -- empty-width flags that must be true
|
125
|
+
// and capture operations that must be performed. It also records
|
126
|
+
// whether a set of conditions required to finish a match at that
|
127
|
+
// point in the input rather than process the next byte.
|
128
|
+
|
129
|
+
// A state in the one-pass NFA (aka DFA) - just an array of actions.
|
130
|
+
struct OneState;
|
131
|
+
|
132
|
+
// A state in the one-pass NFA - just an array of actions indexed
|
133
|
+
// by the bytemap_[] of the next input byte. (The bytemap
|
134
|
+
// maps next input bytes into equivalence classes, to reduce
|
135
|
+
// the memory footprint.)
|
136
|
+
struct OneState {
|
137
|
+
uint32 matchcond; // conditions to match right now.
|
138
|
+
uint32 action[1];
|
139
|
+
};
|
140
|
+
|
141
|
+
// The uint32 conditions in the action are a combination of
|
142
|
+
// condition and capture bits and the next state. The bottom 16 bits
|
143
|
+
// are the condition and capture bits, and the top 16 are the index of
|
144
|
+
// the next state.
|
145
|
+
//
|
146
|
+
// Bits 0-5 are the empty-width flags from prog.h.
|
147
|
+
// Bit 6 is kMatchWins, which means the match takes
|
148
|
+
// priority over moving to next in a first-match search.
|
149
|
+
// The remaining bits mark capture registers that should
|
150
|
+
// be set to the current input position. The capture bits
|
151
|
+
// start at index 2, since the search loop can take care of
|
152
|
+
// cap[0], cap[1] (the overall match position).
|
153
|
+
// That means we can handle up to 5 capturing parens: $1 through $4, plus $0.
|
154
|
+
// No input position can satisfy both kEmptyWordBoundary
|
155
|
+
// and kEmptyNonWordBoundary, so we can use that as a sentinel
|
156
|
+
// instead of needing an extra bit.
|
157
|
+
|
158
|
+
static const int kIndexShift = 16; // number of bits below index
|
159
|
+
static const int kEmptyShift = 6; // number of empty flags in prog.h
|
160
|
+
static const int kRealCapShift = kEmptyShift + 1;
|
161
|
+
static const int kRealMaxCap = (kIndexShift - kRealCapShift) / 2 * 2;
|
162
|
+
|
163
|
+
// Parameters used to skip over cap[0], cap[1].
|
164
|
+
static const int kCapShift = kRealCapShift - 2;
|
165
|
+
static const int kMaxCap = kRealMaxCap + 2;
|
166
|
+
|
167
|
+
static const uint32 kMatchWins = 1 << kEmptyShift;
|
168
|
+
static const uint32 kCapMask = ((1 << kRealMaxCap) - 1) << kRealCapShift;
|
169
|
+
|
170
|
+
static const uint32 kImpossible = kEmptyWordBoundary | kEmptyNonWordBoundary;
|
171
|
+
|
172
|
+
// Check, at compile time, that prog.h agrees with math above.
|
173
|
+
// This function is never called.
|
174
|
+
void OnePass_Checks() {
|
175
|
+
COMPILE_ASSERT((1<<kEmptyShift)-1 == kEmptyAllFlags,
|
176
|
+
kEmptyShift_disagrees_with_kEmptyAllFlags);
|
177
|
+
// kMaxCap counts pointers, kMaxOnePassCapture counts pairs.
|
178
|
+
COMPILE_ASSERT(kMaxCap == Prog::kMaxOnePassCapture*2,
|
179
|
+
kMaxCap_disagrees_with_kMaxOnePassCapture);
|
180
|
+
}
|
181
|
+
|
182
|
+
static bool Satisfy(uint32 cond, const StringPiece& context, const char* p) {
|
183
|
+
uint32 satisfied = Prog::EmptyFlags(context, p);
|
184
|
+
if (cond & kEmptyAllFlags & ~satisfied)
|
185
|
+
return false;
|
186
|
+
return true;
|
187
|
+
}
|
188
|
+
|
189
|
+
// Apply the capture bits in cond, saving p to the appropriate
|
190
|
+
// locations in cap[].
|
191
|
+
static void ApplyCaptures(uint32 cond, const char* p,
|
192
|
+
const char** cap, int ncap) {
|
193
|
+
for (int i = 2; i < ncap; i++)
|
194
|
+
if (cond & (1 << kCapShift << i))
|
195
|
+
cap[i] = p;
|
196
|
+
}
|
197
|
+
|
198
|
+
// Compute a node pointer.
|
199
|
+
// Basically (OneState*)(nodes + statesize*nodeindex)
|
200
|
+
// but the version with the C++ casts overflows 80 characters (and is ugly).
|
201
|
+
static inline OneState* IndexToNode(volatile uint8* nodes, int statesize,
|
202
|
+
int nodeindex) {
|
203
|
+
return reinterpret_cast<OneState*>(
|
204
|
+
const_cast<uint8*>(nodes + statesize*nodeindex));
|
205
|
+
}
|
206
|
+
|
207
|
+
bool Prog::SearchOnePass(const StringPiece& text,
|
208
|
+
const StringPiece& const_context,
|
209
|
+
Anchor anchor, MatchKind kind,
|
210
|
+
StringPiece* match, int nmatch) {
|
211
|
+
if (anchor != kAnchored && kind != kFullMatch) {
|
212
|
+
LOG(DFATAL) << "Cannot use SearchOnePass for unanchored matches.";
|
213
|
+
return false;
|
214
|
+
}
|
215
|
+
|
216
|
+
// Make sure we have at least cap[1],
|
217
|
+
// because we use it to tell if we matched.
|
218
|
+
int ncap = 2*nmatch;
|
219
|
+
if (ncap < 2)
|
220
|
+
ncap = 2;
|
221
|
+
|
222
|
+
const char* cap[kMaxCap];
|
223
|
+
for (int i = 0; i < ncap; i++)
|
224
|
+
cap[i] = NULL;
|
225
|
+
|
226
|
+
const char* matchcap[kMaxCap];
|
227
|
+
for (int i = 0; i < ncap; i++)
|
228
|
+
matchcap[i] = NULL;
|
229
|
+
|
230
|
+
StringPiece context = const_context;
|
231
|
+
if (context.begin() == NULL)
|
232
|
+
context = text;
|
233
|
+
if (anchor_start() && context.begin() != text.begin())
|
234
|
+
return false;
|
235
|
+
if (anchor_end() && context.end() != text.end())
|
236
|
+
return false;
|
237
|
+
if (anchor_end())
|
238
|
+
kind = kFullMatch;
|
239
|
+
|
240
|
+
// State and act are marked volatile to
|
241
|
+
// keep the compiler from re-ordering the
|
242
|
+
// memory accesses walking over the NFA.
|
243
|
+
// This is worth about 5%.
|
244
|
+
volatile OneState* state = onepass_start_;
|
245
|
+
volatile uint8* nodes = onepass_nodes_;
|
246
|
+
volatile uint32 statesize = onepass_statesize_;
|
247
|
+
uint8* bytemap = bytemap_;
|
248
|
+
const char* bp = text.begin();
|
249
|
+
const char* ep = text.end();
|
250
|
+
const char* p;
|
251
|
+
bool matched = false;
|
252
|
+
matchcap[0] = bp;
|
253
|
+
cap[0] = bp;
|
254
|
+
uint32 nextmatchcond = state->matchcond;
|
255
|
+
for (p = bp; p < ep; p++) {
|
256
|
+
int c = bytemap[*p & 0xFF];
|
257
|
+
uint32 matchcond = nextmatchcond;
|
258
|
+
uint32 cond = state->action[c];
|
259
|
+
|
260
|
+
// Determine whether we can reach act->next.
|
261
|
+
// If so, advance state and nextmatchcond.
|
262
|
+
if ((cond & kEmptyAllFlags) == 0 || Satisfy(cond, context, p)) {
|
263
|
+
uint32 nextindex = cond >> kIndexShift;
|
264
|
+
state = IndexToNode(nodes, statesize, nextindex);
|
265
|
+
nextmatchcond = state->matchcond;
|
266
|
+
} else {
|
267
|
+
state = NULL;
|
268
|
+
nextmatchcond = kImpossible;
|
269
|
+
}
|
270
|
+
|
271
|
+
// This code section is carefully tuned.
|
272
|
+
// The goto sequence is about 10% faster than the
|
273
|
+
// obvious rewrite as a large if statement in the
|
274
|
+
// ASCIIMatchRE2 and DotMatchRE2 benchmarks.
|
275
|
+
|
276
|
+
// Saving the match capture registers is expensive.
|
277
|
+
// Is this intermediate match worth thinking about?
|
278
|
+
|
279
|
+
// Not if we want a full match.
|
280
|
+
if (kind == kFullMatch)
|
281
|
+
goto skipmatch;
|
282
|
+
|
283
|
+
// Not if it's impossible.
|
284
|
+
if (matchcond == kImpossible)
|
285
|
+
goto skipmatch;
|
286
|
+
|
287
|
+
// Not if the possible match is beaten by the certain
|
288
|
+
// match at the next byte. When this test is useless
|
289
|
+
// (e.g., HTTPPartialMatchRE2) it slows the loop by
|
290
|
+
// about 10%, but when it avoids work (e.g., DotMatchRE2),
|
291
|
+
// it cuts the loop execution by about 45%.
|
292
|
+
if ((cond & kMatchWins) == 0 && (nextmatchcond & kEmptyAllFlags) == 0)
|
293
|
+
goto skipmatch;
|
294
|
+
|
295
|
+
// Finally, the match conditions must be satisfied.
|
296
|
+
if ((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p)) {
|
297
|
+
for (int i = 2; i < 2*nmatch; i++)
|
298
|
+
matchcap[i] = cap[i];
|
299
|
+
if (nmatch > 1 && (matchcond & kCapMask))
|
300
|
+
ApplyCaptures(matchcond, p, matchcap, ncap);
|
301
|
+
matchcap[1] = p;
|
302
|
+
matched = true;
|
303
|
+
|
304
|
+
// If we're in longest match mode, we have to keep
|
305
|
+
// going and see if we find a longer match.
|
306
|
+
// In first match mode, we can stop if the match
|
307
|
+
// takes priority over the next state for this input byte.
|
308
|
+
// That bit is per-input byte and thus in cond, not matchcond.
|
309
|
+
if (kind == kFirstMatch && (cond & kMatchWins))
|
310
|
+
goto done;
|
311
|
+
}
|
312
|
+
|
313
|
+
skipmatch:
|
314
|
+
if (state == NULL)
|
315
|
+
goto done;
|
316
|
+
if ((cond & kCapMask) && nmatch > 1)
|
317
|
+
ApplyCaptures(cond, p, cap, ncap);
|
318
|
+
}
|
319
|
+
|
320
|
+
// Look for match at end of input.
|
321
|
+
{
|
322
|
+
uint32 matchcond = state->matchcond;
|
323
|
+
if (matchcond != kImpossible &&
|
324
|
+
((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p))) {
|
325
|
+
if (nmatch > 1 && (matchcond & kCapMask))
|
326
|
+
ApplyCaptures(matchcond, p, cap, ncap);
|
327
|
+
for (int i = 2; i < ncap; i++)
|
328
|
+
matchcap[i] = cap[i];
|
329
|
+
matchcap[1] = p;
|
330
|
+
matched = true;
|
331
|
+
}
|
332
|
+
}
|
333
|
+
|
334
|
+
done:
|
335
|
+
if (!matched)
|
336
|
+
return false;
|
337
|
+
for (int i = 0; i < nmatch; i++)
|
338
|
+
match[i].set(matchcap[2*i], matchcap[2*i+1] - matchcap[2*i]);
|
339
|
+
return true;
|
340
|
+
}
|
341
|
+
|
342
|
+
|
343
|
+
// Analysis to determine whether a given regexp program is one-pass.
|
344
|
+
|
345
|
+
// If ip is not on workq, adds ip to work queue and returns true.
|
346
|
+
// If ip is already on work queue, does nothing and returns false.
|
347
|
+
// If ip is NULL, does nothing and returns true (pretends to add it).
|
348
|
+
typedef SparseSet Instq;
|
349
|
+
static bool AddQ(Instq *q, int id) {
|
350
|
+
if (id == 0)
|
351
|
+
return true;
|
352
|
+
if (q->contains(id))
|
353
|
+
return false;
|
354
|
+
q->insert(id);
|
355
|
+
return true;
|
356
|
+
}
|
357
|
+
|
358
|
+
struct InstCond {
|
359
|
+
int id;
|
360
|
+
uint32 cond;
|
361
|
+
};
|
362
|
+
|
363
|
+
// Returns whether this is a one-pass program; that is,
|
364
|
+
// returns whether it is safe to use SearchOnePass on this program.
|
365
|
+
// These conditions must be true for any instruction ip:
|
366
|
+
//
|
367
|
+
// (1) for any other Inst nip, there is at most one input-free
|
368
|
+
// path from ip to nip.
|
369
|
+
// (2) there is at most one kInstByte instruction reachable from
|
370
|
+
// ip that matches any particular byte c.
|
371
|
+
// (3) there is at most one input-free path from ip to a kInstMatch
|
372
|
+
// instruction.
|
373
|
+
//
|
374
|
+
// This is actually just a conservative approximation: it might
|
375
|
+
// return false when the answer is true, when kInstEmptyWidth
|
376
|
+
// instructions are involved.
|
377
|
+
// Constructs and saves corresponding one-pass NFA on success.
|
378
|
+
bool Prog::IsOnePass() {
|
379
|
+
if (did_onepass_)
|
380
|
+
return onepass_start_ != NULL;
|
381
|
+
did_onepass_ = true;
|
382
|
+
|
383
|
+
if (start() == 0) // no match
|
384
|
+
return false;
|
385
|
+
|
386
|
+
// Steal memory for the one-pass NFA from the overall DFA budget.
|
387
|
+
// Willing to use at most 1/4 of the DFA budget (heuristic).
|
388
|
+
// Limit max node count to 65000 as a conservative estimate to
|
389
|
+
// avoid overflowing 16-bit node index in encoding.
|
390
|
+
int maxnodes = 2 + byte_inst_count_;
|
391
|
+
int statesize = sizeof(OneState) + (bytemap_range_-1)*sizeof(uint32);
|
392
|
+
if (maxnodes >= 65000 || dfa_mem_ / 4 / statesize < maxnodes)
|
393
|
+
return false;
|
394
|
+
|
395
|
+
// Flood the graph starting at the start state, and check
|
396
|
+
// that in each reachable state, each possible byte leads
|
397
|
+
// to a unique next state.
|
398
|
+
int size = this->size();
|
399
|
+
InstCond *stack = new InstCond[size];
|
400
|
+
|
401
|
+
int* nodebyid = new int[size]; // indexed by ip
|
402
|
+
memset(nodebyid, 0xFF, size*sizeof nodebyid[0]);
|
403
|
+
|
404
|
+
uint8* nodes = new uint8[maxnodes*statesize];
|
405
|
+
uint8* nodep = nodes;
|
406
|
+
|
407
|
+
Instq tovisit(size), workq(size);
|
408
|
+
AddQ(&tovisit, start());
|
409
|
+
nodebyid[start()] = 0;
|
410
|
+
nodep += statesize;
|
411
|
+
int nalloc = 1;
|
412
|
+
for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
|
413
|
+
int id = *it;
|
414
|
+
int nodeindex = nodebyid[id];
|
415
|
+
OneState* node = IndexToNode(nodes, statesize, nodeindex);
|
416
|
+
|
417
|
+
// Flood graph using manual stack, filling in actions as found.
|
418
|
+
// Default is none.
|
419
|
+
for (int b = 0; b < bytemap_range_; b++)
|
420
|
+
node->action[b] = kImpossible;
|
421
|
+
node->matchcond = kImpossible;
|
422
|
+
|
423
|
+
workq.clear();
|
424
|
+
bool matched = false;
|
425
|
+
int nstack = 0;
|
426
|
+
stack[nstack].id = id;
|
427
|
+
stack[nstack++].cond = 0;
|
428
|
+
while (nstack > 0) {
|
429
|
+
int id = stack[--nstack].id;
|
430
|
+
Prog::Inst* ip = inst(id);
|
431
|
+
uint32 cond = stack[nstack].cond;
|
432
|
+
switch (ip->opcode()) {
|
433
|
+
case kInstAltMatch:
|
434
|
+
// TODO(rsc): Ignoring kInstAltMatch optimization.
|
435
|
+
// Should implement it in this engine, but it's subtle.
|
436
|
+
// Fall through.
|
437
|
+
case kInstAlt:
|
438
|
+
// If already on work queue, (1) is violated: bail out.
|
439
|
+
if (!AddQ(&workq, ip->out()) || !AddQ(&workq, ip->out1()))
|
440
|
+
goto fail;
|
441
|
+
stack[nstack].id = ip->out1();
|
442
|
+
stack[nstack++].cond = cond;
|
443
|
+
stack[nstack].id = ip->out();
|
444
|
+
stack[nstack++].cond = cond;
|
445
|
+
break;
|
446
|
+
|
447
|
+
case kInstByteRange: {
|
448
|
+
int nextindex = nodebyid[ip->out()];
|
449
|
+
if (nextindex == -1) {
|
450
|
+
if (nalloc >= maxnodes) {
|
451
|
+
if (Debug)
|
452
|
+
LOG(ERROR)
|
453
|
+
<< StringPrintf("Not OnePass: hit node limit %d > %d",
|
454
|
+
nalloc, maxnodes);
|
455
|
+
goto fail;
|
456
|
+
}
|
457
|
+
nextindex = nalloc;
|
458
|
+
nodep += statesize;
|
459
|
+
nodebyid[ip->out()] = nextindex;
|
460
|
+
nalloc++;
|
461
|
+
AddQ(&tovisit, ip->out());
|
462
|
+
}
|
463
|
+
if (matched)
|
464
|
+
cond |= kMatchWins;
|
465
|
+
for (int c = ip->lo(); c <= ip->hi(); c++) {
|
466
|
+
int b = bytemap_[c];
|
467
|
+
c = unbytemap_[b]; // last c in byte class
|
468
|
+
uint32 act = node->action[b];
|
469
|
+
uint32 newact = (nextindex << kIndexShift) | cond;
|
470
|
+
if ((act & kImpossible) == kImpossible) {
|
471
|
+
node->action[b] = newact;
|
472
|
+
} else if (act != newact) {
|
473
|
+
if (Debug) {
|
474
|
+
LOG(ERROR)
|
475
|
+
<< StringPrintf("Not OnePass: conflict on byte "
|
476
|
+
"%#x at state %d",
|
477
|
+
c, *it);
|
478
|
+
}
|
479
|
+
goto fail;
|
480
|
+
}
|
481
|
+
}
|
482
|
+
if (ip->foldcase()) {
|
483
|
+
Rune lo = max<Rune>(ip->lo(), 'a') + 'A' - 'a';
|
484
|
+
Rune hi = min<Rune>(ip->hi(), 'z') + 'A' - 'a';
|
485
|
+
for (int c = lo; c <= hi; c++) {
|
486
|
+
int b = bytemap_[c];
|
487
|
+
c = unbytemap_[b]; // last c in class
|
488
|
+
uint32 act = node->action[b];
|
489
|
+
uint32 newact = (nextindex << kIndexShift) | cond;
|
490
|
+
if ((act & kImpossible) == kImpossible) {
|
491
|
+
node->action[b] = newact;
|
492
|
+
} else if (act != newact) {
|
493
|
+
if (Debug) {
|
494
|
+
LOG(ERROR)
|
495
|
+
<< StringPrintf("Not OnePass: conflict on byte "
|
496
|
+
"%#x at state %d",
|
497
|
+
c, *it);
|
498
|
+
}
|
499
|
+
goto fail;
|
500
|
+
}
|
501
|
+
}
|
502
|
+
}
|
503
|
+
break;
|
504
|
+
}
|
505
|
+
|
506
|
+
case kInstCapture:
|
507
|
+
if (ip->cap() < kMaxCap)
|
508
|
+
cond |= (1 << kCapShift) << ip->cap();
|
509
|
+
goto QueueEmpty;
|
510
|
+
|
511
|
+
case kInstEmptyWidth:
|
512
|
+
cond |= ip->empty();
|
513
|
+
goto QueueEmpty;
|
514
|
+
|
515
|
+
case kInstNop:
|
516
|
+
QueueEmpty:
|
517
|
+
// kInstCapture and kInstNop always proceed to ip->out().
|
518
|
+
// kInstEmptyWidth only sometimes proceeds to ip->out(),
|
519
|
+
// but as a conservative approximation we assume it always does.
|
520
|
+
// We could be a little more precise by looking at what c
|
521
|
+
// is, but that seems like overkill.
|
522
|
+
|
523
|
+
// If already on work queue, (1) is violated: bail out.
|
524
|
+
if (!AddQ(&workq, ip->out())) {
|
525
|
+
if (Debug) {
|
526
|
+
LOG(ERROR) << StringPrintf("Not OnePass: multiple paths"
|
527
|
+
" %d -> %d\n",
|
528
|
+
*it, ip->out());
|
529
|
+
}
|
530
|
+
goto fail;
|
531
|
+
}
|
532
|
+
stack[nstack].id = ip->out();
|
533
|
+
stack[nstack++].cond = cond;
|
534
|
+
break;
|
535
|
+
|
536
|
+
case kInstMatch:
|
537
|
+
if (matched) {
|
538
|
+
// (3) is violated
|
539
|
+
if (Debug) {
|
540
|
+
LOG(ERROR) << StringPrintf("Not OnePass: multiple matches"
|
541
|
+
" from %d\n", *it);
|
542
|
+
}
|
543
|
+
goto fail;
|
544
|
+
}
|
545
|
+
matched = true;
|
546
|
+
node->matchcond = cond;
|
547
|
+
break;
|
548
|
+
|
549
|
+
case kInstFail:
|
550
|
+
break;
|
551
|
+
}
|
552
|
+
}
|
553
|
+
}
|
554
|
+
|
555
|
+
if (Debug) { // For debugging, dump one-pass NFA to LOG(ERROR).
|
556
|
+
string dump = "prog dump:\n" + Dump() + "node dump\n";
|
557
|
+
map<int, int> idmap;
|
558
|
+
for (int i = 0; i < size; i++)
|
559
|
+
if (nodebyid[i] != -1)
|
560
|
+
idmap[nodebyid[i]] = i;
|
561
|
+
|
562
|
+
StringAppendF(&dump, "byte ranges:\n");
|
563
|
+
int i = 0;
|
564
|
+
for (int b = 0; b < bytemap_range_; b++) {
|
565
|
+
int lo = i;
|
566
|
+
while (bytemap_[i] == b)
|
567
|
+
i++;
|
568
|
+
StringAppendF(&dump, "\t%d: %#x-%#x\n", b, lo, i - 1);
|
569
|
+
}
|
570
|
+
|
571
|
+
for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
|
572
|
+
int id = *it;
|
573
|
+
int nodeindex = nodebyid[id];
|
574
|
+
if (nodeindex == -1)
|
575
|
+
continue;
|
576
|
+
OneState* node = IndexToNode(nodes, statesize, nodeindex);
|
577
|
+
string s;
|
578
|
+
StringAppendF(&dump, "node %d id=%d: matchcond=%#x\n",
|
579
|
+
nodeindex, id, node->matchcond);
|
580
|
+
for (int i = 0; i < bytemap_range_; i++) {
|
581
|
+
if ((node->action[i] & kImpossible) == kImpossible)
|
582
|
+
continue;
|
583
|
+
StringAppendF(&dump, " %d cond %#x -> %d id=%d\n",
|
584
|
+
i, node->action[i] & 0xFFFF,
|
585
|
+
node->action[i] >> kIndexShift,
|
586
|
+
idmap[node->action[i] >> kIndexShift]);
|
587
|
+
}
|
588
|
+
}
|
589
|
+
LOG(ERROR) << dump;
|
590
|
+
}
|
591
|
+
|
592
|
+
// Overallocated earlier; cut down to actual size.
|
593
|
+
nodep = new uint8[nalloc*statesize];
|
594
|
+
memmove(nodep, nodes, nalloc*statesize);
|
595
|
+
delete[] nodes;
|
596
|
+
nodes = nodep;
|
597
|
+
|
598
|
+
onepass_start_ = IndexToNode(nodes, statesize, nodebyid[start()]);
|
599
|
+
onepass_nodes_ = nodes;
|
600
|
+
onepass_statesize_ = statesize;
|
601
|
+
dfa_mem_ -= nalloc*statesize;
|
602
|
+
|
603
|
+
delete[] stack;
|
604
|
+
delete[] nodebyid;
|
605
|
+
return true;
|
606
|
+
|
607
|
+
fail:
|
608
|
+
delete[] stack;
|
609
|
+
delete[] nodebyid;
|
610
|
+
delete[] nodes;
|
611
|
+
return false;
|
612
|
+
}
|
613
|
+
|
614
|
+
} // namespace re2
|