chipper 0.4.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +51 -0
- data/ext/extconf.rb +58 -0
- data/ext/libstemmer_c/Makefile +10 -0
- data/ext/libstemmer_c/examples/stemwords.c +209 -0
- data/ext/libstemmer_c/include/libstemmer.h +79 -0
- data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
- data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
- data/ext/libstemmer_c/libstemmer/modules.h +190 -0
- data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
- data/ext/libstemmer_c/mkinc.mak +82 -0
- data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
- data/ext/libstemmer_c/runtime/api.c +66 -0
- data/ext/libstemmer_c/runtime/api.h +26 -0
- data/ext/libstemmer_c/runtime/header.h +58 -0
- data/ext/libstemmer_c/runtime/utilities.c +478 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
- data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
- data/ext/re2/bitstate.cc +378 -0
- data/ext/re2/compile.cc +1138 -0
- data/ext/re2/dfa.cc +2086 -0
- data/ext/re2/filtered_re2.cc +100 -0
- data/ext/re2/filtered_re2.h +99 -0
- data/ext/re2/hash.cc +231 -0
- data/ext/re2/mimics_pcre.cc +185 -0
- data/ext/re2/nfa.cc +709 -0
- data/ext/re2/onepass.cc +614 -0
- data/ext/re2/parse.cc +2202 -0
- data/ext/re2/perl_groups.cc +119 -0
- data/ext/re2/prefilter.cc +671 -0
- data/ext/re2/prefilter.h +105 -0
- data/ext/re2/prefilter_tree.cc +398 -0
- data/ext/re2/prefilter_tree.h +130 -0
- data/ext/re2/prog.cc +341 -0
- data/ext/re2/prog.h +376 -0
- data/ext/re2/re2.cc +1180 -0
- data/ext/re2/re2.h +837 -0
- data/ext/re2/regexp.cc +920 -0
- data/ext/re2/regexp.h +632 -0
- data/ext/re2/rune.cc +258 -0
- data/ext/re2/set.cc +113 -0
- data/ext/re2/set.h +55 -0
- data/ext/re2/simplify.cc +393 -0
- data/ext/re2/stringpiece.cc +87 -0
- data/ext/re2/stringpiece.h +182 -0
- data/ext/re2/tostring.cc +341 -0
- data/ext/re2/unicode_casefold.cc +469 -0
- data/ext/re2/unicode_casefold.h +75 -0
- data/ext/re2/unicode_groups.cc +4851 -0
- data/ext/re2/unicode_groups.h +64 -0
- data/ext/re2/valgrind.cc +24 -0
- data/ext/re2/variadic_function.h +346 -0
- data/ext/re2/walker-inl.h +244 -0
- data/ext/src/chipper.cc +626 -0
- data/ext/src/version.h +1 -0
- data/ext/stemmer.rb +40 -0
- data/ext/util/arena.h +103 -0
- data/ext/util/atomicops.h +79 -0
- data/ext/util/benchmark.h +41 -0
- data/ext/util/flags.h +27 -0
- data/ext/util/logging.h +78 -0
- data/ext/util/mutex.h +190 -0
- data/ext/util/pcre.h +679 -0
- data/ext/util/random.h +29 -0
- data/ext/util/sparse_array.h +451 -0
- data/ext/util/sparse_set.h +177 -0
- data/ext/util/test.h +57 -0
- data/ext/util/thread.h +26 -0
- data/ext/util/utf.h +43 -0
- data/ext/util/util.h +127 -0
- data/ext/util/valgrind.h +4517 -0
- data/test/helper.rb +5 -0
- data/test/test_entities.rb +57 -0
- data/test/test_tokens.rb +118 -0
- metadata +199 -0
data/ext/re2/rune.cc
ADDED
@@ -0,0 +1,258 @@
|
|
1
|
+
/*
|
2
|
+
* The authors of this software are Rob Pike and Ken Thompson.
|
3
|
+
* Copyright (c) 2002 by Lucent Technologies.
|
4
|
+
* Permission to use, copy, modify, and distribute this software for any
|
5
|
+
* purpose without fee is hereby granted, provided that this entire notice
|
6
|
+
* is included in all copies of any software which is or includes a copy
|
7
|
+
* or modification of this software and in all copies of the supporting
|
8
|
+
* documentation for such software.
|
9
|
+
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
|
10
|
+
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
|
11
|
+
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
|
12
|
+
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
|
13
|
+
*/
|
14
|
+
#include <stdarg.h>
|
15
|
+
#include <string.h>
|
16
|
+
#include "util/utf.h"
|
17
|
+
|
18
|
+
namespace re2 {
|
19
|
+
|
20
|
+
enum
|
21
|
+
{
|
22
|
+
Bit1 = 7,
|
23
|
+
Bitx = 6,
|
24
|
+
Bit2 = 5,
|
25
|
+
Bit3 = 4,
|
26
|
+
Bit4 = 3,
|
27
|
+
Bit5 = 2,
|
28
|
+
|
29
|
+
T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
|
30
|
+
Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
|
31
|
+
T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
|
32
|
+
T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
|
33
|
+
T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
|
34
|
+
T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
|
35
|
+
|
36
|
+
Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */
|
37
|
+
Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */
|
38
|
+
Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */
|
39
|
+
Rune4 = (1<<(Bit4+3*Bitx))-1,
|
40
|
+
/* 0001 1111 1111 1111 1111 1111 */
|
41
|
+
|
42
|
+
Maskx = (1<<Bitx)-1, /* 0011 1111 */
|
43
|
+
Testx = Maskx ^ 0xFF, /* 1100 0000 */
|
44
|
+
|
45
|
+
Bad = Runeerror,
|
46
|
+
};
|
47
|
+
|
48
|
+
int
|
49
|
+
chartorune(Rune *rune, const char *str)
|
50
|
+
{
|
51
|
+
int c, c1, c2, c3;
|
52
|
+
long l;
|
53
|
+
|
54
|
+
/*
|
55
|
+
* one character sequence
|
56
|
+
* 00000-0007F => T1
|
57
|
+
*/
|
58
|
+
c = *(unsigned char*)str;
|
59
|
+
if(c < Tx) {
|
60
|
+
*rune = c;
|
61
|
+
return 1;
|
62
|
+
}
|
63
|
+
|
64
|
+
/*
|
65
|
+
* two character sequence
|
66
|
+
* 0080-07FF => T2 Tx
|
67
|
+
*/
|
68
|
+
c1 = *(unsigned char*)(str+1) ^ Tx;
|
69
|
+
if(c1 & Testx)
|
70
|
+
goto bad;
|
71
|
+
if(c < T3) {
|
72
|
+
if(c < T2)
|
73
|
+
goto bad;
|
74
|
+
l = ((c << Bitx) | c1) & Rune2;
|
75
|
+
if(l <= Rune1)
|
76
|
+
goto bad;
|
77
|
+
*rune = l;
|
78
|
+
return 2;
|
79
|
+
}
|
80
|
+
|
81
|
+
/*
|
82
|
+
* three character sequence
|
83
|
+
* 0800-FFFF => T3 Tx Tx
|
84
|
+
*/
|
85
|
+
c2 = *(unsigned char*)(str+2) ^ Tx;
|
86
|
+
if(c2 & Testx)
|
87
|
+
goto bad;
|
88
|
+
if(c < T4) {
|
89
|
+
l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
|
90
|
+
if(l <= Rune2)
|
91
|
+
goto bad;
|
92
|
+
*rune = l;
|
93
|
+
return 3;
|
94
|
+
}
|
95
|
+
|
96
|
+
/*
|
97
|
+
* four character sequence (21-bit value)
|
98
|
+
* 10000-1FFFFF => T4 Tx Tx Tx
|
99
|
+
*/
|
100
|
+
c3 = *(unsigned char*)(str+3) ^ Tx;
|
101
|
+
if (c3 & Testx)
|
102
|
+
goto bad;
|
103
|
+
if (c < T5) {
|
104
|
+
l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
|
105
|
+
if (l <= Rune3)
|
106
|
+
goto bad;
|
107
|
+
*rune = l;
|
108
|
+
return 4;
|
109
|
+
}
|
110
|
+
|
111
|
+
/*
|
112
|
+
* Support for 5-byte or longer UTF-8 would go here, but
|
113
|
+
* since we don't have that, we'll just fall through to bad.
|
114
|
+
*/
|
115
|
+
|
116
|
+
/*
|
117
|
+
* bad decoding
|
118
|
+
*/
|
119
|
+
bad:
|
120
|
+
*rune = Bad;
|
121
|
+
return 1;
|
122
|
+
}
|
123
|
+
|
124
|
+
int
|
125
|
+
runetochar(char *str, const Rune *rune)
|
126
|
+
{
|
127
|
+
/* Runes are signed, so convert to unsigned for range check. */
|
128
|
+
unsigned long c;
|
129
|
+
|
130
|
+
/*
|
131
|
+
* one character sequence
|
132
|
+
* 00000-0007F => 00-7F
|
133
|
+
*/
|
134
|
+
c = *rune;
|
135
|
+
if(c <= Rune1) {
|
136
|
+
str[0] = c;
|
137
|
+
return 1;
|
138
|
+
}
|
139
|
+
|
140
|
+
/*
|
141
|
+
* two character sequence
|
142
|
+
* 0080-07FF => T2 Tx
|
143
|
+
*/
|
144
|
+
if(c <= Rune2) {
|
145
|
+
str[0] = T2 | (c >> 1*Bitx);
|
146
|
+
str[1] = Tx | (c & Maskx);
|
147
|
+
return 2;
|
148
|
+
}
|
149
|
+
|
150
|
+
/*
|
151
|
+
* If the Rune is out of range, convert it to the error rune.
|
152
|
+
* Do this test here because the error rune encodes to three bytes.
|
153
|
+
* Doing it earlier would duplicate work, since an out of range
|
154
|
+
* Rune wouldn't have fit in one or two bytes.
|
155
|
+
*/
|
156
|
+
if (c > Runemax)
|
157
|
+
c = Runeerror;
|
158
|
+
|
159
|
+
/*
|
160
|
+
* three character sequence
|
161
|
+
* 0800-FFFF => T3 Tx Tx
|
162
|
+
*/
|
163
|
+
if (c <= Rune3) {
|
164
|
+
str[0] = T3 | (c >> 2*Bitx);
|
165
|
+
str[1] = Tx | ((c >> 1*Bitx) & Maskx);
|
166
|
+
str[2] = Tx | (c & Maskx);
|
167
|
+
return 3;
|
168
|
+
}
|
169
|
+
|
170
|
+
/*
|
171
|
+
* four character sequence (21-bit value)
|
172
|
+
* 10000-1FFFFF => T4 Tx Tx Tx
|
173
|
+
*/
|
174
|
+
str[0] = T4 | (c >> 3*Bitx);
|
175
|
+
str[1] = Tx | ((c >> 2*Bitx) & Maskx);
|
176
|
+
str[2] = Tx | ((c >> 1*Bitx) & Maskx);
|
177
|
+
str[3] = Tx | (c & Maskx);
|
178
|
+
return 4;
|
179
|
+
}
|
180
|
+
|
181
|
+
int
|
182
|
+
runelen(Rune rune)
|
183
|
+
{
|
184
|
+
char str[10];
|
185
|
+
|
186
|
+
return runetochar(str, &rune);
|
187
|
+
}
|
188
|
+
|
189
|
+
int
|
190
|
+
fullrune(const char *str, int n)
|
191
|
+
{
|
192
|
+
if (n > 0) {
|
193
|
+
int c = *(unsigned char*)str;
|
194
|
+
if (c < Tx)
|
195
|
+
return 1;
|
196
|
+
if (n > 1) {
|
197
|
+
if (c < T3)
|
198
|
+
return 1;
|
199
|
+
if (n > 2) {
|
200
|
+
if (c < T4 || n > 3)
|
201
|
+
return 1;
|
202
|
+
}
|
203
|
+
}
|
204
|
+
}
|
205
|
+
return 0;
|
206
|
+
}
|
207
|
+
|
208
|
+
|
209
|
+
int
|
210
|
+
utflen(const char *s)
|
211
|
+
{
|
212
|
+
int c;
|
213
|
+
long n;
|
214
|
+
Rune rune;
|
215
|
+
|
216
|
+
n = 0;
|
217
|
+
for(;;) {
|
218
|
+
c = *(unsigned char*)s;
|
219
|
+
if(c < Runeself) {
|
220
|
+
if(c == 0)
|
221
|
+
return n;
|
222
|
+
s++;
|
223
|
+
} else
|
224
|
+
s += chartorune(&rune, s);
|
225
|
+
n++;
|
226
|
+
}
|
227
|
+
return 0;
|
228
|
+
}
|
229
|
+
|
230
|
+
char*
|
231
|
+
utfrune(const char *s, Rune c)
|
232
|
+
{
|
233
|
+
long c1;
|
234
|
+
Rune r;
|
235
|
+
int n;
|
236
|
+
|
237
|
+
if(c < Runesync) /* not part of utf sequence */
|
238
|
+
return strchr((char*)s, c);
|
239
|
+
|
240
|
+
for(;;) {
|
241
|
+
c1 = *(unsigned char*)s;
|
242
|
+
if(c1 < Runeself) { /* one byte rune */
|
243
|
+
if(c1 == 0)
|
244
|
+
return 0;
|
245
|
+
if(c1 == c)
|
246
|
+
return (char*)s;
|
247
|
+
s++;
|
248
|
+
continue;
|
249
|
+
}
|
250
|
+
n = chartorune(&r, s);
|
251
|
+
if(r == c)
|
252
|
+
return (char*)s;
|
253
|
+
s += n;
|
254
|
+
}
|
255
|
+
return 0;
|
256
|
+
}
|
257
|
+
|
258
|
+
} // namespace re2
|
data/ext/re2/set.cc
ADDED
@@ -0,0 +1,113 @@
|
|
1
|
+
// Copyright 2010 The RE2 Authors. All Rights Reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style
|
3
|
+
// license that can be found in the LICENSE file.
|
4
|
+
|
5
|
+
#include "re2/set.h"
|
6
|
+
|
7
|
+
#include "util/util.h"
|
8
|
+
#include "re2/stringpiece.h"
|
9
|
+
#include "re2/prog.h"
|
10
|
+
#include "re2/re2.h"
|
11
|
+
#include "re2/regexp.h"
|
12
|
+
|
13
|
+
using namespace re2;
|
14
|
+
|
15
|
+
RE2::Set::Set(const RE2::Options& options, RE2::Anchor anchor) {
|
16
|
+
options_.Copy(options);
|
17
|
+
anchor_ = anchor;
|
18
|
+
prog_ = NULL;
|
19
|
+
compiled_ = false;
|
20
|
+
}
|
21
|
+
|
22
|
+
RE2::Set::~Set() {
|
23
|
+
for (int i = 0; i < re_.size(); i++)
|
24
|
+
re_[i]->Decref();
|
25
|
+
delete prog_;
|
26
|
+
}
|
27
|
+
|
28
|
+
int RE2::Set::Add(const StringPiece& pattern, string* error) {
|
29
|
+
if (compiled_) {
|
30
|
+
LOG(DFATAL) << "RE2::Set::Add after Compile";
|
31
|
+
return -1;
|
32
|
+
}
|
33
|
+
|
34
|
+
Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
|
35
|
+
options_.ParseFlags());
|
36
|
+
|
37
|
+
RegexpStatus status;
|
38
|
+
re2::Regexp* re = Regexp::Parse(pattern, pf, &status);
|
39
|
+
if (re == NULL) {
|
40
|
+
if (error != NULL)
|
41
|
+
*error = status.Text();
|
42
|
+
if (options_.log_errors())
|
43
|
+
LOG(ERROR) << "Error parsing '" << pattern << "': " << status.Text();
|
44
|
+
return -1;
|
45
|
+
}
|
46
|
+
|
47
|
+
// Concatenate with match index and push on vector.
|
48
|
+
int n = re_.size();
|
49
|
+
re2::Regexp* m = re2::Regexp::HaveMatch(n, pf);
|
50
|
+
if (re->op() == kRegexpConcat) {
|
51
|
+
int nsub = re->nsub();
|
52
|
+
re2::Regexp** sub = new re2::Regexp*[nsub + 1];
|
53
|
+
for (int i = 0; i < nsub; i++)
|
54
|
+
sub[i] = re->sub()[i]->Incref();
|
55
|
+
sub[nsub] = m;
|
56
|
+
re->Decref();
|
57
|
+
re = re2::Regexp::Concat(sub, nsub + 1, pf);
|
58
|
+
delete[] sub;
|
59
|
+
} else {
|
60
|
+
re2::Regexp* sub[2];
|
61
|
+
sub[0] = re;
|
62
|
+
sub[1] = m;
|
63
|
+
re = re2::Regexp::Concat(sub, 2, pf);
|
64
|
+
}
|
65
|
+
re_.push_back(re);
|
66
|
+
return n;
|
67
|
+
}
|
68
|
+
|
69
|
+
bool RE2::Set::Compile() {
|
70
|
+
if (compiled_) {
|
71
|
+
LOG(DFATAL) << "RE2::Set::Compile multiple times";
|
72
|
+
return false;
|
73
|
+
}
|
74
|
+
compiled_ = true;
|
75
|
+
|
76
|
+
Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
|
77
|
+
options_.ParseFlags());
|
78
|
+
re2::Regexp* re = re2::Regexp::Alternate(const_cast<re2::Regexp**>(&re_[0]),
|
79
|
+
re_.size(), pf);
|
80
|
+
re_.clear();
|
81
|
+
re2::Regexp* sre = re->Simplify();
|
82
|
+
re->Decref();
|
83
|
+
re = sre;
|
84
|
+
if (re == NULL) {
|
85
|
+
if (options_.log_errors())
|
86
|
+
LOG(ERROR) << "Error simplifying during Compile.";
|
87
|
+
return false;
|
88
|
+
}
|
89
|
+
|
90
|
+
prog_ = Prog::CompileSet(options_, anchor_, re);
|
91
|
+
return prog_ != NULL;
|
92
|
+
}
|
93
|
+
|
94
|
+
bool RE2::Set::Match(const StringPiece& text, vector<int>* v) const {
|
95
|
+
if (!compiled_) {
|
96
|
+
LOG(DFATAL) << "RE2::Set::Match without Compile";
|
97
|
+
return false;
|
98
|
+
}
|
99
|
+
v->clear();
|
100
|
+
bool failed;
|
101
|
+
bool ret = prog_->SearchDFA(text, text, Prog::kAnchored,
|
102
|
+
Prog::kManyMatch, NULL, &failed, v);
|
103
|
+
if (failed)
|
104
|
+
LOG(DFATAL) << "RE2::Set::Match: DFA ran out of cache space";
|
105
|
+
|
106
|
+
if (ret == false)
|
107
|
+
return false;
|
108
|
+
if (v->size() == 0) {
|
109
|
+
LOG(DFATAL) << "RE2::Set::Match: match but unknown regexp set";
|
110
|
+
return false;
|
111
|
+
}
|
112
|
+
return true;
|
113
|
+
}
|
data/ext/re2/set.h
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
// Copyright 2010 The RE2 Authors. All Rights Reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style
|
3
|
+
// license that can be found in the LICENSE file.
|
4
|
+
|
5
|
+
#ifndef RE2_SET_H
|
6
|
+
#define RE2_SET_H
|
7
|
+
|
8
|
+
#include <utility>
|
9
|
+
#include <vector>
|
10
|
+
|
11
|
+
#include "re2/re2.h"
|
12
|
+
|
13
|
+
namespace re2 {
|
14
|
+
using std::vector;
|
15
|
+
|
16
|
+
// An RE2::Set represents a collection of regexps that can
|
17
|
+
// be searched for simultaneously.
|
18
|
+
class RE2::Set {
|
19
|
+
public:
|
20
|
+
Set(const RE2::Options& options, RE2::Anchor anchor);
|
21
|
+
~Set();
|
22
|
+
|
23
|
+
// Add adds regexp pattern to the set, interpreted using the RE2 options.
|
24
|
+
// (The RE2 constructor's default options parameter is RE2::UTF8.)
|
25
|
+
// Add returns the regexp index that will be used to identify
|
26
|
+
// it in the result of Match, or -1 if the regexp cannot be parsed.
|
27
|
+
// Indices are assigned in sequential order starting from 0.
|
28
|
+
// Error returns do not increment the index.
|
29
|
+
// If an error occurs and error != NULL, *error will hold an error message.
|
30
|
+
int Add(const StringPiece& pattern, string* error);
|
31
|
+
|
32
|
+
// Compile prepares the Set for matching.
|
33
|
+
// Add must not be called again after Compile.
|
34
|
+
// Compile must be called before FullMatch or PartialMatch.
|
35
|
+
// Compile may return false if it runs out of memory.
|
36
|
+
bool Compile();
|
37
|
+
|
38
|
+
// Match returns true if text matches any of the regexps in the set.
|
39
|
+
// If so, it fills v with the indices of the matching regexps.
|
40
|
+
bool Match(const StringPiece& text, vector<int>* v) const;
|
41
|
+
|
42
|
+
private:
|
43
|
+
RE2::Options options_;
|
44
|
+
RE2::Anchor anchor_;
|
45
|
+
vector<re2::Regexp*> re_;
|
46
|
+
re2::Prog* prog_;
|
47
|
+
bool compiled_;
|
48
|
+
//DISALLOW_EVIL_CONSTRUCTORS(Set);
|
49
|
+
Set(const Set&);
|
50
|
+
void operator=(const Set&);
|
51
|
+
};
|
52
|
+
|
53
|
+
} // namespace re2
|
54
|
+
|
55
|
+
#endif // RE2_SET_H
|