chipper 0.4.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (134) hide show
  1. data/README.rdoc +51 -0
  2. data/ext/extconf.rb +58 -0
  3. data/ext/libstemmer_c/Makefile +10 -0
  4. data/ext/libstemmer_c/examples/stemwords.c +209 -0
  5. data/ext/libstemmer_c/include/libstemmer.h +79 -0
  6. data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
  7. data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
  8. data/ext/libstemmer_c/libstemmer/modules.h +190 -0
  9. data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
  10. data/ext/libstemmer_c/mkinc.mak +82 -0
  11. data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
  12. data/ext/libstemmer_c/runtime/api.c +66 -0
  13. data/ext/libstemmer_c/runtime/api.h +26 -0
  14. data/ext/libstemmer_c/runtime/header.h +58 -0
  15. data/ext/libstemmer_c/runtime/utilities.c +478 -0
  16. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
  17. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
  18. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
  19. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
  20. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
  21. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
  22. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
  23. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
  24. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
  25. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
  26. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
  27. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
  28. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
  29. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
  30. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
  31. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
  32. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
  33. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
  34. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
  35. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
  36. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
  37. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
  38. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
  39. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
  40. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
  41. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
  42. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
  43. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
  44. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
  45. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
  46. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
  47. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
  48. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
  49. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
  50. data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
  51. data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
  52. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
  53. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
  54. data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
  55. data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
  56. data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
  57. data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
  58. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
  59. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
  60. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
  61. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
  62. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
  63. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
  64. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
  65. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
  66. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
  67. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
  68. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
  69. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
  70. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
  71. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
  72. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
  73. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
  74. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
  75. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
  76. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
  77. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
  78. data/ext/re2/bitstate.cc +378 -0
  79. data/ext/re2/compile.cc +1138 -0
  80. data/ext/re2/dfa.cc +2086 -0
  81. data/ext/re2/filtered_re2.cc +100 -0
  82. data/ext/re2/filtered_re2.h +99 -0
  83. data/ext/re2/hash.cc +231 -0
  84. data/ext/re2/mimics_pcre.cc +185 -0
  85. data/ext/re2/nfa.cc +709 -0
  86. data/ext/re2/onepass.cc +614 -0
  87. data/ext/re2/parse.cc +2202 -0
  88. data/ext/re2/perl_groups.cc +119 -0
  89. data/ext/re2/prefilter.cc +671 -0
  90. data/ext/re2/prefilter.h +105 -0
  91. data/ext/re2/prefilter_tree.cc +398 -0
  92. data/ext/re2/prefilter_tree.h +130 -0
  93. data/ext/re2/prog.cc +341 -0
  94. data/ext/re2/prog.h +376 -0
  95. data/ext/re2/re2.cc +1180 -0
  96. data/ext/re2/re2.h +837 -0
  97. data/ext/re2/regexp.cc +920 -0
  98. data/ext/re2/regexp.h +632 -0
  99. data/ext/re2/rune.cc +258 -0
  100. data/ext/re2/set.cc +113 -0
  101. data/ext/re2/set.h +55 -0
  102. data/ext/re2/simplify.cc +393 -0
  103. data/ext/re2/stringpiece.cc +87 -0
  104. data/ext/re2/stringpiece.h +182 -0
  105. data/ext/re2/tostring.cc +341 -0
  106. data/ext/re2/unicode_casefold.cc +469 -0
  107. data/ext/re2/unicode_casefold.h +75 -0
  108. data/ext/re2/unicode_groups.cc +4851 -0
  109. data/ext/re2/unicode_groups.h +64 -0
  110. data/ext/re2/valgrind.cc +24 -0
  111. data/ext/re2/variadic_function.h +346 -0
  112. data/ext/re2/walker-inl.h +244 -0
  113. data/ext/src/chipper.cc +626 -0
  114. data/ext/src/version.h +1 -0
  115. data/ext/stemmer.rb +40 -0
  116. data/ext/util/arena.h +103 -0
  117. data/ext/util/atomicops.h +79 -0
  118. data/ext/util/benchmark.h +41 -0
  119. data/ext/util/flags.h +27 -0
  120. data/ext/util/logging.h +78 -0
  121. data/ext/util/mutex.h +190 -0
  122. data/ext/util/pcre.h +679 -0
  123. data/ext/util/random.h +29 -0
  124. data/ext/util/sparse_array.h +451 -0
  125. data/ext/util/sparse_set.h +177 -0
  126. data/ext/util/test.h +57 -0
  127. data/ext/util/thread.h +26 -0
  128. data/ext/util/utf.h +43 -0
  129. data/ext/util/util.h +127 -0
  130. data/ext/util/valgrind.h +4517 -0
  131. data/test/helper.rb +5 -0
  132. data/test/test_entities.rb +57 -0
  133. data/test/test_tokens.rb +118 -0
  134. metadata +199 -0
data/ext/re2/rune.cc ADDED
@@ -0,0 +1,258 @@
1
+ /*
2
+ * The authors of this software are Rob Pike and Ken Thompson.
3
+ * Copyright (c) 2002 by Lucent Technologies.
4
+ * Permission to use, copy, modify, and distribute this software for any
5
+ * purpose without fee is hereby granted, provided that this entire notice
6
+ * is included in all copies of any software which is or includes a copy
7
+ * or modification of this software and in all copies of the supporting
8
+ * documentation for such software.
9
+ * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
10
+ * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
11
+ * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
12
+ * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
13
+ */
14
+ #include <stdarg.h>
15
+ #include <string.h>
16
+ #include "util/utf.h"
17
+
18
+ namespace re2 {
19
+
20
+ enum
21
+ {
22
+ Bit1 = 7,
23
+ Bitx = 6,
24
+ Bit2 = 5,
25
+ Bit3 = 4,
26
+ Bit4 = 3,
27
+ Bit5 = 2,
28
+
29
+ T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
30
+ Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
31
+ T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
32
+ T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
33
+ T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
34
+ T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
35
+
36
+ Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */
37
+ Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */
38
+ Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */
39
+ Rune4 = (1<<(Bit4+3*Bitx))-1,
40
+ /* 0001 1111 1111 1111 1111 1111 */
41
+
42
+ Maskx = (1<<Bitx)-1, /* 0011 1111 */
43
+ Testx = Maskx ^ 0xFF, /* 1100 0000 */
44
+
45
+ Bad = Runeerror,
46
+ };
47
+
48
+ int
49
+ chartorune(Rune *rune, const char *str)
50
+ {
51
+ int c, c1, c2, c3;
52
+ long l;
53
+
54
+ /*
55
+ * one character sequence
56
+ * 00000-0007F => T1
57
+ */
58
+ c = *(unsigned char*)str;
59
+ if(c < Tx) {
60
+ *rune = c;
61
+ return 1;
62
+ }
63
+
64
+ /*
65
+ * two character sequence
66
+ * 0080-07FF => T2 Tx
67
+ */
68
+ c1 = *(unsigned char*)(str+1) ^ Tx;
69
+ if(c1 & Testx)
70
+ goto bad;
71
+ if(c < T3) {
72
+ if(c < T2)
73
+ goto bad;
74
+ l = ((c << Bitx) | c1) & Rune2;
75
+ if(l <= Rune1)
76
+ goto bad;
77
+ *rune = l;
78
+ return 2;
79
+ }
80
+
81
+ /*
82
+ * three character sequence
83
+ * 0800-FFFF => T3 Tx Tx
84
+ */
85
+ c2 = *(unsigned char*)(str+2) ^ Tx;
86
+ if(c2 & Testx)
87
+ goto bad;
88
+ if(c < T4) {
89
+ l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
90
+ if(l <= Rune2)
91
+ goto bad;
92
+ *rune = l;
93
+ return 3;
94
+ }
95
+
96
+ /*
97
+ * four character sequence (21-bit value)
98
+ * 10000-1FFFFF => T4 Tx Tx Tx
99
+ */
100
+ c3 = *(unsigned char*)(str+3) ^ Tx;
101
+ if (c3 & Testx)
102
+ goto bad;
103
+ if (c < T5) {
104
+ l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
105
+ if (l <= Rune3)
106
+ goto bad;
107
+ *rune = l;
108
+ return 4;
109
+ }
110
+
111
+ /*
112
+ * Support for 5-byte or longer UTF-8 would go here, but
113
+ * since we don't have that, we'll just fall through to bad.
114
+ */
115
+
116
+ /*
117
+ * bad decoding
118
+ */
119
+ bad:
120
+ *rune = Bad;
121
+ return 1;
122
+ }
123
+
124
+ int
125
+ runetochar(char *str, const Rune *rune)
126
+ {
127
+ /* Runes are signed, so convert to unsigned for range check. */
128
+ unsigned long c;
129
+
130
+ /*
131
+ * one character sequence
132
+ * 00000-0007F => 00-7F
133
+ */
134
+ c = *rune;
135
+ if(c <= Rune1) {
136
+ str[0] = c;
137
+ return 1;
138
+ }
139
+
140
+ /*
141
+ * two character sequence
142
+ * 0080-07FF => T2 Tx
143
+ */
144
+ if(c <= Rune2) {
145
+ str[0] = T2 | (c >> 1*Bitx);
146
+ str[1] = Tx | (c & Maskx);
147
+ return 2;
148
+ }
149
+
150
+ /*
151
+ * If the Rune is out of range, convert it to the error rune.
152
+ * Do this test here because the error rune encodes to three bytes.
153
+ * Doing it earlier would duplicate work, since an out of range
154
+ * Rune wouldn't have fit in one or two bytes.
155
+ */
156
+ if (c > Runemax)
157
+ c = Runeerror;
158
+
159
+ /*
160
+ * three character sequence
161
+ * 0800-FFFF => T3 Tx Tx
162
+ */
163
+ if (c <= Rune3) {
164
+ str[0] = T3 | (c >> 2*Bitx);
165
+ str[1] = Tx | ((c >> 1*Bitx) & Maskx);
166
+ str[2] = Tx | (c & Maskx);
167
+ return 3;
168
+ }
169
+
170
+ /*
171
+ * four character sequence (21-bit value)
172
+ * 10000-1FFFFF => T4 Tx Tx Tx
173
+ */
174
+ str[0] = T4 | (c >> 3*Bitx);
175
+ str[1] = Tx | ((c >> 2*Bitx) & Maskx);
176
+ str[2] = Tx | ((c >> 1*Bitx) & Maskx);
177
+ str[3] = Tx | (c & Maskx);
178
+ return 4;
179
+ }
180
+
181
+ int
182
+ runelen(Rune rune)
183
+ {
184
+ char str[10];
185
+
186
+ return runetochar(str, &rune);
187
+ }
188
+
189
+ int
190
+ fullrune(const char *str, int n)
191
+ {
192
+ if (n > 0) {
193
+ int c = *(unsigned char*)str;
194
+ if (c < Tx)
195
+ return 1;
196
+ if (n > 1) {
197
+ if (c < T3)
198
+ return 1;
199
+ if (n > 2) {
200
+ if (c < T4 || n > 3)
201
+ return 1;
202
+ }
203
+ }
204
+ }
205
+ return 0;
206
+ }
207
+
208
+
209
+ int
210
+ utflen(const char *s)
211
+ {
212
+ int c;
213
+ long n;
214
+ Rune rune;
215
+
216
+ n = 0;
217
+ for(;;) {
218
+ c = *(unsigned char*)s;
219
+ if(c < Runeself) {
220
+ if(c == 0)
221
+ return n;
222
+ s++;
223
+ } else
224
+ s += chartorune(&rune, s);
225
+ n++;
226
+ }
227
+ return 0;
228
+ }
229
+
230
+ char*
231
+ utfrune(const char *s, Rune c)
232
+ {
233
+ long c1;
234
+ Rune r;
235
+ int n;
236
+
237
+ if(c < Runesync) /* not part of utf sequence */
238
+ return strchr((char*)s, c);
239
+
240
+ for(;;) {
241
+ c1 = *(unsigned char*)s;
242
+ if(c1 < Runeself) { /* one byte rune */
243
+ if(c1 == 0)
244
+ return 0;
245
+ if(c1 == c)
246
+ return (char*)s;
247
+ s++;
248
+ continue;
249
+ }
250
+ n = chartorune(&r, s);
251
+ if(r == c)
252
+ return (char*)s;
253
+ s += n;
254
+ }
255
+ return 0;
256
+ }
257
+
258
+ } // namespace re2
data/ext/re2/set.cc ADDED
@@ -0,0 +1,113 @@
1
+ // Copyright 2010 The RE2 Authors. All Rights Reserved.
2
+ // Use of this source code is governed by a BSD-style
3
+ // license that can be found in the LICENSE file.
4
+
5
+ #include "re2/set.h"
6
+
7
+ #include "util/util.h"
8
+ #include "re2/stringpiece.h"
9
+ #include "re2/prog.h"
10
+ #include "re2/re2.h"
11
+ #include "re2/regexp.h"
12
+
13
+ using namespace re2;
14
+
15
+ RE2::Set::Set(const RE2::Options& options, RE2::Anchor anchor) {
16
+ options_.Copy(options);
17
+ anchor_ = anchor;
18
+ prog_ = NULL;
19
+ compiled_ = false;
20
+ }
21
+
22
+ RE2::Set::~Set() {
23
+ for (int i = 0; i < re_.size(); i++)
24
+ re_[i]->Decref();
25
+ delete prog_;
26
+ }
27
+
28
+ int RE2::Set::Add(const StringPiece& pattern, string* error) {
29
+ if (compiled_) {
30
+ LOG(DFATAL) << "RE2::Set::Add after Compile";
31
+ return -1;
32
+ }
33
+
34
+ Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
35
+ options_.ParseFlags());
36
+
37
+ RegexpStatus status;
38
+ re2::Regexp* re = Regexp::Parse(pattern, pf, &status);
39
+ if (re == NULL) {
40
+ if (error != NULL)
41
+ *error = status.Text();
42
+ if (options_.log_errors())
43
+ LOG(ERROR) << "Error parsing '" << pattern << "': " << status.Text();
44
+ return -1;
45
+ }
46
+
47
+ // Concatenate with match index and push on vector.
48
+ int n = re_.size();
49
+ re2::Regexp* m = re2::Regexp::HaveMatch(n, pf);
50
+ if (re->op() == kRegexpConcat) {
51
+ int nsub = re->nsub();
52
+ re2::Regexp** sub = new re2::Regexp*[nsub + 1];
53
+ for (int i = 0; i < nsub; i++)
54
+ sub[i] = re->sub()[i]->Incref();
55
+ sub[nsub] = m;
56
+ re->Decref();
57
+ re = re2::Regexp::Concat(sub, nsub + 1, pf);
58
+ delete[] sub;
59
+ } else {
60
+ re2::Regexp* sub[2];
61
+ sub[0] = re;
62
+ sub[1] = m;
63
+ re = re2::Regexp::Concat(sub, 2, pf);
64
+ }
65
+ re_.push_back(re);
66
+ return n;
67
+ }
68
+
69
+ bool RE2::Set::Compile() {
70
+ if (compiled_) {
71
+ LOG(DFATAL) << "RE2::Set::Compile multiple times";
72
+ return false;
73
+ }
74
+ compiled_ = true;
75
+
76
+ Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
77
+ options_.ParseFlags());
78
+ re2::Regexp* re = re2::Regexp::Alternate(const_cast<re2::Regexp**>(&re_[0]),
79
+ re_.size(), pf);
80
+ re_.clear();
81
+ re2::Regexp* sre = re->Simplify();
82
+ re->Decref();
83
+ re = sre;
84
+ if (re == NULL) {
85
+ if (options_.log_errors())
86
+ LOG(ERROR) << "Error simplifying during Compile.";
87
+ return false;
88
+ }
89
+
90
+ prog_ = Prog::CompileSet(options_, anchor_, re);
91
+ return prog_ != NULL;
92
+ }
93
+
94
+ bool RE2::Set::Match(const StringPiece& text, vector<int>* v) const {
95
+ if (!compiled_) {
96
+ LOG(DFATAL) << "RE2::Set::Match without Compile";
97
+ return false;
98
+ }
99
+ v->clear();
100
+ bool failed;
101
+ bool ret = prog_->SearchDFA(text, text, Prog::kAnchored,
102
+ Prog::kManyMatch, NULL, &failed, v);
103
+ if (failed)
104
+ LOG(DFATAL) << "RE2::Set::Match: DFA ran out of cache space";
105
+
106
+ if (ret == false)
107
+ return false;
108
+ if (v->size() == 0) {
109
+ LOG(DFATAL) << "RE2::Set::Match: match but unknown regexp set";
110
+ return false;
111
+ }
112
+ return true;
113
+ }
data/ext/re2/set.h ADDED
@@ -0,0 +1,55 @@
1
+ // Copyright 2010 The RE2 Authors. All Rights Reserved.
2
+ // Use of this source code is governed by a BSD-style
3
+ // license that can be found in the LICENSE file.
4
+
5
+ #ifndef RE2_SET_H
6
+ #define RE2_SET_H
7
+
8
+ #include <utility>
9
+ #include <vector>
10
+
11
+ #include "re2/re2.h"
12
+
13
+ namespace re2 {
14
+ using std::vector;
15
+
16
+ // An RE2::Set represents a collection of regexps that can
17
+ // be searched for simultaneously.
18
+ class RE2::Set {
19
+ public:
20
+ Set(const RE2::Options& options, RE2::Anchor anchor);
21
+ ~Set();
22
+
23
+ // Add adds regexp pattern to the set, interpreted using the RE2 options.
24
+ // (The RE2 constructor's default options parameter is RE2::UTF8.)
25
+ // Add returns the regexp index that will be used to identify
26
+ // it in the result of Match, or -1 if the regexp cannot be parsed.
27
+ // Indices are assigned in sequential order starting from 0.
28
+ // Error returns do not increment the index.
29
+ // If an error occurs and error != NULL, *error will hold an error message.
30
+ int Add(const StringPiece& pattern, string* error);
31
+
32
+ // Compile prepares the Set for matching.
33
+ // Add must not be called again after Compile.
34
+ // Compile must be called before FullMatch or PartialMatch.
35
+ // Compile may return false if it runs out of memory.
36
+ bool Compile();
37
+
38
+ // Match returns true if text matches any of the regexps in the set.
39
+ // If so, it fills v with the indices of the matching regexps.
40
+ bool Match(const StringPiece& text, vector<int>* v) const;
41
+
42
+ private:
43
+ RE2::Options options_;
44
+ RE2::Anchor anchor_;
45
+ vector<re2::Regexp*> re_;
46
+ re2::Prog* prog_;
47
+ bool compiled_;
48
+ //DISALLOW_EVIL_CONSTRUCTORS(Set);
49
+ Set(const Set&);
50
+ void operator=(const Set&);
51
+ };
52
+
53
+ } // namespace re2
54
+
55
+ #endif // RE2_SET_H