chipper 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. data/README.rdoc +51 -0
  2. data/ext/extconf.rb +58 -0
  3. data/ext/libstemmer_c/Makefile +10 -0
  4. data/ext/libstemmer_c/examples/stemwords.c +209 -0
  5. data/ext/libstemmer_c/include/libstemmer.h +79 -0
  6. data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
  7. data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
  8. data/ext/libstemmer_c/libstemmer/modules.h +190 -0
  9. data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
  10. data/ext/libstemmer_c/mkinc.mak +82 -0
  11. data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
  12. data/ext/libstemmer_c/runtime/api.c +66 -0
  13. data/ext/libstemmer_c/runtime/api.h +26 -0
  14. data/ext/libstemmer_c/runtime/header.h +58 -0
  15. data/ext/libstemmer_c/runtime/utilities.c +478 -0
  16. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
  17. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
  18. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
  19. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
  20. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
  21. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
  22. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
  23. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
  24. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
  25. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
  26. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
  27. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
  28. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
  29. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
  30. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
  31. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
  32. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
  33. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
  34. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
  35. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
  36. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
  37. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
  38. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
  39. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
  40. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
  41. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
  42. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
  43. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
  44. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
  45. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
  46. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
  47. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
  48. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
  49. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
  50. data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
  51. data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
  52. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
  53. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
  54. data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
  55. data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
  56. data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
  57. data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
  58. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
  59. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
  60. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
  61. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
  62. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
  63. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
  64. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
  65. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
  66. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
  67. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
  68. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
  69. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
  70. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
  71. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
  72. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
  73. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
  74. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
  75. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
  76. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
  77. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
  78. data/ext/re2/bitstate.cc +378 -0
  79. data/ext/re2/compile.cc +1138 -0
  80. data/ext/re2/dfa.cc +2086 -0
  81. data/ext/re2/filtered_re2.cc +100 -0
  82. data/ext/re2/filtered_re2.h +99 -0
  83. data/ext/re2/hash.cc +231 -0
  84. data/ext/re2/mimics_pcre.cc +185 -0
  85. data/ext/re2/nfa.cc +709 -0
  86. data/ext/re2/onepass.cc +614 -0
  87. data/ext/re2/parse.cc +2202 -0
  88. data/ext/re2/perl_groups.cc +119 -0
  89. data/ext/re2/prefilter.cc +671 -0
  90. data/ext/re2/prefilter.h +105 -0
  91. data/ext/re2/prefilter_tree.cc +398 -0
  92. data/ext/re2/prefilter_tree.h +130 -0
  93. data/ext/re2/prog.cc +341 -0
  94. data/ext/re2/prog.h +376 -0
  95. data/ext/re2/re2.cc +1180 -0
  96. data/ext/re2/re2.h +837 -0
  97. data/ext/re2/regexp.cc +920 -0
  98. data/ext/re2/regexp.h +632 -0
  99. data/ext/re2/rune.cc +258 -0
  100. data/ext/re2/set.cc +113 -0
  101. data/ext/re2/set.h +55 -0
  102. data/ext/re2/simplify.cc +393 -0
  103. data/ext/re2/stringpiece.cc +87 -0
  104. data/ext/re2/stringpiece.h +182 -0
  105. data/ext/re2/tostring.cc +341 -0
  106. data/ext/re2/unicode_casefold.cc +469 -0
  107. data/ext/re2/unicode_casefold.h +75 -0
  108. data/ext/re2/unicode_groups.cc +4851 -0
  109. data/ext/re2/unicode_groups.h +64 -0
  110. data/ext/re2/valgrind.cc +24 -0
  111. data/ext/re2/variadic_function.h +346 -0
  112. data/ext/re2/walker-inl.h +244 -0
  113. data/ext/src/chipper.cc +626 -0
  114. data/ext/src/version.h +1 -0
  115. data/ext/stemmer.rb +40 -0
  116. data/ext/util/arena.h +103 -0
  117. data/ext/util/atomicops.h +79 -0
  118. data/ext/util/benchmark.h +41 -0
  119. data/ext/util/flags.h +27 -0
  120. data/ext/util/logging.h +78 -0
  121. data/ext/util/mutex.h +190 -0
  122. data/ext/util/pcre.h +679 -0
  123. data/ext/util/random.h +29 -0
  124. data/ext/util/sparse_array.h +451 -0
  125. data/ext/util/sparse_set.h +177 -0
  126. data/ext/util/test.h +57 -0
  127. data/ext/util/thread.h +26 -0
  128. data/ext/util/utf.h +43 -0
  129. data/ext/util/util.h +127 -0
  130. data/ext/util/valgrind.h +4517 -0
  131. data/test/helper.rb +5 -0
  132. data/test/test_entities.rb +57 -0
  133. data/test/test_tokens.rb +118 -0
  134. metadata +199 -0
data/ext/re2/rune.cc ADDED
@@ -0,0 +1,258 @@
1
+ /*
2
+ * The authors of this software are Rob Pike and Ken Thompson.
3
+ * Copyright (c) 2002 by Lucent Technologies.
4
+ * Permission to use, copy, modify, and distribute this software for any
5
+ * purpose without fee is hereby granted, provided that this entire notice
6
+ * is included in all copies of any software which is or includes a copy
7
+ * or modification of this software and in all copies of the supporting
8
+ * documentation for such software.
9
+ * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
10
+ * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
11
+ * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
12
+ * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
13
+ */
14
+ #include <stdarg.h>
15
+ #include <string.h>
16
+ #include "util/utf.h"
17
+
18
+ namespace re2 {
19
+
20
+ enum
21
+ {
22
+ Bit1 = 7,
23
+ Bitx = 6,
24
+ Bit2 = 5,
25
+ Bit3 = 4,
26
+ Bit4 = 3,
27
+ Bit5 = 2,
28
+
29
+ T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
30
+ Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
31
+ T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
32
+ T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
33
+ T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
34
+ T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
35
+
36
+ Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */
37
+ Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */
38
+ Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */
39
+ Rune4 = (1<<(Bit4+3*Bitx))-1,
40
+ /* 0001 1111 1111 1111 1111 1111 */
41
+
42
+ Maskx = (1<<Bitx)-1, /* 0011 1111 */
43
+ Testx = Maskx ^ 0xFF, /* 1100 0000 */
44
+
45
+ Bad = Runeerror,
46
+ };
47
+
48
+ int
49
+ chartorune(Rune *rune, const char *str)
50
+ {
51
+ int c, c1, c2, c3;
52
+ long l;
53
+
54
+ /*
55
+ * one character sequence
56
+ * 00000-0007F => T1
57
+ */
58
+ c = *(unsigned char*)str;
59
+ if(c < Tx) {
60
+ *rune = c;
61
+ return 1;
62
+ }
63
+
64
+ /*
65
+ * two character sequence
66
+ * 0080-07FF => T2 Tx
67
+ */
68
+ c1 = *(unsigned char*)(str+1) ^ Tx;
69
+ if(c1 & Testx)
70
+ goto bad;
71
+ if(c < T3) {
72
+ if(c < T2)
73
+ goto bad;
74
+ l = ((c << Bitx) | c1) & Rune2;
75
+ if(l <= Rune1)
76
+ goto bad;
77
+ *rune = l;
78
+ return 2;
79
+ }
80
+
81
+ /*
82
+ * three character sequence
83
+ * 0800-FFFF => T3 Tx Tx
84
+ */
85
+ c2 = *(unsigned char*)(str+2) ^ Tx;
86
+ if(c2 & Testx)
87
+ goto bad;
88
+ if(c < T4) {
89
+ l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
90
+ if(l <= Rune2)
91
+ goto bad;
92
+ *rune = l;
93
+ return 3;
94
+ }
95
+
96
+ /*
97
+ * four character sequence (21-bit value)
98
+ * 10000-1FFFFF => T4 Tx Tx Tx
99
+ */
100
+ c3 = *(unsigned char*)(str+3) ^ Tx;
101
+ if (c3 & Testx)
102
+ goto bad;
103
+ if (c < T5) {
104
+ l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
105
+ if (l <= Rune3)
106
+ goto bad;
107
+ *rune = l;
108
+ return 4;
109
+ }
110
+
111
+ /*
112
+ * Support for 5-byte or longer UTF-8 would go here, but
113
+ * since we don't have that, we'll just fall through to bad.
114
+ */
115
+
116
+ /*
117
+ * bad decoding
118
+ */
119
+ bad:
120
+ *rune = Bad;
121
+ return 1;
122
+ }
123
+
124
+ int
125
+ runetochar(char *str, const Rune *rune)
126
+ {
127
+ /* Runes are signed, so convert to unsigned for range check. */
128
+ unsigned long c;
129
+
130
+ /*
131
+ * one character sequence
132
+ * 00000-0007F => 00-7F
133
+ */
134
+ c = *rune;
135
+ if(c <= Rune1) {
136
+ str[0] = c;
137
+ return 1;
138
+ }
139
+
140
+ /*
141
+ * two character sequence
142
+ * 0080-07FF => T2 Tx
143
+ */
144
+ if(c <= Rune2) {
145
+ str[0] = T2 | (c >> 1*Bitx);
146
+ str[1] = Tx | (c & Maskx);
147
+ return 2;
148
+ }
149
+
150
+ /*
151
+ * If the Rune is out of range, convert it to the error rune.
152
+ * Do this test here because the error rune encodes to three bytes.
153
+ * Doing it earlier would duplicate work, since an out of range
154
+ * Rune wouldn't have fit in one or two bytes.
155
+ */
156
+ if (c > Runemax)
157
+ c = Runeerror;
158
+
159
+ /*
160
+ * three character sequence
161
+ * 0800-FFFF => T3 Tx Tx
162
+ */
163
+ if (c <= Rune3) {
164
+ str[0] = T3 | (c >> 2*Bitx);
165
+ str[1] = Tx | ((c >> 1*Bitx) & Maskx);
166
+ str[2] = Tx | (c & Maskx);
167
+ return 3;
168
+ }
169
+
170
+ /*
171
+ * four character sequence (21-bit value)
172
+ * 10000-1FFFFF => T4 Tx Tx Tx
173
+ */
174
+ str[0] = T4 | (c >> 3*Bitx);
175
+ str[1] = Tx | ((c >> 2*Bitx) & Maskx);
176
+ str[2] = Tx | ((c >> 1*Bitx) & Maskx);
177
+ str[3] = Tx | (c & Maskx);
178
+ return 4;
179
+ }
180
+
181
+ int
182
+ runelen(Rune rune)
183
+ {
184
+ char str[10];
185
+
186
+ return runetochar(str, &rune);
187
+ }
188
+
189
+ int
190
+ fullrune(const char *str, int n)
191
+ {
192
+ if (n > 0) {
193
+ int c = *(unsigned char*)str;
194
+ if (c < Tx)
195
+ return 1;
196
+ if (n > 1) {
197
+ if (c < T3)
198
+ return 1;
199
+ if (n > 2) {
200
+ if (c < T4 || n > 3)
201
+ return 1;
202
+ }
203
+ }
204
+ }
205
+ return 0;
206
+ }
207
+
208
+
209
+ int
210
+ utflen(const char *s)
211
+ {
212
+ int c;
213
+ long n;
214
+ Rune rune;
215
+
216
+ n = 0;
217
+ for(;;) {
218
+ c = *(unsigned char*)s;
219
+ if(c < Runeself) {
220
+ if(c == 0)
221
+ return n;
222
+ s++;
223
+ } else
224
+ s += chartorune(&rune, s);
225
+ n++;
226
+ }
227
+ return 0;
228
+ }
229
+
230
+ char*
231
+ utfrune(const char *s, Rune c)
232
+ {
233
+ long c1;
234
+ Rune r;
235
+ int n;
236
+
237
+ if(c < Runesync) /* not part of utf sequence */
238
+ return strchr((char*)s, c);
239
+
240
+ for(;;) {
241
+ c1 = *(unsigned char*)s;
242
+ if(c1 < Runeself) { /* one byte rune */
243
+ if(c1 == 0)
244
+ return 0;
245
+ if(c1 == c)
246
+ return (char*)s;
247
+ s++;
248
+ continue;
249
+ }
250
+ n = chartorune(&r, s);
251
+ if(r == c)
252
+ return (char*)s;
253
+ s += n;
254
+ }
255
+ return 0;
256
+ }
257
+
258
+ } // namespace re2
data/ext/re2/set.cc ADDED
@@ -0,0 +1,113 @@
1
+ // Copyright 2010 The RE2 Authors. All Rights Reserved.
2
+ // Use of this source code is governed by a BSD-style
3
+ // license that can be found in the LICENSE file.
4
+
5
+ #include "re2/set.h"
6
+
7
+ #include "util/util.h"
8
+ #include "re2/stringpiece.h"
9
+ #include "re2/prog.h"
10
+ #include "re2/re2.h"
11
+ #include "re2/regexp.h"
12
+
13
+ using namespace re2;
14
+
15
+ RE2::Set::Set(const RE2::Options& options, RE2::Anchor anchor) {
16
+ options_.Copy(options);
17
+ anchor_ = anchor;
18
+ prog_ = NULL;
19
+ compiled_ = false;
20
+ }
21
+
22
+ RE2::Set::~Set() {
23
+ for (int i = 0; i < re_.size(); i++)
24
+ re_[i]->Decref();
25
+ delete prog_;
26
+ }
27
+
28
+ int RE2::Set::Add(const StringPiece& pattern, string* error) {
29
+ if (compiled_) {
30
+ LOG(DFATAL) << "RE2::Set::Add after Compile";
31
+ return -1;
32
+ }
33
+
34
+ Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
35
+ options_.ParseFlags());
36
+
37
+ RegexpStatus status;
38
+ re2::Regexp* re = Regexp::Parse(pattern, pf, &status);
39
+ if (re == NULL) {
40
+ if (error != NULL)
41
+ *error = status.Text();
42
+ if (options_.log_errors())
43
+ LOG(ERROR) << "Error parsing '" << pattern << "': " << status.Text();
44
+ return -1;
45
+ }
46
+
47
+ // Concatenate with match index and push on vector.
48
+ int n = re_.size();
49
+ re2::Regexp* m = re2::Regexp::HaveMatch(n, pf);
50
+ if (re->op() == kRegexpConcat) {
51
+ int nsub = re->nsub();
52
+ re2::Regexp** sub = new re2::Regexp*[nsub + 1];
53
+ for (int i = 0; i < nsub; i++)
54
+ sub[i] = re->sub()[i]->Incref();
55
+ sub[nsub] = m;
56
+ re->Decref();
57
+ re = re2::Regexp::Concat(sub, nsub + 1, pf);
58
+ delete[] sub;
59
+ } else {
60
+ re2::Regexp* sub[2];
61
+ sub[0] = re;
62
+ sub[1] = m;
63
+ re = re2::Regexp::Concat(sub, 2, pf);
64
+ }
65
+ re_.push_back(re);
66
+ return n;
67
+ }
68
+
69
+ bool RE2::Set::Compile() {
70
+ if (compiled_) {
71
+ LOG(DFATAL) << "RE2::Set::Compile multiple times";
72
+ return false;
73
+ }
74
+ compiled_ = true;
75
+
76
+ Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
77
+ options_.ParseFlags());
78
+ re2::Regexp* re = re2::Regexp::Alternate(const_cast<re2::Regexp**>(&re_[0]),
79
+ re_.size(), pf);
80
+ re_.clear();
81
+ re2::Regexp* sre = re->Simplify();
82
+ re->Decref();
83
+ re = sre;
84
+ if (re == NULL) {
85
+ if (options_.log_errors())
86
+ LOG(ERROR) << "Error simplifying during Compile.";
87
+ return false;
88
+ }
89
+
90
+ prog_ = Prog::CompileSet(options_, anchor_, re);
91
+ return prog_ != NULL;
92
+ }
93
+
94
+ bool RE2::Set::Match(const StringPiece& text, vector<int>* v) const {
95
+ if (!compiled_) {
96
+ LOG(DFATAL) << "RE2::Set::Match without Compile";
97
+ return false;
98
+ }
99
+ v->clear();
100
+ bool failed;
101
+ bool ret = prog_->SearchDFA(text, text, Prog::kAnchored,
102
+ Prog::kManyMatch, NULL, &failed, v);
103
+ if (failed)
104
+ LOG(DFATAL) << "RE2::Set::Match: DFA ran out of cache space";
105
+
106
+ if (ret == false)
107
+ return false;
108
+ if (v->size() == 0) {
109
+ LOG(DFATAL) << "RE2::Set::Match: match but unknown regexp set";
110
+ return false;
111
+ }
112
+ return true;
113
+ }
data/ext/re2/set.h ADDED
@@ -0,0 +1,55 @@
1
+ // Copyright 2010 The RE2 Authors. All Rights Reserved.
2
+ // Use of this source code is governed by a BSD-style
3
+ // license that can be found in the LICENSE file.
4
+
5
+ #ifndef RE2_SET_H
6
+ #define RE2_SET_H
7
+
8
+ #include <utility>
9
+ #include <vector>
10
+
11
+ #include "re2/re2.h"
12
+
13
+ namespace re2 {
14
+ using std::vector;
15
+
16
+ // An RE2::Set represents a collection of regexps that can
17
+ // be searched for simultaneously.
18
+ class RE2::Set {
19
+ public:
20
+ Set(const RE2::Options& options, RE2::Anchor anchor);
21
+ ~Set();
22
+
23
+ // Add adds regexp pattern to the set, interpreted using the RE2 options.
24
+ // (The RE2 constructor's default options parameter is RE2::UTF8.)
25
+ // Add returns the regexp index that will be used to identify
26
+ // it in the result of Match, or -1 if the regexp cannot be parsed.
27
+ // Indices are assigned in sequential order starting from 0.
28
+ // Error returns do not increment the index.
29
+ // If an error occurs and error != NULL, *error will hold an error message.
30
+ int Add(const StringPiece& pattern, string* error);
31
+
32
+ // Compile prepares the Set for matching.
33
+ // Add must not be called again after Compile.
34
+ // Compile must be called before FullMatch or PartialMatch.
35
+ // Compile may return false if it runs out of memory.
36
+ bool Compile();
37
+
38
+ // Match returns true if text matches any of the regexps in the set.
39
+ // If so, it fills v with the indices of the matching regexps.
40
+ bool Match(const StringPiece& text, vector<int>* v) const;
41
+
42
+ private:
43
+ RE2::Options options_;
44
+ RE2::Anchor anchor_;
45
+ vector<re2::Regexp*> re_;
46
+ re2::Prog* prog_;
47
+ bool compiled_;
48
+ //DISALLOW_EVIL_CONSTRUCTORS(Set);
49
+ Set(const Set&);
50
+ void operator=(const Set&);
51
+ };
52
+
53
+ } // namespace re2
54
+
55
+ #endif // RE2_SET_H