chipper 0.4.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (134) hide show
  1. data/README.rdoc +51 -0
  2. data/ext/extconf.rb +58 -0
  3. data/ext/libstemmer_c/Makefile +10 -0
  4. data/ext/libstemmer_c/examples/stemwords.c +209 -0
  5. data/ext/libstemmer_c/include/libstemmer.h +79 -0
  6. data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
  7. data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
  8. data/ext/libstemmer_c/libstemmer/modules.h +190 -0
  9. data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
  10. data/ext/libstemmer_c/mkinc.mak +82 -0
  11. data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
  12. data/ext/libstemmer_c/runtime/api.c +66 -0
  13. data/ext/libstemmer_c/runtime/api.h +26 -0
  14. data/ext/libstemmer_c/runtime/header.h +58 -0
  15. data/ext/libstemmer_c/runtime/utilities.c +478 -0
  16. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
  17. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
  18. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
  19. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
  20. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
  21. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
  22. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
  23. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
  24. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
  25. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
  26. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
  27. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
  28. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
  29. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
  30. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
  31. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
  32. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
  33. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
  34. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
  35. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
  36. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
  37. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
  38. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
  39. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
  40. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
  41. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
  42. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
  43. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
  44. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
  45. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
  46. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
  47. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
  48. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
  49. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
  50. data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
  51. data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
  52. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
  53. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
  54. data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
  55. data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
  56. data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
  57. data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
  58. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
  59. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
  60. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
  61. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
  62. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
  63. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
  64. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
  65. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
  66. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
  67. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
  68. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
  69. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
  70. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
  71. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
  72. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
  73. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
  74. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
  75. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
  76. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
  77. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
  78. data/ext/re2/bitstate.cc +378 -0
  79. data/ext/re2/compile.cc +1138 -0
  80. data/ext/re2/dfa.cc +2086 -0
  81. data/ext/re2/filtered_re2.cc +100 -0
  82. data/ext/re2/filtered_re2.h +99 -0
  83. data/ext/re2/hash.cc +231 -0
  84. data/ext/re2/mimics_pcre.cc +185 -0
  85. data/ext/re2/nfa.cc +709 -0
  86. data/ext/re2/onepass.cc +614 -0
  87. data/ext/re2/parse.cc +2202 -0
  88. data/ext/re2/perl_groups.cc +119 -0
  89. data/ext/re2/prefilter.cc +671 -0
  90. data/ext/re2/prefilter.h +105 -0
  91. data/ext/re2/prefilter_tree.cc +398 -0
  92. data/ext/re2/prefilter_tree.h +130 -0
  93. data/ext/re2/prog.cc +341 -0
  94. data/ext/re2/prog.h +376 -0
  95. data/ext/re2/re2.cc +1180 -0
  96. data/ext/re2/re2.h +837 -0
  97. data/ext/re2/regexp.cc +920 -0
  98. data/ext/re2/regexp.h +632 -0
  99. data/ext/re2/rune.cc +258 -0
  100. data/ext/re2/set.cc +113 -0
  101. data/ext/re2/set.h +55 -0
  102. data/ext/re2/simplify.cc +393 -0
  103. data/ext/re2/stringpiece.cc +87 -0
  104. data/ext/re2/stringpiece.h +182 -0
  105. data/ext/re2/tostring.cc +341 -0
  106. data/ext/re2/unicode_casefold.cc +469 -0
  107. data/ext/re2/unicode_casefold.h +75 -0
  108. data/ext/re2/unicode_groups.cc +4851 -0
  109. data/ext/re2/unicode_groups.h +64 -0
  110. data/ext/re2/valgrind.cc +24 -0
  111. data/ext/re2/variadic_function.h +346 -0
  112. data/ext/re2/walker-inl.h +244 -0
  113. data/ext/src/chipper.cc +626 -0
  114. data/ext/src/version.h +1 -0
  115. data/ext/stemmer.rb +40 -0
  116. data/ext/util/arena.h +103 -0
  117. data/ext/util/atomicops.h +79 -0
  118. data/ext/util/benchmark.h +41 -0
  119. data/ext/util/flags.h +27 -0
  120. data/ext/util/logging.h +78 -0
  121. data/ext/util/mutex.h +190 -0
  122. data/ext/util/pcre.h +679 -0
  123. data/ext/util/random.h +29 -0
  124. data/ext/util/sparse_array.h +451 -0
  125. data/ext/util/sparse_set.h +177 -0
  126. data/ext/util/test.h +57 -0
  127. data/ext/util/thread.h +26 -0
  128. data/ext/util/utf.h +43 -0
  129. data/ext/util/util.h +127 -0
  130. data/ext/util/valgrind.h +4517 -0
  131. data/test/helper.rb +5 -0
  132. data/test/test_entities.rb +57 -0
  133. data/test/test_tokens.rb +118 -0
  134. metadata +199 -0
data/ext/re2/re2.h ADDED
@@ -0,0 +1,837 @@
1
+ // Copyright 2003-2009 The RE2 Authors. All Rights Reserved.
2
+ // Use of this source code is governed by a BSD-style
3
+ // license that can be found in the LICENSE file.
4
+
5
+ #ifndef RE2_RE2_H
6
+ #define RE2_RE2_H
7
+
8
+ // C++ interface to the re2 regular-expression library.
9
+ // RE2 supports Perl-style regular expressions (with extensions like
10
+ // \d, \w, \s, ...).
11
+ //
12
+ // -----------------------------------------------------------------------
13
+ // REGEXP SYNTAX:
14
+ //
15
+ // This module uses the re2 library and hence supports
16
+ // its syntax for regular expressions, which is similar to Perl's with
17
+ // some of the more complicated things thrown away. In particular,
18
+ // backreferences and generalized assertions are not available, nor is \Z.
19
+ //
20
+ // See http://code.google.com/p/re2/wiki/Syntax for the syntax
21
+ // supported by RE2, and a comparison with PCRE and PERL regexps.
22
+ //
23
+ // For those not familiar with Perl's regular expressions,
24
+ // here are some examples of the most commonly used extensions:
25
+ //
26
+ // "hello (\\w+) world" -- \w matches a "word" character
27
+ // "version (\\d+)" -- \d matches a digit
28
+ // "hello\\s+world" -- \s matches any whitespace character
29
+ // "\\b(\\w+)\\b" -- \b matches non-empty string at word boundary
30
+ // "(?i)hello" -- (?i) turns on case-insensitive matching
31
+ // "/\\*(.*?)\\*/" -- .*? matches . minimum no. of times possible
32
+ //
33
+ // -----------------------------------------------------------------------
34
+ // MATCHING INTERFACE:
35
+ //
36
+ // The "FullMatch" operation checks that supplied text matches a
37
+ // supplied pattern exactly.
38
+ //
39
+ // Example: successful match
40
+ // CHECK(RE2::FullMatch("hello", "h.*o"));
41
+ //
42
+ // Example: unsuccessful match (requires full match):
43
+ // CHECK(!RE2::FullMatch("hello", "e"));
44
+ //
45
+ // -----------------------------------------------------------------------
46
+ // UTF-8 AND THE MATCHING INTERFACE:
47
+ //
48
+ // By default, the pattern and input text are interpreted as UTF-8.
49
+ // The RE2::Latin1 option causes them to be interpreted as Latin-1.
50
+ //
51
+ // Example:
52
+ // CHECK(RE2::FullMatch(utf8_string, RE2(utf8_pattern)));
53
+ // CHECK(RE2::FullMatch(latin1_string, RE2(latin1_pattern, RE2::Latin1)));
54
+ //
55
+ // -----------------------------------------------------------------------
56
+ // MATCHING WITH SUB-STRING EXTRACTION:
57
+ //
58
+ // You can supply extra pointer arguments to extract matched subpieces.
59
+ //
60
+ // Example: extracts "ruby" into "s" and 1234 into "i"
61
+ // int i;
62
+ // string s;
63
+ // CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i));
64
+ //
65
+ // Example: fails because string cannot be stored in integer
66
+ // CHECK(!RE2::FullMatch("ruby", "(.*)", &i));
67
+ //
68
+ // Example: fails because there aren't enough sub-patterns:
69
+ // CHECK(!RE2::FullMatch("ruby:1234", "\\w+:\\d+", &s));
70
+ //
71
+ // Example: does not try to extract any extra sub-patterns
72
+ // CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s));
73
+ //
74
+ // Example: does not try to extract into NULL
75
+ // CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", NULL, &i));
76
+ //
77
+ // Example: integer overflow causes failure
78
+ // CHECK(!RE2::FullMatch("ruby:1234567891234", "\\w+:(\\d+)", &i));
79
+ //
80
+ // NOTE(rsc): Asking for substrings slows successful matches quite a bit.
81
+ // This may get a little faster in the future, but right now is slower
82
+ // than PCRE. On the other hand, failed matches run *very* fast (faster
83
+ // than PCRE), as do matches without substring extraction.
84
+ //
85
+ // -----------------------------------------------------------------------
86
+ // PARTIAL MATCHES
87
+ //
88
+ // You can use the "PartialMatch" operation when you want the pattern
89
+ // to match any substring of the text.
90
+ //
91
+ // Example: simple search for a string:
92
+ // CHECK(RE2::PartialMatch("hello", "ell"));
93
+ //
94
+ // Example: find first number in a string
95
+ // int number;
96
+ // CHECK(RE2::PartialMatch("x*100 + 20", "(\\d+)", &number));
97
+ // CHECK_EQ(number, 100);
98
+ //
99
+ // -----------------------------------------------------------------------
100
+ // PRE-COMPILED REGULAR EXPRESSIONS
101
+ //
102
+ // RE2 makes it easy to use any string as a regular expression, without
103
+ // requiring a separate compilation step.
104
+ //
105
+ // If speed is of the essence, you can create a pre-compiled "RE2"
106
+ // object from the pattern and use it multiple times. If you do so,
107
+ // you can typically parse text faster than with sscanf.
108
+ //
109
+ // Example: precompile pattern for faster matching:
110
+ // RE2 pattern("h.*o");
111
+ // while (ReadLine(&str)) {
112
+ // if (RE2::FullMatch(str, pattern)) ...;
113
+ // }
114
+ //
115
+ // -----------------------------------------------------------------------
116
+ // SCANNING TEXT INCREMENTALLY
117
+ //
118
+ // The "Consume" operation may be useful if you want to repeatedly
119
+ // match regular expressions at the front of a string and skip over
120
+ // them as they match. This requires use of the "StringPiece" type,
121
+ // which represents a sub-range of a real string.
122
+ //
123
+ // Example: read lines of the form "var = value" from a string.
124
+ // string contents = ...; // Fill string somehow
125
+ // StringPiece input(contents); // Wrap a StringPiece around it
126
+ //
127
+ // string var;
128
+ // int value;
129
+ // while (RE2::Consume(&input, "(\\w+) = (\\d+)\n", &var, &value)) {
130
+ // ...;
131
+ // }
132
+ //
133
+ // Each successful call to "Consume" will set "var/value", and also
134
+ // advance "input" so it points past the matched text. Note that if the
135
+ // regular expression matches an empty string, input will advance
136
+ // by 0 bytes. If the regular expression being used might match
137
+ // an empty string, the loop body must check for this case and either
138
+ // advance the string or break out of the loop.
139
+ //
140
+ // The "FindAndConsume" operation is similar to "Consume" but does not
141
+ // anchor your match at the beginning of the string. For example, you
142
+ // could extract all words from a string by repeatedly calling
143
+ // RE2::FindAndConsume(&input, "(\\w+)", &word)
144
+ //
145
+ // -----------------------------------------------------------------------
146
+ // USING VARIABLE NUMBER OF ARGUMENTS
147
+ //
148
+ // The above operations require you to know the number of arguments
149
+ // when you write the code. This is not always possible or easy (for
150
+ // example, the regular expression may be calculated at run time).
151
+ // You can use the "N" version of the operations when the number of
152
+ // match arguments are determined at run time.
153
+ //
154
+ // Example:
155
+ // const RE2::Arg* args[10];
156
+ // int n;
157
+ // // ... populate args with pointers to RE2::Arg values ...
158
+ // // ... set n to the number of RE2::Arg objects ...
159
+ // bool match = RE2::FullMatchN(input, pattern, args, n);
160
+ //
161
+ // The last statement is equivalent to
162
+ //
163
+ // bool match = RE2::FullMatch(input, pattern,
164
+ // *args[0], *args[1], ..., *args[n - 1]);
165
+ //
166
+ // -----------------------------------------------------------------------
167
+ // PARSING HEX/OCTAL/C-RADIX NUMBERS
168
+ //
169
+ // By default, if you pass a pointer to a numeric value, the
170
+ // corresponding text is interpreted as a base-10 number. You can
171
+ // instead wrap the pointer with a call to one of the operators Hex(),
172
+ // Octal(), or CRadix() to interpret the text in another base. The
173
+ // CRadix operator interprets C-style "0" (base-8) and "0x" (base-16)
174
+ // prefixes, but defaults to base-10.
175
+ //
176
+ // Example:
177
+ // int a, b, c, d;
178
+ // CHECK(RE2::FullMatch("100 40 0100 0x40", "(.*) (.*) (.*) (.*)",
179
+ // RE2::Octal(&a), RE2::Hex(&b), RE2::CRadix(&c), RE2::CRadix(&d));
180
+ // will leave 64 in a, b, c, and d.
181
+
182
+
183
+ #include <stdint.h>
184
+ #include <map>
185
+ #include <string>
186
+ #include "re2/stringpiece.h"
187
+ #include "re2/variadic_function.h"
188
+
189
+ namespace re2 {
190
+ using std::string;
191
+ using std::map;
192
+ class Mutex;
193
+ class Prog;
194
+ class Regexp;
195
+
196
+ // Interface for regular expression matching. Also corresponds to a
197
+ // pre-compiled regular expression. An "RE2" object is safe for
198
+ // concurrent use by multiple threads.
199
+ class RE2 {
200
+ public:
201
+ // We convert user-passed pointers into special Arg objects
202
+ class Arg;
203
+ class Options;
204
+
205
+ // Defined in set.h.
206
+ class Set;
207
+
208
+ enum ErrorCode {
209
+ NoError = 0,
210
+
211
+ // Unexpected error
212
+ ErrorInternal,
213
+
214
+ // Parse errors
215
+ ErrorBadEscape, // bad escape sequence
216
+ ErrorBadCharClass, // bad character class
217
+ ErrorBadCharRange, // bad character class range
218
+ ErrorMissingBracket, // missing closing ]
219
+ ErrorMissingParen, // missing closing )
220
+ ErrorTrailingBackslash, // trailing \ at end of regexp
221
+ ErrorRepeatArgument, // repeat argument missing, e.g. "*"
222
+ ErrorRepeatSize, // bad repetition argument
223
+ ErrorRepeatOp, // bad repetition operator
224
+ ErrorBadPerlOp, // bad perl operator
225
+ ErrorBadUTF8, // invalid UTF-8 in regexp
226
+ ErrorBadNamedCapture, // bad named capture group
227
+ ErrorPatternTooLarge, // pattern too large (compile failed)
228
+ };
229
+
230
+ // Predefined common options.
231
+ // If you need more complicated things, instantiate
232
+ // an Option class, change the settings, and pass it to the
233
+ // RE2 constructor.
234
+ static const Options DefaultOptions;
235
+ static const Options Latin1; // treat input as Latin-1 (default UTF-8)
236
+ static const Options POSIX; // POSIX syntax, leftmost-longest match
237
+ static const Options Quiet; // do not log about regexp parse errors
238
+
239
+ // Need to have the const char* and const string& forms for implicit
240
+ // conversions when passing string literals to FullMatch and PartialMatch.
241
+ // Otherwise the StringPiece form would be sufficient.
242
+ #ifndef SWIG
243
+ RE2(const char* pattern);
244
+ RE2(const string& pattern);
245
+ #endif
246
+ RE2(const StringPiece& pattern);
247
+ RE2(const StringPiece& pattern, const Options& option);
248
+ ~RE2();
249
+
250
+ // Returns whether RE2 was created properly.
251
+ bool ok() const { return error_code() == NoError; }
252
+
253
+ // The string specification for this RE2. E.g.
254
+ // RE2 re("ab*c?d+");
255
+ // re.pattern(); // "ab*c?d+"
256
+ const string& pattern() const { return pattern_; }
257
+
258
+ // If RE2 could not be created properly, returns an error string.
259
+ // Else returns the empty string.
260
+ const string& error() const { return *error_; }
261
+
262
+ // If RE2 could not be created properly, returns an error code.
263
+ // Else returns RE2::NoError (== 0).
264
+ ErrorCode error_code() const { return error_code_; }
265
+
266
+ // If RE2 could not be created properly, returns the offending
267
+ // portion of the regexp.
268
+ const string& error_arg() const { return error_arg_; }
269
+
270
+ // Returns the program size, a very approximate measure of a regexp's "cost".
271
+ // Larger numbers are more expensive than smaller numbers.
272
+ int ProgramSize() const;
273
+
274
+ // Returns the underlying Regexp; not for general use.
275
+ // Returns entire_regexp_ so that callers don't need
276
+ // to know about prefix_ and prefix_foldcase_.
277
+ re2::Regexp* Regexp() const { return entire_regexp_; }
278
+
279
+ /***** The useful part: the matching interface *****/
280
+
281
+ // Matches "text" against "pattern". If pointer arguments are
282
+ // supplied, copies matched sub-patterns into them.
283
+ //
284
+ // You can pass in a "const char*" or a "string" for "text".
285
+ // You can pass in a "const char*" or a "string" or a "RE2" for "pattern".
286
+ //
287
+ // The provided pointer arguments can be pointers to any scalar numeric
288
+ // type, or one of:
289
+ // string (matched piece is copied to string)
290
+ // StringPiece (StringPiece is mutated to point to matched piece)
291
+ // T (where "bool T::ParseFrom(const char*, int)" exists)
292
+ // (void*)NULL (the corresponding matched sub-pattern is not copied)
293
+ //
294
+ // Returns true iff all of the following conditions are satisfied:
295
+ // a. "text" matches "pattern" exactly
296
+ // b. The number of matched sub-patterns is >= number of supplied pointers
297
+ // c. The "i"th argument has a suitable type for holding the
298
+ // string captured as the "i"th sub-pattern. If you pass in
299
+ // NULL for the "i"th argument, or pass fewer arguments than
300
+ // number of sub-patterns, "i"th captured sub-pattern is
301
+ // ignored.
302
+ //
303
+ // CAVEAT: An optional sub-pattern that does not exist in the
304
+ // matched string is assigned the empty string. Therefore, the
305
+ // following will return false (because the empty string is not a
306
+ // valid number):
307
+ // int number;
308
+ // RE2::FullMatch("abc", "[a-z]+(\\d+)?", &number);
309
+ static bool FullMatchN(const StringPiece& text, const RE2& re,
310
+ const Arg* const args[], int argc);
311
+ static const VariadicFunction2<
312
+ bool, const StringPiece&, const RE2&, Arg, RE2::FullMatchN> FullMatch;
313
+
314
+ // Exactly like FullMatch(), except that "pattern" is allowed to match
315
+ // a substring of "text".
316
+ static bool PartialMatchN(const StringPiece& text, const RE2& re, // 3..16 args
317
+ const Arg* const args[], int argc);
318
+ static const VariadicFunction2<
319
+ bool, const StringPiece&, const RE2&, Arg, RE2::PartialMatchN> PartialMatch;
320
+
321
+ // Like FullMatch() and PartialMatch(), except that pattern has to
322
+ // match a prefix of "text", and "input" is advanced past the matched
323
+ // text. Note: "input" is modified iff this routine returns true.
324
+ static bool ConsumeN(StringPiece* input, const RE2& pattern, // 3..16 args
325
+ const Arg* const args[], int argc);
326
+ static const VariadicFunction2<
327
+ bool, StringPiece*, const RE2&, Arg, RE2::ConsumeN> Consume;
328
+
329
+ // Like Consume(..), but does not anchor the match at the beginning of the
330
+ // string. That is, "pattern" need not start its match at the beginning of
331
+ // "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds the next
332
+ // word in "s" and stores it in "word".
333
+ static bool FindAndConsumeN(StringPiece* input, const RE2& pattern,
334
+ const Arg* const args[], int argc);
335
+ static const VariadicFunction2<
336
+ bool, StringPiece*, const RE2&, Arg, RE2::FindAndConsumeN> FindAndConsume;
337
+
338
+ // Replace the first match of "pattern" in "str" with "rewrite".
339
+ // Within "rewrite", backslash-escaped digits (\1 to \9) can be
340
+ // used to insert text matching corresponding parenthesized group
341
+ // from the pattern. \0 in "rewrite" refers to the entire matching
342
+ // text. E.g.,
343
+ //
344
+ // string s = "yabba dabba doo";
345
+ // CHECK(RE2::Replace(&s, "b+", "d"));
346
+ //
347
+ // will leave "s" containing "yada dabba doo"
348
+ //
349
+ // Returns true if the pattern matches and a replacement occurs,
350
+ // false otherwise.
351
+ static bool Replace(string *str,
352
+ const RE2& pattern,
353
+ const StringPiece& rewrite);
354
+
355
+ // Like Replace(), except replaces successive non-overlapping occurrences
356
+ // of the pattern in the string with the rewrite. E.g.
357
+ //
358
+ // string s = "yabba dabba doo";
359
+ // CHECK(RE2::GlobalReplace(&s, "b+", "d"));
360
+ //
361
+ // will leave "s" containing "yada dada doo"
362
+ // Replacements are not subject to re-matching.
363
+ //
364
+ // Because GlobalReplace only replaces non-overlapping matches,
365
+ // replacing "ana" within "banana" makes only one replacement, not two.
366
+ //
367
+ // Returns the number of replacements made.
368
+ static int GlobalReplace(string *str,
369
+ const RE2& pattern,
370
+ const StringPiece& rewrite);
371
+
372
+ // Like Replace, except that if the pattern matches, "rewrite"
373
+ // is copied into "out" with substitutions. The non-matching
374
+ // portions of "text" are ignored.
375
+ //
376
+ // Returns true iff a match occurred and the extraction happened
377
+ // successfully; if no match occurs, the string is left unaffected.
378
+ static bool Extract(const StringPiece &text,
379
+ const RE2& pattern,
380
+ const StringPiece &rewrite,
381
+ string *out);
382
+
383
+ // Escapes all potentially meaningful regexp characters in
384
+ // 'unquoted'. The returned string, used as a regular expression,
385
+ // will exactly match the original string. For example,
386
+ // 1.5-2.0?
387
+ // may become:
388
+ // 1\.5\-2\.0\?
389
+ static string QuoteMeta(const StringPiece& unquoted);
390
+
391
+ // Computes range for any strings matching regexp. The min and max can in
392
+ // some cases be arbitrarily precise, so the caller gets to specify the
393
+ // maximum desired length of string returned.
394
+ //
395
+ // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
396
+ // string s that is an anchored match for this regexp satisfies
397
+ // min <= s && s <= max.
398
+ //
399
+ // Note that PossibleMatchRange() will only consider the first copy of an
400
+ // infinitely repeated element (i.e., any regexp element followed by a '*' or
401
+ // '+' operator). Regexps with "{N}" constructions are not affected, as those
402
+ // do not compile down to infinite repetitions.
403
+ //
404
+ // Returns true on success, false on error.
405
+ bool PossibleMatchRange(string* min, string* max, int maxlen) const;
406
+
407
+ // Generic matching interface
408
+
409
+ // Type of match.
410
+ enum Anchor {
411
+ UNANCHORED, // No anchoring
412
+ ANCHOR_START, // Anchor at start only
413
+ ANCHOR_BOTH, // Anchor at start and end
414
+ };
415
+
416
+ // Return the number of capturing subpatterns, or -1 if the
417
+ // regexp wasn't valid on construction. The overall match ($0)
418
+ // does not count: if the regexp is "(a)(b)", returns 2.
419
+ int NumberOfCapturingGroups() const;
420
+
421
+
422
+ // Return a map from names to capturing indices.
423
+ // The map records the index of the leftmost group
424
+ // with the given name.
425
+ // Only valid until the re is deleted.
426
+ const map<string, int>& NamedCapturingGroups() const;
427
+
428
+ // Return a map from capturing indices to names.
429
+ // The map has no entries for unnamed groups.
430
+ // Only valid until the re is deleted.
431
+ const map<int, string>& CapturingGroupNames() const;
432
+
433
+ // General matching routine.
434
+ // Match against text starting at offset startpos
435
+ // and stopping the search at offset endpos.
436
+ // Returns true if match found, false if not.
437
+ // On a successful match, fills in match[] (up to nmatch entries)
438
+ // with information about submatches.
439
+ // I.e. matching RE2("(foo)|(bar)baz") on "barbazbla" will return true,
440
+ // setting match[0] = "barbaz", match[1] = NULL, match[2] = "bar",
441
+ // match[3] = NULL, ..., up to match[nmatch-1] = NULL.
442
+ //
443
+ // Don't ask for more match information than you will use:
444
+ // runs much faster with nmatch == 1 than nmatch > 1, and
445
+ // runs even faster if nmatch == 0.
446
+ // Doesn't make sense to use nmatch > 1 + NumberOfCapturingGroups(),
447
+ // but will be handled correctly.
448
+ //
449
+ // Passing text == StringPiece(NULL, 0) will be handled like any other
450
+ // empty string, but note that on return, it will not be possible to tell
451
+ // whether submatch i matched the empty string or did not match:
452
+ // either way, match[i] == NULL.
453
+ bool Match(const StringPiece& text,
454
+ int startpos,
455
+ int endpos,
456
+ Anchor anchor,
457
+ StringPiece *match,
458
+ int nmatch) const;
459
+
460
+ // Check that the given rewrite string is suitable for use with this
461
+ // regular expression. It checks that:
462
+ // * The regular expression has enough parenthesized subexpressions
463
+ // to satisfy all of the \N tokens in rewrite
464
+ // * The rewrite string doesn't have any syntax errors. E.g.,
465
+ // '\' followed by anything other than a digit or '\'.
466
+ // A true return value guarantees that Replace() and Extract() won't
467
+ // fail because of a bad rewrite string.
468
+ bool CheckRewriteString(const StringPiece& rewrite, string* error) const;
469
+
470
+ // Constructor options
471
+ class Options {
472
+ public:
473
+ // The options are (defaults in parentheses):
474
+ //
475
+ // utf8 (true) text and pattern are UTF-8; otherwise Latin-1
476
+ // posix_syntax (false) restrict regexps to POSIX egrep syntax
477
+ // longest_match (false) search for longest match, not first match
478
+ // log_errors (true) log syntax and execution errors to ERROR
479
+ // max_mem (see below) approx. max memory footprint of RE2
480
+ // literal (false) interpret string as literal, not regexp
481
+ // never_nl (false) never match \n, even if it is in regexp
482
+ // case_sensitive (true) match is case-sensitive (regexp can override
483
+ // with (?i) unless in posix_syntax mode)
484
+ //
485
+ // The following options are only consulted when posix_syntax == true.
486
+ // (When posix_syntax == false these features are always enabled and
487
+ // cannot be turned off.)
488
+ // perl_classes (false) allow Perl's \d \s \w \D \S \W
489
+ // word_boundary (false) allow Perl's \b \B (word boundary and not)
490
+ // one_line (false) ^ and $ only match beginning and end of text
491
+ //
492
+ // The max_mem option controls how much memory can be used
493
+ // to hold the compiled form of the regexp (the Prog) and
494
+ // its cached DFA graphs. Code Search placed limits on the number
495
+ // of Prog instructions and DFA states: 10,000 for both.
496
+ // In RE2, those limits would translate to about 240 KB per Prog
497
+ // and perhaps 2.5 MB per DFA (DFA state sizes vary by regexp; RE2 does a
498
+ // better job of keeping them small than Code Search did).
499
+ // Each RE2 has two Progs (one forward, one reverse), and each Prog
500
+ // can have two DFAs (one first match, one longest match).
501
+ // That makes 4 DFAs:
502
+ //
503
+ // forward, first-match - used for UNANCHORED or ANCHOR_LEFT searches
504
+ // if opt.longest_match() == false
505
+ // forward, longest-match - used for all ANCHOR_BOTH searches,
506
+ // and the other two kinds if
507
+ // opt.longest_match() == true
508
+ // reverse, first-match - never used
509
+ // reverse, longest-match - used as second phase for unanchored searches
510
+ //
511
+ // The RE2 memory budget is statically divided between the two
512
+ // Progs and then the DFAs: two thirds to the forward Prog
513
+ // and one third to the reverse Prog. The forward Prog gives half
514
+ // of what it has left over to each of its DFAs. The reverse Prog
515
+ // gives it all to its longest-match DFA.
516
+ //
517
+ // Once a DFA fills its budget, it flushes its cache and starts over.
518
+ // If this happens too often, RE2 falls back on the NFA implementation.
519
+
520
+ // For now, make the default budget something close to Code Search.
521
+ static const int kDefaultMaxMem = 8<<20;
522
+
523
+ enum Encoding {
524
+ EncodingUTF8 = 1,
525
+ EncodingLatin1
526
+ };
527
+
528
+ Options() :
529
+ encoding_(EncodingUTF8),
530
+ posix_syntax_(false),
531
+ longest_match_(false),
532
+ log_errors_(true),
533
+ max_mem_(kDefaultMaxMem),
534
+ literal_(false),
535
+ never_nl_(false),
536
+ case_sensitive_(true),
537
+ perl_classes_(false),
538
+ word_boundary_(false),
539
+ one_line_(false) {
540
+ }
541
+
542
+ Encoding encoding() const { return encoding_; }
543
+ void set_encoding(Encoding encoding) { encoding_ = encoding; }
544
+
545
+ // Legacy interface to encoding.
546
+ // TODO(rsc): Remove once clients have been converted.
547
+ bool utf8() const { return encoding_ == EncodingUTF8; }
548
+ void set_utf8(bool b) {
549
+ if (b) {
550
+ encoding_ = EncodingUTF8;
551
+ } else {
552
+ encoding_ = EncodingLatin1;
553
+ }
554
+ }
555
+
556
+ bool posix_syntax() const { return posix_syntax_; }
557
+ void set_posix_syntax(bool b) { posix_syntax_ = b; }
558
+
559
+ bool longest_match() const { return longest_match_; }
560
+ void set_longest_match(bool b) { longest_match_ = b; }
561
+
562
+ bool log_errors() const { return log_errors_; }
563
+ void set_log_errors(bool b) { log_errors_ = b; }
564
+
565
+ int max_mem() const { return max_mem_; }
566
+ void set_max_mem(int m) { max_mem_ = m; }
567
+
568
+ bool literal() const { return literal_; }
569
+ void set_literal(bool b) { literal_ = b; }
570
+
571
+ bool never_nl() const { return never_nl_; }
572
+ void set_never_nl(bool b) { never_nl_ = b; }
573
+
574
+ bool case_sensitive() const { return case_sensitive_; }
575
+ void set_case_sensitive(bool b) { case_sensitive_ = b; }
576
+
577
+ bool perl_classes() const { return perl_classes_; }
578
+ void set_perl_classes(bool b) { perl_classes_ = b; }
579
+
580
+ bool word_boundary() const { return word_boundary_; }
581
+ void set_word_boundary(bool b) { word_boundary_ = b; }
582
+
583
+ bool one_line() const { return one_line_; }
584
+ void set_one_line(bool b) { one_line_ = b; }
585
+
586
+ void Copy(const Options& src) {
587
+ encoding_ = src.encoding_;
588
+ posix_syntax_ = src.posix_syntax_;
589
+ longest_match_ = src.longest_match_;
590
+ log_errors_ = src.log_errors_;
591
+ max_mem_ = src.max_mem_;
592
+ literal_ = src.literal_;
593
+ never_nl_ = src.never_nl_;
594
+ case_sensitive_ = src.case_sensitive_;
595
+ perl_classes_ = src.perl_classes_;
596
+ word_boundary_ = src.word_boundary_;
597
+ one_line_ = src.one_line_;
598
+ }
599
+
600
+ int ParseFlags() const;
601
+
602
+ private:
603
+ // Private constructor for defining constants like RE2::Latin1.
604
+ friend class RE2;
605
+ Options(Encoding encoding,
606
+ bool posix_syntax,
607
+ bool longest_match,
608
+ bool log_errors) :
609
+ encoding_(encoding),
610
+ posix_syntax_(posix_syntax),
611
+ longest_match_(longest_match),
612
+ log_errors_(log_errors),
613
+ max_mem_(kDefaultMaxMem),
614
+ literal_(false),
615
+ never_nl_(false),
616
+ case_sensitive_(true),
617
+ perl_classes_(false),
618
+ word_boundary_(false),
619
+ one_line_(false) {
620
+ }
621
+
622
+ Encoding encoding_;
623
+ bool posix_syntax_;
624
+ bool longest_match_;
625
+ bool log_errors_;
626
+ int64_t max_mem_;
627
+ bool literal_;
628
+ bool never_nl_;
629
+ bool case_sensitive_;
630
+ bool perl_classes_;
631
+ bool word_boundary_;
632
+ bool one_line_;
633
+
634
+ //DISALLOW_EVIL_CONSTRUCTORS(Options);
635
+ Options(const Options&);
636
+ void operator=(const Options&);
637
+ };
638
+
639
+ // Returns the options set in the constructor.
640
+ const Options& options() const { return options_; };
641
+
642
+ // Argument converters; see below.
643
+ static inline Arg CRadix(short* x);
644
+ static inline Arg CRadix(unsigned short* x);
645
+ static inline Arg CRadix(int* x);
646
+ static inline Arg CRadix(unsigned int* x);
647
+ static inline Arg CRadix(long* x);
648
+ static inline Arg CRadix(unsigned long* x);
649
+ static inline Arg CRadix(long long* x);
650
+ static inline Arg CRadix(unsigned long long* x);
651
+
652
+ static inline Arg Hex(short* x);
653
+ static inline Arg Hex(unsigned short* x);
654
+ static inline Arg Hex(int* x);
655
+ static inline Arg Hex(unsigned int* x);
656
+ static inline Arg Hex(long* x);
657
+ static inline Arg Hex(unsigned long* x);
658
+ static inline Arg Hex(long long* x);
659
+ static inline Arg Hex(unsigned long long* x);
660
+
661
+ static inline Arg Octal(short* x);
662
+ static inline Arg Octal(unsigned short* x);
663
+ static inline Arg Octal(int* x);
664
+ static inline Arg Octal(unsigned int* x);
665
+ static inline Arg Octal(long* x);
666
+ static inline Arg Octal(unsigned long* x);
667
+ static inline Arg Octal(long long* x);
668
+ static inline Arg Octal(unsigned long long* x);
669
+
670
+ private:
671
+ void Init(const StringPiece& pattern, const Options& options);
672
+
673
+ bool Rewrite(string *out,
674
+ const StringPiece &rewrite,
675
+ const StringPiece* vec,
676
+ int veclen) const;
677
+
678
+ bool DoMatch(const StringPiece& text,
679
+ Anchor anchor,
680
+ int* consumed,
681
+ const Arg* const args[],
682
+ int n) const;
683
+
684
+ re2::Prog* ReverseProg() const;
685
+
686
+ mutable Mutex* mutex_;
687
+ string pattern_; // string regular expression
688
+ Options options_; // option flags
689
+ string prefix_; // required prefix (before regexp_)
690
+ bool prefix_foldcase_; // prefix is ASCII case-insensitive
691
+ re2::Regexp* entire_regexp_; // parsed regular expression
692
+ re2::Regexp* suffix_regexp_; // parsed regular expression, prefix removed
693
+ re2::Prog* prog_; // compiled program for regexp
694
+ mutable re2::Prog* rprog_; // reverse program for regexp
695
+ bool is_one_pass_; // can use prog_->SearchOnePass?
696
+ mutable const string* error_; // Error indicator
697
+ // (or points to empty string)
698
+ mutable ErrorCode error_code_; // Error code
699
+ mutable string error_arg_; // Fragment of regexp showing error
700
+ mutable int num_captures_; // Number of capturing groups
701
+
702
+ // Map from capture names to indices
703
+ mutable const map<string, int>* named_groups_;
704
+
705
+ // Map from capture indices to names
706
+ mutable const map<int, string>* group_names_;
707
+
708
+ //DISALLOW_EVIL_CONSTRUCTORS(RE2);
709
+ RE2(const RE2&);
710
+ void operator=(const RE2&);
711
+ };
712
+
713
+ /***** Implementation details *****/
714
+
715
+ // Hex/Octal/Binary?
716
+
717
+ // Special class for parsing into objects that define a ParseFrom() method
718
+ template <class T>
719
+ class _RE2_MatchObject {
720
+ public:
721
+ static inline bool Parse(const char* str, int n, void* dest) {
722
+ if (dest == NULL) return true;
723
+ T* object = reinterpret_cast<T*>(dest);
724
+ return object->ParseFrom(str, n);
725
+ }
726
+ };
727
+
728
+ class RE2::Arg {
729
+ public:
730
+ // Empty constructor so we can declare arrays of RE2::Arg
731
+ Arg();
732
+
733
+ // Constructor specially designed for NULL arguments
734
+ Arg(void*);
735
+
736
+ typedef bool (*Parser)(const char* str, int n, void* dest);
737
+
738
+ // Type-specific parsers
739
+ #define MAKE_PARSER(type,name) \
740
+ Arg(type* p) : arg_(p), parser_(name) { } \
741
+ Arg(type* p, Parser parser) : arg_(p), parser_(parser) { } \
742
+
743
+
744
+ MAKE_PARSER(char, parse_char);
745
+ MAKE_PARSER(signed char, parse_char);
746
+ MAKE_PARSER(unsigned char, parse_uchar);
747
+ MAKE_PARSER(short, parse_short);
748
+ MAKE_PARSER(unsigned short, parse_ushort);
749
+ MAKE_PARSER(int, parse_int);
750
+ MAKE_PARSER(unsigned int, parse_uint);
751
+ MAKE_PARSER(long, parse_long);
752
+ MAKE_PARSER(unsigned long, parse_ulong);
753
+ MAKE_PARSER(long long, parse_longlong);
754
+ MAKE_PARSER(unsigned long long, parse_ulonglong);
755
+ MAKE_PARSER(float, parse_float);
756
+ MAKE_PARSER(double, parse_double);
757
+ MAKE_PARSER(string, parse_string);
758
+ MAKE_PARSER(StringPiece, parse_stringpiece);
759
+
760
+ #undef MAKE_PARSER
761
+
762
+ // Generic constructor
763
+ template <class T> Arg(T*, Parser parser);
764
+ // Generic constructor template
765
+ template <class T> Arg(T* p)
766
+ : arg_(p), parser_(_RE2_MatchObject<T>::Parse) {
767
+ }
768
+
769
+ // Parse the data
770
+ bool Parse(const char* str, int n) const;
771
+
772
+ private:
773
+ void* arg_;
774
+ Parser parser_;
775
+
776
+ static bool parse_null (const char* str, int n, void* dest);
777
+ static bool parse_char (const char* str, int n, void* dest);
778
+ static bool parse_uchar (const char* str, int n, void* dest);
779
+ static bool parse_float (const char* str, int n, void* dest);
780
+ static bool parse_double (const char* str, int n, void* dest);
781
+ static bool parse_string (const char* str, int n, void* dest);
782
+ static bool parse_stringpiece (const char* str, int n, void* dest);
783
+
784
+ #define DECLARE_INTEGER_PARSER(name) \
785
+ private: \
786
+ static bool parse_ ## name(const char* str, int n, void* dest); \
787
+ static bool parse_ ## name ## _radix( \
788
+ const char* str, int n, void* dest, int radix); \
789
+ public: \
790
+ static bool parse_ ## name ## _hex(const char* str, int n, void* dest); \
791
+ static bool parse_ ## name ## _octal(const char* str, int n, void* dest); \
792
+ static bool parse_ ## name ## _cradix(const char* str, int n, void* dest)
793
+
794
+ DECLARE_INTEGER_PARSER(short);
795
+ DECLARE_INTEGER_PARSER(ushort);
796
+ DECLARE_INTEGER_PARSER(int);
797
+ DECLARE_INTEGER_PARSER(uint);
798
+ DECLARE_INTEGER_PARSER(long);
799
+ DECLARE_INTEGER_PARSER(ulong);
800
+ DECLARE_INTEGER_PARSER(longlong);
801
+ DECLARE_INTEGER_PARSER(ulonglong);
802
+
803
+ #undef DECLARE_INTEGER_PARSER
804
+ };
805
+
806
+ inline RE2::Arg::Arg() : arg_(NULL), parser_(parse_null) { }
807
+ inline RE2::Arg::Arg(void* p) : arg_(p), parser_(parse_null) { }
808
+
809
+ inline bool RE2::Arg::Parse(const char* str, int n) const {
810
+ return (*parser_)(str, n, arg_);
811
+ }
812
+
813
+ // This part of the parser, appropriate only for ints, deals with bases
814
+ #define MAKE_INTEGER_PARSER(type, name) \
815
+ inline RE2::Arg RE2::Hex(type* ptr) { \
816
+ return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _hex); } \
817
+ inline RE2::Arg RE2::Octal(type* ptr) { \
818
+ return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _octal); } \
819
+ inline RE2::Arg RE2::CRadix(type* ptr) { \
820
+ return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _cradix); }
821
+
822
+ MAKE_INTEGER_PARSER(short, short);
823
+ MAKE_INTEGER_PARSER(unsigned short, ushort);
824
+ MAKE_INTEGER_PARSER(int, int);
825
+ MAKE_INTEGER_PARSER(unsigned int, uint);
826
+ MAKE_INTEGER_PARSER(long, long);
827
+ MAKE_INTEGER_PARSER(unsigned long, ulong);
828
+ MAKE_INTEGER_PARSER(long long, longlong);
829
+ MAKE_INTEGER_PARSER(unsigned long long, ulonglong);
830
+
831
+ #undef MAKE_INTEGER_PARSER
832
+
833
+ } // namespace re2
834
+
835
+ using re2::RE2;
836
+
837
+ #endif /* RE2_RE2_H */