chipper 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. data/README.rdoc +51 -0
  2. data/ext/extconf.rb +58 -0
  3. data/ext/libstemmer_c/Makefile +10 -0
  4. data/ext/libstemmer_c/examples/stemwords.c +209 -0
  5. data/ext/libstemmer_c/include/libstemmer.h +79 -0
  6. data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
  7. data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
  8. data/ext/libstemmer_c/libstemmer/modules.h +190 -0
  9. data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
  10. data/ext/libstemmer_c/mkinc.mak +82 -0
  11. data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
  12. data/ext/libstemmer_c/runtime/api.c +66 -0
  13. data/ext/libstemmer_c/runtime/api.h +26 -0
  14. data/ext/libstemmer_c/runtime/header.h +58 -0
  15. data/ext/libstemmer_c/runtime/utilities.c +478 -0
  16. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
  17. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
  18. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
  19. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
  20. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
  21. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
  22. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
  23. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
  24. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
  25. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
  26. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
  27. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
  28. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
  29. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
  30. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
  31. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
  32. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
  33. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
  34. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
  35. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
  36. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
  37. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
  38. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
  39. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
  40. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
  41. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
  42. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
  43. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
  44. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
  45. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
  46. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
  47. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
  48. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
  49. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
  50. data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
  51. data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
  52. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
  53. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
  54. data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
  55. data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
  56. data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
  57. data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
  58. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
  59. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
  60. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
  61. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
  62. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
  63. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
  64. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
  65. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
  66. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
  67. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
  68. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
  69. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
  70. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
  71. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
  72. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
  73. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
  74. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
  75. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
  76. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
  77. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
  78. data/ext/re2/bitstate.cc +378 -0
  79. data/ext/re2/compile.cc +1138 -0
  80. data/ext/re2/dfa.cc +2086 -0
  81. data/ext/re2/filtered_re2.cc +100 -0
  82. data/ext/re2/filtered_re2.h +99 -0
  83. data/ext/re2/hash.cc +231 -0
  84. data/ext/re2/mimics_pcre.cc +185 -0
  85. data/ext/re2/nfa.cc +709 -0
  86. data/ext/re2/onepass.cc +614 -0
  87. data/ext/re2/parse.cc +2202 -0
  88. data/ext/re2/perl_groups.cc +119 -0
  89. data/ext/re2/prefilter.cc +671 -0
  90. data/ext/re2/prefilter.h +105 -0
  91. data/ext/re2/prefilter_tree.cc +398 -0
  92. data/ext/re2/prefilter_tree.h +130 -0
  93. data/ext/re2/prog.cc +341 -0
  94. data/ext/re2/prog.h +376 -0
  95. data/ext/re2/re2.cc +1180 -0
  96. data/ext/re2/re2.h +837 -0
  97. data/ext/re2/regexp.cc +920 -0
  98. data/ext/re2/regexp.h +632 -0
  99. data/ext/re2/rune.cc +258 -0
  100. data/ext/re2/set.cc +113 -0
  101. data/ext/re2/set.h +55 -0
  102. data/ext/re2/simplify.cc +393 -0
  103. data/ext/re2/stringpiece.cc +87 -0
  104. data/ext/re2/stringpiece.h +182 -0
  105. data/ext/re2/tostring.cc +341 -0
  106. data/ext/re2/unicode_casefold.cc +469 -0
  107. data/ext/re2/unicode_casefold.h +75 -0
  108. data/ext/re2/unicode_groups.cc +4851 -0
  109. data/ext/re2/unicode_groups.h +64 -0
  110. data/ext/re2/valgrind.cc +24 -0
  111. data/ext/re2/variadic_function.h +346 -0
  112. data/ext/re2/walker-inl.h +244 -0
  113. data/ext/src/chipper.cc +626 -0
  114. data/ext/src/version.h +1 -0
  115. data/ext/stemmer.rb +40 -0
  116. data/ext/util/arena.h +103 -0
  117. data/ext/util/atomicops.h +79 -0
  118. data/ext/util/benchmark.h +41 -0
  119. data/ext/util/flags.h +27 -0
  120. data/ext/util/logging.h +78 -0
  121. data/ext/util/mutex.h +190 -0
  122. data/ext/util/pcre.h +679 -0
  123. data/ext/util/random.h +29 -0
  124. data/ext/util/sparse_array.h +451 -0
  125. data/ext/util/sparse_set.h +177 -0
  126. data/ext/util/test.h +57 -0
  127. data/ext/util/thread.h +26 -0
  128. data/ext/util/utf.h +43 -0
  129. data/ext/util/util.h +127 -0
  130. data/ext/util/valgrind.h +4517 -0
  131. data/test/helper.rb +5 -0
  132. data/test/test_entities.rb +57 -0
  133. data/test/test_tokens.rb +118 -0
  134. metadata +199 -0
data/ext/re2/re2.h ADDED
@@ -0,0 +1,837 @@
1
+ // Copyright 2003-2009 The RE2 Authors. All Rights Reserved.
2
+ // Use of this source code is governed by a BSD-style
3
+ // license that can be found in the LICENSE file.
4
+
5
+ #ifndef RE2_RE2_H
6
+ #define RE2_RE2_H
7
+
8
+ // C++ interface to the re2 regular-expression library.
9
+ // RE2 supports Perl-style regular expressions (with extensions like
10
+ // \d, \w, \s, ...).
11
+ //
12
+ // -----------------------------------------------------------------------
13
+ // REGEXP SYNTAX:
14
+ //
15
+ // This module uses the re2 library and hence supports
16
+ // its syntax for regular expressions, which is similar to Perl's with
17
+ // some of the more complicated things thrown away. In particular,
18
+ // backreferences and generalized assertions are not available, nor is \Z.
19
+ //
20
+ // See http://code.google.com/p/re2/wiki/Syntax for the syntax
21
+ // supported by RE2, and a comparison with PCRE and PERL regexps.
22
+ //
23
+ // For those not familiar with Perl's regular expressions,
24
+ // here are some examples of the most commonly used extensions:
25
+ //
26
+ // "hello (\\w+) world" -- \w matches a "word" character
27
+ // "version (\\d+)" -- \d matches a digit
28
+ // "hello\\s+world" -- \s matches any whitespace character
29
+ // "\\b(\\w+)\\b" -- \b matches non-empty string at word boundary
30
+ // "(?i)hello" -- (?i) turns on case-insensitive matching
31
+ // "/\\*(.*?)\\*/" -- .*? matches . minimum no. of times possible
32
+ //
33
+ // -----------------------------------------------------------------------
34
+ // MATCHING INTERFACE:
35
+ //
36
+ // The "FullMatch" operation checks that supplied text matches a
37
+ // supplied pattern exactly.
38
+ //
39
+ // Example: successful match
40
+ // CHECK(RE2::FullMatch("hello", "h.*o"));
41
+ //
42
+ // Example: unsuccessful match (requires full match):
43
+ // CHECK(!RE2::FullMatch("hello", "e"));
44
+ //
45
+ // -----------------------------------------------------------------------
46
+ // UTF-8 AND THE MATCHING INTERFACE:
47
+ //
48
+ // By default, the pattern and input text are interpreted as UTF-8.
49
+ // The RE2::Latin1 option causes them to be interpreted as Latin-1.
50
+ //
51
+ // Example:
52
+ // CHECK(RE2::FullMatch(utf8_string, RE2(utf8_pattern)));
53
+ // CHECK(RE2::FullMatch(latin1_string, RE2(latin1_pattern, RE2::Latin1)));
54
+ //
55
+ // -----------------------------------------------------------------------
56
+ // MATCHING WITH SUB-STRING EXTRACTION:
57
+ //
58
+ // You can supply extra pointer arguments to extract matched subpieces.
59
+ //
60
+ // Example: extracts "ruby" into "s" and 1234 into "i"
61
+ // int i;
62
+ // string s;
63
+ // CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i));
64
+ //
65
+ // Example: fails because string cannot be stored in integer
66
+ // CHECK(!RE2::FullMatch("ruby", "(.*)", &i));
67
+ //
68
+ // Example: fails because there aren't enough sub-patterns:
69
+ // CHECK(!RE2::FullMatch("ruby:1234", "\\w+:\\d+", &s));
70
+ //
71
+ // Example: does not try to extract any extra sub-patterns
72
+ // CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s));
73
+ //
74
+ // Example: does not try to extract into NULL
75
+ // CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", NULL, &i));
76
+ //
77
+ // Example: integer overflow causes failure
78
+ // CHECK(!RE2::FullMatch("ruby:1234567891234", "\\w+:(\\d+)", &i));
79
+ //
80
+ // NOTE(rsc): Asking for substrings slows successful matches quite a bit.
81
+ // This may get a little faster in the future, but right now is slower
82
+ // than PCRE. On the other hand, failed matches run *very* fast (faster
83
+ // than PCRE), as do matches without substring extraction.
84
+ //
85
+ // -----------------------------------------------------------------------
86
+ // PARTIAL MATCHES
87
+ //
88
+ // You can use the "PartialMatch" operation when you want the pattern
89
+ // to match any substring of the text.
90
+ //
91
+ // Example: simple search for a string:
92
+ // CHECK(RE2::PartialMatch("hello", "ell"));
93
+ //
94
+ // Example: find first number in a string
95
+ // int number;
96
+ // CHECK(RE2::PartialMatch("x*100 + 20", "(\\d+)", &number));
97
+ // CHECK_EQ(number, 100);
98
+ //
99
+ // -----------------------------------------------------------------------
100
+ // PRE-COMPILED REGULAR EXPRESSIONS
101
+ //
102
+ // RE2 makes it easy to use any string as a regular expression, without
103
+ // requiring a separate compilation step.
104
+ //
105
+ // If speed is of the essence, you can create a pre-compiled "RE2"
106
+ // object from the pattern and use it multiple times. If you do so,
107
+ // you can typically parse text faster than with sscanf.
108
+ //
109
+ // Example: precompile pattern for faster matching:
110
+ // RE2 pattern("h.*o");
111
+ // while (ReadLine(&str)) {
112
+ // if (RE2::FullMatch(str, pattern)) ...;
113
+ // }
114
+ //
115
+ // -----------------------------------------------------------------------
116
+ // SCANNING TEXT INCREMENTALLY
117
+ //
118
+ // The "Consume" operation may be useful if you want to repeatedly
119
+ // match regular expressions at the front of a string and skip over
120
+ // them as they match. This requires use of the "StringPiece" type,
121
+ // which represents a sub-range of a real string.
122
+ //
123
+ // Example: read lines of the form "var = value" from a string.
124
+ // string contents = ...; // Fill string somehow
125
+ // StringPiece input(contents); // Wrap a StringPiece around it
126
+ //
127
+ // string var;
128
+ // int value;
129
+ // while (RE2::Consume(&input, "(\\w+) = (\\d+)\n", &var, &value)) {
130
+ // ...;
131
+ // }
132
+ //
133
+ // Each successful call to "Consume" will set "var/value", and also
134
+ // advance "input" so it points past the matched text. Note that if the
135
+ // regular expression matches an empty string, input will advance
136
+ // by 0 bytes. If the regular expression being used might match
137
+ // an empty string, the loop body must check for this case and either
138
+ // advance the string or break out of the loop.
139
+ //
140
+ // The "FindAndConsume" operation is similar to "Consume" but does not
141
+ // anchor your match at the beginning of the string. For example, you
142
+ // could extract all words from a string by repeatedly calling
143
+ // RE2::FindAndConsume(&input, "(\\w+)", &word)
144
+ //
145
+ // -----------------------------------------------------------------------
146
+ // USING VARIABLE NUMBER OF ARGUMENTS
147
+ //
148
+ // The above operations require you to know the number of arguments
149
+ // when you write the code. This is not always possible or easy (for
150
+ // example, the regular expression may be calculated at run time).
151
+ // You can use the "N" version of the operations when the number of
152
+ // match arguments are determined at run time.
153
+ //
154
+ // Example:
155
+ // const RE2::Arg* args[10];
156
+ // int n;
157
+ // // ... populate args with pointers to RE2::Arg values ...
158
+ // // ... set n to the number of RE2::Arg objects ...
159
+ // bool match = RE2::FullMatchN(input, pattern, args, n);
160
+ //
161
+ // The last statement is equivalent to
162
+ //
163
+ // bool match = RE2::FullMatch(input, pattern,
164
+ // *args[0], *args[1], ..., *args[n - 1]);
165
+ //
166
+ // -----------------------------------------------------------------------
167
+ // PARSING HEX/OCTAL/C-RADIX NUMBERS
168
+ //
169
+ // By default, if you pass a pointer to a numeric value, the
170
+ // corresponding text is interpreted as a base-10 number. You can
171
+ // instead wrap the pointer with a call to one of the operators Hex(),
172
+ // Octal(), or CRadix() to interpret the text in another base. The
173
+ // CRadix operator interprets C-style "0" (base-8) and "0x" (base-16)
174
+ // prefixes, but defaults to base-10.
175
+ //
176
+ // Example:
177
+ // int a, b, c, d;
178
+ // CHECK(RE2::FullMatch("100 40 0100 0x40", "(.*) (.*) (.*) (.*)",
179
+ // RE2::Octal(&a), RE2::Hex(&b), RE2::CRadix(&c), RE2::CRadix(&d));
180
+ // will leave 64 in a, b, c, and d.
181
+
182
+
183
+ #include <stdint.h>
184
+ #include <map>
185
+ #include <string>
186
+ #include "re2/stringpiece.h"
187
+ #include "re2/variadic_function.h"
188
+
189
+ namespace re2 {
190
+ using std::string;
191
+ using std::map;
192
+ class Mutex;
193
+ class Prog;
194
+ class Regexp;
195
+
196
+ // Interface for regular expression matching. Also corresponds to a
197
+ // pre-compiled regular expression. An "RE2" object is safe for
198
+ // concurrent use by multiple threads.
199
+ class RE2 {
200
+ public:
201
+ // We convert user-passed pointers into special Arg objects
202
+ class Arg;
203
+ class Options;
204
+
205
+ // Defined in set.h.
206
+ class Set;
207
+
208
+ enum ErrorCode {
209
+ NoError = 0,
210
+
211
+ // Unexpected error
212
+ ErrorInternal,
213
+
214
+ // Parse errors
215
+ ErrorBadEscape, // bad escape sequence
216
+ ErrorBadCharClass, // bad character class
217
+ ErrorBadCharRange, // bad character class range
218
+ ErrorMissingBracket, // missing closing ]
219
+ ErrorMissingParen, // missing closing )
220
+ ErrorTrailingBackslash, // trailing \ at end of regexp
221
+ ErrorRepeatArgument, // repeat argument missing, e.g. "*"
222
+ ErrorRepeatSize, // bad repetition argument
223
+ ErrorRepeatOp, // bad repetition operator
224
+ ErrorBadPerlOp, // bad perl operator
225
+ ErrorBadUTF8, // invalid UTF-8 in regexp
226
+ ErrorBadNamedCapture, // bad named capture group
227
+ ErrorPatternTooLarge, // pattern too large (compile failed)
228
+ };
229
+
230
+ // Predefined common options.
231
+ // If you need more complicated things, instantiate
232
+ // an Option class, change the settings, and pass it to the
233
+ // RE2 constructor.
234
+ static const Options DefaultOptions;
235
+ static const Options Latin1; // treat input as Latin-1 (default UTF-8)
236
+ static const Options POSIX; // POSIX syntax, leftmost-longest match
237
+ static const Options Quiet; // do not log about regexp parse errors
238
+
239
+ // Need to have the const char* and const string& forms for implicit
240
+ // conversions when passing string literals to FullMatch and PartialMatch.
241
+ // Otherwise the StringPiece form would be sufficient.
242
+ #ifndef SWIG
243
+ RE2(const char* pattern);
244
+ RE2(const string& pattern);
245
+ #endif
246
+ RE2(const StringPiece& pattern);
247
+ RE2(const StringPiece& pattern, const Options& option);
248
+ ~RE2();
249
+
250
+ // Returns whether RE2 was created properly.
251
+ bool ok() const { return error_code() == NoError; }
252
+
253
+ // The string specification for this RE2. E.g.
254
+ // RE2 re("ab*c?d+");
255
+ // re.pattern(); // "ab*c?d+"
256
+ const string& pattern() const { return pattern_; }
257
+
258
+ // If RE2 could not be created properly, returns an error string.
259
+ // Else returns the empty string.
260
+ const string& error() const { return *error_; }
261
+
262
+ // If RE2 could not be created properly, returns an error code.
263
+ // Else returns RE2::NoError (== 0).
264
+ ErrorCode error_code() const { return error_code_; }
265
+
266
+ // If RE2 could not be created properly, returns the offending
267
+ // portion of the regexp.
268
+ const string& error_arg() const { return error_arg_; }
269
+
270
+ // Returns the program size, a very approximate measure of a regexp's "cost".
271
+ // Larger numbers are more expensive than smaller numbers.
272
+ int ProgramSize() const;
273
+
274
+ // Returns the underlying Regexp; not for general use.
275
+ // Returns entire_regexp_ so that callers don't need
276
+ // to know about prefix_ and prefix_foldcase_.
277
+ re2::Regexp* Regexp() const { return entire_regexp_; }
278
+
279
+ /***** The useful part: the matching interface *****/
280
+
281
+ // Matches "text" against "pattern". If pointer arguments are
282
+ // supplied, copies matched sub-patterns into them.
283
+ //
284
+ // You can pass in a "const char*" or a "string" for "text".
285
+ // You can pass in a "const char*" or a "string" or a "RE2" for "pattern".
286
+ //
287
+ // The provided pointer arguments can be pointers to any scalar numeric
288
+ // type, or one of:
289
+ // string (matched piece is copied to string)
290
+ // StringPiece (StringPiece is mutated to point to matched piece)
291
+ // T (where "bool T::ParseFrom(const char*, int)" exists)
292
+ // (void*)NULL (the corresponding matched sub-pattern is not copied)
293
+ //
294
+ // Returns true iff all of the following conditions are satisfied:
295
+ // a. "text" matches "pattern" exactly
296
+ // b. The number of matched sub-patterns is >= number of supplied pointers
297
+ // c. The "i"th argument has a suitable type for holding the
298
+ // string captured as the "i"th sub-pattern. If you pass in
299
+ // NULL for the "i"th argument, or pass fewer arguments than
300
+ // number of sub-patterns, "i"th captured sub-pattern is
301
+ // ignored.
302
+ //
303
+ // CAVEAT: An optional sub-pattern that does not exist in the
304
+ // matched string is assigned the empty string. Therefore, the
305
+ // following will return false (because the empty string is not a
306
+ // valid number):
307
+ // int number;
308
+ // RE2::FullMatch("abc", "[a-z]+(\\d+)?", &number);
309
+ static bool FullMatchN(const StringPiece& text, const RE2& re,
310
+ const Arg* const args[], int argc);
311
+ static const VariadicFunction2<
312
+ bool, const StringPiece&, const RE2&, Arg, RE2::FullMatchN> FullMatch;
313
+
314
+ // Exactly like FullMatch(), except that "pattern" is allowed to match
315
+ // a substring of "text".
316
+ static bool PartialMatchN(const StringPiece& text, const RE2& re, // 3..16 args
317
+ const Arg* const args[], int argc);
318
+ static const VariadicFunction2<
319
+ bool, const StringPiece&, const RE2&, Arg, RE2::PartialMatchN> PartialMatch;
320
+
321
+ // Like FullMatch() and PartialMatch(), except that pattern has to
322
+ // match a prefix of "text", and "input" is advanced past the matched
323
+ // text. Note: "input" is modified iff this routine returns true.
324
+ static bool ConsumeN(StringPiece* input, const RE2& pattern, // 3..16 args
325
+ const Arg* const args[], int argc);
326
+ static const VariadicFunction2<
327
+ bool, StringPiece*, const RE2&, Arg, RE2::ConsumeN> Consume;
328
+
329
+ // Like Consume(..), but does not anchor the match at the beginning of the
330
+ // string. That is, "pattern" need not start its match at the beginning of
331
+ // "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds the next
332
+ // word in "s" and stores it in "word".
333
+ static bool FindAndConsumeN(StringPiece* input, const RE2& pattern,
334
+ const Arg* const args[], int argc);
335
+ static const VariadicFunction2<
336
+ bool, StringPiece*, const RE2&, Arg, RE2::FindAndConsumeN> FindAndConsume;
337
+
338
+ // Replace the first match of "pattern" in "str" with "rewrite".
339
+ // Within "rewrite", backslash-escaped digits (\1 to \9) can be
340
+ // used to insert text matching corresponding parenthesized group
341
+ // from the pattern. \0 in "rewrite" refers to the entire matching
342
+ // text. E.g.,
343
+ //
344
+ // string s = "yabba dabba doo";
345
+ // CHECK(RE2::Replace(&s, "b+", "d"));
346
+ //
347
+ // will leave "s" containing "yada dabba doo"
348
+ //
349
+ // Returns true if the pattern matches and a replacement occurs,
350
+ // false otherwise.
351
+ static bool Replace(string *str,
352
+ const RE2& pattern,
353
+ const StringPiece& rewrite);
354
+
355
+ // Like Replace(), except replaces successive non-overlapping occurrences
356
+ // of the pattern in the string with the rewrite. E.g.
357
+ //
358
+ // string s = "yabba dabba doo";
359
+ // CHECK(RE2::GlobalReplace(&s, "b+", "d"));
360
+ //
361
+ // will leave "s" containing "yada dada doo"
362
+ // Replacements are not subject to re-matching.
363
+ //
364
+ // Because GlobalReplace only replaces non-overlapping matches,
365
+ // replacing "ana" within "banana" makes only one replacement, not two.
366
+ //
367
+ // Returns the number of replacements made.
368
+ static int GlobalReplace(string *str,
369
+ const RE2& pattern,
370
+ const StringPiece& rewrite);
371
+
372
+ // Like Replace, except that if the pattern matches, "rewrite"
373
+ // is copied into "out" with substitutions. The non-matching
374
+ // portions of "text" are ignored.
375
+ //
376
+ // Returns true iff a match occurred and the extraction happened
377
+ // successfully; if no match occurs, the string is left unaffected.
378
+ static bool Extract(const StringPiece &text,
379
+ const RE2& pattern,
380
+ const StringPiece &rewrite,
381
+ string *out);
382
+
383
+ // Escapes all potentially meaningful regexp characters in
384
+ // 'unquoted'. The returned string, used as a regular expression,
385
+ // will exactly match the original string. For example,
386
+ // 1.5-2.0?
387
+ // may become:
388
+ // 1\.5\-2\.0\?
389
+ static string QuoteMeta(const StringPiece& unquoted);
390
+
391
+ // Computes range for any strings matching regexp. The min and max can in
392
+ // some cases be arbitrarily precise, so the caller gets to specify the
393
+ // maximum desired length of string returned.
394
+ //
395
+ // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
396
+ // string s that is an anchored match for this regexp satisfies
397
+ // min <= s && s <= max.
398
+ //
399
+ // Note that PossibleMatchRange() will only consider the first copy of an
400
+ // infinitely repeated element (i.e., any regexp element followed by a '*' or
401
+ // '+' operator). Regexps with "{N}" constructions are not affected, as those
402
+ // do not compile down to infinite repetitions.
403
+ //
404
+ // Returns true on success, false on error.
405
+ bool PossibleMatchRange(string* min, string* max, int maxlen) const;
406
+
407
+ // Generic matching interface
408
+
409
+ // Type of match.
410
+ enum Anchor {
411
+ UNANCHORED, // No anchoring
412
+ ANCHOR_START, // Anchor at start only
413
+ ANCHOR_BOTH, // Anchor at start and end
414
+ };
415
+
416
+ // Return the number of capturing subpatterns, or -1 if the
417
+ // regexp wasn't valid on construction. The overall match ($0)
418
+ // does not count: if the regexp is "(a)(b)", returns 2.
419
+ int NumberOfCapturingGroups() const;
420
+
421
+
422
+ // Return a map from names to capturing indices.
423
+ // The map records the index of the leftmost group
424
+ // with the given name.
425
+ // Only valid until the re is deleted.
426
+ const map<string, int>& NamedCapturingGroups() const;
427
+
428
+ // Return a map from capturing indices to names.
429
+ // The map has no entries for unnamed groups.
430
+ // Only valid until the re is deleted.
431
+ const map<int, string>& CapturingGroupNames() const;
432
+
433
+ // General matching routine.
434
+ // Match against text starting at offset startpos
435
+ // and stopping the search at offset endpos.
436
+ // Returns true if match found, false if not.
437
+ // On a successful match, fills in match[] (up to nmatch entries)
438
+ // with information about submatches.
439
+ // I.e. matching RE2("(foo)|(bar)baz") on "barbazbla" will return true,
440
+ // setting match[0] = "barbaz", match[1] = NULL, match[2] = "bar",
441
+ // match[3] = NULL, ..., up to match[nmatch-1] = NULL.
442
+ //
443
+ // Don't ask for more match information than you will use:
444
+ // runs much faster with nmatch == 1 than nmatch > 1, and
445
+ // runs even faster if nmatch == 0.
446
+ // Doesn't make sense to use nmatch > 1 + NumberOfCapturingGroups(),
447
+ // but will be handled correctly.
448
+ //
449
+ // Passing text == StringPiece(NULL, 0) will be handled like any other
450
+ // empty string, but note that on return, it will not be possible to tell
451
+ // whether submatch i matched the empty string or did not match:
452
+ // either way, match[i] == NULL.
453
+ bool Match(const StringPiece& text,
454
+ int startpos,
455
+ int endpos,
456
+ Anchor anchor,
457
+ StringPiece *match,
458
+ int nmatch) const;
459
+
460
+ // Check that the given rewrite string is suitable for use with this
461
+ // regular expression. It checks that:
462
+ // * The regular expression has enough parenthesized subexpressions
463
+ // to satisfy all of the \N tokens in rewrite
464
+ // * The rewrite string doesn't have any syntax errors. E.g.,
465
+ // '\' followed by anything other than a digit or '\'.
466
+ // A true return value guarantees that Replace() and Extract() won't
467
+ // fail because of a bad rewrite string.
468
+ bool CheckRewriteString(const StringPiece& rewrite, string* error) const;
469
+
470
+ // Constructor options
471
+ class Options {
472
+ public:
473
+ // The options are (defaults in parentheses):
474
+ //
475
+ // utf8 (true) text and pattern are UTF-8; otherwise Latin-1
476
+ // posix_syntax (false) restrict regexps to POSIX egrep syntax
477
+ // longest_match (false) search for longest match, not first match
478
+ // log_errors (true) log syntax and execution errors to ERROR
479
+ // max_mem (see below) approx. max memory footprint of RE2
480
+ // literal (false) interpret string as literal, not regexp
481
+ // never_nl (false) never match \n, even if it is in regexp
482
+ // case_sensitive (true) match is case-sensitive (regexp can override
483
+ // with (?i) unless in posix_syntax mode)
484
+ //
485
+ // The following options are only consulted when posix_syntax == true.
486
+ // (When posix_syntax == false these features are always enabled and
487
+ // cannot be turned off.)
488
+ // perl_classes (false) allow Perl's \d \s \w \D \S \W
489
+ // word_boundary (false) allow Perl's \b \B (word boundary and not)
490
+ // one_line (false) ^ and $ only match beginning and end of text
491
+ //
492
+ // The max_mem option controls how much memory can be used
493
+ // to hold the compiled form of the regexp (the Prog) and
494
+ // its cached DFA graphs. Code Search placed limits on the number
495
+ // of Prog instructions and DFA states: 10,000 for both.
496
+ // In RE2, those limits would translate to about 240 KB per Prog
497
+ // and perhaps 2.5 MB per DFA (DFA state sizes vary by regexp; RE2 does a
498
+ // better job of keeping them small than Code Search did).
499
+ // Each RE2 has two Progs (one forward, one reverse), and each Prog
500
+ // can have two DFAs (one first match, one longest match).
501
+ // That makes 4 DFAs:
502
+ //
503
+ // forward, first-match - used for UNANCHORED or ANCHOR_LEFT searches
504
+ // if opt.longest_match() == false
505
+ // forward, longest-match - used for all ANCHOR_BOTH searches,
506
+ // and the other two kinds if
507
+ // opt.longest_match() == true
508
+ // reverse, first-match - never used
509
+ // reverse, longest-match - used as second phase for unanchored searches
510
+ //
511
+ // The RE2 memory budget is statically divided between the two
512
+ // Progs and then the DFAs: two thirds to the forward Prog
513
+ // and one third to the reverse Prog. The forward Prog gives half
514
+ // of what it has left over to each of its DFAs. The reverse Prog
515
+ // gives it all to its longest-match DFA.
516
+ //
517
+ // Once a DFA fills its budget, it flushes its cache and starts over.
518
+ // If this happens too often, RE2 falls back on the NFA implementation.
519
+
520
+ // For now, make the default budget something close to Code Search.
521
+ static const int kDefaultMaxMem = 8<<20;
522
+
523
+ enum Encoding {
524
+ EncodingUTF8 = 1,
525
+ EncodingLatin1
526
+ };
527
+
528
+ Options() :
529
+ encoding_(EncodingUTF8),
530
+ posix_syntax_(false),
531
+ longest_match_(false),
532
+ log_errors_(true),
533
+ max_mem_(kDefaultMaxMem),
534
+ literal_(false),
535
+ never_nl_(false),
536
+ case_sensitive_(true),
537
+ perl_classes_(false),
538
+ word_boundary_(false),
539
+ one_line_(false) {
540
+ }
541
+
542
+ Encoding encoding() const { return encoding_; }
543
+ void set_encoding(Encoding encoding) { encoding_ = encoding; }
544
+
545
+ // Legacy interface to encoding.
546
+ // TODO(rsc): Remove once clients have been converted.
547
+ bool utf8() const { return encoding_ == EncodingUTF8; }
548
+ void set_utf8(bool b) {
549
+ if (b) {
550
+ encoding_ = EncodingUTF8;
551
+ } else {
552
+ encoding_ = EncodingLatin1;
553
+ }
554
+ }
555
+
556
+ bool posix_syntax() const { return posix_syntax_; }
557
+ void set_posix_syntax(bool b) { posix_syntax_ = b; }
558
+
559
+ bool longest_match() const { return longest_match_; }
560
+ void set_longest_match(bool b) { longest_match_ = b; }
561
+
562
+ bool log_errors() const { return log_errors_; }
563
+ void set_log_errors(bool b) { log_errors_ = b; }
564
+
565
+ int max_mem() const { return max_mem_; }
566
+ void set_max_mem(int m) { max_mem_ = m; }
567
+
568
+ bool literal() const { return literal_; }
569
+ void set_literal(bool b) { literal_ = b; }
570
+
571
+ bool never_nl() const { return never_nl_; }
572
+ void set_never_nl(bool b) { never_nl_ = b; }
573
+
574
+ bool case_sensitive() const { return case_sensitive_; }
575
+ void set_case_sensitive(bool b) { case_sensitive_ = b; }
576
+
577
+ bool perl_classes() const { return perl_classes_; }
578
+ void set_perl_classes(bool b) { perl_classes_ = b; }
579
+
580
+ bool word_boundary() const { return word_boundary_; }
581
+ void set_word_boundary(bool b) { word_boundary_ = b; }
582
+
583
+ bool one_line() const { return one_line_; }
584
+ void set_one_line(bool b) { one_line_ = b; }
585
+
586
+ void Copy(const Options& src) {
587
+ encoding_ = src.encoding_;
588
+ posix_syntax_ = src.posix_syntax_;
589
+ longest_match_ = src.longest_match_;
590
+ log_errors_ = src.log_errors_;
591
+ max_mem_ = src.max_mem_;
592
+ literal_ = src.literal_;
593
+ never_nl_ = src.never_nl_;
594
+ case_sensitive_ = src.case_sensitive_;
595
+ perl_classes_ = src.perl_classes_;
596
+ word_boundary_ = src.word_boundary_;
597
+ one_line_ = src.one_line_;
598
+ }
599
+
600
+ int ParseFlags() const;
601
+
602
+ private:
603
+ // Private constructor for defining constants like RE2::Latin1.
604
+ friend class RE2;
605
+ Options(Encoding encoding,
606
+ bool posix_syntax,
607
+ bool longest_match,
608
+ bool log_errors) :
609
+ encoding_(encoding),
610
+ posix_syntax_(posix_syntax),
611
+ longest_match_(longest_match),
612
+ log_errors_(log_errors),
613
+ max_mem_(kDefaultMaxMem),
614
+ literal_(false),
615
+ never_nl_(false),
616
+ case_sensitive_(true),
617
+ perl_classes_(false),
618
+ word_boundary_(false),
619
+ one_line_(false) {
620
+ }
621
+
622
+ Encoding encoding_;
623
+ bool posix_syntax_;
624
+ bool longest_match_;
625
+ bool log_errors_;
626
+ int64_t max_mem_;
627
+ bool literal_;
628
+ bool never_nl_;
629
+ bool case_sensitive_;
630
+ bool perl_classes_;
631
+ bool word_boundary_;
632
+ bool one_line_;
633
+
634
+ //DISALLOW_EVIL_CONSTRUCTORS(Options);
635
+ Options(const Options&);
636
+ void operator=(const Options&);
637
+ };
638
+
639
+ // Returns the options set in the constructor.
640
+ const Options& options() const { return options_; };
641
+
642
+ // Argument converters; see below.
643
+ static inline Arg CRadix(short* x);
644
+ static inline Arg CRadix(unsigned short* x);
645
+ static inline Arg CRadix(int* x);
646
+ static inline Arg CRadix(unsigned int* x);
647
+ static inline Arg CRadix(long* x);
648
+ static inline Arg CRadix(unsigned long* x);
649
+ static inline Arg CRadix(long long* x);
650
+ static inline Arg CRadix(unsigned long long* x);
651
+
652
+ static inline Arg Hex(short* x);
653
+ static inline Arg Hex(unsigned short* x);
654
+ static inline Arg Hex(int* x);
655
+ static inline Arg Hex(unsigned int* x);
656
+ static inline Arg Hex(long* x);
657
+ static inline Arg Hex(unsigned long* x);
658
+ static inline Arg Hex(long long* x);
659
+ static inline Arg Hex(unsigned long long* x);
660
+
661
+ static inline Arg Octal(short* x);
662
+ static inline Arg Octal(unsigned short* x);
663
+ static inline Arg Octal(int* x);
664
+ static inline Arg Octal(unsigned int* x);
665
+ static inline Arg Octal(long* x);
666
+ static inline Arg Octal(unsigned long* x);
667
+ static inline Arg Octal(long long* x);
668
+ static inline Arg Octal(unsigned long long* x);
669
+
670
+ private:
671
+ void Init(const StringPiece& pattern, const Options& options);
672
+
673
+ bool Rewrite(string *out,
674
+ const StringPiece &rewrite,
675
+ const StringPiece* vec,
676
+ int veclen) const;
677
+
678
+ bool DoMatch(const StringPiece& text,
679
+ Anchor anchor,
680
+ int* consumed,
681
+ const Arg* const args[],
682
+ int n) const;
683
+
684
+ re2::Prog* ReverseProg() const;
685
+
686
+ mutable Mutex* mutex_;
687
+ string pattern_; // string regular expression
688
+ Options options_; // option flags
689
+ string prefix_; // required prefix (before regexp_)
690
+ bool prefix_foldcase_; // prefix is ASCII case-insensitive
691
+ re2::Regexp* entire_regexp_; // parsed regular expression
692
+ re2::Regexp* suffix_regexp_; // parsed regular expression, prefix removed
693
+ re2::Prog* prog_; // compiled program for regexp
694
+ mutable re2::Prog* rprog_; // reverse program for regexp
695
+ bool is_one_pass_; // can use prog_->SearchOnePass?
696
+ mutable const string* error_; // Error indicator
697
+ // (or points to empty string)
698
+ mutable ErrorCode error_code_; // Error code
699
+ mutable string error_arg_; // Fragment of regexp showing error
700
+ mutable int num_captures_; // Number of capturing groups
701
+
702
+ // Map from capture names to indices
703
+ mutable const map<string, int>* named_groups_;
704
+
705
+ // Map from capture indices to names
706
+ mutable const map<int, string>* group_names_;
707
+
708
+ //DISALLOW_EVIL_CONSTRUCTORS(RE2);
709
+ RE2(const RE2&);
710
+ void operator=(const RE2&);
711
+ };
712
+
713
+ /***** Implementation details *****/
714
+
715
+ // Hex/Octal/Binary?
716
+
717
+ // Special class for parsing into objects that define a ParseFrom() method
718
+ template <class T>
719
+ class _RE2_MatchObject {
720
+ public:
721
+ static inline bool Parse(const char* str, int n, void* dest) {
722
+ if (dest == NULL) return true;
723
+ T* object = reinterpret_cast<T*>(dest);
724
+ return object->ParseFrom(str, n);
725
+ }
726
+ };
727
+
728
+ class RE2::Arg {
729
+ public:
730
+ // Empty constructor so we can declare arrays of RE2::Arg
731
+ Arg();
732
+
733
+ // Constructor specially designed for NULL arguments
734
+ Arg(void*);
735
+
736
+ typedef bool (*Parser)(const char* str, int n, void* dest);
737
+
738
+ // Type-specific parsers
739
+ #define MAKE_PARSER(type,name) \
740
+ Arg(type* p) : arg_(p), parser_(name) { } \
741
+ Arg(type* p, Parser parser) : arg_(p), parser_(parser) { } \
742
+
743
+
744
+ MAKE_PARSER(char, parse_char);
745
+ MAKE_PARSER(signed char, parse_char);
746
+ MAKE_PARSER(unsigned char, parse_uchar);
747
+ MAKE_PARSER(short, parse_short);
748
+ MAKE_PARSER(unsigned short, parse_ushort);
749
+ MAKE_PARSER(int, parse_int);
750
+ MAKE_PARSER(unsigned int, parse_uint);
751
+ MAKE_PARSER(long, parse_long);
752
+ MAKE_PARSER(unsigned long, parse_ulong);
753
+ MAKE_PARSER(long long, parse_longlong);
754
+ MAKE_PARSER(unsigned long long, parse_ulonglong);
755
+ MAKE_PARSER(float, parse_float);
756
+ MAKE_PARSER(double, parse_double);
757
+ MAKE_PARSER(string, parse_string);
758
+ MAKE_PARSER(StringPiece, parse_stringpiece);
759
+
760
+ #undef MAKE_PARSER
761
+
762
+ // Generic constructor
763
+ template <class T> Arg(T*, Parser parser);
764
+ // Generic constructor template
765
+ template <class T> Arg(T* p)
766
+ : arg_(p), parser_(_RE2_MatchObject<T>::Parse) {
767
+ }
768
+
769
+ // Parse the data
770
+ bool Parse(const char* str, int n) const;
771
+
772
+ private:
773
+ void* arg_;
774
+ Parser parser_;
775
+
776
+ static bool parse_null (const char* str, int n, void* dest);
777
+ static bool parse_char (const char* str, int n, void* dest);
778
+ static bool parse_uchar (const char* str, int n, void* dest);
779
+ static bool parse_float (const char* str, int n, void* dest);
780
+ static bool parse_double (const char* str, int n, void* dest);
781
+ static bool parse_string (const char* str, int n, void* dest);
782
+ static bool parse_stringpiece (const char* str, int n, void* dest);
783
+
784
+ #define DECLARE_INTEGER_PARSER(name) \
785
+ private: \
786
+ static bool parse_ ## name(const char* str, int n, void* dest); \
787
+ static bool parse_ ## name ## _radix( \
788
+ const char* str, int n, void* dest, int radix); \
789
+ public: \
790
+ static bool parse_ ## name ## _hex(const char* str, int n, void* dest); \
791
+ static bool parse_ ## name ## _octal(const char* str, int n, void* dest); \
792
+ static bool parse_ ## name ## _cradix(const char* str, int n, void* dest)
793
+
794
+ DECLARE_INTEGER_PARSER(short);
795
+ DECLARE_INTEGER_PARSER(ushort);
796
+ DECLARE_INTEGER_PARSER(int);
797
+ DECLARE_INTEGER_PARSER(uint);
798
+ DECLARE_INTEGER_PARSER(long);
799
+ DECLARE_INTEGER_PARSER(ulong);
800
+ DECLARE_INTEGER_PARSER(longlong);
801
+ DECLARE_INTEGER_PARSER(ulonglong);
802
+
803
+ #undef DECLARE_INTEGER_PARSER
804
+ };
805
+
806
+ inline RE2::Arg::Arg() : arg_(NULL), parser_(parse_null) { }
807
+ inline RE2::Arg::Arg(void* p) : arg_(p), parser_(parse_null) { }
808
+
809
+ inline bool RE2::Arg::Parse(const char* str, int n) const {
810
+ return (*parser_)(str, n, arg_);
811
+ }
812
+
813
+ // This part of the parser, appropriate only for ints, deals with bases
814
+ #define MAKE_INTEGER_PARSER(type, name) \
815
+ inline RE2::Arg RE2::Hex(type* ptr) { \
816
+ return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _hex); } \
817
+ inline RE2::Arg RE2::Octal(type* ptr) { \
818
+ return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _octal); } \
819
+ inline RE2::Arg RE2::CRadix(type* ptr) { \
820
+ return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _cradix); }
821
+
822
+ MAKE_INTEGER_PARSER(short, short);
823
+ MAKE_INTEGER_PARSER(unsigned short, ushort);
824
+ MAKE_INTEGER_PARSER(int, int);
825
+ MAKE_INTEGER_PARSER(unsigned int, uint);
826
+ MAKE_INTEGER_PARSER(long, long);
827
+ MAKE_INTEGER_PARSER(unsigned long, ulong);
828
+ MAKE_INTEGER_PARSER(long long, longlong);
829
+ MAKE_INTEGER_PARSER(unsigned long long, ulonglong);
830
+
831
+ #undef MAKE_INTEGER_PARSER
832
+
833
+ } // namespace re2
834
+
835
+ using re2::RE2;
836
+
837
+ #endif /* RE2_RE2_H */