chipper 0.4.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (134) hide show
  1. data/README.rdoc +51 -0
  2. data/ext/extconf.rb +58 -0
  3. data/ext/libstemmer_c/Makefile +10 -0
  4. data/ext/libstemmer_c/examples/stemwords.c +209 -0
  5. data/ext/libstemmer_c/include/libstemmer.h +79 -0
  6. data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
  7. data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
  8. data/ext/libstemmer_c/libstemmer/modules.h +190 -0
  9. data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
  10. data/ext/libstemmer_c/mkinc.mak +82 -0
  11. data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
  12. data/ext/libstemmer_c/runtime/api.c +66 -0
  13. data/ext/libstemmer_c/runtime/api.h +26 -0
  14. data/ext/libstemmer_c/runtime/header.h +58 -0
  15. data/ext/libstemmer_c/runtime/utilities.c +478 -0
  16. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
  17. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
  18. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
  19. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
  20. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
  21. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
  22. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
  23. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
  24. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
  25. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
  26. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
  27. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
  28. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
  29. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
  30. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
  31. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
  32. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
  33. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
  34. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
  35. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
  36. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
  37. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
  38. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
  39. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
  40. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
  41. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
  42. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
  43. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
  44. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
  45. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
  46. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
  47. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
  48. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
  49. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
  50. data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
  51. data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
  52. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
  53. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
  54. data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
  55. data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
  56. data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
  57. data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
  58. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
  59. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
  60. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
  61. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
  62. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
  63. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
  64. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
  65. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
  66. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
  67. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
  68. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
  69. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
  70. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
  71. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
  72. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
  73. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
  74. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
  75. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
  76. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
  77. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
  78. data/ext/re2/bitstate.cc +378 -0
  79. data/ext/re2/compile.cc +1138 -0
  80. data/ext/re2/dfa.cc +2086 -0
  81. data/ext/re2/filtered_re2.cc +100 -0
  82. data/ext/re2/filtered_re2.h +99 -0
  83. data/ext/re2/hash.cc +231 -0
  84. data/ext/re2/mimics_pcre.cc +185 -0
  85. data/ext/re2/nfa.cc +709 -0
  86. data/ext/re2/onepass.cc +614 -0
  87. data/ext/re2/parse.cc +2202 -0
  88. data/ext/re2/perl_groups.cc +119 -0
  89. data/ext/re2/prefilter.cc +671 -0
  90. data/ext/re2/prefilter.h +105 -0
  91. data/ext/re2/prefilter_tree.cc +398 -0
  92. data/ext/re2/prefilter_tree.h +130 -0
  93. data/ext/re2/prog.cc +341 -0
  94. data/ext/re2/prog.h +376 -0
  95. data/ext/re2/re2.cc +1180 -0
  96. data/ext/re2/re2.h +837 -0
  97. data/ext/re2/regexp.cc +920 -0
  98. data/ext/re2/regexp.h +632 -0
  99. data/ext/re2/rune.cc +258 -0
  100. data/ext/re2/set.cc +113 -0
  101. data/ext/re2/set.h +55 -0
  102. data/ext/re2/simplify.cc +393 -0
  103. data/ext/re2/stringpiece.cc +87 -0
  104. data/ext/re2/stringpiece.h +182 -0
  105. data/ext/re2/tostring.cc +341 -0
  106. data/ext/re2/unicode_casefold.cc +469 -0
  107. data/ext/re2/unicode_casefold.h +75 -0
  108. data/ext/re2/unicode_groups.cc +4851 -0
  109. data/ext/re2/unicode_groups.h +64 -0
  110. data/ext/re2/valgrind.cc +24 -0
  111. data/ext/re2/variadic_function.h +346 -0
  112. data/ext/re2/walker-inl.h +244 -0
  113. data/ext/src/chipper.cc +626 -0
  114. data/ext/src/version.h +1 -0
  115. data/ext/stemmer.rb +40 -0
  116. data/ext/util/arena.h +103 -0
  117. data/ext/util/atomicops.h +79 -0
  118. data/ext/util/benchmark.h +41 -0
  119. data/ext/util/flags.h +27 -0
  120. data/ext/util/logging.h +78 -0
  121. data/ext/util/mutex.h +190 -0
  122. data/ext/util/pcre.h +679 -0
  123. data/ext/util/random.h +29 -0
  124. data/ext/util/sparse_array.h +451 -0
  125. data/ext/util/sparse_set.h +177 -0
  126. data/ext/util/test.h +57 -0
  127. data/ext/util/thread.h +26 -0
  128. data/ext/util/utf.h +43 -0
  129. data/ext/util/util.h +127 -0
  130. data/ext/util/valgrind.h +4517 -0
  131. data/test/helper.rb +5 -0
  132. data/test/test_entities.rb +57 -0
  133. data/test/test_tokens.rb +118 -0
  134. metadata +199 -0
data/ext/re2/regexp.h ADDED
@@ -0,0 +1,632 @@
1
+ // Copyright 2006 The RE2 Authors. All Rights Reserved.
2
+ // Use of this source code is governed by a BSD-style
3
+ // license that can be found in the LICENSE file.
4
+
5
+ // --- SPONSORED LINK --------------------------------------------------
6
+ // If you want to use this library for regular expression matching,
7
+ // you should use re2/re2.h, which provides a class RE2 that
8
+ // mimics the PCRE interface provided by PCRE's C++ wrappers.
9
+ // This header describes the low-level interface used to implement RE2
10
+ // and may change in backwards-incompatible ways from time to time.
11
+ // In contrast, RE2's interface will not.
12
+ // ---------------------------------------------------------------------
13
+
14
+ // Regular expression library: parsing, execution, and manipulation
15
+ // of regular expressions.
16
+ //
17
+ // Any operation that traverses the Regexp structures should be written
18
+ // using Regexp::Walker (see walker-inl.h), not recursively, because deeply nested
19
+ // regular expressions such as x++++++++++++++++++++... might cause recursive
20
+ // traversals to overflow the stack.
21
+ //
22
+ // It is the caller's responsibility to provide appropriate mutual exclusion
23
+ // around manipulation of the regexps. RE2 does this.
24
+ //
25
+ // PARSING
26
+ //
27
+ // Regexp::Parse parses regular expressions encoded in UTF-8.
28
+ // The default syntax is POSIX extended regular expressions,
29
+ // with the following changes:
30
+ //
31
+ // 1. Backreferences (optional in POSIX EREs) are not supported.
32
+ // (Supporting them precludes the use of DFA-based
33
+ // matching engines.)
34
+ //
35
+ // 2. Collating elements and collation classes are not supported.
36
+ // (No one has needed or wanted them.)
37
+ //
38
+ // The exact syntax accepted can be modified by passing flags to
39
+ // Regexp::Parse. In particular, many of the basic Perl additions
40
+ // are available. The flags are documented below (search for LikePerl).
41
+ //
42
+ // If parsed with the flag Regexp::Latin1, both the regular expression
43
+ // and the input to the matching routines are assumed to be encoded in
44
+ // Latin-1, not UTF-8.
45
+ //
46
+ // EXECUTION
47
+ //
48
+ // Once Regexp has parsed a regular expression, it provides methods
49
+ // to search text using that regular expression. These methods are
50
+ // implemented via calling out to other regular expression libraries.
51
+ // (Let's call them the sublibraries.)
52
+ //
53
+ // To call a sublibrary, Regexp does not simply prepare a
54
+ // string version of the regular expression and hand it to the
55
+ // sublibrary. Instead, Regexp prepares, from its own parsed form, the
56
+ // corresponding internal representation used by the sublibrary.
57
+ // This has the drawback of needing to know the internal representation
58
+ // used by the sublibrary, but it has two important benefits:
59
+ //
60
+ // 1. The syntax and meaning of regular expressions is guaranteed
61
+ // to be that used by Regexp's parser, not the syntax expected
62
+ // by the sublibrary. Regexp might accept a restricted or
63
+ // expanded syntax for regular expressions as compared with
64
+ // the sublibrary. As long as Regexp can translate from its
65
+ // internal form into the sublibrary's, clients need not know
66
+ // exactly which sublibrary they are using.
67
+ //
68
+ // 2. The sublibrary parsers are bypassed. For whatever reason,
69
+ // sublibrary regular expression parsers often have security
70
+ // problems. For example, plan9grep's regular expression parser
71
+ // has a buffer overflow in its handling of large character
72
+ // classes, and PCRE's parser has had buffer overflow problems
73
+ // in the past. Security-team requires sandboxing of sublibrary
74
+ // regular expression parsers. Avoiding the sublibrary parsers
75
+ // avoids the sandbox.
76
+ //
77
+ // The execution methods we use now are provided by the compiled form,
78
+ // Prog, described in prog.h
79
+ //
80
+ // MANIPULATION
81
+ //
82
+ // Unlike other regular expression libraries, Regexp makes its parsed
83
+ // form accessible to clients, so that client code can analyze the
84
+ // parsed regular expressions.
85
+
86
+ #ifndef RE2_REGEXP_H__
87
+ #define RE2_REGEXP_H__
88
+
89
+ #include "util/util.h"
90
+ #include "re2/stringpiece.h"
91
+
92
+ namespace re2 {
93
+
94
+ // Keep in sync with string list kOpcodeNames[] in testing/dump.cc
95
+ enum RegexpOp {
96
+ // Matches no strings.
97
+ kRegexpNoMatch = 1,
98
+
99
+ // Matches empty string.
100
+ kRegexpEmptyMatch,
101
+
102
+ // Matches rune_.
103
+ kRegexpLiteral,
104
+
105
+ // Matches runes_.
106
+ kRegexpLiteralString,
107
+
108
+ // Matches concatenation of sub_[0..nsub-1].
109
+ kRegexpConcat,
110
+ // Matches union of sub_[0..nsub-1].
111
+ kRegexpAlternate,
112
+
113
+ // Matches sub_[0] zero or more times.
114
+ kRegexpStar,
115
+ // Matches sub_[0] one or more times.
116
+ kRegexpPlus,
117
+ // Matches sub_[0] zero or one times.
118
+ kRegexpQuest,
119
+
120
+ // Matches sub_[0] at least min_ times, at most max_ times.
121
+ // max_ == -1 means no upper limit.
122
+ kRegexpRepeat,
123
+
124
+ // Parenthesized (capturing) subexpression. Index is cap_.
125
+ // Optionally, capturing name is name_.
126
+ kRegexpCapture,
127
+
128
+ // Matches any character.
129
+ kRegexpAnyChar,
130
+
131
+ // Matches any byte [sic].
132
+ kRegexpAnyByte,
133
+
134
+ // Matches empty string at beginning of line.
135
+ kRegexpBeginLine,
136
+ // Matches empty string at end of line.
137
+ kRegexpEndLine,
138
+
139
+ // Matches word boundary "\b".
140
+ kRegexpWordBoundary,
141
+ // Matches not-a-word boundary "\B".
142
+ kRegexpNoWordBoundary,
143
+
144
+ // Matches empty string at beginning of text.
145
+ kRegexpBeginText,
146
+ // Matches empty string at end of text.
147
+ kRegexpEndText,
148
+
149
+ // Matches character class given by cc_.
150
+ kRegexpCharClass,
151
+
152
+ // Forces match of entire expression right now,
153
+ // with match ID match_id_ (used by RE2::Set).
154
+ kRegexpHaveMatch,
155
+
156
+ kMaxRegexpOp = kRegexpHaveMatch,
157
+ };
158
+
159
+ // Keep in sync with string list in regexp.cc
160
+ enum RegexpStatusCode {
161
+ // No error
162
+ kRegexpSuccess = 0,
163
+
164
+ // Unexpected error
165
+ kRegexpInternalError,
166
+
167
+ // Parse errors
168
+ kRegexpBadEscape, // bad escape sequence
169
+ kRegexpBadCharClass, // bad character class
170
+ kRegexpBadCharRange, // bad character class range
171
+ kRegexpMissingBracket, // missing closing ]
172
+ kRegexpMissingParen, // missing closing )
173
+ kRegexpTrailingBackslash, // at end of regexp
174
+ kRegexpRepeatArgument, // repeat argument missing, e.g. "*"
175
+ kRegexpRepeatSize, // bad repetition argument
176
+ kRegexpRepeatOp, // bad repetition operator
177
+ kRegexpBadPerlOp, // bad perl operator
178
+ kRegexpBadUTF8, // invalid UTF-8 in regexp
179
+ kRegexpBadNamedCapture, // bad named capture
180
+ };
181
+
182
+ // Error status for certain operations.
183
+ class RegexpStatus {
184
+ public:
185
+ RegexpStatus() : code_(kRegexpSuccess), tmp_(NULL) {}
186
+ ~RegexpStatus() { delete tmp_; }
187
+
188
+ void set_code(enum RegexpStatusCode code) { code_ = code; }
189
+ void set_error_arg(const StringPiece& error_arg) { error_arg_ = error_arg; }
190
+ void set_tmp(string* tmp) { delete tmp_; tmp_ = tmp; }
191
+ enum RegexpStatusCode code() const { return code_; }
192
+ const StringPiece& error_arg() const { return error_arg_; }
193
+ bool ok() const { return code() == kRegexpSuccess; }
194
+
195
+ // Copies state from status.
196
+ void Copy(const RegexpStatus& status);
197
+
198
+ // Returns text equivalent of code, e.g.:
199
+ // "Bad character class"
200
+ static const string& CodeText(enum RegexpStatusCode code);
201
+
202
+ // Returns text describing error, e.g.:
203
+ // "Bad character class: [z-a]"
204
+ string Text() const;
205
+
206
+ private:
207
+ enum RegexpStatusCode code_; // Kind of error
208
+ StringPiece error_arg_; // Piece of regexp containing syntax error.
209
+ string* tmp_; // Temporary storage, possibly where error_arg_ is.
210
+
211
+ DISALLOW_EVIL_CONSTRUCTORS(RegexpStatus);
212
+ };
213
+
214
+ // Walker to implement Simplify.
215
+ class SimplifyWalker;
216
+
217
+ // Compiled form; see prog.h
218
+ class Prog;
219
+
220
+ struct RuneRange {
221
+ RuneRange() : lo(0), hi(0) { }
222
+ RuneRange(int l, int h) : lo(l), hi(h) { }
223
+ Rune lo;
224
+ Rune hi;
225
+ };
226
+
227
+ // Less-than on RuneRanges treats a == b if they overlap at all.
228
+ // This lets us look in a set to find the range covering a particular Rune.
229
+ struct RuneRangeLess {
230
+ bool operator()(const RuneRange& a, const RuneRange& b) const {
231
+ return a.hi < b.lo;
232
+ }
233
+ };
234
+
235
+ class CharClassBuilder;
236
+
237
+ class CharClass {
238
+ public:
239
+ void Delete();
240
+
241
+ typedef RuneRange* iterator;
242
+ iterator begin() { return ranges_; }
243
+ iterator end() { return ranges_ + nranges_; }
244
+
245
+ int size() { return nrunes_; }
246
+ bool empty() { return nrunes_ == 0; }
247
+ bool full() { return nrunes_ == Runemax+1; }
248
+ bool FoldsASCII() { return folds_ascii_; }
249
+
250
+ bool Contains(Rune r);
251
+ CharClass* Negate();
252
+
253
+ private:
254
+ CharClass(); // not implemented
255
+ ~CharClass(); // not implemented
256
+ static CharClass* New(int maxranges);
257
+
258
+ friend class CharClassBuilder;
259
+
260
+ bool folds_ascii_;
261
+ int nrunes_;
262
+ RuneRange *ranges_;
263
+ int nranges_;
264
+ DISALLOW_EVIL_CONSTRUCTORS(CharClass);
265
+ };
266
+
267
+ class Regexp {
268
+ public:
269
+
270
+ // Flags for parsing. Can be ORed together.
271
+ enum ParseFlags {
272
+ NoParseFlags = 0,
273
+ FoldCase = 1<<0, // Fold case during matching (case-insensitive).
274
+ Literal = 1<<1, // Treat s as literal string instead of a regexp.
275
+ ClassNL = 1<<2, // Allow char classes like [^a-z] and \D and \s
276
+ // and [[:space:]] to match newline.
277
+ DotNL = 1<<3, // Allow . to match newline.
278
+ MatchNL = ClassNL | DotNL,
279
+ OneLine = 1<<4, // Treat ^ and $ as only matching at beginning and
280
+ // end of text, not around embedded newlines.
281
+ // (Perl's default)
282
+ Latin1 = 1<<5, // Regexp and text are in Latin1, not UTF-8.
283
+ NonGreedy = 1<<6, // Repetition operators are non-greedy by default.
284
+ PerlClasses = 1<<7, // Allow Perl character classes like \d.
285
+ PerlB = 1<<8, // Allow Perl's \b and \B.
286
+ PerlX = 1<<9, // Perl extensions:
287
+ // non-capturing parens - (?: )
288
+ // non-greedy operators - *? +? ?? {}?
289
+ // flag edits - (?i) (?-i) (?i: )
290
+ // i - FoldCase
291
+ // m - !OneLine
292
+ // s - DotNL
293
+ // U - NonGreedy
294
+ // line ends: \A \z
295
+ // \Q and \E to disable/enable metacharacters
296
+ // (?P<name>expr) for named captures
297
+ // \C to match any single byte
298
+ UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group
299
+ // and \P{Han} for its negation.
300
+ NeverNL = 1<<11, // Never match NL, even if the regexp mentions
301
+ // it explicitly.
302
+
303
+ // As close to Perl as we can get.
304
+ LikePerl = ClassNL | OneLine | PerlClasses | PerlB | PerlX |
305
+ UnicodeGroups,
306
+
307
+ // Internal use only.
308
+ WasDollar = 1<<15, // on kRegexpEndText: was $ in regexp text
309
+ };
310
+
311
+ // Get. No set, Regexps are logically immutable once created.
312
+ RegexpOp op() { return static_cast<RegexpOp>(op_); }
313
+ int nsub() { return nsub_; }
314
+ bool simple() { return simple_; }
315
+ enum ParseFlags parse_flags() { return static_cast<ParseFlags>(parse_flags_); }
316
+ int Ref(); // For testing.
317
+
318
+ Regexp** sub() {
319
+ if(nsub_ <= 1)
320
+ return &subone_;
321
+ else
322
+ return submany_;
323
+ }
324
+
325
+ int min() { DCHECK_EQ(op_, kRegexpRepeat); return min_; }
326
+ int max() { DCHECK_EQ(op_, kRegexpRepeat); return max_; }
327
+ Rune rune() { DCHECK_EQ(op_, kRegexpLiteral); return rune_; }
328
+ CharClass* cc() { DCHECK_EQ(op_, kRegexpCharClass); return cc_; }
329
+ int cap() { DCHECK_EQ(op_, kRegexpCapture); return cap_; }
330
+ const string* name() { DCHECK_EQ(op_, kRegexpCapture); return name_; }
331
+ Rune* runes() { DCHECK_EQ(op_, kRegexpLiteralString); return runes_; }
332
+ int nrunes() { DCHECK_EQ(op_, kRegexpLiteralString); return nrunes_; }
333
+ int match_id() { DCHECK_EQ(op_, kRegexpHaveMatch); return match_id_; }
334
+
335
+ // Increments reference count, returns object as convenience.
336
+ Regexp* Incref();
337
+
338
+ // Decrements reference count and deletes this object if count reaches 0.
339
+ void Decref();
340
+
341
+ // Parses string s to produce regular expression, returned.
342
+ // Caller must release return value with re->Decref().
343
+ // On failure, sets *status (if status != NULL) and returns NULL.
344
+ static Regexp* Parse(const StringPiece& s, ParseFlags flags,
345
+ RegexpStatus* status);
346
+
347
+ // Returns a _new_ simplified version of the current regexp.
348
+ // Does not edit the current regexp.
349
+ // Caller must release return value with re->Decref().
350
+ // Simplified means that counted repetition has been rewritten
351
+ // into simpler terms and all Perl/POSIX features have been
352
+ // removed. The result will capture exactly the same
353
+ // subexpressions the original did, unless formatted with ToString.
354
+ Regexp* Simplify();
355
+ friend class SimplifyWalker;
356
+
357
+ // Parses the regexp src and then simplifies it and sets *dst to the
358
+ // string representation of the simplified form. Returns true on success.
359
+ // Returns false and sets *status (if status != NULL) on parse error.
360
+ static bool SimplifyRegexp(const StringPiece& src, ParseFlags flags,
361
+ string* dst,
362
+ RegexpStatus* status);
363
+
364
+ // Returns the number of capturing groups in the regexp.
365
+ int NumCaptures();
366
+ friend class NumCapturesWalker;
367
+
368
+ // Returns a map from names to capturing group indices,
369
+ // or NULL if the regexp contains no named capture groups.
370
+ // The caller is responsible for deleting the map.
371
+ map<string, int>* NamedCaptures();
372
+
373
+ // Returns a map from capturing group indices to capturing group
374
+ // names or NULL if the regexp contains no named capture groups. The
375
+ // caller is responsible for deleting the map.
376
+ map<int, string>* CaptureNames();
377
+
378
+ // Returns a string representation of the current regexp,
379
+ // using as few parentheses as possible.
380
+ string ToString();
381
+
382
+ // Convenience functions. They consume the passed reference,
383
+ // so in many cases you should use, e.g., Plus(re->Incref(), flags).
384
+ // They do not consume allocated arrays like subs or runes.
385
+ static Regexp* Plus(Regexp* sub, ParseFlags flags);
386
+ static Regexp* Star(Regexp* sub, ParseFlags flags);
387
+ static Regexp* Quest(Regexp* sub, ParseFlags flags);
388
+ static Regexp* Concat(Regexp** subs, int nsubs, ParseFlags flags);
389
+ static Regexp* Alternate(Regexp** subs, int nsubs, ParseFlags flags);
390
+ static Regexp* Capture(Regexp* sub, ParseFlags flags, int cap);
391
+ static Regexp* Repeat(Regexp* sub, ParseFlags flags, int min, int max);
392
+ static Regexp* NewLiteral(Rune rune, ParseFlags flags);
393
+ static Regexp* NewCharClass(CharClass* cc, ParseFlags flags);
394
+ static Regexp* LiteralString(Rune* runes, int nrunes, ParseFlags flags);
395
+ static Regexp* HaveMatch(int match_id, ParseFlags flags);
396
+
397
+ // Like Alternate but does not factor out common prefixes.
398
+ static Regexp* AlternateNoFactor(Regexp** subs, int nsubs, ParseFlags flags);
399
+
400
+ // Debugging function. Returns string format for regexp
401
+ // that makes structure clear. Does NOT use regexp syntax.
402
+ string Dump();
403
+
404
+ // Helper traversal class, defined fully in walker-inl.h.
405
+ template<typename T> class Walker;
406
+
407
+ // Compile to Prog. See prog.h
408
+ // Reverse prog expects to be run over text backward.
409
+ // Construction and execution of prog will
410
+ // stay within approximately max_mem bytes of memory.
411
+ // If max_mem <= 0, a reasonable default is used.
412
+ Prog* CompileToProg(int64 max_mem);
413
+ Prog* CompileToReverseProg(int64 max_mem);
414
+
415
+ // Whether to expect this library to find exactly the same answer as PCRE
416
+ // when running this regexp. Most regexps do mimic PCRE exactly, but a few
417
+ // obscure cases behave differently. Technically this is more a property
418
+ // of the Prog than the Regexp, but the computation is much easier to do
419
+ // on the Regexp. See mimics_pcre.cc for the exact conditions.
420
+ bool MimicsPCRE();
421
+
422
+ // Benchmarking function.
423
+ void NullWalk();
424
+
425
+ // Whether every match of this regexp must be anchored and
426
+ // begin with a non-empty fixed string (perhaps after ASCII
427
+ // case-folding). If so, returns the prefix and the sub-regexp that
428
+ // follows it.
429
+ bool RequiredPrefix(string* prefix, bool *foldcase, Regexp** suffix);
430
+
431
+ private:
432
+ // Constructor allocates vectors as appropriate for operator.
433
+ explicit Regexp(RegexpOp op, ParseFlags parse_flags);
434
+
435
+ // Use Decref() instead of delete to release Regexps.
436
+ // This is private to catch deletes at compile time.
437
+ ~Regexp();
438
+ void Destroy();
439
+ bool QuickDestroy();
440
+
441
+ // Helpers for Parse. Listed here so they can edit Regexps.
442
+ class ParseState;
443
+ friend class ParseState;
444
+ friend bool ParseCharClass(StringPiece* s, Regexp** out_re,
445
+ RegexpStatus* status);
446
+
447
+ // Helper for testing [sic].
448
+ friend bool RegexpEqualTestingOnly(Regexp*, Regexp*);
449
+
450
+ // Computes whether Regexp is already simple.
451
+ bool ComputeSimple();
452
+
453
+ // Constructor that generates a concatenation or alternation,
454
+ // enforcing the limit on the number of subexpressions for
455
+ // a particular Regexp.
456
+ static Regexp* ConcatOrAlternate(RegexpOp op, Regexp** subs, int nsubs,
457
+ ParseFlags flags, bool can_factor);
458
+
459
+ // Returns the leading string that re starts with.
460
+ // The returned Rune* points into a piece of re,
461
+ // so it must not be used after the caller calls re->Decref().
462
+ static Rune* LeadingString(Regexp* re, int* nrune, ParseFlags* flags);
463
+
464
+ // Removes the first n leading runes from the beginning of re.
465
+ // Edits re in place.
466
+ static void RemoveLeadingString(Regexp* re, int n);
467
+
468
+ // Returns the leading regexp in re's top-level concatenation.
469
+ // The returned Regexp* points at re or a sub-expression of re,
470
+ // so it must not be used after the caller calls re->Decref().
471
+ static Regexp* LeadingRegexp(Regexp* re);
472
+
473
+ // Removes LeadingRegexp(re) from re and returns the remainder.
474
+ // Might edit re in place.
475
+ static Regexp* RemoveLeadingRegexp(Regexp* re);
476
+
477
+ // Simplifies an alternation of literal strings by factoring out
478
+ // common prefixes.
479
+ static int FactorAlternation(Regexp** sub, int nsub, ParseFlags flags);
480
+ static int FactorAlternationRecursive(Regexp** sub, int nsub,
481
+ ParseFlags flags, int maxdepth);
482
+
483
+ // Is a == b? Only efficient on regexps that have not been through
484
+ // Simplify yet - the expansion of a kRegexpRepeat will make this
485
+ // take a long time. Do not call on such regexps, hence private.
486
+ static bool Equal(Regexp* a, Regexp* b);
487
+
488
+ // Allocate space for n sub-regexps.
489
+ void AllocSub(int n) {
490
+ if (n < 0 || static_cast<uint16>(n) != n)
491
+ LOG(FATAL) << "Cannot AllocSub " << n;
492
+ if (n > 1)
493
+ submany_ = new Regexp*[n];
494
+ nsub_ = n;
495
+ }
496
+
497
+ // Add Rune to LiteralString
498
+ void AddRuneToString(Rune r);
499
+
500
+ // Swaps this with that, in place.
501
+ void Swap(Regexp *that);
502
+
503
+ // Operator. See description of operators above.
504
+ // uint8 instead of RegexpOp to control space usage.
505
+ uint8 op_;
506
+
507
+ // Is this regexp structure already simple
508
+ // (has it been returned by Simplify)?
509
+ // uint8 instead of bool to control space usage.
510
+ uint8 simple_;
511
+
512
+ // Flags saved from parsing and used during execution.
513
+ // (Only FoldCase is used.)
514
+ // uint16 instead of ParseFlags to control space usage.
515
+ uint16 parse_flags_;
516
+
517
+ // Reference count. Exists so that SimplifyRegexp can build
518
+ // regexp structures that are dags rather than trees to avoid
519
+ // exponential blowup in space requirements.
520
+ // uint16 to control space usage.
521
+ // The standard regexp routines will never generate a
522
+ // ref greater than the maximum repeat count (100),
523
+ // but even so, Incref and Decref consult an overflow map
524
+ // when ref_ reaches kMaxRef.
525
+ uint16 ref_;
526
+ static const uint16 kMaxRef = 0xffff;
527
+
528
+ // Subexpressions.
529
+ // uint16 to control space usage.
530
+ // Concat and Alternate handle larger numbers of subexpressions
531
+ // by building concatenation or alternation trees.
532
+ // Other routines should call Concat or Alternate instead of
533
+ // filling in sub() by hand.
534
+ uint16 nsub_;
535
+ static const uint16 kMaxNsub = 0xffff;
536
+ union {
537
+ Regexp** submany_; // if nsub_ > 1
538
+ Regexp* subone_; // if nsub_ == 1
539
+ };
540
+
541
+ // Extra space for parse and teardown stacks.
542
+ Regexp* down_;
543
+
544
+ // Arguments to operator. See description of operators above.
545
+ union {
546
+ struct { // Repeat
547
+ int max_;
548
+ int min_;
549
+ };
550
+ struct { // Capture
551
+ int cap_;
552
+ string* name_;
553
+ };
554
+ struct { // LiteralString
555
+ int nrunes_;
556
+ Rune* runes_;
557
+ };
558
+ struct { // CharClass
559
+ // These two could be in separate union members,
560
+ // but it wouldn't save any space (there are other two-word structs)
561
+ // and keeping them separate avoids confusion during parsing.
562
+ CharClass* cc_;
563
+ CharClassBuilder* ccb_;
564
+ };
565
+ Rune rune_; // Literal
566
+ int match_id_; // HaveMatch
567
+ void *the_union_[2]; // as big as any other element, for memset
568
+ };
569
+
570
+ DISALLOW_EVIL_CONSTRUCTORS(Regexp);
571
+ };
572
+
573
+ // Character class set: contains non-overlapping, non-abutting RuneRanges.
574
+ typedef set<RuneRange, RuneRangeLess> RuneRangeSet;
575
+
576
+ class CharClassBuilder {
577
+ public:
578
+ CharClassBuilder();
579
+
580
+ typedef RuneRangeSet::iterator iterator;
581
+ iterator begin() { return ranges_.begin(); }
582
+ iterator end() { return ranges_.end(); }
583
+
584
+ int size() { return nrunes_; }
585
+ bool empty() { return nrunes_ == 0; }
586
+ bool full() { return nrunes_ == Runemax+1; }
587
+
588
+ bool Contains(Rune r);
589
+ bool FoldsASCII();
590
+ bool AddRange(Rune lo, Rune hi); // returns whether class changed
591
+ CharClassBuilder* Copy();
592
+ void AddCharClass(CharClassBuilder* cc);
593
+ void Negate();
594
+ void RemoveAbove(Rune r);
595
+ CharClass* GetCharClass();
596
+ void AddRangeFlags(Rune lo, Rune hi, Regexp::ParseFlags parse_flags);
597
+
598
+ private:
599
+ static const uint32 AlphaMask = (1<<26) - 1;
600
+ uint32 upper_; // bitmap of A-Z
601
+ uint32 lower_; // bitmap of a-z
602
+ int nrunes_;
603
+ RuneRangeSet ranges_;
604
+ DISALLOW_EVIL_CONSTRUCTORS(CharClassBuilder);
605
+ };
606
+
607
+ // Tell g++ that bitwise ops on ParseFlags produce ParseFlags.
608
+ inline Regexp::ParseFlags operator|(Regexp::ParseFlags a, Regexp::ParseFlags b)
609
+ {
610
+ return static_cast<Regexp::ParseFlags>(static_cast<int>(a) | static_cast<int>(b));
611
+ }
612
+
613
+ inline Regexp::ParseFlags operator^(Regexp::ParseFlags a, Regexp::ParseFlags b)
614
+ {
615
+ return static_cast<Regexp::ParseFlags>(static_cast<int>(a) ^ static_cast<int>(b));
616
+ }
617
+
618
+ inline Regexp::ParseFlags operator&(Regexp::ParseFlags a, Regexp::ParseFlags b)
619
+ {
620
+ return static_cast<Regexp::ParseFlags>(static_cast<int>(a) & static_cast<int>(b));
621
+ }
622
+
623
+ inline Regexp::ParseFlags operator~(Regexp::ParseFlags a)
624
+ {
625
+ return static_cast<Regexp::ParseFlags>(~static_cast<int>(a));
626
+ }
627
+
628
+
629
+
630
+ } // namespace re2
631
+
632
+ #endif // RE2_REGEXP_H__