chipper 0.4.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (134) hide show
  1. data/README.rdoc +51 -0
  2. data/ext/extconf.rb +58 -0
  3. data/ext/libstemmer_c/Makefile +10 -0
  4. data/ext/libstemmer_c/examples/stemwords.c +209 -0
  5. data/ext/libstemmer_c/include/libstemmer.h +79 -0
  6. data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
  7. data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
  8. data/ext/libstemmer_c/libstemmer/modules.h +190 -0
  9. data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
  10. data/ext/libstemmer_c/mkinc.mak +82 -0
  11. data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
  12. data/ext/libstemmer_c/runtime/api.c +66 -0
  13. data/ext/libstemmer_c/runtime/api.h +26 -0
  14. data/ext/libstemmer_c/runtime/header.h +58 -0
  15. data/ext/libstemmer_c/runtime/utilities.c +478 -0
  16. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
  17. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
  18. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
  19. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
  20. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
  21. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
  22. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
  23. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
  24. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
  25. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
  26. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
  27. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
  28. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
  29. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
  30. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
  31. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
  32. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
  33. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
  34. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
  35. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
  36. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
  37. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
  38. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
  39. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
  40. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
  41. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
  42. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
  43. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
  44. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
  45. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
  46. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
  47. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
  48. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
  49. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
  50. data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
  51. data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
  52. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
  53. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
  54. data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
  55. data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
  56. data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
  57. data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
  58. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
  59. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
  60. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
  61. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
  62. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
  63. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
  64. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
  65. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
  66. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
  67. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
  68. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
  69. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
  70. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
  71. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
  72. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
  73. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
  74. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
  75. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
  76. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
  77. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
  78. data/ext/re2/bitstate.cc +378 -0
  79. data/ext/re2/compile.cc +1138 -0
  80. data/ext/re2/dfa.cc +2086 -0
  81. data/ext/re2/filtered_re2.cc +100 -0
  82. data/ext/re2/filtered_re2.h +99 -0
  83. data/ext/re2/hash.cc +231 -0
  84. data/ext/re2/mimics_pcre.cc +185 -0
  85. data/ext/re2/nfa.cc +709 -0
  86. data/ext/re2/onepass.cc +614 -0
  87. data/ext/re2/parse.cc +2202 -0
  88. data/ext/re2/perl_groups.cc +119 -0
  89. data/ext/re2/prefilter.cc +671 -0
  90. data/ext/re2/prefilter.h +105 -0
  91. data/ext/re2/prefilter_tree.cc +398 -0
  92. data/ext/re2/prefilter_tree.h +130 -0
  93. data/ext/re2/prog.cc +341 -0
  94. data/ext/re2/prog.h +376 -0
  95. data/ext/re2/re2.cc +1180 -0
  96. data/ext/re2/re2.h +837 -0
  97. data/ext/re2/regexp.cc +920 -0
  98. data/ext/re2/regexp.h +632 -0
  99. data/ext/re2/rune.cc +258 -0
  100. data/ext/re2/set.cc +113 -0
  101. data/ext/re2/set.h +55 -0
  102. data/ext/re2/simplify.cc +393 -0
  103. data/ext/re2/stringpiece.cc +87 -0
  104. data/ext/re2/stringpiece.h +182 -0
  105. data/ext/re2/tostring.cc +341 -0
  106. data/ext/re2/unicode_casefold.cc +469 -0
  107. data/ext/re2/unicode_casefold.h +75 -0
  108. data/ext/re2/unicode_groups.cc +4851 -0
  109. data/ext/re2/unicode_groups.h +64 -0
  110. data/ext/re2/valgrind.cc +24 -0
  111. data/ext/re2/variadic_function.h +346 -0
  112. data/ext/re2/walker-inl.h +244 -0
  113. data/ext/src/chipper.cc +626 -0
  114. data/ext/src/version.h +1 -0
  115. data/ext/stemmer.rb +40 -0
  116. data/ext/util/arena.h +103 -0
  117. data/ext/util/atomicops.h +79 -0
  118. data/ext/util/benchmark.h +41 -0
  119. data/ext/util/flags.h +27 -0
  120. data/ext/util/logging.h +78 -0
  121. data/ext/util/mutex.h +190 -0
  122. data/ext/util/pcre.h +679 -0
  123. data/ext/util/random.h +29 -0
  124. data/ext/util/sparse_array.h +451 -0
  125. data/ext/util/sparse_set.h +177 -0
  126. data/ext/util/test.h +57 -0
  127. data/ext/util/thread.h +26 -0
  128. data/ext/util/utf.h +43 -0
  129. data/ext/util/util.h +127 -0
  130. data/ext/util/valgrind.h +4517 -0
  131. data/test/helper.rb +5 -0
  132. data/test/test_entities.rb +57 -0
  133. data/test/test_tokens.rb +118 -0
  134. metadata +199 -0
data/ext/re2/prog.h ADDED
@@ -0,0 +1,376 @@
1
+ // Copyright 2007 The RE2 Authors. All Rights Reserved.
2
+ // Use of this source code is governed by a BSD-style
3
+ // license that can be found in the LICENSE file.
4
+
5
+ // Compiled representation of regular expressions.
6
+ // See regexp.h for the Regexp class, which represents a regular
7
+ // expression symbolically.
8
+
9
+ #ifndef RE2_PROG_H__
10
+ #define RE2_PROG_H__
11
+
12
+ #include "util/util.h"
13
+ #include "re2/re2.h"
14
+
15
+ namespace re2 {
16
+
17
+ // Simple fixed-size bitmap.
18
+ template<int Bits>
19
+ class Bitmap {
20
+ public:
21
+ Bitmap() { Reset(); }
22
+ int Size() { return Bits; }
23
+
24
+ void Reset() {
25
+ for (int i = 0; i < Words; i++)
26
+ w_[i] = 0;
27
+ }
28
+ bool Get(int k) const {
29
+ return w_[k >> WordLog] & (1<<(k & 31));
30
+ }
31
+ void Set(int k) {
32
+ w_[k >> WordLog] |= 1<<(k & 31);
33
+ }
34
+ void Clear(int k) {
35
+ w_[k >> WordLog] &= ~(1<<(k & 31));
36
+ }
37
+ uint32 Word(int i) const {
38
+ return w_[i];
39
+ }
40
+
41
+ private:
42
+ static const int WordLog = 5;
43
+ static const int Words = (Bits+31)/32;
44
+ uint32 w_[Words];
45
+ DISALLOW_EVIL_CONSTRUCTORS(Bitmap);
46
+ };
47
+
48
+
49
+ // Opcodes for Inst
50
+ enum InstOp {
51
+ kInstAlt = 0, // choose between out_ and out1_
52
+ kInstAltMatch, // Alt: out_ is [00-FF] and back, out1_ is match; or vice versa.
53
+ kInstByteRange, // next (possible case-folded) byte must be in [lo_, hi_]
54
+ kInstCapture, // capturing parenthesis number cap_
55
+ kInstEmptyWidth, // empty-width special (^ $ ...); bit(s) set in empty_
56
+ kInstMatch, // found a match!
57
+ kInstNop, // no-op; occasionally unavoidable
58
+ kInstFail, // never match; occasionally unavoidable
59
+ };
60
+
61
+ // Bit flags for empty-width specials
62
+ enum EmptyOp {
63
+ kEmptyBeginLine = 1<<0, // ^ - beginning of line
64
+ kEmptyEndLine = 1<<1, // $ - end of line
65
+ kEmptyBeginText = 1<<2, // \A - beginning of text
66
+ kEmptyEndText = 1<<3, // \z - end of text
67
+ kEmptyWordBoundary = 1<<4, // \b - word boundary
68
+ kEmptyNonWordBoundary = 1<<5, // \B - not \b
69
+ kEmptyAllFlags = (1<<6)-1,
70
+ };
71
+
72
+ class Regexp;
73
+
74
+ class DFA;
75
+ struct OneState;
76
+
77
+ // Compiled form of regexp program.
78
+ class Prog {
79
+ public:
80
+ Prog();
81
+ ~Prog();
82
+
83
+ // Single instruction in regexp program.
84
+ class Inst {
85
+ public:
86
+ Inst() : out_opcode_(0), out1_(0) { }
87
+
88
+ // Constructors per opcode
89
+ void InitAlt(uint32 out, uint32 out1);
90
+ void InitByteRange(int lo, int hi, int foldcase, uint32 out);
91
+ void InitCapture(int cap, uint32 out);
92
+ void InitEmptyWidth(EmptyOp empty, uint32 out);
93
+ void InitMatch(int id);
94
+ void InitNop(uint32 out);
95
+ void InitFail();
96
+
97
+ // Getters
98
+ int id(Prog* p) { return this - p->inst_; }
99
+ InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); }
100
+ int out() { return out_opcode_>>3; }
101
+ int out1() { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; }
102
+ int cap() { DCHECK_EQ(opcode(), kInstCapture); return cap_; }
103
+ int lo() { DCHECK_EQ(opcode(), kInstByteRange); return lo_; }
104
+ int hi() { DCHECK_EQ(opcode(), kInstByteRange); return hi_; }
105
+ int foldcase() { DCHECK_EQ(opcode(), kInstByteRange); return foldcase_; }
106
+ int match_id() { DCHECK_EQ(opcode(), kInstMatch); return match_id_; }
107
+ EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; }
108
+ bool greedy(Prog *p) {
109
+ DCHECK_EQ(opcode(), kInstAltMatch);
110
+ return p->inst(out())->opcode() == kInstByteRange;
111
+ }
112
+
113
+ // Does this inst (an kInstByteRange) match c?
114
+ inline bool Matches(int c) {
115
+ DCHECK_EQ(opcode(), kInstByteRange);
116
+ if (foldcase_ && 'A' <= c && c <= 'Z')
117
+ c += 'a' - 'A';
118
+ return lo_ <= c && c <= hi_;
119
+ }
120
+
121
+ // Returns string representation for debugging.
122
+ string Dump();
123
+
124
+ // Maximum instruction id.
125
+ // (Must fit in out_opcode_, and PatchList steals another bit.)
126
+ static const int kMaxInst = (1<<28) - 1;
127
+
128
+ private:
129
+ void set_opcode(InstOp opcode) {
130
+ out_opcode_ = (out()<<3) | opcode;
131
+ }
132
+
133
+ void set_out(int out) {
134
+ out_opcode_ = (out<<3) | opcode();
135
+ }
136
+
137
+ void set_out_opcode(int out, InstOp opcode) {
138
+ out_opcode_ = (out<<3) | opcode;
139
+ }
140
+
141
+ uint32 out_opcode_; // 29 bits of out, 3 (low) bits opcode
142
+ union { // additional instruction arguments:
143
+ uint32 out1_; // opcode == kInstAlt
144
+ // alternate next instruction
145
+
146
+ int32 cap_; // opcode == kInstCapture
147
+ // Index of capture register (holds text
148
+ // position recorded by capturing parentheses).
149
+ // For \n (the submatch for the nth parentheses),
150
+ // the left parenthesis captures into register 2*n
151
+ // and the right one captures into register 2*n+1.
152
+
153
+ int32 match_id_; // opcode == kInstMatch
154
+ // Match ID to identify this match (for re2::Set).
155
+
156
+ struct { // opcode == kInstByteRange
157
+ uint8 lo_; // byte range is lo_-hi_ inclusive
158
+ uint8 hi_; //
159
+ uint8 foldcase_; // convert A-Z to a-z before checking range.
160
+ };
161
+
162
+ EmptyOp empty_; // opcode == kInstEmptyWidth
163
+ // empty_ is bitwise OR of kEmpty* flags above.
164
+ };
165
+
166
+ friend class Compiler;
167
+ friend struct PatchList;
168
+ friend class Prog;
169
+
170
+ DISALLOW_EVIL_CONSTRUCTORS(Inst);
171
+ };
172
+
173
+ // Whether to anchor the search.
174
+ enum Anchor {
175
+ kUnanchored, // match anywhere
176
+ kAnchored, // match only starting at beginning of text
177
+ };
178
+
179
+ // Kind of match to look for (for anchor != kFullMatch)
180
+ //
181
+ // kLongestMatch mode finds the overall longest
182
+ // match but still makes its submatch choices the way
183
+ // Perl would, not in the way prescribed by POSIX.
184
+ // The POSIX rules are much more expensive to implement,
185
+ // and no one has needed them.
186
+ //
187
+ // kFullMatch is not strictly necessary -- we could use
188
+ // kLongestMatch and then check the length of the match -- but
189
+ // the matching code can run faster if it knows to consider only
190
+ // full matches.
191
+ enum MatchKind {
192
+ kFirstMatch, // like Perl, PCRE
193
+ kLongestMatch, // like egrep or POSIX
194
+ kFullMatch, // match only entire text; implies anchor==kAnchored
195
+ kManyMatch // for SearchDFA, records set of matches
196
+ };
197
+
198
+ Inst *inst(int id) { return &inst_[id]; }
199
+ int start() { return start_; }
200
+ int start_unanchored() { return start_unanchored_; }
201
+ void set_start(int start) { start_ = start; }
202
+ void set_start_unanchored(int start) { start_unanchored_ = start; }
203
+ int64 size() { return size_; }
204
+ bool reversed() { return reversed_; }
205
+ void set_reversed(bool reversed) { reversed_ = reversed; }
206
+ int64 byte_inst_count() { return byte_inst_count_; }
207
+ const Bitmap<256>& byterange() { return byterange_; }
208
+ void set_dfa_mem(int64 dfa_mem) { dfa_mem_ = dfa_mem; }
209
+ int64 dfa_mem() { return dfa_mem_; }
210
+ int flags() { return flags_; }
211
+ void set_flags(int flags) { flags_ = flags; }
212
+ bool anchor_start() { return anchor_start_; }
213
+ void set_anchor_start(bool b) { anchor_start_ = b; }
214
+ bool anchor_end() { return anchor_end_; }
215
+ void set_anchor_end(bool b) { anchor_end_ = b; }
216
+ int bytemap_range() { return bytemap_range_; }
217
+ const uint8* bytemap() { return bytemap_; }
218
+
219
+ // Returns string representation of program for debugging.
220
+ string Dump();
221
+ string DumpUnanchored();
222
+
223
+ // Record that at some point in the prog, the bytes in the range
224
+ // lo-hi (inclusive) are treated as different from bytes outside the range.
225
+ // Tracking this lets the DFA collapse commonly-treated byte ranges
226
+ // when recording state pointers, greatly reducing its memory footprint.
227
+ void MarkByteRange(int lo, int hi);
228
+
229
+ // Returns the set of kEmpty flags that are in effect at
230
+ // position p within context.
231
+ static uint32 EmptyFlags(const StringPiece& context, const char* p);
232
+
233
+ // Returns whether byte c is a word character: ASCII only.
234
+ // Used by the implementation of \b and \B.
235
+ // This is not right for Unicode, but:
236
+ // - it's hard to get right in a byte-at-a-time matching world
237
+ // (the DFA has only one-byte lookahead).
238
+ // - even if the lookahead were possible, the Progs would be huge.
239
+ // This crude approximation is the same one PCRE uses.
240
+ static bool IsWordChar(uint8 c) {
241
+ return ('A' <= c && c <= 'Z') ||
242
+ ('a' <= c && c <= 'z') ||
243
+ ('0' <= c && c <= '9') ||
244
+ c == '_';
245
+ }
246
+
247
+ // Execution engines. They all search for the regexp (run the prog)
248
+ // in text, which is in the larger context (used for ^ $ \b etc).
249
+ // Anchor and kind control the kind of search.
250
+ // Returns true if match found, false if not.
251
+ // If match found, fills match[0..nmatch-1] with submatch info.
252
+ // match[0] is overall match, match[1] is first set of parens, etc.
253
+ // If a particular submatch is not matched during the regexp match,
254
+ // it is set to NULL.
255
+ //
256
+ // Matching text == StringPiece(NULL, 0) is treated as any other empty
257
+ // string, but note that on return, it will not be possible to distinguish
258
+ // submatches that matched that empty string from submatches that didn't
259
+ // match anything. Either way, match[i] == NULL.
260
+
261
+ // Search using NFA: can find submatches but kind of slow.
262
+ bool SearchNFA(const StringPiece& text, const StringPiece& context,
263
+ Anchor anchor, MatchKind kind,
264
+ StringPiece* match, int nmatch);
265
+
266
+ // Search using DFA: much faster than NFA but only finds
267
+ // end of match and can use a lot more memory.
268
+ // Returns whether a match was found.
269
+ // If the DFA runs out of memory, sets *failed to true and returns false.
270
+ // If matches != NULL and kind == kManyMatch and there is a match,
271
+ // SearchDFA fills matches with the match IDs of the final matching state.
272
+ bool SearchDFA(const StringPiece& text, const StringPiece& context,
273
+ Anchor anchor, MatchKind kind,
274
+ StringPiece* match0, bool* failed,
275
+ vector<int>* matches);
276
+
277
+ // Build the entire DFA for the given match kind. FOR TESTING ONLY.
278
+ // Usually the DFA is built out incrementally, as needed, which
279
+ // avoids lots of unnecessary work. This function is useful only
280
+ // for testing purposes. Returns number of states.
281
+ int BuildEntireDFA(MatchKind kind);
282
+
283
+ // Compute byte map.
284
+ void ComputeByteMap();
285
+
286
+ // Run peep-hole optimizer on program.
287
+ void Optimize();
288
+
289
+ // One-pass NFA: only correct if IsOnePass() is true,
290
+ // but much faster than NFA (competitive with PCRE)
291
+ // for those expressions.
292
+ bool IsOnePass();
293
+ bool SearchOnePass(const StringPiece& text, const StringPiece& context,
294
+ Anchor anchor, MatchKind kind,
295
+ StringPiece* match, int nmatch);
296
+
297
+ // Bit-state backtracking. Fast on small cases but uses memory
298
+ // proportional to the product of the program size and the text size.
299
+ bool SearchBitState(const StringPiece& text, const StringPiece& context,
300
+ Anchor anchor, MatchKind kind,
301
+ StringPiece* match, int nmatch);
302
+
303
+ static const int kMaxOnePassCapture = 5; // $0 through $4
304
+
305
+ // Backtracking search: the gold standard against which the other
306
+ // implementations are checked. FOR TESTING ONLY.
307
+ // It allocates a ton of memory to avoid running forever.
308
+ // It is also recursive, so can't use in production (will overflow stacks).
309
+ // The name "Unsafe" here is supposed to be a flag that
310
+ // you should not be using this function.
311
+ bool UnsafeSearchBacktrack(const StringPiece& text,
312
+ const StringPiece& context,
313
+ Anchor anchor, MatchKind kind,
314
+ StringPiece* match, int nmatch);
315
+
316
+ // Computes range for any strings matching regexp. The min and max can in
317
+ // some cases be arbitrarily precise, so the caller gets to specify the
318
+ // maximum desired length of string returned.
319
+ //
320
+ // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
321
+ // string s that is an anchored match for this regexp satisfies
322
+ // min <= s && s <= max.
323
+ //
324
+ // Note that PossibleMatchRange() will only consider the first copy of an
325
+ // infinitely repeated element (i.e., any regexp element followed by a '*' or
326
+ // '+' operator). Regexps with "{N}" constructions are not affected, as those
327
+ // do not compile down to infinite repetitions.
328
+ //
329
+ // Returns true on success, false on error.
330
+ bool PossibleMatchRange(string* min, string* max, int maxlen);
331
+
332
+ // Compiles a collection of regexps to Prog. Each regexp will have
333
+ // its own Match instruction recording the index in the vector.
334
+ static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor,
335
+ Regexp* re);
336
+
337
+ private:
338
+ friend class Compiler;
339
+
340
+ DFA* GetDFA(MatchKind kind);
341
+
342
+ bool anchor_start_; // regexp has explicit start anchor
343
+ bool anchor_end_; // regexp has explicit end anchor
344
+ bool reversed_; // whether program runs backward over input
345
+ bool did_onepass_; // has IsOnePass been called?
346
+
347
+ int start_; // entry point for program
348
+ int start_unanchored_; // unanchored entry point for program
349
+ int size_; // number of instructions
350
+ int byte_inst_count_; // number of kInstByteRange instructions
351
+ int bytemap_range_; // bytemap_[x] < bytemap_range_
352
+ int flags_; // regexp parse flags
353
+ int onepass_statesize_; // byte size of each OneState* node
354
+
355
+ Inst* inst_; // pointer to instruction array
356
+
357
+ Mutex dfa_mutex_; // Protects dfa_first_, dfa_longest_
358
+ DFA* volatile dfa_first_; // DFA cached for kFirstMatch
359
+ DFA* volatile dfa_longest_; // DFA cached for kLongestMatch and kFullMatch
360
+ int64 dfa_mem_; // Maximum memory for DFAs.
361
+ void (*delete_dfa_)(DFA* dfa);
362
+
363
+ Bitmap<256> byterange_; // byterange.Get(x) true if x ends a
364
+ // commonly-treated byte range.
365
+ uint8 bytemap_[256]; // map from input bytes to byte classes
366
+ uint8 *unbytemap_; // bytemap_[unbytemap_[x]] == x
367
+
368
+ uint8* onepass_nodes_; // data for OnePass nodes
369
+ OneState* onepass_start_; // start node for OnePass program
370
+
371
+ DISALLOW_EVIL_CONSTRUCTORS(Prog);
372
+ };
373
+
374
+ } // namespace re2
375
+
376
+ #endif // RE2_PROG_H__