chipper 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. data/README.rdoc +51 -0
  2. data/ext/extconf.rb +58 -0
  3. data/ext/libstemmer_c/Makefile +10 -0
  4. data/ext/libstemmer_c/examples/stemwords.c +209 -0
  5. data/ext/libstemmer_c/include/libstemmer.h +79 -0
  6. data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
  7. data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
  8. data/ext/libstemmer_c/libstemmer/modules.h +190 -0
  9. data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
  10. data/ext/libstemmer_c/mkinc.mak +82 -0
  11. data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
  12. data/ext/libstemmer_c/runtime/api.c +66 -0
  13. data/ext/libstemmer_c/runtime/api.h +26 -0
  14. data/ext/libstemmer_c/runtime/header.h +58 -0
  15. data/ext/libstemmer_c/runtime/utilities.c +478 -0
  16. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
  17. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
  18. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
  19. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
  20. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
  21. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
  22. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
  23. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
  24. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
  25. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
  26. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
  27. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
  28. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
  29. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
  30. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
  31. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
  32. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
  33. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
  34. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
  35. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
  36. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
  37. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
  38. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
  39. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
  40. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
  41. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
  42. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
  43. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
  44. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
  45. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
  46. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
  47. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
  48. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
  49. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
  50. data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
  51. data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
  52. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
  53. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
  54. data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
  55. data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
  56. data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
  57. data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
  58. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
  59. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
  60. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
  61. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
  62. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
  63. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
  64. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
  65. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
  66. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
  67. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
  68. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
  69. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
  70. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
  71. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
  72. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
  73. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
  74. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
  75. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
  76. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
  77. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
  78. data/ext/re2/bitstate.cc +378 -0
  79. data/ext/re2/compile.cc +1138 -0
  80. data/ext/re2/dfa.cc +2086 -0
  81. data/ext/re2/filtered_re2.cc +100 -0
  82. data/ext/re2/filtered_re2.h +99 -0
  83. data/ext/re2/hash.cc +231 -0
  84. data/ext/re2/mimics_pcre.cc +185 -0
  85. data/ext/re2/nfa.cc +709 -0
  86. data/ext/re2/onepass.cc +614 -0
  87. data/ext/re2/parse.cc +2202 -0
  88. data/ext/re2/perl_groups.cc +119 -0
  89. data/ext/re2/prefilter.cc +671 -0
  90. data/ext/re2/prefilter.h +105 -0
  91. data/ext/re2/prefilter_tree.cc +398 -0
  92. data/ext/re2/prefilter_tree.h +130 -0
  93. data/ext/re2/prog.cc +341 -0
  94. data/ext/re2/prog.h +376 -0
  95. data/ext/re2/re2.cc +1180 -0
  96. data/ext/re2/re2.h +837 -0
  97. data/ext/re2/regexp.cc +920 -0
  98. data/ext/re2/regexp.h +632 -0
  99. data/ext/re2/rune.cc +258 -0
  100. data/ext/re2/set.cc +113 -0
  101. data/ext/re2/set.h +55 -0
  102. data/ext/re2/simplify.cc +393 -0
  103. data/ext/re2/stringpiece.cc +87 -0
  104. data/ext/re2/stringpiece.h +182 -0
  105. data/ext/re2/tostring.cc +341 -0
  106. data/ext/re2/unicode_casefold.cc +469 -0
  107. data/ext/re2/unicode_casefold.h +75 -0
  108. data/ext/re2/unicode_groups.cc +4851 -0
  109. data/ext/re2/unicode_groups.h +64 -0
  110. data/ext/re2/valgrind.cc +24 -0
  111. data/ext/re2/variadic_function.h +346 -0
  112. data/ext/re2/walker-inl.h +244 -0
  113. data/ext/src/chipper.cc +626 -0
  114. data/ext/src/version.h +1 -0
  115. data/ext/stemmer.rb +40 -0
  116. data/ext/util/arena.h +103 -0
  117. data/ext/util/atomicops.h +79 -0
  118. data/ext/util/benchmark.h +41 -0
  119. data/ext/util/flags.h +27 -0
  120. data/ext/util/logging.h +78 -0
  121. data/ext/util/mutex.h +190 -0
  122. data/ext/util/pcre.h +679 -0
  123. data/ext/util/random.h +29 -0
  124. data/ext/util/sparse_array.h +451 -0
  125. data/ext/util/sparse_set.h +177 -0
  126. data/ext/util/test.h +57 -0
  127. data/ext/util/thread.h +26 -0
  128. data/ext/util/utf.h +43 -0
  129. data/ext/util/util.h +127 -0
  130. data/ext/util/valgrind.h +4517 -0
  131. data/test/helper.rb +5 -0
  132. data/test/test_entities.rb +57 -0
  133. data/test/test_tokens.rb +118 -0
  134. metadata +199 -0
data/ext/re2/prog.h ADDED
@@ -0,0 +1,376 @@
1
+ // Copyright 2007 The RE2 Authors. All Rights Reserved.
2
+ // Use of this source code is governed by a BSD-style
3
+ // license that can be found in the LICENSE file.
4
+
5
+ // Compiled representation of regular expressions.
6
+ // See regexp.h for the Regexp class, which represents a regular
7
+ // expression symbolically.
8
+
9
+ #ifndef RE2_PROG_H__
10
+ #define RE2_PROG_H__
11
+
12
+ #include "util/util.h"
13
+ #include "re2/re2.h"
14
+
15
+ namespace re2 {
16
+
17
+ // Simple fixed-size bitmap.
18
+ template<int Bits>
19
+ class Bitmap {
20
+ public:
21
+ Bitmap() { Reset(); }
22
+ int Size() { return Bits; }
23
+
24
+ void Reset() {
25
+ for (int i = 0; i < Words; i++)
26
+ w_[i] = 0;
27
+ }
28
+ bool Get(int k) const {
29
+ return w_[k >> WordLog] & (1<<(k & 31));
30
+ }
31
+ void Set(int k) {
32
+ w_[k >> WordLog] |= 1<<(k & 31);
33
+ }
34
+ void Clear(int k) {
35
+ w_[k >> WordLog] &= ~(1<<(k & 31));
36
+ }
37
+ uint32 Word(int i) const {
38
+ return w_[i];
39
+ }
40
+
41
+ private:
42
+ static const int WordLog = 5;
43
+ static const int Words = (Bits+31)/32;
44
+ uint32 w_[Words];
45
+ DISALLOW_EVIL_CONSTRUCTORS(Bitmap);
46
+ };
47
+
48
+
49
+ // Opcodes for Inst
50
+ enum InstOp {
51
+ kInstAlt = 0, // choose between out_ and out1_
52
+ kInstAltMatch, // Alt: out_ is [00-FF] and back, out1_ is match; or vice versa.
53
+ kInstByteRange, // next (possible case-folded) byte must be in [lo_, hi_]
54
+ kInstCapture, // capturing parenthesis number cap_
55
+ kInstEmptyWidth, // empty-width special (^ $ ...); bit(s) set in empty_
56
+ kInstMatch, // found a match!
57
+ kInstNop, // no-op; occasionally unavoidable
58
+ kInstFail, // never match; occasionally unavoidable
59
+ };
60
+
61
+ // Bit flags for empty-width specials
62
+ enum EmptyOp {
63
+ kEmptyBeginLine = 1<<0, // ^ - beginning of line
64
+ kEmptyEndLine = 1<<1, // $ - end of line
65
+ kEmptyBeginText = 1<<2, // \A - beginning of text
66
+ kEmptyEndText = 1<<3, // \z - end of text
67
+ kEmptyWordBoundary = 1<<4, // \b - word boundary
68
+ kEmptyNonWordBoundary = 1<<5, // \B - not \b
69
+ kEmptyAllFlags = (1<<6)-1,
70
+ };
71
+
72
+ class Regexp;
73
+
74
+ class DFA;
75
+ struct OneState;
76
+
77
+ // Compiled form of regexp program.
78
+ class Prog {
79
+ public:
80
+ Prog();
81
+ ~Prog();
82
+
83
+ // Single instruction in regexp program.
84
+ class Inst {
85
+ public:
86
+ Inst() : out_opcode_(0), out1_(0) { }
87
+
88
+ // Constructors per opcode
89
+ void InitAlt(uint32 out, uint32 out1);
90
+ void InitByteRange(int lo, int hi, int foldcase, uint32 out);
91
+ void InitCapture(int cap, uint32 out);
92
+ void InitEmptyWidth(EmptyOp empty, uint32 out);
93
+ void InitMatch(int id);
94
+ void InitNop(uint32 out);
95
+ void InitFail();
96
+
97
+ // Getters
98
+ int id(Prog* p) { return this - p->inst_; }
99
+ InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); }
100
+ int out() { return out_opcode_>>3; }
101
+ int out1() { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; }
102
+ int cap() { DCHECK_EQ(opcode(), kInstCapture); return cap_; }
103
+ int lo() { DCHECK_EQ(opcode(), kInstByteRange); return lo_; }
104
+ int hi() { DCHECK_EQ(opcode(), kInstByteRange); return hi_; }
105
+ int foldcase() { DCHECK_EQ(opcode(), kInstByteRange); return foldcase_; }
106
+ int match_id() { DCHECK_EQ(opcode(), kInstMatch); return match_id_; }
107
+ EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; }
108
+ bool greedy(Prog *p) {
109
+ DCHECK_EQ(opcode(), kInstAltMatch);
110
+ return p->inst(out())->opcode() == kInstByteRange;
111
+ }
112
+
113
+ // Does this inst (an kInstByteRange) match c?
114
+ inline bool Matches(int c) {
115
+ DCHECK_EQ(opcode(), kInstByteRange);
116
+ if (foldcase_ && 'A' <= c && c <= 'Z')
117
+ c += 'a' - 'A';
118
+ return lo_ <= c && c <= hi_;
119
+ }
120
+
121
+ // Returns string representation for debugging.
122
+ string Dump();
123
+
124
+ // Maximum instruction id.
125
+ // (Must fit in out_opcode_, and PatchList steals another bit.)
126
+ static const int kMaxInst = (1<<28) - 1;
127
+
128
+ private:
129
+ void set_opcode(InstOp opcode) {
130
+ out_opcode_ = (out()<<3) | opcode;
131
+ }
132
+
133
+ void set_out(int out) {
134
+ out_opcode_ = (out<<3) | opcode();
135
+ }
136
+
137
+ void set_out_opcode(int out, InstOp opcode) {
138
+ out_opcode_ = (out<<3) | opcode;
139
+ }
140
+
141
+ uint32 out_opcode_; // 29 bits of out, 3 (low) bits opcode
142
+ union { // additional instruction arguments:
143
+ uint32 out1_; // opcode == kInstAlt
144
+ // alternate next instruction
145
+
146
+ int32 cap_; // opcode == kInstCapture
147
+ // Index of capture register (holds text
148
+ // position recorded by capturing parentheses).
149
+ // For \n (the submatch for the nth parentheses),
150
+ // the left parenthesis captures into register 2*n
151
+ // and the right one captures into register 2*n+1.
152
+
153
+ int32 match_id_; // opcode == kInstMatch
154
+ // Match ID to identify this match (for re2::Set).
155
+
156
+ struct { // opcode == kInstByteRange
157
+ uint8 lo_; // byte range is lo_-hi_ inclusive
158
+ uint8 hi_; //
159
+ uint8 foldcase_; // convert A-Z to a-z before checking range.
160
+ };
161
+
162
+ EmptyOp empty_; // opcode == kInstEmptyWidth
163
+ // empty_ is bitwise OR of kEmpty* flags above.
164
+ };
165
+
166
+ friend class Compiler;
167
+ friend struct PatchList;
168
+ friend class Prog;
169
+
170
+ DISALLOW_EVIL_CONSTRUCTORS(Inst);
171
+ };
172
+
173
+ // Whether to anchor the search.
174
+ enum Anchor {
175
+ kUnanchored, // match anywhere
176
+ kAnchored, // match only starting at beginning of text
177
+ };
178
+
179
+ // Kind of match to look for (for anchor != kFullMatch)
180
+ //
181
+ // kLongestMatch mode finds the overall longest
182
+ // match but still makes its submatch choices the way
183
+ // Perl would, not in the way prescribed by POSIX.
184
+ // The POSIX rules are much more expensive to implement,
185
+ // and no one has needed them.
186
+ //
187
+ // kFullMatch is not strictly necessary -- we could use
188
+ // kLongestMatch and then check the length of the match -- but
189
+ // the matching code can run faster if it knows to consider only
190
+ // full matches.
191
+ enum MatchKind {
192
+ kFirstMatch, // like Perl, PCRE
193
+ kLongestMatch, // like egrep or POSIX
194
+ kFullMatch, // match only entire text; implies anchor==kAnchored
195
+ kManyMatch // for SearchDFA, records set of matches
196
+ };
197
+
198
+ Inst *inst(int id) { return &inst_[id]; }
199
+ int start() { return start_; }
200
+ int start_unanchored() { return start_unanchored_; }
201
+ void set_start(int start) { start_ = start; }
202
+ void set_start_unanchored(int start) { start_unanchored_ = start; }
203
+ int64 size() { return size_; }
204
+ bool reversed() { return reversed_; }
205
+ void set_reversed(bool reversed) { reversed_ = reversed; }
206
+ int64 byte_inst_count() { return byte_inst_count_; }
207
+ const Bitmap<256>& byterange() { return byterange_; }
208
+ void set_dfa_mem(int64 dfa_mem) { dfa_mem_ = dfa_mem; }
209
+ int64 dfa_mem() { return dfa_mem_; }
210
+ int flags() { return flags_; }
211
+ void set_flags(int flags) { flags_ = flags; }
212
+ bool anchor_start() { return anchor_start_; }
213
+ void set_anchor_start(bool b) { anchor_start_ = b; }
214
+ bool anchor_end() { return anchor_end_; }
215
+ void set_anchor_end(bool b) { anchor_end_ = b; }
216
+ int bytemap_range() { return bytemap_range_; }
217
+ const uint8* bytemap() { return bytemap_; }
218
+
219
+ // Returns string representation of program for debugging.
220
+ string Dump();
221
+ string DumpUnanchored();
222
+
223
+ // Record that at some point in the prog, the bytes in the range
224
+ // lo-hi (inclusive) are treated as different from bytes outside the range.
225
+ // Tracking this lets the DFA collapse commonly-treated byte ranges
226
+ // when recording state pointers, greatly reducing its memory footprint.
227
+ void MarkByteRange(int lo, int hi);
228
+
229
+ // Returns the set of kEmpty flags that are in effect at
230
+ // position p within context.
231
+ static uint32 EmptyFlags(const StringPiece& context, const char* p);
232
+
233
+ // Returns whether byte c is a word character: ASCII only.
234
+ // Used by the implementation of \b and \B.
235
+ // This is not right for Unicode, but:
236
+ // - it's hard to get right in a byte-at-a-time matching world
237
+ // (the DFA has only one-byte lookahead).
238
+ // - even if the lookahead were possible, the Progs would be huge.
239
+ // This crude approximation is the same one PCRE uses.
240
+ static bool IsWordChar(uint8 c) {
241
+ return ('A' <= c && c <= 'Z') ||
242
+ ('a' <= c && c <= 'z') ||
243
+ ('0' <= c && c <= '9') ||
244
+ c == '_';
245
+ }
246
+
247
+ // Execution engines. They all search for the regexp (run the prog)
248
+ // in text, which is in the larger context (used for ^ $ \b etc).
249
+ // Anchor and kind control the kind of search.
250
+ // Returns true if match found, false if not.
251
+ // If match found, fills match[0..nmatch-1] with submatch info.
252
+ // match[0] is overall match, match[1] is first set of parens, etc.
253
+ // If a particular submatch is not matched during the regexp match,
254
+ // it is set to NULL.
255
+ //
256
+ // Matching text == StringPiece(NULL, 0) is treated as any other empty
257
+ // string, but note that on return, it will not be possible to distinguish
258
+ // submatches that matched that empty string from submatches that didn't
259
+ // match anything. Either way, match[i] == NULL.
260
+
261
+ // Search using NFA: can find submatches but kind of slow.
262
+ bool SearchNFA(const StringPiece& text, const StringPiece& context,
263
+ Anchor anchor, MatchKind kind,
264
+ StringPiece* match, int nmatch);
265
+
266
+ // Search using DFA: much faster than NFA but only finds
267
+ // end of match and can use a lot more memory.
268
+ // Returns whether a match was found.
269
+ // If the DFA runs out of memory, sets *failed to true and returns false.
270
+ // If matches != NULL and kind == kManyMatch and there is a match,
271
+ // SearchDFA fills matches with the match IDs of the final matching state.
272
+ bool SearchDFA(const StringPiece& text, const StringPiece& context,
273
+ Anchor anchor, MatchKind kind,
274
+ StringPiece* match0, bool* failed,
275
+ vector<int>* matches);
276
+
277
+ // Build the entire DFA for the given match kind. FOR TESTING ONLY.
278
+ // Usually the DFA is built out incrementally, as needed, which
279
+ // avoids lots of unnecessary work. This function is useful only
280
+ // for testing purposes. Returns number of states.
281
+ int BuildEntireDFA(MatchKind kind);
282
+
283
+ // Compute byte map.
284
+ void ComputeByteMap();
285
+
286
+ // Run peep-hole optimizer on program.
287
+ void Optimize();
288
+
289
+ // One-pass NFA: only correct if IsOnePass() is true,
290
+ // but much faster than NFA (competitive with PCRE)
291
+ // for those expressions.
292
+ bool IsOnePass();
293
+ bool SearchOnePass(const StringPiece& text, const StringPiece& context,
294
+ Anchor anchor, MatchKind kind,
295
+ StringPiece* match, int nmatch);
296
+
297
+ // Bit-state backtracking. Fast on small cases but uses memory
298
+ // proportional to the product of the program size and the text size.
299
+ bool SearchBitState(const StringPiece& text, const StringPiece& context,
300
+ Anchor anchor, MatchKind kind,
301
+ StringPiece* match, int nmatch);
302
+
303
+ static const int kMaxOnePassCapture = 5; // $0 through $4
304
+
305
+ // Backtracking search: the gold standard against which the other
306
+ // implementations are checked. FOR TESTING ONLY.
307
+ // It allocates a ton of memory to avoid running forever.
308
+ // It is also recursive, so can't use in production (will overflow stacks).
309
+ // The name "Unsafe" here is supposed to be a flag that
310
+ // you should not be using this function.
311
+ bool UnsafeSearchBacktrack(const StringPiece& text,
312
+ const StringPiece& context,
313
+ Anchor anchor, MatchKind kind,
314
+ StringPiece* match, int nmatch);
315
+
316
+ // Computes range for any strings matching regexp. The min and max can in
317
+ // some cases be arbitrarily precise, so the caller gets to specify the
318
+ // maximum desired length of string returned.
319
+ //
320
+ // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
321
+ // string s that is an anchored match for this regexp satisfies
322
+ // min <= s && s <= max.
323
+ //
324
+ // Note that PossibleMatchRange() will only consider the first copy of an
325
+ // infinitely repeated element (i.e., any regexp element followed by a '*' or
326
+ // '+' operator). Regexps with "{N}" constructions are not affected, as those
327
+ // do not compile down to infinite repetitions.
328
+ //
329
+ // Returns true on success, false on error.
330
+ bool PossibleMatchRange(string* min, string* max, int maxlen);
331
+
332
+ // Compiles a collection of regexps to Prog. Each regexp will have
333
+ // its own Match instruction recording the index in the vector.
334
+ static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor,
335
+ Regexp* re);
336
+
337
+ private:
338
+ friend class Compiler;
339
+
340
+ DFA* GetDFA(MatchKind kind);
341
+
342
+ bool anchor_start_; // regexp has explicit start anchor
343
+ bool anchor_end_; // regexp has explicit end anchor
344
+ bool reversed_; // whether program runs backward over input
345
+ bool did_onepass_; // has IsOnePass been called?
346
+
347
+ int start_; // entry point for program
348
+ int start_unanchored_; // unanchored entry point for program
349
+ int size_; // number of instructions
350
+ int byte_inst_count_; // number of kInstByteRange instructions
351
+ int bytemap_range_; // bytemap_[x] < bytemap_range_
352
+ int flags_; // regexp parse flags
353
+ int onepass_statesize_; // byte size of each OneState* node
354
+
355
+ Inst* inst_; // pointer to instruction array
356
+
357
+ Mutex dfa_mutex_; // Protects dfa_first_, dfa_longest_
358
+ DFA* volatile dfa_first_; // DFA cached for kFirstMatch
359
+ DFA* volatile dfa_longest_; // DFA cached for kLongestMatch and kFullMatch
360
+ int64 dfa_mem_; // Maximum memory for DFAs.
361
+ void (*delete_dfa_)(DFA* dfa);
362
+
363
+ Bitmap<256> byterange_; // byterange.Get(x) true if x ends a
364
+ // commonly-treated byte range.
365
+ uint8 bytemap_[256]; // map from input bytes to byte classes
366
+ uint8 *unbytemap_; // bytemap_[unbytemap_[x]] == x
367
+
368
+ uint8* onepass_nodes_; // data for OnePass nodes
369
+ OneState* onepass_start_; // start node for OnePass program
370
+
371
+ DISALLOW_EVIL_CONSTRUCTORS(Prog);
372
+ };
373
+
374
+ } // namespace re2
375
+
376
+ #endif // RE2_PROG_H__