chipper 0.4.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (134) hide show
  1. data/README.rdoc +51 -0
  2. data/ext/extconf.rb +58 -0
  3. data/ext/libstemmer_c/Makefile +10 -0
  4. data/ext/libstemmer_c/examples/stemwords.c +209 -0
  5. data/ext/libstemmer_c/include/libstemmer.h +79 -0
  6. data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
  7. data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
  8. data/ext/libstemmer_c/libstemmer/modules.h +190 -0
  9. data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
  10. data/ext/libstemmer_c/mkinc.mak +82 -0
  11. data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
  12. data/ext/libstemmer_c/runtime/api.c +66 -0
  13. data/ext/libstemmer_c/runtime/api.h +26 -0
  14. data/ext/libstemmer_c/runtime/header.h +58 -0
  15. data/ext/libstemmer_c/runtime/utilities.c +478 -0
  16. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
  17. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
  18. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
  19. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
  20. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
  21. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
  22. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
  23. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
  24. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
  25. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
  26. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
  27. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
  28. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
  29. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
  30. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
  31. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
  32. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
  33. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
  34. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
  35. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
  36. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
  37. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
  38. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
  39. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
  40. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
  41. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
  42. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
  43. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
  44. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
  45. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
  46. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
  47. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
  48. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
  49. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
  50. data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
  51. data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
  52. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
  53. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
  54. data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
  55. data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
  56. data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
  57. data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
  58. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
  59. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
  60. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
  61. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
  62. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
  63. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
  64. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
  65. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
  66. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
  67. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
  68. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
  69. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
  70. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
  71. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
  72. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
  73. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
  74. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
  75. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
  76. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
  77. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
  78. data/ext/re2/bitstate.cc +378 -0
  79. data/ext/re2/compile.cc +1138 -0
  80. data/ext/re2/dfa.cc +2086 -0
  81. data/ext/re2/filtered_re2.cc +100 -0
  82. data/ext/re2/filtered_re2.h +99 -0
  83. data/ext/re2/hash.cc +231 -0
  84. data/ext/re2/mimics_pcre.cc +185 -0
  85. data/ext/re2/nfa.cc +709 -0
  86. data/ext/re2/onepass.cc +614 -0
  87. data/ext/re2/parse.cc +2202 -0
  88. data/ext/re2/perl_groups.cc +119 -0
  89. data/ext/re2/prefilter.cc +671 -0
  90. data/ext/re2/prefilter.h +105 -0
  91. data/ext/re2/prefilter_tree.cc +398 -0
  92. data/ext/re2/prefilter_tree.h +130 -0
  93. data/ext/re2/prog.cc +341 -0
  94. data/ext/re2/prog.h +376 -0
  95. data/ext/re2/re2.cc +1180 -0
  96. data/ext/re2/re2.h +837 -0
  97. data/ext/re2/regexp.cc +920 -0
  98. data/ext/re2/regexp.h +632 -0
  99. data/ext/re2/rune.cc +258 -0
  100. data/ext/re2/set.cc +113 -0
  101. data/ext/re2/set.h +55 -0
  102. data/ext/re2/simplify.cc +393 -0
  103. data/ext/re2/stringpiece.cc +87 -0
  104. data/ext/re2/stringpiece.h +182 -0
  105. data/ext/re2/tostring.cc +341 -0
  106. data/ext/re2/unicode_casefold.cc +469 -0
  107. data/ext/re2/unicode_casefold.h +75 -0
  108. data/ext/re2/unicode_groups.cc +4851 -0
  109. data/ext/re2/unicode_groups.h +64 -0
  110. data/ext/re2/valgrind.cc +24 -0
  111. data/ext/re2/variadic_function.h +346 -0
  112. data/ext/re2/walker-inl.h +244 -0
  113. data/ext/src/chipper.cc +626 -0
  114. data/ext/src/version.h +1 -0
  115. data/ext/stemmer.rb +40 -0
  116. data/ext/util/arena.h +103 -0
  117. data/ext/util/atomicops.h +79 -0
  118. data/ext/util/benchmark.h +41 -0
  119. data/ext/util/flags.h +27 -0
  120. data/ext/util/logging.h +78 -0
  121. data/ext/util/mutex.h +190 -0
  122. data/ext/util/pcre.h +679 -0
  123. data/ext/util/random.h +29 -0
  124. data/ext/util/sparse_array.h +451 -0
  125. data/ext/util/sparse_set.h +177 -0
  126. data/ext/util/test.h +57 -0
  127. data/ext/util/thread.h +26 -0
  128. data/ext/util/utf.h +43 -0
  129. data/ext/util/util.h +127 -0
  130. data/ext/util/valgrind.h +4517 -0
  131. data/test/helper.rb +5 -0
  132. data/test/test_entities.rb +57 -0
  133. data/test/test_tokens.rb +118 -0
  134. metadata +199 -0
data/ext/re2/dfa.cc ADDED
@@ -0,0 +1,2086 @@
1
+ // Copyright 2008 The RE2 Authors. All Rights Reserved.
2
+ // Use of this source code is governed by a BSD-style
3
+ // license that can be found in the LICENSE file.
4
+
5
+ // A DFA (deterministic finite automaton)-based regular expression search.
6
+ //
7
+ // The DFA search has two main parts: the construction of the automaton,
8
+ // which is represented by a graph of State structures, and the execution
9
+ // of the automaton over a given input string.
10
+ //
11
+ // The basic idea is that the State graph is constructed so that the
12
+ // execution can simply start with a state s, and then for each byte c in
13
+ // the input string, execute "s = s->next[c]", checking at each point whether
14
+ // the current s represents a matching state.
15
+ //
16
+ // The simple explanation just given does convey the essence of this code,
17
+ // but it omits the details of how the State graph gets constructed as well
18
+ // as some performance-driven optimizations to the execution of the automaton.
19
+ // All these details are explained in the comments for the code following
20
+ // the definition of class DFA.
21
+ //
22
+ // See http://swtch.com/~rsc/regexp/ for a very bare-bones equivalent.
23
+
24
+ #include "re2/prog.h"
25
+ #include "re2/stringpiece.h"
26
+ #include "util/atomicops.h"
27
+ #include "util/flags.h"
28
+ #include "util/sparse_set.h"
29
+
30
+ DEFINE_bool(re2_dfa_bail_when_slow, true,
31
+ "Whether the RE2 DFA should bail out early "
32
+ "if the NFA would be faster (for testing).");
33
+
34
+ namespace re2 {
35
+
36
+ #if !defined(__linux__) /* only Linux seems to have memrchr */
37
+ static void* memrchr(const void* s, int c, size_t n) {
38
+ const unsigned char* p = (const unsigned char*)s;
39
+ for (p += n; n > 0; n--)
40
+ if (*--p == c)
41
+ return (void*)p;
42
+
43
+ return NULL;
44
+ }
45
+ #endif
46
+
47
+ // Changing this to true compiles in prints that trace execution of the DFA.
48
+ // Generates a lot of output -- only useful for debugging.
49
+ static const bool DebugDFA = false;
50
+
51
+ // A DFA implementation of a regular expression program.
52
+ // Since this is entirely a forward declaration mandated by C++,
53
+ // some of the comments here are better understood after reading
54
+ // the comments in the sections that follow the DFA definition.
55
+ class DFA {
56
+ public:
57
+ DFA(Prog* prog, Prog::MatchKind kind, int64 max_mem);
58
+ ~DFA();
59
+ bool ok() const { return !init_failed_; }
60
+ Prog::MatchKind kind() { return kind_; }
61
+
62
+ // Searches for the regular expression in text, which is considered
63
+ // as a subsection of context for the purposes of interpreting flags
64
+ // like ^ and $ and \A and \z.
65
+ // Returns whether a match was found.
66
+ // If a match is found, sets *ep to the end point of the best match in text.
67
+ // If "anchored", the match must begin at the start of text.
68
+ // If "want_earliest_match", the match that ends first is used, not
69
+ // necessarily the best one.
70
+ // If "run_forward" is true, the DFA runs from text.begin() to text.end().
71
+ // If it is false, the DFA runs from text.end() to text.begin(),
72
+ // returning the leftmost end of the match instead of the rightmost one.
73
+ // If the DFA cannot complete the search (for example, if it is out of
74
+ // memory), it sets *failed and returns false.
75
+ bool Search(const StringPiece& text, const StringPiece& context,
76
+ bool anchored, bool want_earliest_match, bool run_forward,
77
+ bool* failed, const char** ep, vector<int>* matches);
78
+
79
+ // Builds out all states for the entire DFA. FOR TESTING ONLY
80
+ // Returns number of states.
81
+ int BuildAllStates();
82
+
83
+ // Computes min and max for matching strings. Won't return strings
84
+ // bigger than maxlen.
85
+ bool PossibleMatchRange(string* min, string* max, int maxlen);
86
+
87
+ // These data structures are logically private, but C++ makes it too
88
+ // difficult to mark them as such.
89
+ class Workq;
90
+ class RWLocker;
91
+ class StateSaver;
92
+
93
+ // A single DFA state. The DFA is represented as a graph of these
94
+ // States, linked by the next_ pointers. If in state s and reading
95
+ // byte c, the next state should be s->next_[c].
96
+ struct State {
97
+ inline bool IsMatch() const { return flag_ & kFlagMatch; }
98
+ void SaveMatch(vector<int>* v);
99
+
100
+ int* inst_; // Instruction pointers in the state.
101
+ int ninst_; // # of inst_ pointers.
102
+ uint flag_; // Empty string bitfield flags in effect on the way
103
+ // into this state, along with kFlagMatch if this
104
+ // is a matching state.
105
+ State** next_; // Outgoing arrows from State,
106
+ // one per input byte class
107
+ };
108
+
109
+ enum {
110
+ kByteEndText = 256, // imaginary byte at end of text
111
+
112
+ kFlagEmptyMask = 0xFFF, // State.flag_: bits holding kEmptyXXX flags
113
+ kFlagMatch = 0x1000, // State.flag_: this is a matching state
114
+ kFlagLastWord = 0x2000, // State.flag_: last byte was a word char
115
+ kFlagNeedShift = 16, // needed kEmpty bits are or'ed in shifted left
116
+ };
117
+
118
+ // STL function structures for use with unordered_set.
119
+ struct StateEqual {
120
+ bool operator()(const State* a, const State* b) const {
121
+ if (a == b)
122
+ return true;
123
+ if (a == NULL || b == NULL)
124
+ return false;
125
+ if (a->ninst_ != b->ninst_)
126
+ return false;
127
+ if (a->flag_ != b->flag_)
128
+ return false;
129
+ for (int i = 0; i < a->ninst_; i++)
130
+ if (a->inst_[i] != b->inst_[i])
131
+ return false;
132
+ return true; // they're equal
133
+ }
134
+ };
135
+ struct StateHash {
136
+ size_t operator()(const State* a) const {
137
+ if (a == NULL)
138
+ return 0;
139
+ const char* s = reinterpret_cast<const char*>(a->inst_);
140
+ int len = a->ninst_ * sizeof a->inst_[0];
141
+ if (sizeof(size_t) == sizeof(uint32))
142
+ return Hash32StringWithSeed(s, len, a->flag_);
143
+ else
144
+ return Hash64StringWithSeed(s, len, a->flag_);
145
+ }
146
+ };
147
+
148
+ typedef unordered_set<State*, StateHash, StateEqual> StateSet;
149
+
150
+
151
+ private:
152
+ // Special "firstbyte" values for a state. (Values >= 0 denote actual bytes.)
153
+ enum {
154
+ kFbUnknown = -1, // No analysis has been performed.
155
+ kFbMany = -2, // Many bytes will lead out of this state.
156
+ kFbNone = -3, // No bytes lead out of this state.
157
+ };
158
+
159
+ enum {
160
+ // Indices into start_ for unanchored searches.
161
+ // Add kStartAnchored for anchored searches.
162
+ kStartBeginText = 0, // text at beginning of context
163
+ kStartBeginLine = 2, // text at beginning of line
164
+ kStartAfterWordChar = 4, // text follows a word character
165
+ kStartAfterNonWordChar = 6, // text follows non-word character
166
+ kMaxStart = 8,
167
+
168
+ kStartAnchored = 1,
169
+ };
170
+
171
+ // Resets the DFA State cache, flushing all saved State* information.
172
+ // Releases and reacquires cache_mutex_ via cache_lock, so any
173
+ // State* existing before the call are not valid after the call.
174
+ // Use a StateSaver to preserve important states across the call.
175
+ // cache_mutex_.r <= L < mutex_
176
+ // After: cache_mutex_.w <= L < mutex_
177
+ void ResetCache(RWLocker* cache_lock);
178
+
179
+ // Looks up and returns the State corresponding to a Workq.
180
+ // L >= mutex_
181
+ State* WorkqToCachedState(Workq* q, uint flag);
182
+
183
+ // Looks up and returns a State matching the inst, ninst, and flag.
184
+ // L >= mutex_
185
+ State* CachedState(int* inst, int ninst, uint flag);
186
+
187
+ // Clear the cache entirely.
188
+ // Must hold cache_mutex_.w or be in destructor.
189
+ void ClearCache();
190
+
191
+ // Converts a State into a Workq: the opposite of WorkqToCachedState.
192
+ // L >= mutex_
193
+ static void StateToWorkq(State* s, Workq* q);
194
+
195
+ // Runs a State on a given byte, returning the next state.
196
+ State* RunStateOnByteUnlocked(State*, int); // cache_mutex_.r <= L < mutex_
197
+ State* RunStateOnByte(State*, int); // L >= mutex_
198
+
199
+ // Runs a Workq on a given byte followed by a set of empty-string flags,
200
+ // producing a new Workq in nq. If a match instruction is encountered,
201
+ // sets *ismatch to true.
202
+ // L >= mutex_
203
+ void RunWorkqOnByte(Workq* q, Workq* nq,
204
+ int c, uint flag, bool* ismatch,
205
+ Prog::MatchKind kind,
206
+ int new_byte_loop);
207
+
208
+ // Runs a Workq on a set of empty-string flags, producing a new Workq in nq.
209
+ // L >= mutex_
210
+ void RunWorkqOnEmptyString(Workq* q, Workq* nq, uint flag);
211
+
212
+ // Adds the instruction id to the Workq, following empty arrows
213
+ // according to flag.
214
+ // L >= mutex_
215
+ void AddToQueue(Workq* q, int id, uint flag);
216
+
217
+ // For debugging, returns a text representation of State.
218
+ static string DumpState(State* state);
219
+
220
+ // For debugging, returns a text representation of a Workq.
221
+ static string DumpWorkq(Workq* q);
222
+
223
+ // Search parameters
224
+ struct SearchParams {
225
+ SearchParams(const StringPiece& text, const StringPiece& context,
226
+ RWLocker* cache_lock)
227
+ : text(text), context(context),
228
+ anchored(false),
229
+ want_earliest_match(false),
230
+ run_forward(false),
231
+ start(NULL),
232
+ firstbyte(kFbUnknown),
233
+ cache_lock(cache_lock),
234
+ failed(false),
235
+ ep(NULL),
236
+ matches(NULL) { }
237
+
238
+ StringPiece text;
239
+ StringPiece context;
240
+ bool anchored;
241
+ bool want_earliest_match;
242
+ bool run_forward;
243
+ State* start;
244
+ int firstbyte;
245
+ RWLocker *cache_lock;
246
+ bool failed; // "out" parameter: whether search gave up
247
+ const char* ep; // "out" parameter: end pointer for match
248
+ vector<int>* matches;
249
+
250
+ private:
251
+ DISALLOW_EVIL_CONSTRUCTORS(SearchParams);
252
+ };
253
+
254
+ // Before each search, the parameters to Search are analyzed by
255
+ // AnalyzeSearch to determine the state in which to start and the
256
+ // "firstbyte" for that state, if any.
257
+ struct StartInfo {
258
+ StartInfo() : start(NULL), firstbyte(kFbUnknown) { }
259
+ State* start;
260
+ volatile int firstbyte;
261
+ };
262
+
263
+ // Fills in params->start and params->firstbyte using
264
+ // the other search parameters. Returns true on success,
265
+ // false on failure.
266
+ // cache_mutex_.r <= L < mutex_
267
+ bool AnalyzeSearch(SearchParams* params);
268
+ bool AnalyzeSearchHelper(SearchParams* params, StartInfo* info, uint flags);
269
+
270
+ // The generic search loop, inlined to create specialized versions.
271
+ // cache_mutex_.r <= L < mutex_
272
+ // Might unlock and relock cache_mutex_ via params->cache_lock.
273
+ inline bool InlinedSearchLoop(SearchParams* params,
274
+ bool have_firstbyte,
275
+ bool want_earliest_match,
276
+ bool run_forward);
277
+
278
+ // The specialized versions of InlinedSearchLoop. The three letters
279
+ // at the ends of the name denote the true/false values used as the
280
+ // last three parameters of InlinedSearchLoop.
281
+ // cache_mutex_.r <= L < mutex_
282
+ // Might unlock and relock cache_mutex_ via params->cache_lock.
283
+ bool SearchFFF(SearchParams* params);
284
+ bool SearchFFT(SearchParams* params);
285
+ bool SearchFTF(SearchParams* params);
286
+ bool SearchFTT(SearchParams* params);
287
+ bool SearchTFF(SearchParams* params);
288
+ bool SearchTFT(SearchParams* params);
289
+ bool SearchTTF(SearchParams* params);
290
+ bool SearchTTT(SearchParams* params);
291
+
292
+ // The main search loop: calls an appropriate specialized version of
293
+ // InlinedSearchLoop.
294
+ // cache_mutex_.r <= L < mutex_
295
+ // Might unlock and relock cache_mutex_ via params->cache_lock.
296
+ bool FastSearchLoop(SearchParams* params);
297
+
298
+ // For debugging, a slow search loop that calls InlinedSearchLoop
299
+ // directly -- because the booleans passed are not constants, the
300
+ // loop is not specialized like the SearchFFF etc. versions, so it
301
+ // runs much more slowly. Useful only for debugging.
302
+ // cache_mutex_.r <= L < mutex_
303
+ // Might unlock and relock cache_mutex_ via params->cache_lock.
304
+ bool SlowSearchLoop(SearchParams* params);
305
+
306
+ // Looks up bytes in bytemap_ but handles case c == kByteEndText too.
307
+ int ByteMap(int c) {
308
+ if (c == kByteEndText)
309
+ return prog_->bytemap_range();
310
+ return prog_->bytemap()[c];
311
+ }
312
+
313
+ // Constant after initialization.
314
+ Prog* prog_; // The regular expression program to run.
315
+ Prog::MatchKind kind_; // The kind of DFA.
316
+ int start_unanchored_; // start of unanchored program
317
+ bool init_failed_; // initialization failed (out of memory)
318
+
319
+ Mutex mutex_; // mutex_ >= cache_mutex_.r
320
+
321
+ // Scratch areas, protected by mutex_.
322
+ Workq* q0_; // Two pre-allocated work queues.
323
+ Workq* q1_;
324
+ int* astack_; // Pre-allocated stack for AddToQueue
325
+ int nastack_;
326
+
327
+ // State* cache. Many threads use and add to the cache simultaneously,
328
+ // holding cache_mutex_ for reading and mutex_ (above) when adding.
329
+ // If the cache fills and needs to be discarded, the discarding is done
330
+ // while holding cache_mutex_ for writing, to avoid interrupting other
331
+ // readers. Any State* pointers are only valid while cache_mutex_
332
+ // is held.
333
+ Mutex cache_mutex_;
334
+ int64 mem_budget_; // Total memory budget for all States.
335
+ int64 state_budget_; // Amount of memory remaining for new States.
336
+ StateSet state_cache_; // All States computed so far.
337
+ StartInfo start_[kMaxStart];
338
+ bool cache_warned_; // have printed to LOG(INFO) about the cache
339
+ };
340
+
341
+ // Shorthand for casting to uint8*.
342
+ static inline const uint8* BytePtr(const void* v) {
343
+ return reinterpret_cast<const uint8*>(v);
344
+ }
345
+
346
+ // Work queues
347
+
348
+ // Marks separate thread groups of different priority
349
+ // in the work queue when in leftmost-longest matching mode.
350
+ #define Mark (-1)
351
+
352
+ // Internally, the DFA uses a sparse array of
353
+ // program instruction pointers as a work queue.
354
+ // In leftmost longest mode, marks separate sections
355
+ // of workq that started executing at different
356
+ // locations in the string (earlier locations first).
357
+ class DFA::Workq : public SparseSet {
358
+ public:
359
+ // Constructor: n is number of normal slots, maxmark number of mark slots.
360
+ Workq(int n, int maxmark) :
361
+ SparseSet(n+maxmark),
362
+ n_(n),
363
+ maxmark_(maxmark),
364
+ nextmark_(n),
365
+ last_was_mark_(true) {
366
+ }
367
+
368
+ bool is_mark(int i) { return i >= n_; }
369
+
370
+ int maxmark() { return maxmark_; }
371
+
372
+ void clear() {
373
+ SparseSet::clear();
374
+ nextmark_ = n_;
375
+ }
376
+
377
+ void mark() {
378
+ if (last_was_mark_)
379
+ return;
380
+ last_was_mark_ = false;
381
+ SparseSet::insert_new(nextmark_++);
382
+ }
383
+
384
+ int size() {
385
+ return n_ + maxmark_;
386
+ }
387
+
388
+ void insert(int id) {
389
+ if (contains(id))
390
+ return;
391
+ insert_new(id);
392
+ }
393
+
394
+ void insert_new(int id) {
395
+ last_was_mark_ = false;
396
+ SparseSet::insert_new(id);
397
+ }
398
+
399
+ private:
400
+ int n_; // size excluding marks
401
+ int maxmark_; // maximum number of marks
402
+ int nextmark_; // id of next mark
403
+ bool last_was_mark_; // last inserted was mark
404
+ DISALLOW_EVIL_CONSTRUCTORS(Workq);
405
+ };
406
+
407
+ DFA::DFA(Prog* prog, Prog::MatchKind kind, int64 max_mem)
408
+ : prog_(prog),
409
+ kind_(kind),
410
+ init_failed_(false),
411
+ q0_(NULL),
412
+ q1_(NULL),
413
+ astack_(NULL),
414
+ mem_budget_(max_mem),
415
+ cache_warned_(false) {
416
+ if (DebugDFA)
417
+ fprintf(stderr, "\nkind %d\n%s\n", (int)kind_, prog_->DumpUnanchored().c_str());
418
+ int nmark = 0;
419
+ start_unanchored_ = 0;
420
+ if (kind_ == Prog::kLongestMatch) {
421
+ nmark = prog->size();
422
+ start_unanchored_ = prog->start_unanchored();
423
+ }
424
+ nastack_ = 2 * prog->size() + nmark;
425
+
426
+ // Account for space needed for DFA, q0, q1, astack.
427
+ mem_budget_ -= sizeof(DFA);
428
+ mem_budget_ -= (prog_->size() + nmark) *
429
+ (sizeof(int)+sizeof(int)) * 2; // q0, q1
430
+ mem_budget_ -= nastack_ * sizeof(int); // astack
431
+ if (mem_budget_ < 0) {
432
+ LOG(INFO) << StringPrintf("DFA out of memory: prog size %lld mem %lld",
433
+ prog_->size(), max_mem);
434
+ init_failed_ = true;
435
+ return;
436
+ }
437
+
438
+ state_budget_ = mem_budget_;
439
+
440
+ // Make sure there is a reasonable amount of working room left.
441
+ // At minimum, the search requires room for two states in order
442
+ // to limp along, restarting frequently. We'll get better performance
443
+ // if there is room for a larger number of states, say 20.
444
+ int one_state = sizeof(State) + (prog_->size()+nmark)*sizeof(int) +
445
+ (prog_->bytemap_range()+1)*sizeof(State*);
446
+ if (state_budget_ < 20*one_state) {
447
+ LOG(INFO) << StringPrintf("DFA out of memory: prog size %lld mem %lld",
448
+ prog_->size(), max_mem);
449
+ init_failed_ = true;
450
+ return;
451
+ }
452
+
453
+ q0_ = new Workq(prog->size(), nmark);
454
+ q1_ = new Workq(prog->size(), nmark);
455
+ astack_ = new int[nastack_];
456
+ }
457
+
458
+ DFA::~DFA() {
459
+ delete q0_;
460
+ delete q1_;
461
+ delete[] astack_;
462
+ ClearCache();
463
+ }
464
+
465
+ // In the DFA state graph, s->next[c] == NULL means that the
466
+ // state has not yet been computed and needs to be. We need
467
+ // a different special value to signal that s->next[c] is a
468
+ // state that can never lead to a match (and thus the search
469
+ // can be called off). Hence DeadState.
470
+ #define DeadState reinterpret_cast<State*>(1)
471
+
472
+ // Signals that the rest of the string matches no matter what it is.
473
+ #define FullMatchState reinterpret_cast<State*>(2)
474
+
475
+ #define SpecialStateMax FullMatchState
476
+
477
+ // Debugging printouts
478
+
479
+ // For debugging, returns a string representation of the work queue.
480
+ string DFA::DumpWorkq(Workq* q) {
481
+ string s;
482
+ const char* sep = "";
483
+ for (DFA::Workq::iterator it = q->begin(); it != q->end(); ++it) {
484
+ if (q->is_mark(*it)) {
485
+ StringAppendF(&s, "|");
486
+ sep = "";
487
+ } else {
488
+ StringAppendF(&s, "%s%d", sep, *it);
489
+ sep = ",";
490
+ }
491
+ }
492
+ return s;
493
+ }
494
+
495
+ // For debugging, returns a string representation of the state.
496
+ string DFA::DumpState(State* state) {
497
+ if (state == NULL)
498
+ return "_";
499
+ if (state == DeadState)
500
+ return "X";
501
+ if (state == FullMatchState)
502
+ return "*";
503
+ string s;
504
+ const char* sep = "";
505
+ StringAppendF(&s, "(%p)", state);
506
+ for (int i = 0; i < state->ninst_; i++) {
507
+ if (state->inst_[i] == Mark) {
508
+ StringAppendF(&s, "|");
509
+ sep = "";
510
+ } else {
511
+ StringAppendF(&s, "%s%d", sep, state->inst_[i]);
512
+ sep = ",";
513
+ }
514
+ }
515
+ StringAppendF(&s, " flag=%#x", state->flag_);
516
+ return s;
517
+ }
518
+
519
+ //////////////////////////////////////////////////////////////////////
520
+ //
521
+ // DFA state graph construction.
522
+ //
523
+ // The DFA state graph is a heavily-linked collection of State* structures.
524
+ // The state_cache_ is a set of all the State structures ever allocated,
525
+ // so that if the same state is reached by two different paths,
526
+ // the same State structure can be used. This reduces allocation
527
+ // requirements and also avoids duplication of effort across the two
528
+ // identical states.
529
+ //
530
+ // A State is defined by an ordered list of instruction ids and a flag word.
531
+ //
532
+ // The choice of an ordered list of instructions differs from a typical
533
+ // textbook DFA implementation, which would use an unordered set.
534
+ // Textbook descriptions, however, only care about whether
535
+ // the DFA matches, not where it matches in the text. To decide where the
536
+ // DFA matches, we need to mimic the behavior of the dominant backtracking
537
+ // implementations like PCRE, which try one possible regular expression
538
+ // execution, then another, then another, stopping when one of them succeeds.
539
+ // The DFA execution tries these many executions in parallel, representing
540
+ // each by an instruction id. These pointers are ordered in the State.inst_
541
+ // list in the same order that the executions would happen in a backtracking
542
+ // search: if a match is found during execution of inst_[2], inst_[i] for i>=3
543
+ // can be discarded.
544
+ //
545
+ // Textbooks also typically do not consider context-aware empty string operators
546
+ // like ^ or $. These are handled by the flag word, which specifies the set
547
+ // of empty-string operators that should be matched when executing at the
548
+ // current text position. These flag bits are defined in prog.h.
549
+ // The flag word also contains two DFA-specific bits: kFlagMatch if the state
550
+ // is a matching state (one that reached a kInstMatch in the program)
551
+ // and kFlagLastWord if the last processed byte was a word character, for the
552
+ // implementation of \B and \b.
553
+ //
554
+ // The flag word also contains, shifted up 16 bits, the bits looked for by
555
+ // any kInstEmptyWidth instructions in the state. These provide a useful
556
+ // summary indicating when new flags might be useful.
557
+ //
558
+ // The permanent representation of a State's instruction ids is just an array,
559
+ // but while a state is being analyzed, these instruction ids are represented
560
+ // as a Workq, which is an array that allows iteration in insertion order.
561
+
562
+ // NOTE(rsc): The choice of State construction determines whether the DFA
563
+ // mimics backtracking implementations (so-called leftmost first matching) or
564
+ // traditional DFA implementations (so-called leftmost longest matching as
565
+ // prescribed by POSIX). This implementation chooses to mimic the
566
+ // backtracking implementations, because we want to replace PCRE. To get
567
+ // POSIX behavior, the states would need to be considered not as a simple
568
+ // ordered list of instruction ids, but as a list of unordered sets of instruction
569
+ // ids. A match by a state in one set would inhibit the running of sets
570
+ // farther down the list but not other instruction ids in the same set. Each
571
+ // set would correspond to matches beginning at a given point in the string.
572
+ // This is implemented by separating different sets with Mark pointers.
573
+
574
+ // Looks in the State cache for a State matching q, flag.
575
+ // If one is found, returns it. If one is not found, allocates one,
576
+ // inserts it in the cache, and returns it.
577
+ DFA::State* DFA::WorkqToCachedState(Workq* q, uint flag) {
578
+ if (DEBUG_MODE)
579
+ mutex_.AssertHeld();
580
+
581
+ // Construct array of instruction ids for the new state.
582
+ // Only ByteRange, EmptyWidth, and Match instructions are useful to keep:
583
+ // those are the only operators with any effect in
584
+ // RunWorkqOnEmptyString or RunWorkqOnByte.
585
+ int* inst = new int[q->size()];
586
+ int n = 0;
587
+ uint needflags = 0; // flags needed by kInstEmptyWidth instructions
588
+ bool sawmatch = false; // whether queue contains guaranteed kInstMatch
589
+ bool sawmark = false; // whether queue contains a Mark
590
+ if (DebugDFA)
591
+ fprintf(stderr, "WorkqToCachedState %s [%#x]", DumpWorkq(q).c_str(), flag);
592
+ for (Workq::iterator it = q->begin(); it != q->end(); ++it) {
593
+ int id = *it;
594
+ if (sawmatch && (kind_ == Prog::kFirstMatch || q->is_mark(id)))
595
+ break;
596
+ if (q->is_mark(id)) {
597
+ if (n > 0 && inst[n-1] != Mark) {
598
+ sawmark = true;
599
+ inst[n++] = Mark;
600
+ }
601
+ continue;
602
+ }
603
+ Prog::Inst* ip = prog_->inst(id);
604
+ switch (ip->opcode()) {
605
+ case kInstAltMatch:
606
+ // This state will continue to a match no matter what
607
+ // the rest of the input is. If it is the highest priority match
608
+ // being considered, return the special FullMatchState
609
+ // to indicate that it's all matches from here out.
610
+ if (kind_ != Prog::kManyMatch &&
611
+ (kind_ != Prog::kFirstMatch ||
612
+ (it == q->begin() && ip->greedy(prog_))) &&
613
+ (kind_ != Prog::kLongestMatch || !sawmark) &&
614
+ (flag & kFlagMatch)) {
615
+ delete[] inst;
616
+ if (DebugDFA)
617
+ fprintf(stderr, " -> FullMatchState\n");
618
+ return FullMatchState;
619
+ }
620
+ // Fall through.
621
+ case kInstByteRange: // These are useful.
622
+ case kInstEmptyWidth:
623
+ case kInstMatch:
624
+ case kInstAlt: // Not useful, but necessary [*]
625
+ inst[n++] = *it;
626
+ if (ip->opcode() == kInstEmptyWidth)
627
+ needflags |= ip->empty();
628
+ if (ip->opcode() == kInstMatch && !prog_->anchor_end())
629
+ sawmatch = true;
630
+ break;
631
+
632
+ default: // The rest are not.
633
+ break;
634
+ }
635
+
636
+ // [*] kInstAlt would seem useless to record in a state, since
637
+ // we've already followed both its arrows and saved all the
638
+ // interesting states we can reach from there. The problem
639
+ // is that one of the empty-width instructions might lead
640
+ // back to the same kInstAlt (if an empty-width operator is starred),
641
+ // producing a different evaluation order depending on whether
642
+ // we keep the kInstAlt to begin with. Sigh.
643
+ // A specific case that this affects is /(^|a)+/ matching "a".
644
+ // If we don't save the kInstAlt, we will match the whole "a" (0,1)
645
+ // but in fact the correct leftmost-first match is the leading "" (0,0).
646
+ }
647
+ DCHECK_LE(n, q->size());
648
+ if (n > 0 && inst[n-1] == Mark)
649
+ n--;
650
+
651
+ // If there are no empty-width instructions waiting to execute,
652
+ // then the extra flag bits will not be used, so there is no
653
+ // point in saving them. (Discarding them reduces the number
654
+ // of distinct states.)
655
+ if (needflags == 0)
656
+ flag &= kFlagMatch;
657
+
658
+ // NOTE(rsc): The code above cannot do flag &= needflags,
659
+ // because if the right flags were present to pass the current
660
+ // kInstEmptyWidth instructions, new kInstEmptyWidth instructions
661
+ // might be reached that in turn need different flags.
662
+ // The only sure thing is that if there are no kInstEmptyWidth
663
+ // instructions at all, no flags will be needed.
664
+ // We could do the extra work to figure out the full set of
665
+ // possibly needed flags by exploring past the kInstEmptyWidth
666
+ // instructions, but the check above -- are any flags needed
667
+ // at all? -- handles the most common case. More fine-grained
668
+ // analysis can only be justified by measurements showing that
669
+ // too many redundant states are being allocated.
670
+
671
+ // If there are no Insts in the list, it's a dead state,
672
+ // which is useful to signal with a special pointer so that
673
+ // the execution loop can stop early. This is only okay
674
+ // if the state is *not* a matching state.
675
+ if (n == 0 && flag == 0) {
676
+ delete[] inst;
677
+ if (DebugDFA)
678
+ fprintf(stderr, " -> DeadState\n");
679
+ return DeadState;
680
+ }
681
+
682
+ // If we're in longest match mode, the state is a sequence of
683
+ // unordered state sets separated by Marks. Sort each set
684
+ // to canonicalize, to reduce the number of distinct sets stored.
685
+ if (kind_ == Prog::kLongestMatch) {
686
+ int* ip = inst;
687
+ int* ep = ip + n;
688
+ while (ip < ep) {
689
+ int* markp = ip;
690
+ while (markp < ep && *markp != Mark)
691
+ markp++;
692
+ sort(ip, markp);
693
+ if (markp < ep)
694
+ markp++;
695
+ ip = markp;
696
+ }
697
+ }
698
+
699
+ // Save the needed empty-width flags in the top bits for use later.
700
+ flag |= needflags << kFlagNeedShift;
701
+
702
+ State* state = CachedState(inst, n, flag);
703
+ delete[] inst;
704
+ return state;
705
+ }
706
+
707
+ // Looks in the State cache for a State matching inst, ninst, flag.
708
+ // If one is found, returns it. If one is not found, allocates one,
709
+ // inserts it in the cache, and returns it.
710
+ DFA::State* DFA::CachedState(int* inst, int ninst, uint flag) {
711
+ if (DEBUG_MODE)
712
+ mutex_.AssertHeld();
713
+
714
+ // Look in the cache for a pre-existing state.
715
+ State state = { inst, ninst, flag, NULL };
716
+ StateSet::iterator it = state_cache_.find(&state);
717
+ if (it != state_cache_.end()) {
718
+ if (DebugDFA)
719
+ fprintf(stderr, " -cached-> %s\n", DumpState(*it).c_str());
720
+ return *it;
721
+ }
722
+
723
+ // Must have enough memory for new state.
724
+ // In addition to what we're going to allocate,
725
+ // the state cache hash table seems to incur about 32 bytes per
726
+ // State*, empirically.
727
+ const int kStateCacheOverhead = 32;
728
+ int nnext = prog_->bytemap_range() + 1; // + 1 for kByteEndText slot
729
+ int mem = sizeof(State) + nnext*sizeof(State*) + ninst*sizeof(int);
730
+ if (mem_budget_ < mem + kStateCacheOverhead) {
731
+ mem_budget_ = -1;
732
+ return NULL;
733
+ }
734
+ mem_budget_ -= mem + kStateCacheOverhead;
735
+
736
+ // Allocate new state, along with room for next and inst.
737
+ char* space = new char[mem];
738
+ State* s = reinterpret_cast<State*>(space);
739
+ s->next_ = reinterpret_cast<State**>(s + 1);
740
+ s->inst_ = reinterpret_cast<int*>(s->next_ + nnext);
741
+ memset(s->next_, 0, nnext*sizeof s->next_[0]);
742
+ memmove(s->inst_, inst, ninst*sizeof s->inst_[0]);
743
+ s->ninst_ = ninst;
744
+ s->flag_ = flag;
745
+ if (DebugDFA)
746
+ fprintf(stderr, " -> %s\n", DumpState(s).c_str());
747
+
748
+ // Put state in cache and return it.
749
+ state_cache_.insert(s);
750
+ return s;
751
+ }
752
+
753
+ // Clear the cache. Must hold cache_mutex_.w or be in destructor.
754
+ void DFA::ClearCache() {
755
+ // In case state_cache_ doesn't support deleting entries
756
+ // during iteration, copy into a vector and then delete.
757
+ vector<State*> v;
758
+ v.reserve(state_cache_.size());
759
+ for (StateSet::iterator it = state_cache_.begin();
760
+ it != state_cache_.end(); ++it)
761
+ v.push_back(*it);
762
+ state_cache_.clear();
763
+ for (int i = 0; i < v.size(); i++)
764
+ delete[] reinterpret_cast<const char*>(v[i]);
765
+ }
766
+
767
+ // Copies insts in state s to the work queue q.
768
+ void DFA::StateToWorkq(State* s, Workq* q) {
769
+ q->clear();
770
+ for (int i = 0; i < s->ninst_; i++) {
771
+ if (s->inst_[i] == Mark)
772
+ q->mark();
773
+ else
774
+ q->insert_new(s->inst_[i]);
775
+ }
776
+ }
777
+
778
+ // Adds ip to the work queue, following empty arrows according to flag
779
+ // and expanding kInstAlt instructions (two-target gotos).
780
+ void DFA::AddToQueue(Workq* q, int id, uint flag) {
781
+
782
+ // Use astack_ to hold our stack of states yet to process.
783
+ // It is sized to have room for nastack_ == 2*prog->size() + nmark
784
+ // instructions, which is enough: each instruction can be
785
+ // processed by the switch below only once, and the processing
786
+ // pushes at most two instructions plus maybe a mark.
787
+ // (If we're using marks, nmark == prog->size(); otherwise nmark == 0.)
788
+ int* stk = astack_;
789
+ int nstk = 0;
790
+
791
+ stk[nstk++] = id;
792
+ while (nstk > 0) {
793
+ DCHECK_LE(nstk, nastack_);
794
+ id = stk[--nstk];
795
+
796
+ if (id == Mark) {
797
+ q->mark();
798
+ continue;
799
+ }
800
+
801
+ if (id == 0)
802
+ continue;
803
+
804
+ // If ip is already on the queue, nothing to do.
805
+ // Otherwise add it. We don't actually keep all the ones
806
+ // that get added -- for example, kInstAlt is ignored
807
+ // when on a work queue -- but adding all ip's here
808
+ // increases the likelihood of q->contains(id),
809
+ // reducing the amount of duplicated work.
810
+ if (q->contains(id))
811
+ continue;
812
+ q->insert_new(id);
813
+
814
+ // Process instruction.
815
+ Prog::Inst* ip = prog_->inst(id);
816
+ switch (ip->opcode()) {
817
+ case kInstFail: // can't happen: discarded above
818
+ break;
819
+
820
+ case kInstByteRange: // just save these on the queue
821
+ case kInstMatch:
822
+ break;
823
+
824
+ case kInstCapture: // DFA treats captures as no-ops.
825
+ case kInstNop:
826
+ stk[nstk++] = ip->out();
827
+ break;
828
+
829
+ case kInstAlt: // two choices: expand both, in order
830
+ case kInstAltMatch:
831
+ // Want to visit out then out1, so push on stack in reverse order.
832
+ // This instruction is the [00-FF]* loop at the beginning of
833
+ // a leftmost-longest unanchored search, separate out from out1
834
+ // with a Mark, so that out1's threads (which will start farther
835
+ // to the right in the string being searched) are lower priority
836
+ // than the current ones.
837
+ stk[nstk++] = ip->out1();
838
+ if (q->maxmark() > 0 &&
839
+ id == prog_->start_unanchored() && id != prog_->start())
840
+ stk[nstk++] = Mark;
841
+ stk[nstk++] = ip->out();
842
+ break;
843
+
844
+ case kInstEmptyWidth:
845
+ if ((ip->empty() & flag) == ip->empty())
846
+ stk[nstk++] = ip->out();
847
+ break;
848
+ }
849
+ }
850
+ }
851
+
852
+ // Running of work queues. In the work queue, order matters:
853
+ // the queue is sorted in priority order. If instruction i comes before j,
854
+ // then the instructions that i produces during the run must come before
855
+ // the ones that j produces. In order to keep this invariant, all the
856
+ // work queue runners have to take an old queue to process and then
857
+ // also a new queue to fill in. It's not acceptable to add to the end of
858
+ // an existing queue, because new instructions will not end up in the
859
+ // correct position.
860
+
861
+ // Runs the work queue, processing the empty strings indicated by flag.
862
+ // For example, flag == kEmptyBeginLine|kEmptyEndLine means to match
863
+ // both ^ and $. It is important that callers pass all flags at once:
864
+ // processing both ^ and $ is not the same as first processing only ^
865
+ // and then processing only $. Doing the two-step sequence won't match
866
+ // ^$^$^$ but processing ^ and $ simultaneously will (and is the behavior
867
+ // exhibited by existing implementations).
868
+ void DFA::RunWorkqOnEmptyString(Workq* oldq, Workq* newq, uint flag) {
869
+ newq->clear();
870
+ for (Workq::iterator i = oldq->begin(); i != oldq->end(); ++i) {
871
+ if (oldq->is_mark(*i))
872
+ AddToQueue(newq, Mark, flag);
873
+ else
874
+ AddToQueue(newq, *i, flag);
875
+ }
876
+ }
877
+
878
+ // Runs the work queue, processing the single byte c followed by any empty
879
+ // strings indicated by flag. For example, c == 'a' and flag == kEmptyEndLine,
880
+ // means to match c$. Sets the bool *ismatch to true if the end of the
881
+ // regular expression program has been reached (the regexp has matched).
882
+ void DFA::RunWorkqOnByte(Workq* oldq, Workq* newq,
883
+ int c, uint flag, bool* ismatch,
884
+ Prog::MatchKind kind,
885
+ int new_byte_loop) {
886
+ if (DEBUG_MODE)
887
+ mutex_.AssertHeld();
888
+
889
+ newq->clear();
890
+ for (Workq::iterator i = oldq->begin(); i != oldq->end(); ++i) {
891
+ if (oldq->is_mark(*i)) {
892
+ if (*ismatch)
893
+ return;
894
+ newq->mark();
895
+ continue;
896
+ }
897
+ int id = *i;
898
+ Prog::Inst* ip = prog_->inst(id);
899
+ switch (ip->opcode()) {
900
+ case kInstFail: // never succeeds
901
+ case kInstCapture: // already followed
902
+ case kInstNop: // already followed
903
+ case kInstAlt: // already followed
904
+ case kInstAltMatch: // already followed
905
+ case kInstEmptyWidth: // already followed
906
+ break;
907
+
908
+ case kInstByteRange: // can follow if c is in range
909
+ if (ip->Matches(c))
910
+ AddToQueue(newq, ip->out(), flag);
911
+ break;
912
+
913
+ case kInstMatch:
914
+ if (prog_->anchor_end() && c != kByteEndText)
915
+ break;
916
+ *ismatch = true;
917
+ if (kind == Prog::kFirstMatch) {
918
+ // Can stop processing work queue since we found a match.
919
+ return;
920
+ }
921
+ break;
922
+ }
923
+ }
924
+
925
+ if (DebugDFA)
926
+ fprintf(stderr, "%s on %d[%#x] -> %s [%d]\n", DumpWorkq(oldq).c_str(),
927
+ c, flag, DumpWorkq(newq).c_str(), *ismatch);
928
+ }
929
+
930
+ // Processes input byte c in state, returning new state.
931
+ // Caller does not hold mutex.
932
+ DFA::State* DFA::RunStateOnByteUnlocked(State* state, int c) {
933
+ // Keep only one RunStateOnByte going
934
+ // even if the DFA is being run by multiple threads.
935
+ MutexLock l(&mutex_);
936
+ return RunStateOnByte(state, c);
937
+ }
938
+
939
+ // Processes input byte c in state, returning new state.
940
+ DFA::State* DFA::RunStateOnByte(State* state, int c) {
941
+ if (DEBUG_MODE)
942
+ mutex_.AssertHeld();
943
+ if (state <= SpecialStateMax) {
944
+ if (state == FullMatchState) {
945
+ // It is convenient for routines like PossibleMatchRange
946
+ // if we implement RunStateOnByte for FullMatchState:
947
+ // once you get into this state you never get out,
948
+ // so it's pretty easy.
949
+ return FullMatchState;
950
+ }
951
+ if (state == DeadState) {
952
+ LOG(DFATAL) << "DeadState in RunStateOnByte";
953
+ return NULL;
954
+ }
955
+ if (state == NULL) {
956
+ LOG(DFATAL) << "NULL state in RunStateOnByte";
957
+ return NULL;
958
+ }
959
+ LOG(DFATAL) << "Unexpected special state in RunStateOnByte";
960
+ return NULL;
961
+ }
962
+
963
+ // If someone else already computed this, return it.
964
+ MaybeReadMemoryBarrier(); // On alpha we need to ensure read ordering
965
+ if (state->next_[ByteMap(c)])
966
+ return state->next_[ByteMap(c)];
967
+
968
+ // Convert state into Workq.
969
+ StateToWorkq(state, q0_);
970
+
971
+ // Flags marking the kinds of empty-width things (^ $ etc)
972
+ // around this byte. Before the byte we have the flags recorded
973
+ // in the State structure itself. After the byte we have
974
+ // nothing yet (but that will change: read on).
975
+ uint needflag = state->flag_ >> kFlagNeedShift;
976
+ uint beforeflag = state->flag_ & kFlagEmptyMask;
977
+ uint oldbeforeflag = beforeflag;
978
+ uint afterflag = 0;
979
+
980
+ if (c == '\n') {
981
+ // Insert implicit $ and ^ around \n
982
+ beforeflag |= kEmptyEndLine;
983
+ afterflag |= kEmptyBeginLine;
984
+ }
985
+
986
+ if (c == kByteEndText) {
987
+ // Insert implicit $ and \z before the fake "end text" byte.
988
+ beforeflag |= kEmptyEndLine | kEmptyEndText;
989
+ }
990
+
991
+ // The state flag kFlagLastWord says whether the last
992
+ // byte processed was a word character. Use that info to
993
+ // insert empty-width (non-)word boundaries.
994
+ bool islastword = state->flag_ & kFlagLastWord;
995
+ bool isword = (c != kByteEndText && Prog::IsWordChar(c));
996
+ if (isword == islastword)
997
+ beforeflag |= kEmptyNonWordBoundary;
998
+ else
999
+ beforeflag |= kEmptyWordBoundary;
1000
+
1001
+ // Okay, finally ready to run.
1002
+ // Only useful to rerun on empty string if there are new, useful flags.
1003
+ if (beforeflag & ~oldbeforeflag & needflag) {
1004
+ RunWorkqOnEmptyString(q0_, q1_, beforeflag);
1005
+ swap(q0_, q1_);
1006
+ }
1007
+ bool ismatch = false;
1008
+ RunWorkqOnByte(q0_, q1_, c, afterflag, &ismatch, kind_, start_unanchored_);
1009
+ swap(q0_, q1_);
1010
+
1011
+ // Save afterflag along with ismatch and isword in new state.
1012
+ uint flag = afterflag;
1013
+ if (ismatch)
1014
+ flag |= kFlagMatch;
1015
+ if (isword)
1016
+ flag |= kFlagLastWord;
1017
+
1018
+ State* ns = WorkqToCachedState(q0_, flag);
1019
+
1020
+ // Write barrier before updating state->next_ so that the
1021
+ // main search loop can proceed without any locking, for speed.
1022
+ // (Otherwise it would need one mutex operation per input byte.)
1023
+ // The annotations below tell race detectors that:
1024
+ // a) the access to next_ should be ignored,
1025
+ // b) 'ns' is properly published.
1026
+ WriteMemoryBarrier(); // Flush ns before linking to it.
1027
+ ANNOTATE_PUBLISH_MEMORY_RANGE(ns, sizeof(*ns));
1028
+
1029
+ ANNOTATE_IGNORE_WRITES_BEGIN();
1030
+ state->next_[ByteMap(c)] = ns;
1031
+ ANNOTATE_IGNORE_WRITES_END();
1032
+ return ns;
1033
+ }
1034
+
1035
+
1036
+ //////////////////////////////////////////////////////////////////////
1037
+ // DFA cache reset.
1038
+
1039
+ // Reader-writer lock helper.
1040
+ //
1041
+ // The DFA uses a reader-writer mutex to protect the state graph itself.
1042
+ // Traversing the state graph requires holding the mutex for reading,
1043
+ // and discarding the state graph and starting over requires holding the
1044
+ // lock for writing. If a search needs to expand the graph but is out
1045
+ // of memory, it will need to drop its read lock and then acquire the
1046
+ // write lock. Since it cannot then atomically downgrade from write lock
1047
+ // to read lock, it runs the rest of the search holding the write lock.
1048
+ // (This probably helps avoid repeated contention, but really the decision
1049
+ // is forced by the Mutex interface.) It's a bit complicated to keep
1050
+ // track of whether the lock is held for reading or writing and thread
1051
+ // that through the search, so instead we encapsulate it in the RWLocker
1052
+ // and pass that around.
1053
+
1054
+ class DFA::RWLocker {
1055
+ public:
1056
+ explicit RWLocker(Mutex* mu);
1057
+ ~RWLocker();
1058
+
1059
+ // If the lock is only held for reading right now,
1060
+ // drop the read lock and re-acquire for writing.
1061
+ // Subsequent calls to LockForWriting are no-ops.
1062
+ // Notice that the lock is *released* temporarily.
1063
+ void LockForWriting();
1064
+
1065
+ // Returns whether the lock is already held for writing.
1066
+ bool IsLockedForWriting() {
1067
+ return writing_;
1068
+ }
1069
+
1070
+ private:
1071
+ Mutex* mu_;
1072
+ bool writing_;
1073
+
1074
+ DISALLOW_EVIL_CONSTRUCTORS(RWLocker);
1075
+ };
1076
+
1077
+ DFA::RWLocker::RWLocker(Mutex* mu)
1078
+ : mu_(mu), writing_(false) {
1079
+
1080
+ mu_->ReaderLock();
1081
+ }
1082
+
1083
+ // This function is marked as NO_THREAD_SAFETY_ANALYSIS because the annotations
1084
+ // does not support lock upgrade.
1085
+ void DFA::RWLocker::LockForWriting() NO_THREAD_SAFETY_ANALYSIS {
1086
+ if (!writing_) {
1087
+ mu_->ReaderUnlock();
1088
+ mu_->Lock();
1089
+ writing_ = true;
1090
+ }
1091
+ }
1092
+
1093
+ DFA::RWLocker::~RWLocker() {
1094
+ if (writing_)
1095
+ mu_->WriterUnlock();
1096
+ else
1097
+ mu_->ReaderUnlock();
1098
+ }
1099
+
1100
+
1101
+ // When the DFA's State cache fills, we discard all the states in the
1102
+ // cache and start over. Many threads can be using and adding to the
1103
+ // cache at the same time, so we synchronize using the cache_mutex_
1104
+ // to keep from stepping on other threads. Specifically, all the
1105
+ // threads using the current cache hold cache_mutex_ for reading.
1106
+ // When a thread decides to flush the cache, it drops cache_mutex_
1107
+ // and then re-acquires it for writing. That ensures there are no
1108
+ // other threads accessing the cache anymore. The rest of the search
1109
+ // runs holding cache_mutex_ for writing, avoiding any contention
1110
+ // with or cache pollution caused by other threads.
1111
+
1112
+ void DFA::ResetCache(RWLocker* cache_lock) {
1113
+ // Re-acquire the cache_mutex_ for writing (exclusive use).
1114
+ bool was_writing = cache_lock->IsLockedForWriting();
1115
+ cache_lock->LockForWriting();
1116
+
1117
+ // If we already held cache_mutex_ for writing, it means
1118
+ // this invocation of Search() has already reset the
1119
+ // cache once already. That's a pretty clear indication
1120
+ // that the cache is too small. Warn about that, once.
1121
+ // TODO(rsc): Only warn if state_cache_.size() < some threshold.
1122
+ if (was_writing && !cache_warned_) {
1123
+ LOG(INFO) << "DFA memory cache could be too small: "
1124
+ << "only room for " << state_cache_.size() << " states.";
1125
+ cache_warned_ = true;
1126
+ }
1127
+
1128
+ // Clear the cache, reset the memory budget.
1129
+ for (int i = 0; i < kMaxStart; i++) {
1130
+ start_[i].start = NULL;
1131
+ start_[i].firstbyte = kFbUnknown;
1132
+ }
1133
+ ClearCache();
1134
+ mem_budget_ = state_budget_;
1135
+ }
1136
+
1137
+ // Typically, a couple States do need to be preserved across a cache
1138
+ // reset, like the State at the current point in the search.
1139
+ // The StateSaver class helps keep States across cache resets.
1140
+ // It makes a copy of the state's guts outside the cache (before the reset)
1141
+ // and then can be asked, after the reset, to recreate the State
1142
+ // in the new cache. For example, in a DFA method ("this" is a DFA):
1143
+ //
1144
+ // StateSaver saver(this, s);
1145
+ // ResetCache(cache_lock);
1146
+ // s = saver.Restore();
1147
+ //
1148
+ // The saver should always have room in the cache to re-create the state,
1149
+ // because resetting the cache locks out all other threads, and the cache
1150
+ // is known to have room for at least a couple states (otherwise the DFA
1151
+ // constructor fails).
1152
+
1153
+ class DFA::StateSaver {
1154
+ public:
1155
+ explicit StateSaver(DFA* dfa, State* state);
1156
+ ~StateSaver();
1157
+
1158
+ // Recreates and returns a state equivalent to the
1159
+ // original state passed to the constructor.
1160
+ // Returns NULL if the cache has filled, but
1161
+ // since the DFA guarantees to have room in the cache
1162
+ // for a couple states, should never return NULL
1163
+ // if used right after ResetCache.
1164
+ State* Restore();
1165
+
1166
+ private:
1167
+ DFA* dfa_; // the DFA to use
1168
+ int* inst_; // saved info from State
1169
+ int ninst_;
1170
+ uint flag_;
1171
+ bool is_special_; // whether original state was special
1172
+ State* special_; // if is_special_, the original state
1173
+
1174
+ DISALLOW_EVIL_CONSTRUCTORS(StateSaver);
1175
+ };
1176
+
1177
+ DFA::StateSaver::StateSaver(DFA* dfa, State* state) {
1178
+ dfa_ = dfa;
1179
+ if (state <= SpecialStateMax) {
1180
+ inst_ = NULL;
1181
+ ninst_ = 0;
1182
+ flag_ = 0;
1183
+ is_special_ = true;
1184
+ special_ = state;
1185
+ return;
1186
+ }
1187
+ is_special_ = false;
1188
+ special_ = NULL;
1189
+ flag_ = state->flag_;
1190
+ ninst_ = state->ninst_;
1191
+ inst_ = new int[ninst_];
1192
+ memmove(inst_, state->inst_, ninst_*sizeof inst_[0]);
1193
+ }
1194
+
1195
+ DFA::StateSaver::~StateSaver() {
1196
+ if (!is_special_)
1197
+ delete[] inst_;
1198
+ }
1199
+
1200
+ DFA::State* DFA::StateSaver::Restore() {
1201
+ if (is_special_)
1202
+ return special_;
1203
+ MutexLock l(&dfa_->mutex_);
1204
+ State* s = dfa_->CachedState(inst_, ninst_, flag_);
1205
+ if (s == NULL)
1206
+ LOG(DFATAL) << "StateSaver failed to restore state.";
1207
+ return s;
1208
+ }
1209
+
1210
+
1211
+ //////////////////////////////////////////////////////////////////////
1212
+ //
1213
+ // DFA execution.
1214
+ //
1215
+ // The basic search loop is easy: start in a state s and then for each
1216
+ // byte c in the input, s = s->next[c].
1217
+ //
1218
+ // This simple description omits a few efficiency-driven complications.
1219
+ //
1220
+ // First, the State graph is constructed incrementally: it is possible
1221
+ // that s->next[c] is null, indicating that that state has not been
1222
+ // fully explored. In this case, RunStateOnByte must be invoked to
1223
+ // determine the next state, which is cached in s->next[c] to save
1224
+ // future effort. An alternative reason for s->next[c] to be null is
1225
+ // that the DFA has reached a so-called "dead state", in which any match
1226
+ // is no longer possible. In this case RunStateOnByte will return NULL
1227
+ // and the processing of the string can stop early.
1228
+ //
1229
+ // Second, a 256-element pointer array for s->next_ makes each State
1230
+ // quite large (2kB on 64-bit machines). Instead, dfa->bytemap_[]
1231
+ // maps from bytes to "byte classes" and then next_ only needs to have
1232
+ // as many pointers as there are byte classes. A byte class is simply a
1233
+ // range of bytes that the regexp never distinguishes between.
1234
+ // A regexp looking for a[abc] would have four byte ranges -- 0 to 'a'-1,
1235
+ // 'a', 'b' to 'c', and 'c' to 0xFF. The bytemap slows us a little bit
1236
+ // but in exchange we typically cut the size of a State (and thus our
1237
+ // memory footprint) by about 5-10x. The comments still refer to
1238
+ // s->next[c] for simplicity, but code should refer to s->next_[bytemap_[c]].
1239
+ //
1240
+ // Third, it is common for a DFA for an unanchored match to begin in a
1241
+ // state in which only one particular byte value can take the DFA to a
1242
+ // different state. That is, s->next[c] != s for only one c. In this
1243
+ // situation, the DFA can do better than executing the simple loop.
1244
+ // Instead, it can call memchr to search very quickly for the byte c.
1245
+ // Whether the start state has this property is determined during a
1246
+ // pre-compilation pass, and if so, the byte b is passed to the search
1247
+ // loop as the "firstbyte" argument, along with a boolean "have_firstbyte".
1248
+ //
1249
+ // Fourth, the desired behavior is to search for the leftmost-best match
1250
+ // (approximately, the same one that Perl would find), which is not
1251
+ // necessarily the match ending earliest in the string. Each time a
1252
+ // match is found, it must be noted, but the DFA must continue on in
1253
+ // hope of finding a higher-priority match. In some cases, the caller only
1254
+ // cares whether there is any match at all, not which one is found.
1255
+ // The "want_earliest_match" flag causes the search to stop at the first
1256
+ // match found.
1257
+ //
1258
+ // Fifth, one algorithm that uses the DFA needs it to run over the
1259
+ // input string backward, beginning at the end and ending at the beginning.
1260
+ // Passing false for the "run_forward" flag causes the DFA to run backward.
1261
+ //
1262
+ // The checks for these last three cases, which in a naive implementation
1263
+ // would be performed once per input byte, slow the general loop enough
1264
+ // to merit specialized versions of the search loop for each of the
1265
+ // eight possible settings of the three booleans. Rather than write
1266
+ // eight different functions, we write one general implementation and then
1267
+ // inline it to create the specialized ones.
1268
+ //
1269
+ // Note that matches are delayed by one byte, to make it easier to
1270
+ // accomodate match conditions depending on the next input byte (like $ and \b).
1271
+ // When s->next[c]->IsMatch(), it means that there is a match ending just
1272
+ // *before* byte c.
1273
+
1274
+ // The generic search loop. Searches text for a match, returning
1275
+ // the pointer to the end of the chosen match, or NULL if no match.
1276
+ // The bools are equal to the same-named variables in params, but
1277
+ // making them function arguments lets the inliner specialize
1278
+ // this function to each combination (see two paragraphs above).
1279
+ inline bool DFA::InlinedSearchLoop(SearchParams* params,
1280
+ bool have_firstbyte,
1281
+ bool want_earliest_match,
1282
+ bool run_forward) {
1283
+ State* start = params->start;
1284
+ const uint8* bp = BytePtr(params->text.begin()); // start of text
1285
+ const uint8* p = bp; // text scanning point
1286
+ const uint8* ep = BytePtr(params->text.end()); // end of text
1287
+ const uint8* resetp = NULL; // p at last cache reset
1288
+ if (!run_forward)
1289
+ swap(p, ep);
1290
+
1291
+ const uint8* bytemap = prog_->bytemap();
1292
+ const uint8* lastmatch = NULL; // most recent matching position in text
1293
+ bool matched = false;
1294
+ State* s = start;
1295
+
1296
+ if (s->IsMatch()) {
1297
+ matched = true;
1298
+ lastmatch = p;
1299
+ if (want_earliest_match) {
1300
+ params->ep = reinterpret_cast<const char*>(lastmatch);
1301
+ return true;
1302
+ }
1303
+ }
1304
+
1305
+ while (p != ep) {
1306
+ if (DebugDFA)
1307
+ fprintf(stderr, "@%d: %s\n", static_cast<int>(p - bp),
1308
+ DumpState(s).c_str());
1309
+ if (have_firstbyte && s == start) {
1310
+ // In start state, only way out is to find firstbyte,
1311
+ // so use optimized assembly in memchr to skip ahead.
1312
+ // If firstbyte isn't found, we can skip to the end
1313
+ // of the string.
1314
+ if (run_forward) {
1315
+ if ((p = BytePtr(memchr(p, params->firstbyte, ep - p))) == NULL) {
1316
+ p = ep;
1317
+ break;
1318
+ }
1319
+ } else {
1320
+ if ((p = BytePtr(memrchr(ep, params->firstbyte, p - ep))) == NULL) {
1321
+ p = ep;
1322
+ break;
1323
+ }
1324
+ p++;
1325
+ }
1326
+ }
1327
+
1328
+ int c;
1329
+ if (run_forward)
1330
+ c = *p++;
1331
+ else
1332
+ c = *--p;
1333
+
1334
+ // Note that multiple threads might be consulting
1335
+ // s->next_[bytemap[c]] simultaneously.
1336
+ // RunStateOnByte takes care of the appropriate locking,
1337
+ // including a memory barrier so that the unlocked access
1338
+ // (sometimes known as "double-checked locking") is safe.
1339
+ // The alternative would be either one DFA per thread
1340
+ // or one mutex operation per input byte.
1341
+ //
1342
+ // ns == DeadState means the state is known to be dead
1343
+ // (no more matches are possible).
1344
+ // ns == NULL means the state has not yet been computed
1345
+ // (need to call RunStateOnByteUnlocked).
1346
+ // RunStateOnByte returns ns == NULL if it is out of memory.
1347
+ // ns == FullMatchState means the rest of the string matches.
1348
+ //
1349
+ // Okay to use bytemap[] not ByteMap() here, because
1350
+ // c is known to be an actual byte and not kByteEndText.
1351
+
1352
+ MaybeReadMemoryBarrier(); // On alpha we need to ensure read ordering
1353
+ State* ns = s->next_[bytemap[c]];
1354
+ if (ns == NULL) {
1355
+ ns = RunStateOnByteUnlocked(s, c);
1356
+ if (ns == NULL) {
1357
+ // After we reset the cache, we hold cache_mutex exclusively,
1358
+ // so if resetp != NULL, it means we filled the DFA state
1359
+ // cache with this search alone (without any other threads).
1360
+ // Benchmarks show that doing a state computation on every
1361
+ // byte runs at about 0.2 MB/s, while the NFA (nfa.cc) can do the
1362
+ // same at about 2 MB/s. Unless we're processing an average
1363
+ // of 10 bytes per state computation, fail so that RE2 can
1364
+ // fall back to the NFA.
1365
+ if (FLAGS_re2_dfa_bail_when_slow && resetp != NULL &&
1366
+ (p - resetp) < 10*state_cache_.size()) {
1367
+ params->failed = true;
1368
+ return false;
1369
+ }
1370
+ resetp = p;
1371
+
1372
+ // Prepare to save start and s across the reset.
1373
+ StateSaver save_start(this, start);
1374
+ StateSaver save_s(this, s);
1375
+
1376
+ // Discard all the States in the cache.
1377
+ ResetCache(params->cache_lock);
1378
+
1379
+ // Restore start and s so we can continue.
1380
+ if ((start = save_start.Restore()) == NULL ||
1381
+ (s = save_s.Restore()) == NULL) {
1382
+ // Restore already did LOG(DFATAL).
1383
+ params->failed = true;
1384
+ return false;
1385
+ }
1386
+ ns = RunStateOnByteUnlocked(s, c);
1387
+ if (ns == NULL) {
1388
+ LOG(DFATAL) << "RunStateOnByteUnlocked failed after ResetCache";
1389
+ params->failed = true;
1390
+ return false;
1391
+ }
1392
+ }
1393
+ }
1394
+ if (ns <= SpecialStateMax) {
1395
+ if (ns == DeadState) {
1396
+ params->ep = reinterpret_cast<const char*>(lastmatch);
1397
+ return matched;
1398
+ }
1399
+ // FullMatchState
1400
+ params->ep = reinterpret_cast<const char*>(ep);
1401
+ return true;
1402
+ }
1403
+ s = ns;
1404
+
1405
+ if (s->IsMatch()) {
1406
+ matched = true;
1407
+ // The DFA notices the match one byte late,
1408
+ // so adjust p before using it in the match.
1409
+ if (run_forward)
1410
+ lastmatch = p - 1;
1411
+ else
1412
+ lastmatch = p + 1;
1413
+ if (DebugDFA)
1414
+ fprintf(stderr, "match @%d! [%s]\n",
1415
+ static_cast<int>(lastmatch - bp),
1416
+ DumpState(s).c_str());
1417
+
1418
+ if (want_earliest_match) {
1419
+ params->ep = reinterpret_cast<const char*>(lastmatch);
1420
+ return true;
1421
+ }
1422
+ }
1423
+ }
1424
+
1425
+ // Peek in state to see if a match is coming up.
1426
+ if (params->matches && kind_ == Prog::kManyMatch) {
1427
+ vector<int>* v = params->matches;
1428
+ v->clear();
1429
+ if (s > SpecialStateMax) {
1430
+ for (int i = 0; i < s->ninst_; i++) {
1431
+ Prog::Inst* ip = prog_->inst(s->inst_[i]);
1432
+ if (ip->opcode() == kInstMatch)
1433
+ v->push_back(ip->match_id());
1434
+ }
1435
+ }
1436
+ }
1437
+
1438
+
1439
+ // Process one more byte to see if it triggers a match.
1440
+ // (Remember, matches are delayed one byte.)
1441
+ int lastbyte;
1442
+ if (run_forward) {
1443
+ if (params->text.end() == params->context.end())
1444
+ lastbyte = kByteEndText;
1445
+ else
1446
+ lastbyte = params->text.end()[0] & 0xFF;
1447
+ } else {
1448
+ if (params->text.begin() == params->context.begin())
1449
+ lastbyte = kByteEndText;
1450
+ else
1451
+ lastbyte = params->text.begin()[-1] & 0xFF;
1452
+ }
1453
+
1454
+ MaybeReadMemoryBarrier(); // On alpha we need to ensure read ordering
1455
+ State* ns = s->next_[ByteMap(lastbyte)];
1456
+ if (ns == NULL) {
1457
+ ns = RunStateOnByteUnlocked(s, lastbyte);
1458
+ if (ns == NULL) {
1459
+ StateSaver save_s(this, s);
1460
+ ResetCache(params->cache_lock);
1461
+ if ((s = save_s.Restore()) == NULL) {
1462
+ params->failed = true;
1463
+ return false;
1464
+ }
1465
+ ns = RunStateOnByteUnlocked(s, lastbyte);
1466
+ if (ns == NULL) {
1467
+ LOG(DFATAL) << "RunStateOnByteUnlocked failed after Reset";
1468
+ params->failed = true;
1469
+ return false;
1470
+ }
1471
+ }
1472
+ }
1473
+ s = ns;
1474
+ if (DebugDFA)
1475
+ fprintf(stderr, "@_: %s\n", DumpState(s).c_str());
1476
+ if (s == FullMatchState) {
1477
+ params->ep = reinterpret_cast<const char*>(ep);
1478
+ return true;
1479
+ }
1480
+ if (s > SpecialStateMax && s->IsMatch()) {
1481
+ matched = true;
1482
+ lastmatch = p;
1483
+ if (DebugDFA)
1484
+ fprintf(stderr, "match @%d! [%s]\n", static_cast<int>(lastmatch - bp),
1485
+ DumpState(s).c_str());
1486
+ }
1487
+ params->ep = reinterpret_cast<const char*>(lastmatch);
1488
+ return matched;
1489
+ }
1490
+
1491
+ // Inline specializations of the general loop.
1492
+ bool DFA::SearchFFF(SearchParams* params) {
1493
+ return InlinedSearchLoop(params, 0, 0, 0);
1494
+ }
1495
+ bool DFA::SearchFFT(SearchParams* params) {
1496
+ return InlinedSearchLoop(params, 0, 0, 1);
1497
+ }
1498
+ bool DFA::SearchFTF(SearchParams* params) {
1499
+ return InlinedSearchLoop(params, 0, 1, 0);
1500
+ }
1501
+ bool DFA::SearchFTT(SearchParams* params) {
1502
+ return InlinedSearchLoop(params, 0, 1, 1);
1503
+ }
1504
+ bool DFA::SearchTFF(SearchParams* params) {
1505
+ return InlinedSearchLoop(params, 1, 0, 0);
1506
+ }
1507
+ bool DFA::SearchTFT(SearchParams* params) {
1508
+ return InlinedSearchLoop(params, 1, 0, 1);
1509
+ }
1510
+ bool DFA::SearchTTF(SearchParams* params) {
1511
+ return InlinedSearchLoop(params, 1, 1, 0);
1512
+ }
1513
+ bool DFA::SearchTTT(SearchParams* params) {
1514
+ return InlinedSearchLoop(params, 1, 1, 1);
1515
+ }
1516
+
1517
+ // For debugging, calls the general code directly.
1518
+ bool DFA::SlowSearchLoop(SearchParams* params) {
1519
+ return InlinedSearchLoop(params,
1520
+ params->firstbyte >= 0,
1521
+ params->want_earliest_match,
1522
+ params->run_forward);
1523
+ }
1524
+
1525
+ // For performance, calls the appropriate specialized version
1526
+ // of InlinedSearchLoop.
1527
+ bool DFA::FastSearchLoop(SearchParams* params) {
1528
+ // Because the methods are private, the Searches array
1529
+ // cannot be declared at top level.
1530
+ static bool (DFA::*Searches[])(SearchParams*) = {
1531
+ &DFA::SearchFFF,
1532
+ &DFA::SearchFFT,
1533
+ &DFA::SearchFTF,
1534
+ &DFA::SearchFTT,
1535
+ &DFA::SearchTFF,
1536
+ &DFA::SearchTFT,
1537
+ &DFA::SearchTTF,
1538
+ &DFA::SearchTTT,
1539
+ };
1540
+
1541
+ bool have_firstbyte = (params->firstbyte >= 0);
1542
+ int index = 4 * have_firstbyte +
1543
+ 2 * params->want_earliest_match +
1544
+ 1 * params->run_forward;
1545
+ return (this->*Searches[index])(params);
1546
+ }
1547
+
1548
+
1549
+ // The discussion of DFA execution above ignored the question of how
1550
+ // to determine the initial state for the search loop. There are two
1551
+ // factors that influence the choice of start state.
1552
+ //
1553
+ // The first factor is whether the search is anchored or not.
1554
+ // The regexp program (Prog*) itself has
1555
+ // two different entry points: one for anchored searches and one for
1556
+ // unanchored searches. (The unanchored version starts with a leading ".*?"
1557
+ // and then jumps to the anchored one.)
1558
+ //
1559
+ // The second factor is where text appears in the larger context, which
1560
+ // determines which empty-string operators can be matched at the beginning
1561
+ // of execution. If text is at the very beginning of context, \A and ^ match.
1562
+ // Otherwise if text is at the beginning of a line, then ^ matches.
1563
+ // Otherwise it matters whether the character before text is a word character
1564
+ // or a non-word character.
1565
+ //
1566
+ // The two cases (unanchored vs not) and four cases (empty-string flags)
1567
+ // combine to make the eight cases recorded in the DFA's begin_text_[2],
1568
+ // begin_line_[2], after_wordchar_[2], and after_nonwordchar_[2] cached
1569
+ // StartInfos. The start state for each is filled in the first time it
1570
+ // is used for an actual search.
1571
+
1572
+ // Examines text, context, and anchored to determine the right start
1573
+ // state for the DFA search loop. Fills in params and returns true on success.
1574
+ // Returns false on failure.
1575
+ bool DFA::AnalyzeSearch(SearchParams* params) {
1576
+ const StringPiece& text = params->text;
1577
+ const StringPiece& context = params->context;
1578
+
1579
+ // Sanity check: make sure that text lies within context.
1580
+ if (text.begin() < context.begin() || text.end() > context.end()) {
1581
+ LOG(DFATAL) << "Text is not inside context.";
1582
+ params->start = DeadState;
1583
+ return true;
1584
+ }
1585
+
1586
+ // Determine correct search type.
1587
+ int start;
1588
+ uint flags;
1589
+ if (params->run_forward) {
1590
+ if (text.begin() == context.begin()) {
1591
+ start = kStartBeginText;
1592
+ flags = kEmptyBeginText|kEmptyBeginLine;
1593
+ } else if (text.begin()[-1] == '\n') {
1594
+ start = kStartBeginLine;
1595
+ flags = kEmptyBeginLine;
1596
+ } else if (Prog::IsWordChar(text.begin()[-1] & 0xFF)) {
1597
+ start = kStartAfterWordChar;
1598
+ flags = kFlagLastWord;
1599
+ } else {
1600
+ start = kStartAfterNonWordChar;
1601
+ flags = 0;
1602
+ }
1603
+ } else {
1604
+ if (text.end() == context.end()) {
1605
+ start = kStartBeginText;
1606
+ flags = kEmptyBeginText|kEmptyBeginLine;
1607
+ } else if (text.end()[0] == '\n') {
1608
+ start = kStartBeginLine;
1609
+ flags = kEmptyBeginLine;
1610
+ } else if (Prog::IsWordChar(text.end()[0] & 0xFF)) {
1611
+ start = kStartAfterWordChar;
1612
+ flags = kFlagLastWord;
1613
+ } else {
1614
+ start = kStartAfterNonWordChar;
1615
+ flags = 0;
1616
+ }
1617
+ }
1618
+ if (params->anchored || prog_->anchor_start())
1619
+ start |= kStartAnchored;
1620
+ StartInfo* info = &start_[start];
1621
+
1622
+ // Try once without cache_lock for writing.
1623
+ // Try again after resetting the cache
1624
+ // (ResetCache will relock cache_lock for writing).
1625
+ if (!AnalyzeSearchHelper(params, info, flags)) {
1626
+ ResetCache(params->cache_lock);
1627
+ if (!AnalyzeSearchHelper(params, info, flags)) {
1628
+ LOG(DFATAL) << "Failed to analyze start state.";
1629
+ params->failed = true;
1630
+ return false;
1631
+ }
1632
+ }
1633
+
1634
+ if (DebugDFA)
1635
+ fprintf(stderr, "anchored=%d fwd=%d flags=%#x state=%s firstbyte=%d\n",
1636
+ params->anchored, params->run_forward, flags,
1637
+ DumpState(info->start).c_str(), info->firstbyte);
1638
+
1639
+ params->start = info->start;
1640
+ params->firstbyte = info->firstbyte;
1641
+
1642
+ return true;
1643
+ }
1644
+
1645
+ // Fills in info if needed. Returns true on success, false on failure.
1646
+ bool DFA::AnalyzeSearchHelper(SearchParams* params, StartInfo* info,
1647
+ uint flags) {
1648
+ // Quick check; okay because of memory barriers below.
1649
+ if (info->firstbyte != kFbUnknown)
1650
+ return true;
1651
+
1652
+ MutexLock l(&mutex_);
1653
+ if (info->firstbyte != kFbUnknown)
1654
+ return true;
1655
+
1656
+ q0_->clear();
1657
+ AddToQueue(q0_,
1658
+ params->anchored ? prog_->start() : prog_->start_unanchored(),
1659
+ flags);
1660
+ info->start = WorkqToCachedState(q0_, flags);
1661
+ if (info->start == NULL)
1662
+ return false;
1663
+
1664
+ if (info->start == DeadState) {
1665
+ WriteMemoryBarrier(); // Synchronize with "quick check" above.
1666
+ info->firstbyte = kFbNone;
1667
+ return true;
1668
+ }
1669
+
1670
+ if (info->start == FullMatchState) {
1671
+ WriteMemoryBarrier(); // Synchronize with "quick check" above.
1672
+ info->firstbyte = kFbNone; // will be ignored
1673
+ return true;
1674
+ }
1675
+
1676
+ // Compute info->firstbyte by running state on all
1677
+ // possible byte values, looking for a single one that
1678
+ // leads to a different state.
1679
+ int firstbyte = kFbNone;
1680
+ for (int i = 0; i < 256; i++) {
1681
+ State* s = RunStateOnByte(info->start, i);
1682
+ if (s == NULL) {
1683
+ WriteMemoryBarrier(); // Synchronize with "quick check" above.
1684
+ info->firstbyte = firstbyte;
1685
+ return false;
1686
+ }
1687
+ if (s == info->start)
1688
+ continue;
1689
+ // Goes to new state...
1690
+ if (firstbyte == kFbNone) {
1691
+ firstbyte = i; // ... first one
1692
+ } else {
1693
+ firstbyte = kFbMany; // ... too many
1694
+ break;
1695
+ }
1696
+ }
1697
+ WriteMemoryBarrier(); // Synchronize with "quick check" above.
1698
+ info->firstbyte = firstbyte;
1699
+ return true;
1700
+ }
1701
+
1702
+ // The actual DFA search: calls AnalyzeSearch and then FastSearchLoop.
1703
+ bool DFA::Search(const StringPiece& text,
1704
+ const StringPiece& context,
1705
+ bool anchored,
1706
+ bool want_earliest_match,
1707
+ bool run_forward,
1708
+ bool* failed,
1709
+ const char** epp,
1710
+ vector<int>* matches) {
1711
+ *epp = NULL;
1712
+ if (!ok()) {
1713
+ *failed = true;
1714
+ return false;
1715
+ }
1716
+ *failed = false;
1717
+
1718
+ if (DebugDFA) {
1719
+ fprintf(stderr, "\nprogram:\n%s\n", prog_->DumpUnanchored().c_str());
1720
+ fprintf(stderr, "text %s anchored=%d earliest=%d fwd=%d kind %d\n",
1721
+ text.as_string().c_str(), anchored, want_earliest_match,
1722
+ run_forward, kind_);
1723
+ }
1724
+
1725
+ RWLocker l(&cache_mutex_);
1726
+ SearchParams params(text, context, &l);
1727
+ params.anchored = anchored;
1728
+ params.want_earliest_match = want_earliest_match;
1729
+ params.run_forward = run_forward;
1730
+ params.matches = matches;
1731
+
1732
+ if (!AnalyzeSearch(&params)) {
1733
+ *failed = true;
1734
+ return false;
1735
+ }
1736
+ if (params.start == DeadState)
1737
+ return NULL;
1738
+ if (params.start == FullMatchState) {
1739
+ if (run_forward == want_earliest_match)
1740
+ *epp = text.begin();
1741
+ else
1742
+ *epp = text.end();
1743
+ return true;
1744
+ }
1745
+ if (DebugDFA)
1746
+ fprintf(stderr, "start %s\n", DumpState(params.start).c_str());
1747
+ bool ret = FastSearchLoop(&params);
1748
+ if (params.failed) {
1749
+ *failed = true;
1750
+ return false;
1751
+ }
1752
+ *epp = params.ep;
1753
+ return ret;
1754
+ }
1755
+
1756
+ // Deletes dfa.
1757
+ //
1758
+ // This is a separate function so that
1759
+ // prog.h can be used without moving the definition of
1760
+ // class DFA out of this file. If you set
1761
+ // prog->dfa_ = dfa;
1762
+ // then you also have to set
1763
+ // prog->delete_dfa_ = DeleteDFA;
1764
+ // so that ~Prog can delete the dfa.
1765
+ static void DeleteDFA(DFA* dfa) {
1766
+ delete dfa;
1767
+ }
1768
+
1769
+ DFA* Prog::GetDFA(MatchKind kind) {
1770
+ DFA*volatile* pdfa;
1771
+ if (kind == kFirstMatch || kind == kManyMatch) {
1772
+ pdfa = &dfa_first_;
1773
+ } else {
1774
+ kind = kLongestMatch;
1775
+ pdfa = &dfa_longest_;
1776
+ }
1777
+
1778
+ // Quick check; okay because of memory barrier below.
1779
+ DFA *dfa = *pdfa;
1780
+ if (dfa != NULL) {
1781
+ ANNOTATE_HAPPENS_AFTER(dfa);
1782
+ return dfa;
1783
+ }
1784
+
1785
+ MutexLock l(&dfa_mutex_);
1786
+ dfa = *pdfa;
1787
+ if (dfa != NULL)
1788
+ return dfa;
1789
+
1790
+ // For a forward DFA, half the memory goes to each DFA.
1791
+ // For a reverse DFA, all the memory goes to the
1792
+ // "longest match" DFA, because RE2 never does reverse
1793
+ // "first match" searches.
1794
+ int64 m = dfa_mem_/2;
1795
+ if (reversed_) {
1796
+ if (kind == kLongestMatch || kind == kManyMatch)
1797
+ m = dfa_mem_;
1798
+ else
1799
+ m = 0;
1800
+ }
1801
+ dfa = new DFA(this, kind, m);
1802
+ delete_dfa_ = DeleteDFA;
1803
+
1804
+ // Synchronize with "quick check" above.
1805
+ ANNOTATE_HAPPENS_BEFORE(dfa);
1806
+ WriteMemoryBarrier();
1807
+ *pdfa = dfa;
1808
+
1809
+ return dfa;
1810
+ }
1811
+
1812
+
1813
+ // Executes the regexp program to search in text,
1814
+ // which itself is inside the larger context. (As a convenience,
1815
+ // passing a NULL context is equivalent to passing text.)
1816
+ // Returns true if a match is found, false if not.
1817
+ // If a match is found, fills in match0->end() to point at the end of the match
1818
+ // and sets match0->begin() to text.begin(), since the DFA can't track
1819
+ // where the match actually began.
1820
+ //
1821
+ // This is the only external interface (class DFA only exists in this file).
1822
+ //
1823
+ bool Prog::SearchDFA(const StringPiece& text, const StringPiece& const_context,
1824
+ Anchor anchor, MatchKind kind,
1825
+ StringPiece* match0, bool* failed, vector<int>* matches) {
1826
+ *failed = false;
1827
+
1828
+ StringPiece context = const_context;
1829
+ if (context.begin() == NULL)
1830
+ context = text;
1831
+ bool carat = anchor_start();
1832
+ bool dollar = anchor_end();
1833
+ if (reversed_) {
1834
+ bool t = carat;
1835
+ carat = dollar;
1836
+ dollar = t;
1837
+ }
1838
+ if (carat && context.begin() != text.begin())
1839
+ return false;
1840
+ if (dollar && context.end() != text.end())
1841
+ return false;
1842
+
1843
+ // Handle full match by running an anchored longest match
1844
+ // and then checking if it covers all of text.
1845
+ bool anchored = anchor == kAnchored || anchor_start() || kind == kFullMatch;
1846
+ bool endmatch = false;
1847
+ if (kind == kManyMatch) {
1848
+ endmatch = true;
1849
+ } else if (kind == kFullMatch || anchor_end()) {
1850
+ endmatch = true;
1851
+ kind = kLongestMatch;
1852
+ }
1853
+
1854
+ // If the caller doesn't care where the match is (just whether one exists),
1855
+ // then we can stop at the very first match we find, the so-called
1856
+ // "shortest match".
1857
+ bool want_shortest_match = false;
1858
+ if (match0 == NULL && !endmatch) {
1859
+ want_shortest_match = true;
1860
+ kind = kLongestMatch;
1861
+ }
1862
+
1863
+ DFA* dfa = GetDFA(kind);
1864
+ const char* ep;
1865
+ bool matched = dfa->Search(text, context, anchored,
1866
+ want_shortest_match, !reversed_,
1867
+ failed, &ep, matches);
1868
+ if (*failed)
1869
+ return false;
1870
+ if (!matched)
1871
+ return false;
1872
+ if (endmatch && ep != (reversed_ ? text.begin() : text.end()))
1873
+ return false;
1874
+
1875
+ // If caller cares, record the boundary of the match.
1876
+ // We only know where it ends, so use the boundary of text
1877
+ // as the beginning.
1878
+ if (match0) {
1879
+ if (reversed_)
1880
+ *match0 = StringPiece(ep, text.end() - ep);
1881
+ else
1882
+ *match0 = StringPiece(text.begin(), ep - text.begin());
1883
+ }
1884
+ return true;
1885
+ }
1886
+
1887
+ // Build out all states in DFA. Returns number of states.
1888
+ int DFA::BuildAllStates() {
1889
+ if (!ok())
1890
+ return 0;
1891
+
1892
+ // Pick out start state for unanchored search
1893
+ // at beginning of text.
1894
+ RWLocker l(&cache_mutex_);
1895
+ SearchParams params(NULL, NULL, &l);
1896
+ params.anchored = false;
1897
+ if (!AnalyzeSearch(&params) || params.start <= SpecialStateMax)
1898
+ return 0;
1899
+
1900
+ // Add start state to work queue.
1901
+ StateSet queued;
1902
+ vector<State*> q;
1903
+ queued.insert(params.start);
1904
+ q.push_back(params.start);
1905
+
1906
+ // Flood to expand every state.
1907
+ for (int i = 0; i < q.size(); i++) {
1908
+ State* s = q[i];
1909
+ for (int c = 0; c < 257; c++) {
1910
+ State* ns = RunStateOnByteUnlocked(s, c);
1911
+ if (ns > SpecialStateMax && queued.find(ns) == queued.end()) {
1912
+ queued.insert(ns);
1913
+ q.push_back(ns);
1914
+ }
1915
+ }
1916
+ }
1917
+
1918
+ return q.size();
1919
+ }
1920
+
1921
+ // Build out all states in DFA for kind. Returns number of states.
1922
+ int Prog::BuildEntireDFA(MatchKind kind) {
1923
+ //LOG(ERROR) << "BuildEntireDFA is only for testing.";
1924
+ return GetDFA(kind)->BuildAllStates();
1925
+ }
1926
+
1927
+ // Computes min and max for matching string.
1928
+ // Won't return strings bigger than maxlen.
1929
+ bool DFA::PossibleMatchRange(string* min, string* max, int maxlen) {
1930
+ if (!ok())
1931
+ return false;
1932
+
1933
+ // NOTE: if future users of PossibleMatchRange want more precision when
1934
+ // presented with infinitely repeated elements, consider making this a
1935
+ // parameter to PossibleMatchRange.
1936
+ static int kMaxEltRepetitions = 0;
1937
+
1938
+ // Keep track of the number of times we've visited states previously. We only
1939
+ // revisit a given state if it's part of a repeated group, so if the value
1940
+ // portion of the map tuple exceeds kMaxEltRepetitions we bail out and set
1941
+ // |*max| to |PrefixSuccessor(*max)|.
1942
+ //
1943
+ // Also note that previously_visited_states[UnseenStatePtr] will, in the STL
1944
+ // tradition, implicitly insert a '0' value at first use. We take advantage
1945
+ // of that property below.
1946
+ map<State*, int> previously_visited_states;
1947
+
1948
+ // Pick out start state for anchored search at beginning of text.
1949
+ RWLocker l(&cache_mutex_);
1950
+ SearchParams params(NULL, NULL, &l);
1951
+ params.anchored = true;
1952
+ if (!AnalyzeSearch(&params))
1953
+ return false;
1954
+ if (params.start == DeadState) { // No matching strings
1955
+ *min = "";
1956
+ *max = "";
1957
+ return true;
1958
+ }
1959
+ if (params.start == FullMatchState) // Every string matches: no max
1960
+ return false;
1961
+
1962
+ // The DFA is essentially a big graph rooted at params.start,
1963
+ // and paths in the graph correspond to accepted strings.
1964
+ // Each node in the graph has potentially 256+1 arrows
1965
+ // coming out, one for each byte plus the magic end of
1966
+ // text character kByteEndText.
1967
+
1968
+ // To find the smallest possible prefix of an accepted
1969
+ // string, we just walk the graph preferring to follow
1970
+ // arrows with the lowest bytes possible. To find the
1971
+ // largest possible prefix, we follow the largest bytes
1972
+ // possible.
1973
+
1974
+ // The test for whether there is an arrow from s on byte j is
1975
+ // ns = RunStateOnByteUnlocked(s, j);
1976
+ // if (ns == NULL)
1977
+ // return false;
1978
+ // if (ns != DeadState && ns->ninst > 0)
1979
+ // The RunStateOnByteUnlocked call asks the DFA to build out the graph.
1980
+ // It returns NULL only if the DFA has run out of memory,
1981
+ // in which case we can't be sure of anything.
1982
+ // The second check sees whether there was graph built
1983
+ // and whether it is interesting graph. Nodes might have
1984
+ // ns->ninst == 0 if they exist only to represent the fact
1985
+ // that a match was found on the previous byte.
1986
+
1987
+ // Build minimum prefix.
1988
+ State* s = params.start;
1989
+ min->clear();
1990
+ for (int i = 0; i < maxlen; i++) {
1991
+ if (previously_visited_states[s] > kMaxEltRepetitions) {
1992
+ VLOG(2) << "Hit kMaxEltRepetitions=" << kMaxEltRepetitions
1993
+ << " for state s=" << s << " and min=" << CEscape(*min);
1994
+ break;
1995
+ }
1996
+ previously_visited_states[s]++;
1997
+
1998
+ // Stop if min is a match.
1999
+ State* ns = RunStateOnByteUnlocked(s, kByteEndText);
2000
+ if (ns == NULL) // DFA out of memory
2001
+ return false;
2002
+ if (ns != DeadState && (ns == FullMatchState || ns->IsMatch()))
2003
+ break;
2004
+
2005
+ // Try to extend the string with low bytes.
2006
+ bool extended = false;
2007
+ for (int j = 0; j < 256; j++) {
2008
+ ns = RunStateOnByteUnlocked(s, j);
2009
+ if (ns == NULL) // DFA out of memory
2010
+ return false;
2011
+ if (ns == FullMatchState ||
2012
+ (ns > SpecialStateMax && ns->ninst_ > 0)) {
2013
+ extended = true;
2014
+ min->append(1, j);
2015
+ s = ns;
2016
+ break;
2017
+ }
2018
+ }
2019
+ if (!extended)
2020
+ break;
2021
+ }
2022
+
2023
+ // Build maximum prefix.
2024
+ previously_visited_states.clear();
2025
+ s = params.start;
2026
+ max->clear();
2027
+ for (int i = 0; i < maxlen; i++) {
2028
+ if (previously_visited_states[s] > kMaxEltRepetitions) {
2029
+ VLOG(2) << "Hit kMaxEltRepetitions=" << kMaxEltRepetitions
2030
+ << " for state s=" << s << " and max=" << CEscape(*max);
2031
+ break;
2032
+ }
2033
+ previously_visited_states[s] += 1;
2034
+
2035
+ // Try to extend the string with high bytes.
2036
+ bool extended = false;
2037
+ for (int j = 255; j >= 0; j--) {
2038
+ State* ns = RunStateOnByteUnlocked(s, j);
2039
+ if (ns == NULL)
2040
+ return false;
2041
+ if (ns == FullMatchState ||
2042
+ (ns > SpecialStateMax && ns->ninst_ > 0)) {
2043
+ extended = true;
2044
+ max->append(1, j);
2045
+ s = ns;
2046
+ break;
2047
+ }
2048
+ }
2049
+ if (!extended) {
2050
+ // Done, no need for PrefixSuccessor.
2051
+ return true;
2052
+ }
2053
+ }
2054
+
2055
+ // Stopped while still adding to *max - round aaaaaaaaaa... to aaaa...b
2056
+ *max = PrefixSuccessor(*max);
2057
+
2058
+ // If there are no bytes left, we have no way to say "there is no maximum
2059
+ // string". We could make the interface more complicated and be able to
2060
+ // return "there is no maximum but here is a minimum", but that seems like
2061
+ // overkill -- the most common no-max case is all possible strings, so not
2062
+ // telling the caller that the empty string is the minimum match isn't a
2063
+ // great loss.
2064
+ if (max->empty())
2065
+ return false;
2066
+
2067
+ return true;
2068
+ }
2069
+
2070
+ // PossibleMatchRange for a Prog.
2071
+ bool Prog::PossibleMatchRange(string* min, string* max, int maxlen) {
2072
+ DFA* dfa = NULL;
2073
+ {
2074
+ MutexLock l(&dfa_mutex_);
2075
+ // Have to use dfa_longest_ to get all strings for full matches.
2076
+ // For example, (a|aa) never matches aa in first-match mode.
2077
+ if (dfa_longest_ == NULL) {
2078
+ dfa_longest_ = new DFA(this, Prog::kLongestMatch, dfa_mem_/2);
2079
+ delete_dfa_ = DeleteDFA;
2080
+ }
2081
+ dfa = dfa_longest_;
2082
+ }
2083
+ return dfa->PossibleMatchRange(min, max, maxlen);
2084
+ }
2085
+
2086
+ } // namespace re2