chipper 0.4.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (134) hide show
  1. data/README.rdoc +51 -0
  2. data/ext/extconf.rb +58 -0
  3. data/ext/libstemmer_c/Makefile +10 -0
  4. data/ext/libstemmer_c/examples/stemwords.c +209 -0
  5. data/ext/libstemmer_c/include/libstemmer.h +79 -0
  6. data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
  7. data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
  8. data/ext/libstemmer_c/libstemmer/modules.h +190 -0
  9. data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
  10. data/ext/libstemmer_c/mkinc.mak +82 -0
  11. data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
  12. data/ext/libstemmer_c/runtime/api.c +66 -0
  13. data/ext/libstemmer_c/runtime/api.h +26 -0
  14. data/ext/libstemmer_c/runtime/header.h +58 -0
  15. data/ext/libstemmer_c/runtime/utilities.c +478 -0
  16. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
  17. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
  18. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
  19. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
  20. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
  21. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
  22. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
  23. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
  24. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
  25. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
  26. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
  27. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
  28. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
  29. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
  30. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
  31. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
  32. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
  33. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
  34. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
  35. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
  36. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
  37. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
  38. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
  39. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
  40. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
  41. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
  42. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
  43. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
  44. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
  45. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
  46. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
  47. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
  48. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
  49. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
  50. data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
  51. data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
  52. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
  53. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
  54. data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
  55. data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
  56. data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
  57. data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
  58. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
  59. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
  60. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
  61. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
  62. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
  63. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
  64. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
  65. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
  66. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
  67. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
  68. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
  69. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
  70. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
  71. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
  72. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
  73. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
  74. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
  75. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
  76. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
  77. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
  78. data/ext/re2/bitstate.cc +378 -0
  79. data/ext/re2/compile.cc +1138 -0
  80. data/ext/re2/dfa.cc +2086 -0
  81. data/ext/re2/filtered_re2.cc +100 -0
  82. data/ext/re2/filtered_re2.h +99 -0
  83. data/ext/re2/hash.cc +231 -0
  84. data/ext/re2/mimics_pcre.cc +185 -0
  85. data/ext/re2/nfa.cc +709 -0
  86. data/ext/re2/onepass.cc +614 -0
  87. data/ext/re2/parse.cc +2202 -0
  88. data/ext/re2/perl_groups.cc +119 -0
  89. data/ext/re2/prefilter.cc +671 -0
  90. data/ext/re2/prefilter.h +105 -0
  91. data/ext/re2/prefilter_tree.cc +398 -0
  92. data/ext/re2/prefilter_tree.h +130 -0
  93. data/ext/re2/prog.cc +341 -0
  94. data/ext/re2/prog.h +376 -0
  95. data/ext/re2/re2.cc +1180 -0
  96. data/ext/re2/re2.h +837 -0
  97. data/ext/re2/regexp.cc +920 -0
  98. data/ext/re2/regexp.h +632 -0
  99. data/ext/re2/rune.cc +258 -0
  100. data/ext/re2/set.cc +113 -0
  101. data/ext/re2/set.h +55 -0
  102. data/ext/re2/simplify.cc +393 -0
  103. data/ext/re2/stringpiece.cc +87 -0
  104. data/ext/re2/stringpiece.h +182 -0
  105. data/ext/re2/tostring.cc +341 -0
  106. data/ext/re2/unicode_casefold.cc +469 -0
  107. data/ext/re2/unicode_casefold.h +75 -0
  108. data/ext/re2/unicode_groups.cc +4851 -0
  109. data/ext/re2/unicode_groups.h +64 -0
  110. data/ext/re2/valgrind.cc +24 -0
  111. data/ext/re2/variadic_function.h +346 -0
  112. data/ext/re2/walker-inl.h +244 -0
  113. data/ext/src/chipper.cc +626 -0
  114. data/ext/src/version.h +1 -0
  115. data/ext/stemmer.rb +40 -0
  116. data/ext/util/arena.h +103 -0
  117. data/ext/util/atomicops.h +79 -0
  118. data/ext/util/benchmark.h +41 -0
  119. data/ext/util/flags.h +27 -0
  120. data/ext/util/logging.h +78 -0
  121. data/ext/util/mutex.h +190 -0
  122. data/ext/util/pcre.h +679 -0
  123. data/ext/util/random.h +29 -0
  124. data/ext/util/sparse_array.h +451 -0
  125. data/ext/util/sparse_set.h +177 -0
  126. data/ext/util/test.h +57 -0
  127. data/ext/util/thread.h +26 -0
  128. data/ext/util/utf.h +43 -0
  129. data/ext/util/util.h +127 -0
  130. data/ext/util/valgrind.h +4517 -0
  131. data/test/helper.rb +5 -0
  132. data/test/test_entities.rb +57 -0
  133. data/test/test_tokens.rb +118 -0
  134. metadata +199 -0
@@ -0,0 +1,614 @@
1
+ // Copyright 2008 The RE2 Authors. All Rights Reserved.
2
+ // Use of this source code is governed by a BSD-style
3
+ // license that can be found in the LICENSE file.
4
+
5
+ // Tested by search_test.cc.
6
+ //
7
+ // Prog::SearchOnePass is an efficient implementation of
8
+ // regular expression search with submatch tracking for
9
+ // what I call "one-pass regular expressions". (An alternate
10
+ // name might be "backtracking-free regular expressions".)
11
+ //
12
+ // One-pass regular expressions have the property that
13
+ // at each input byte during an anchored match, there may be
14
+ // multiple alternatives but only one can proceed for any
15
+ // given input byte.
16
+ //
17
+ // For example, the regexp /x*yx*/ is one-pass: you read
18
+ // x's until a y, then you read the y, then you keep reading x's.
19
+ // At no point do you have to guess what to do or back up
20
+ // and try a different guess.
21
+ //
22
+ // On the other hand, /x*x/ is not one-pass: when you're
23
+ // looking at an input "x", it's not clear whether you should
24
+ // use it to extend the x* or as the final x.
25
+ //
26
+ // More examples: /([^ ]*) (.*)/ is one-pass; /(.*) (.*)/ is not.
27
+ // /(\d+)-(\d+)/ is one-pass; /(\d+).(\d+)/ is not.
28
+ //
29
+ // A simple intuition for identifying one-pass regular expressions
30
+ // is that it's always immediately obvious when a repetition ends.
31
+ // It must also be immediately obvious which branch of an | to take:
32
+ //
33
+ // /x(y|z)/ is one-pass, but /(xy|xz)/ is not.
34
+ //
35
+ // The NFA-based search in nfa.cc does some bookkeeping to
36
+ // avoid the need for backtracking and its associated exponential blowup.
37
+ // But if we have a one-pass regular expression, there is no
38
+ // possibility of backtracking, so there is no need for the
39
+ // extra bookkeeping. Hence, this code.
40
+ //
41
+ // On a one-pass regular expression, the NFA code in nfa.cc
42
+ // runs at about 1/20 of the backtracking-based PCRE speed.
43
+ // In contrast, the code in this file runs at about the same
44
+ // speed as PCRE.
45
+ //
46
+ // One-pass regular expressions get used a lot when RE is
47
+ // used for parsing simple strings, so it pays off to
48
+ // notice them and handle them efficiently.
49
+ //
50
+ // See also Anne Brüggemann-Klein and Derick Wood,
51
+ // "One-unambiguous regular languages", Information and Computation 142(2).
52
+
53
+ #include <string.h>
54
+ #include <map>
55
+ #include "util/util.h"
56
+ #include "util/arena.h"
57
+ #include "util/sparse_set.h"
58
+ #include "re2/prog.h"
59
+ #include "re2/stringpiece.h"
60
+
61
+ namespace re2 {
62
+
63
+ static const int Debug = 0;
64
+
65
+ // The key insight behind this implementation is that the
66
+ // non-determinism in an NFA for a one-pass regular expression
67
+ // is contained. To explain what that means, first a
68
+ // refresher about what regular expression programs look like
69
+ // and how the usual NFA execution runs.
70
+ //
71
+ // In a regular expression program, only the kInstByteRange
72
+ // instruction processes an input byte c and moves on to the
73
+ // next byte in the string (it does so if c is in the given range).
74
+ // The kInstByteRange instructions correspond to literal characters
75
+ // and character classes in the regular expression.
76
+ //
77
+ // The kInstAlt instructions are used as wiring to connect the
78
+ // kInstByteRange instructions together in interesting ways when
79
+ // implementing | + and *.
80
+ // The kInstAlt instruction forks execution, like a goto that
81
+ // jumps to ip->out() and ip->out1() in parallel. Each of the
82
+ // resulting computation paths is called a thread.
83
+ //
84
+ // The other instructions -- kInstEmptyWidth, kInstMatch, kInstCapture --
85
+ // are interesting in their own right but like kInstAlt they don't
86
+ // advance the input pointer. Only kInstByteRange does.
87
+ //
88
+ // The automaton execution in nfa.cc runs all the possible
89
+ // threads of execution in lock-step over the input. To process
90
+ // a particular byte, each thread gets run until it either dies
91
+ // or finds a kInstByteRange instruction matching the byte.
92
+ // If the latter happens, the thread stops just past the
93
+ // kInstByteRange instruction (at ip->out()) and waits for
94
+ // the other threads to finish processing the input byte.
95
+ // Then, once all the threads have processed that input byte,
96
+ // the whole process repeats. The kInstAlt state instruction
97
+ // might create new threads during input processing, but no
98
+ // matter what, all the threads stop after a kInstByteRange
99
+ // and wait for the other threads to "catch up".
100
+ // Running in lock step like this ensures that the NFA reads
101
+ // the input string only once.
102
+ //
103
+ // Each thread maintains its own set of capture registers
104
+ // (the string positions at which it executed the kInstCapture
105
+ // instructions corresponding to capturing parentheses in the
106
+ // regular expression). Repeated copying of the capture registers
107
+ // is the main performance bottleneck in the NFA implementation.
108
+ //
109
+ // A regular expression program is "one-pass" if, no matter what
110
+ // the input string, there is only one thread that makes it
111
+ // past a kInstByteRange instruction at each input byte. This means
112
+ // that there is in some sense only one active thread throughout
113
+ // the execution. Other threads might be created during the
114
+ // processing of an input byte, but they are ephemeral: only one
115
+ // thread is left to start processing the next input byte.
116
+ // This is what I meant above when I said the non-determinism
117
+ // was "contained".
118
+ //
119
+ // To execute a one-pass regular expression program, we can build
120
+ // a DFA (no non-determinism) that has at most as many states as
121
+ // the NFA (compare this to the possibly exponential number of states
122
+ // in the general case). Each state records, for each possible
123
+ // input byte, the next state along with the conditions required
124
+ // before entering that state -- empty-width flags that must be true
125
+ // and capture operations that must be performed. It also records
126
+ // whether a set of conditions required to finish a match at that
127
+ // point in the input rather than process the next byte.
128
+
129
+ // A state in the one-pass NFA (aka DFA) - just an array of actions.
130
+ struct OneState;
131
+
132
+ // A state in the one-pass NFA - just an array of actions indexed
133
+ // by the bytemap_[] of the next input byte. (The bytemap
134
+ // maps next input bytes into equivalence classes, to reduce
135
+ // the memory footprint.)
136
+ struct OneState {
137
+ uint32 matchcond; // conditions to match right now.
138
+ uint32 action[1];
139
+ };
140
+
141
+ // The uint32 conditions in the action are a combination of
142
+ // condition and capture bits and the next state. The bottom 16 bits
143
+ // are the condition and capture bits, and the top 16 are the index of
144
+ // the next state.
145
+ //
146
+ // Bits 0-5 are the empty-width flags from prog.h.
147
+ // Bit 6 is kMatchWins, which means the match takes
148
+ // priority over moving to next in a first-match search.
149
+ // The remaining bits mark capture registers that should
150
+ // be set to the current input position. The capture bits
151
+ // start at index 2, since the search loop can take care of
152
+ // cap[0], cap[1] (the overall match position).
153
+ // That means we can handle up to 5 capturing parens: $1 through $4, plus $0.
154
+ // No input position can satisfy both kEmptyWordBoundary
155
+ // and kEmptyNonWordBoundary, so we can use that as a sentinel
156
+ // instead of needing an extra bit.
157
+
158
+ static const int kIndexShift = 16; // number of bits below index
159
+ static const int kEmptyShift = 6; // number of empty flags in prog.h
160
+ static const int kRealCapShift = kEmptyShift + 1;
161
+ static const int kRealMaxCap = (kIndexShift - kRealCapShift) / 2 * 2;
162
+
163
+ // Parameters used to skip over cap[0], cap[1].
164
+ static const int kCapShift = kRealCapShift - 2;
165
+ static const int kMaxCap = kRealMaxCap + 2;
166
+
167
+ static const uint32 kMatchWins = 1 << kEmptyShift;
168
+ static const uint32 kCapMask = ((1 << kRealMaxCap) - 1) << kRealCapShift;
169
+
170
+ static const uint32 kImpossible = kEmptyWordBoundary | kEmptyNonWordBoundary;
171
+
172
+ // Check, at compile time, that prog.h agrees with math above.
173
+ // This function is never called.
174
+ void OnePass_Checks() {
175
+ COMPILE_ASSERT((1<<kEmptyShift)-1 == kEmptyAllFlags,
176
+ kEmptyShift_disagrees_with_kEmptyAllFlags);
177
+ // kMaxCap counts pointers, kMaxOnePassCapture counts pairs.
178
+ COMPILE_ASSERT(kMaxCap == Prog::kMaxOnePassCapture*2,
179
+ kMaxCap_disagrees_with_kMaxOnePassCapture);
180
+ }
181
+
182
+ static bool Satisfy(uint32 cond, const StringPiece& context, const char* p) {
183
+ uint32 satisfied = Prog::EmptyFlags(context, p);
184
+ if (cond & kEmptyAllFlags & ~satisfied)
185
+ return false;
186
+ return true;
187
+ }
188
+
189
+ // Apply the capture bits in cond, saving p to the appropriate
190
+ // locations in cap[].
191
+ static void ApplyCaptures(uint32 cond, const char* p,
192
+ const char** cap, int ncap) {
193
+ for (int i = 2; i < ncap; i++)
194
+ if (cond & (1 << kCapShift << i))
195
+ cap[i] = p;
196
+ }
197
+
198
+ // Compute a node pointer.
199
+ // Basically (OneState*)(nodes + statesize*nodeindex)
200
+ // but the version with the C++ casts overflows 80 characters (and is ugly).
201
+ static inline OneState* IndexToNode(volatile uint8* nodes, int statesize,
202
+ int nodeindex) {
203
+ return reinterpret_cast<OneState*>(
204
+ const_cast<uint8*>(nodes + statesize*nodeindex));
205
+ }
206
+
207
+ bool Prog::SearchOnePass(const StringPiece& text,
208
+ const StringPiece& const_context,
209
+ Anchor anchor, MatchKind kind,
210
+ StringPiece* match, int nmatch) {
211
+ if (anchor != kAnchored && kind != kFullMatch) {
212
+ LOG(DFATAL) << "Cannot use SearchOnePass for unanchored matches.";
213
+ return false;
214
+ }
215
+
216
+ // Make sure we have at least cap[1],
217
+ // because we use it to tell if we matched.
218
+ int ncap = 2*nmatch;
219
+ if (ncap < 2)
220
+ ncap = 2;
221
+
222
+ const char* cap[kMaxCap];
223
+ for (int i = 0; i < ncap; i++)
224
+ cap[i] = NULL;
225
+
226
+ const char* matchcap[kMaxCap];
227
+ for (int i = 0; i < ncap; i++)
228
+ matchcap[i] = NULL;
229
+
230
+ StringPiece context = const_context;
231
+ if (context.begin() == NULL)
232
+ context = text;
233
+ if (anchor_start() && context.begin() != text.begin())
234
+ return false;
235
+ if (anchor_end() && context.end() != text.end())
236
+ return false;
237
+ if (anchor_end())
238
+ kind = kFullMatch;
239
+
240
+ // State and act are marked volatile to
241
+ // keep the compiler from re-ordering the
242
+ // memory accesses walking over the NFA.
243
+ // This is worth about 5%.
244
+ volatile OneState* state = onepass_start_;
245
+ volatile uint8* nodes = onepass_nodes_;
246
+ volatile uint32 statesize = onepass_statesize_;
247
+ uint8* bytemap = bytemap_;
248
+ const char* bp = text.begin();
249
+ const char* ep = text.end();
250
+ const char* p;
251
+ bool matched = false;
252
+ matchcap[0] = bp;
253
+ cap[0] = bp;
254
+ uint32 nextmatchcond = state->matchcond;
255
+ for (p = bp; p < ep; p++) {
256
+ int c = bytemap[*p & 0xFF];
257
+ uint32 matchcond = nextmatchcond;
258
+ uint32 cond = state->action[c];
259
+
260
+ // Determine whether we can reach act->next.
261
+ // If so, advance state and nextmatchcond.
262
+ if ((cond & kEmptyAllFlags) == 0 || Satisfy(cond, context, p)) {
263
+ uint32 nextindex = cond >> kIndexShift;
264
+ state = IndexToNode(nodes, statesize, nextindex);
265
+ nextmatchcond = state->matchcond;
266
+ } else {
267
+ state = NULL;
268
+ nextmatchcond = kImpossible;
269
+ }
270
+
271
+ // This code section is carefully tuned.
272
+ // The goto sequence is about 10% faster than the
273
+ // obvious rewrite as a large if statement in the
274
+ // ASCIIMatchRE2 and DotMatchRE2 benchmarks.
275
+
276
+ // Saving the match capture registers is expensive.
277
+ // Is this intermediate match worth thinking about?
278
+
279
+ // Not if we want a full match.
280
+ if (kind == kFullMatch)
281
+ goto skipmatch;
282
+
283
+ // Not if it's impossible.
284
+ if (matchcond == kImpossible)
285
+ goto skipmatch;
286
+
287
+ // Not if the possible match is beaten by the certain
288
+ // match at the next byte. When this test is useless
289
+ // (e.g., HTTPPartialMatchRE2) it slows the loop by
290
+ // about 10%, but when it avoids work (e.g., DotMatchRE2),
291
+ // it cuts the loop execution by about 45%.
292
+ if ((cond & kMatchWins) == 0 && (nextmatchcond & kEmptyAllFlags) == 0)
293
+ goto skipmatch;
294
+
295
+ // Finally, the match conditions must be satisfied.
296
+ if ((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p)) {
297
+ for (int i = 2; i < 2*nmatch; i++)
298
+ matchcap[i] = cap[i];
299
+ if (nmatch > 1 && (matchcond & kCapMask))
300
+ ApplyCaptures(matchcond, p, matchcap, ncap);
301
+ matchcap[1] = p;
302
+ matched = true;
303
+
304
+ // If we're in longest match mode, we have to keep
305
+ // going and see if we find a longer match.
306
+ // In first match mode, we can stop if the match
307
+ // takes priority over the next state for this input byte.
308
+ // That bit is per-input byte and thus in cond, not matchcond.
309
+ if (kind == kFirstMatch && (cond & kMatchWins))
310
+ goto done;
311
+ }
312
+
313
+ skipmatch:
314
+ if (state == NULL)
315
+ goto done;
316
+ if ((cond & kCapMask) && nmatch > 1)
317
+ ApplyCaptures(cond, p, cap, ncap);
318
+ }
319
+
320
+ // Look for match at end of input.
321
+ {
322
+ uint32 matchcond = state->matchcond;
323
+ if (matchcond != kImpossible &&
324
+ ((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p))) {
325
+ if (nmatch > 1 && (matchcond & kCapMask))
326
+ ApplyCaptures(matchcond, p, cap, ncap);
327
+ for (int i = 2; i < ncap; i++)
328
+ matchcap[i] = cap[i];
329
+ matchcap[1] = p;
330
+ matched = true;
331
+ }
332
+ }
333
+
334
+ done:
335
+ if (!matched)
336
+ return false;
337
+ for (int i = 0; i < nmatch; i++)
338
+ match[i].set(matchcap[2*i], matchcap[2*i+1] - matchcap[2*i]);
339
+ return true;
340
+ }
341
+
342
+
343
+ // Analysis to determine whether a given regexp program is one-pass.
344
+
345
+ // If ip is not on workq, adds ip to work queue and returns true.
346
+ // If ip is already on work queue, does nothing and returns false.
347
+ // If ip is NULL, does nothing and returns true (pretends to add it).
348
+ typedef SparseSet Instq;
349
+ static bool AddQ(Instq *q, int id) {
350
+ if (id == 0)
351
+ return true;
352
+ if (q->contains(id))
353
+ return false;
354
+ q->insert(id);
355
+ return true;
356
+ }
357
+
358
+ struct InstCond {
359
+ int id;
360
+ uint32 cond;
361
+ };
362
+
363
+ // Returns whether this is a one-pass program; that is,
364
+ // returns whether it is safe to use SearchOnePass on this program.
365
+ // These conditions must be true for any instruction ip:
366
+ //
367
+ // (1) for any other Inst nip, there is at most one input-free
368
+ // path from ip to nip.
369
+ // (2) there is at most one kInstByte instruction reachable from
370
+ // ip that matches any particular byte c.
371
+ // (3) there is at most one input-free path from ip to a kInstMatch
372
+ // instruction.
373
+ //
374
+ // This is actually just a conservative approximation: it might
375
+ // return false when the answer is true, when kInstEmptyWidth
376
+ // instructions are involved.
377
+ // Constructs and saves corresponding one-pass NFA on success.
378
+ bool Prog::IsOnePass() {
379
+ if (did_onepass_)
380
+ return onepass_start_ != NULL;
381
+ did_onepass_ = true;
382
+
383
+ if (start() == 0) // no match
384
+ return false;
385
+
386
+ // Steal memory for the one-pass NFA from the overall DFA budget.
387
+ // Willing to use at most 1/4 of the DFA budget (heuristic).
388
+ // Limit max node count to 65000 as a conservative estimate to
389
+ // avoid overflowing 16-bit node index in encoding.
390
+ int maxnodes = 2 + byte_inst_count_;
391
+ int statesize = sizeof(OneState) + (bytemap_range_-1)*sizeof(uint32);
392
+ if (maxnodes >= 65000 || dfa_mem_ / 4 / statesize < maxnodes)
393
+ return false;
394
+
395
+ // Flood the graph starting at the start state, and check
396
+ // that in each reachable state, each possible byte leads
397
+ // to a unique next state.
398
+ int size = this->size();
399
+ InstCond *stack = new InstCond[size];
400
+
401
+ int* nodebyid = new int[size]; // indexed by ip
402
+ memset(nodebyid, 0xFF, size*sizeof nodebyid[0]);
403
+
404
+ uint8* nodes = new uint8[maxnodes*statesize];
405
+ uint8* nodep = nodes;
406
+
407
+ Instq tovisit(size), workq(size);
408
+ AddQ(&tovisit, start());
409
+ nodebyid[start()] = 0;
410
+ nodep += statesize;
411
+ int nalloc = 1;
412
+ for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
413
+ int id = *it;
414
+ int nodeindex = nodebyid[id];
415
+ OneState* node = IndexToNode(nodes, statesize, nodeindex);
416
+
417
+ // Flood graph using manual stack, filling in actions as found.
418
+ // Default is none.
419
+ for (int b = 0; b < bytemap_range_; b++)
420
+ node->action[b] = kImpossible;
421
+ node->matchcond = kImpossible;
422
+
423
+ workq.clear();
424
+ bool matched = false;
425
+ int nstack = 0;
426
+ stack[nstack].id = id;
427
+ stack[nstack++].cond = 0;
428
+ while (nstack > 0) {
429
+ int id = stack[--nstack].id;
430
+ Prog::Inst* ip = inst(id);
431
+ uint32 cond = stack[nstack].cond;
432
+ switch (ip->opcode()) {
433
+ case kInstAltMatch:
434
+ // TODO(rsc): Ignoring kInstAltMatch optimization.
435
+ // Should implement it in this engine, but it's subtle.
436
+ // Fall through.
437
+ case kInstAlt:
438
+ // If already on work queue, (1) is violated: bail out.
439
+ if (!AddQ(&workq, ip->out()) || !AddQ(&workq, ip->out1()))
440
+ goto fail;
441
+ stack[nstack].id = ip->out1();
442
+ stack[nstack++].cond = cond;
443
+ stack[nstack].id = ip->out();
444
+ stack[nstack++].cond = cond;
445
+ break;
446
+
447
+ case kInstByteRange: {
448
+ int nextindex = nodebyid[ip->out()];
449
+ if (nextindex == -1) {
450
+ if (nalloc >= maxnodes) {
451
+ if (Debug)
452
+ LOG(ERROR)
453
+ << StringPrintf("Not OnePass: hit node limit %d > %d",
454
+ nalloc, maxnodes);
455
+ goto fail;
456
+ }
457
+ nextindex = nalloc;
458
+ nodep += statesize;
459
+ nodebyid[ip->out()] = nextindex;
460
+ nalloc++;
461
+ AddQ(&tovisit, ip->out());
462
+ }
463
+ if (matched)
464
+ cond |= kMatchWins;
465
+ for (int c = ip->lo(); c <= ip->hi(); c++) {
466
+ int b = bytemap_[c];
467
+ c = unbytemap_[b]; // last c in byte class
468
+ uint32 act = node->action[b];
469
+ uint32 newact = (nextindex << kIndexShift) | cond;
470
+ if ((act & kImpossible) == kImpossible) {
471
+ node->action[b] = newact;
472
+ } else if (act != newact) {
473
+ if (Debug) {
474
+ LOG(ERROR)
475
+ << StringPrintf("Not OnePass: conflict on byte "
476
+ "%#x at state %d",
477
+ c, *it);
478
+ }
479
+ goto fail;
480
+ }
481
+ }
482
+ if (ip->foldcase()) {
483
+ Rune lo = max<Rune>(ip->lo(), 'a') + 'A' - 'a';
484
+ Rune hi = min<Rune>(ip->hi(), 'z') + 'A' - 'a';
485
+ for (int c = lo; c <= hi; c++) {
486
+ int b = bytemap_[c];
487
+ c = unbytemap_[b]; // last c in class
488
+ uint32 act = node->action[b];
489
+ uint32 newact = (nextindex << kIndexShift) | cond;
490
+ if ((act & kImpossible) == kImpossible) {
491
+ node->action[b] = newact;
492
+ } else if (act != newact) {
493
+ if (Debug) {
494
+ LOG(ERROR)
495
+ << StringPrintf("Not OnePass: conflict on byte "
496
+ "%#x at state %d",
497
+ c, *it);
498
+ }
499
+ goto fail;
500
+ }
501
+ }
502
+ }
503
+ break;
504
+ }
505
+
506
+ case kInstCapture:
507
+ if (ip->cap() < kMaxCap)
508
+ cond |= (1 << kCapShift) << ip->cap();
509
+ goto QueueEmpty;
510
+
511
+ case kInstEmptyWidth:
512
+ cond |= ip->empty();
513
+ goto QueueEmpty;
514
+
515
+ case kInstNop:
516
+ QueueEmpty:
517
+ // kInstCapture and kInstNop always proceed to ip->out().
518
+ // kInstEmptyWidth only sometimes proceeds to ip->out(),
519
+ // but as a conservative approximation we assume it always does.
520
+ // We could be a little more precise by looking at what c
521
+ // is, but that seems like overkill.
522
+
523
+ // If already on work queue, (1) is violated: bail out.
524
+ if (!AddQ(&workq, ip->out())) {
525
+ if (Debug) {
526
+ LOG(ERROR) << StringPrintf("Not OnePass: multiple paths"
527
+ " %d -> %d\n",
528
+ *it, ip->out());
529
+ }
530
+ goto fail;
531
+ }
532
+ stack[nstack].id = ip->out();
533
+ stack[nstack++].cond = cond;
534
+ break;
535
+
536
+ case kInstMatch:
537
+ if (matched) {
538
+ // (3) is violated
539
+ if (Debug) {
540
+ LOG(ERROR) << StringPrintf("Not OnePass: multiple matches"
541
+ " from %d\n", *it);
542
+ }
543
+ goto fail;
544
+ }
545
+ matched = true;
546
+ node->matchcond = cond;
547
+ break;
548
+
549
+ case kInstFail:
550
+ break;
551
+ }
552
+ }
553
+ }
554
+
555
+ if (Debug) { // For debugging, dump one-pass NFA to LOG(ERROR).
556
+ string dump = "prog dump:\n" + Dump() + "node dump\n";
557
+ map<int, int> idmap;
558
+ for (int i = 0; i < size; i++)
559
+ if (nodebyid[i] != -1)
560
+ idmap[nodebyid[i]] = i;
561
+
562
+ StringAppendF(&dump, "byte ranges:\n");
563
+ int i = 0;
564
+ for (int b = 0; b < bytemap_range_; b++) {
565
+ int lo = i;
566
+ while (bytemap_[i] == b)
567
+ i++;
568
+ StringAppendF(&dump, "\t%d: %#x-%#x\n", b, lo, i - 1);
569
+ }
570
+
571
+ for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
572
+ int id = *it;
573
+ int nodeindex = nodebyid[id];
574
+ if (nodeindex == -1)
575
+ continue;
576
+ OneState* node = IndexToNode(nodes, statesize, nodeindex);
577
+ string s;
578
+ StringAppendF(&dump, "node %d id=%d: matchcond=%#x\n",
579
+ nodeindex, id, node->matchcond);
580
+ for (int i = 0; i < bytemap_range_; i++) {
581
+ if ((node->action[i] & kImpossible) == kImpossible)
582
+ continue;
583
+ StringAppendF(&dump, " %d cond %#x -> %d id=%d\n",
584
+ i, node->action[i] & 0xFFFF,
585
+ node->action[i] >> kIndexShift,
586
+ idmap[node->action[i] >> kIndexShift]);
587
+ }
588
+ }
589
+ LOG(ERROR) << dump;
590
+ }
591
+
592
+ // Overallocated earlier; cut down to actual size.
593
+ nodep = new uint8[nalloc*statesize];
594
+ memmove(nodep, nodes, nalloc*statesize);
595
+ delete[] nodes;
596
+ nodes = nodep;
597
+
598
+ onepass_start_ = IndexToNode(nodes, statesize, nodebyid[start()]);
599
+ onepass_nodes_ = nodes;
600
+ onepass_statesize_ = statesize;
601
+ dfa_mem_ -= nalloc*statesize;
602
+
603
+ delete[] stack;
604
+ delete[] nodebyid;
605
+ return true;
606
+
607
+ fail:
608
+ delete[] stack;
609
+ delete[] nodebyid;
610
+ delete[] nodes;
611
+ return false;
612
+ }
613
+
614
+ } // namespace re2