chipper 0.4.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (134) hide show
  1. data/README.rdoc +51 -0
  2. data/ext/extconf.rb +58 -0
  3. data/ext/libstemmer_c/Makefile +10 -0
  4. data/ext/libstemmer_c/examples/stemwords.c +209 -0
  5. data/ext/libstemmer_c/include/libstemmer.h +79 -0
  6. data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
  7. data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
  8. data/ext/libstemmer_c/libstemmer/modules.h +190 -0
  9. data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
  10. data/ext/libstemmer_c/mkinc.mak +82 -0
  11. data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
  12. data/ext/libstemmer_c/runtime/api.c +66 -0
  13. data/ext/libstemmer_c/runtime/api.h +26 -0
  14. data/ext/libstemmer_c/runtime/header.h +58 -0
  15. data/ext/libstemmer_c/runtime/utilities.c +478 -0
  16. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
  17. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
  18. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
  19. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
  20. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
  21. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
  22. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
  23. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
  24. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
  25. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
  26. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
  27. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
  28. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
  29. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
  30. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
  31. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
  32. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
  33. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
  34. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
  35. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
  36. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
  37. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
  38. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
  39. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
  40. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
  41. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
  42. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
  43. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
  44. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
  45. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
  46. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
  47. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
  48. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
  49. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
  50. data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
  51. data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
  52. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
  53. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
  54. data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
  55. data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
  56. data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
  57. data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
  58. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
  59. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
  60. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
  61. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
  62. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
  63. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
  64. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
  65. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
  66. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
  67. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
  68. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
  69. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
  70. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
  71. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
  72. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
  73. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
  74. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
  75. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
  76. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
  77. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
  78. data/ext/re2/bitstate.cc +378 -0
  79. data/ext/re2/compile.cc +1138 -0
  80. data/ext/re2/dfa.cc +2086 -0
  81. data/ext/re2/filtered_re2.cc +100 -0
  82. data/ext/re2/filtered_re2.h +99 -0
  83. data/ext/re2/hash.cc +231 -0
  84. data/ext/re2/mimics_pcre.cc +185 -0
  85. data/ext/re2/nfa.cc +709 -0
  86. data/ext/re2/onepass.cc +614 -0
  87. data/ext/re2/parse.cc +2202 -0
  88. data/ext/re2/perl_groups.cc +119 -0
  89. data/ext/re2/prefilter.cc +671 -0
  90. data/ext/re2/prefilter.h +105 -0
  91. data/ext/re2/prefilter_tree.cc +398 -0
  92. data/ext/re2/prefilter_tree.h +130 -0
  93. data/ext/re2/prog.cc +341 -0
  94. data/ext/re2/prog.h +376 -0
  95. data/ext/re2/re2.cc +1180 -0
  96. data/ext/re2/re2.h +837 -0
  97. data/ext/re2/regexp.cc +920 -0
  98. data/ext/re2/regexp.h +632 -0
  99. data/ext/re2/rune.cc +258 -0
  100. data/ext/re2/set.cc +113 -0
  101. data/ext/re2/set.h +55 -0
  102. data/ext/re2/simplify.cc +393 -0
  103. data/ext/re2/stringpiece.cc +87 -0
  104. data/ext/re2/stringpiece.h +182 -0
  105. data/ext/re2/tostring.cc +341 -0
  106. data/ext/re2/unicode_casefold.cc +469 -0
  107. data/ext/re2/unicode_casefold.h +75 -0
  108. data/ext/re2/unicode_groups.cc +4851 -0
  109. data/ext/re2/unicode_groups.h +64 -0
  110. data/ext/re2/valgrind.cc +24 -0
  111. data/ext/re2/variadic_function.h +346 -0
  112. data/ext/re2/walker-inl.h +244 -0
  113. data/ext/src/chipper.cc +626 -0
  114. data/ext/src/version.h +1 -0
  115. data/ext/stemmer.rb +40 -0
  116. data/ext/util/arena.h +103 -0
  117. data/ext/util/atomicops.h +79 -0
  118. data/ext/util/benchmark.h +41 -0
  119. data/ext/util/flags.h +27 -0
  120. data/ext/util/logging.h +78 -0
  121. data/ext/util/mutex.h +190 -0
  122. data/ext/util/pcre.h +679 -0
  123. data/ext/util/random.h +29 -0
  124. data/ext/util/sparse_array.h +451 -0
  125. data/ext/util/sparse_set.h +177 -0
  126. data/ext/util/test.h +57 -0
  127. data/ext/util/thread.h +26 -0
  128. data/ext/util/utf.h +43 -0
  129. data/ext/util/util.h +127 -0
  130. data/ext/util/valgrind.h +4517 -0
  131. data/test/helper.rb +5 -0
  132. data/test/test_entities.rb +57 -0
  133. data/test/test_tokens.rb +118 -0
  134. metadata +199 -0
data/ext/re2/nfa.cc ADDED
@@ -0,0 +1,709 @@
1
+ // Copyright 2006-2007 The RE2 Authors. All Rights Reserved.
2
+ // Use of this source code is governed by a BSD-style
3
+ // license that can be found in the LICENSE file.
4
+
5
+ // Tested by search_test.cc.
6
+ //
7
+ // Prog::SearchNFA, an NFA search.
8
+ // This is an actual NFA like the theorists talk about,
9
+ // not the pseudo-NFA found in backtracking regexp implementations.
10
+ //
11
+ // IMPLEMENTATION
12
+ //
13
+ // This algorithm is a variant of one that appeared in Rob Pike's sam editor,
14
+ // which is a variant of the one described in Thompson's 1968 CACM paper.
15
+ // See http://swtch.com/~rsc/regexp/ for various history. The main feature
16
+ // over the DFA implementation is that it tracks submatch boundaries.
17
+ //
18
+ // When the choice of submatch boundaries is ambiguous, this particular
19
+ // implementation makes the same choices that traditional backtracking
20
+ // implementations (in particular, Perl and PCRE) do.
21
+ // Note that unlike in Perl and PCRE, this algorithm *cannot* take exponential
22
+ // time in the length of the input.
23
+ //
24
+ // Like Thompson's original machine and like the DFA implementation, this
25
+ // implementation notices a match only once it is one byte past it.
26
+
27
+ #include "re2/prog.h"
28
+ #include "re2/regexp.h"
29
+ #include "util/sparse_array.h"
30
+ #include "util/sparse_set.h"
31
+
32
+ namespace re2 {
33
+
34
+ class NFA {
35
+ public:
36
+ NFA(Prog* prog);
37
+ ~NFA();
38
+
39
+ // Searches for a matching string.
40
+ // * If anchored is true, only considers matches starting at offset.
41
+ // Otherwise finds lefmost match at or after offset.
42
+ // * If longest is true, returns the longest match starting
43
+ // at the chosen start point. Otherwise returns the so-called
44
+ // left-biased match, the one traditional backtracking engines
45
+ // (like Perl and PCRE) find.
46
+ // Records submatch boundaries in submatch[1..nsubmatch-1].
47
+ // Submatch[0] is the entire match. When there is a choice in
48
+ // which text matches each subexpression, the submatch boundaries
49
+ // are chosen to match what a backtracking implementation would choose.
50
+ bool Search(const StringPiece& text, const StringPiece& context,
51
+ bool anchored, bool longest,
52
+ StringPiece* submatch, int nsubmatch);
53
+
54
+ static const int Debug = 0;
55
+
56
+ private:
57
+ struct Thread {
58
+ union {
59
+ int id;
60
+ Thread* next; // when on free list
61
+ };
62
+ const char** capture;
63
+ };
64
+
65
+ // State for explicit stack in AddToThreadq.
66
+ struct AddState {
67
+ int id; // Inst to process
68
+ int j;
69
+ const char* cap_j; // if j>=0, set capture[j] = cap_j before processing ip
70
+
71
+ AddState()
72
+ : id(0), j(-1), cap_j(NULL) {}
73
+ explicit AddState(int id)
74
+ : id(id), j(-1), cap_j(NULL) {}
75
+ AddState(int id, const char* cap_j, int j)
76
+ : id(id), j(j), cap_j(cap_j) {}
77
+ };
78
+
79
+ // Threadq is a list of threads. The list is sorted by the order
80
+ // in which Perl would explore that particular state -- the earlier
81
+ // choices appear earlier in the list.
82
+ typedef SparseArray<Thread*> Threadq;
83
+
84
+ inline Thread* AllocThread();
85
+ inline void FreeThread(Thread*);
86
+
87
+ // Add r (or its children, following unlabeled arrows)
88
+ // to the workqueue q with associated capture info.
89
+ void AddToThreadq(Threadq* q, int id, int flag,
90
+ const char* p, const char** capture);
91
+
92
+ // Run runq on byte c, appending new states to nextq.
93
+ // Updates matched_ and match_ as new, better matches are found.
94
+ // p is position of the next byte (the one after c)
95
+ // in the input string, used when processing capturing parens.
96
+ // flag is the bitwise or of Bol, Eol, etc., specifying whether
97
+ // ^, $ and \b match the current input point (after c).
98
+ inline int Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p);
99
+
100
+ // Returns text version of capture information, for debugging.
101
+ string FormatCapture(const char** capture);
102
+
103
+ inline void CopyCapture(const char** dst, const char** src);
104
+
105
+ // Computes whether all matches must begin with the same first
106
+ // byte, and if so, returns that byte. If not, returns -1.
107
+ int ComputeFirstByte();
108
+
109
+ Prog* prog_; // underlying program
110
+ int start_; // start instruction in program
111
+ int ncapture_; // number of submatches to track
112
+ bool longest_; // whether searching for longest match
113
+ bool endmatch_; // whether match must end at text.end()
114
+ const char* btext_; // beginning of text being matched (for FormatSubmatch)
115
+ const char* etext_; // end of text being matched (for endmatch_)
116
+ Threadq q0_, q1_; // pre-allocated for Search.
117
+ const char** match_; // best match so far
118
+ bool matched_; // any match so far?
119
+ AddState* astack_; // pre-allocated for AddToThreadq
120
+ int nastack_;
121
+ int first_byte_; // required first byte for match, or -1 if none
122
+
123
+ Thread* free_threads_; // free list
124
+
125
+ DISALLOW_EVIL_CONSTRUCTORS(NFA);
126
+ };
127
+
128
+ NFA::NFA(Prog* prog) {
129
+ prog_ = prog;
130
+ start_ = prog->start();
131
+ ncapture_ = 0;
132
+ longest_ = false;
133
+ endmatch_ = false;
134
+ btext_ = NULL;
135
+ etext_ = NULL;
136
+ q0_.resize(prog_->size());
137
+ q1_.resize(prog_->size());
138
+ nastack_ = 2*prog_->size();
139
+ astack_ = new AddState[nastack_];
140
+ match_ = NULL;
141
+ matched_ = false;
142
+ free_threads_ = NULL;
143
+ first_byte_ = ComputeFirstByte();
144
+ }
145
+
146
+ NFA::~NFA() {
147
+ delete[] match_;
148
+ delete[] astack_;
149
+ Thread* next;
150
+ for (Thread* t = free_threads_; t; t = next) {
151
+ next = t->next;
152
+ delete[] t->capture;
153
+ delete t;
154
+ }
155
+ }
156
+
157
+ void NFA::FreeThread(Thread *t) {
158
+ if (t == NULL)
159
+ return;
160
+ t->next = free_threads_;
161
+ free_threads_ = t;
162
+ }
163
+
164
+ NFA::Thread* NFA::AllocThread() {
165
+ Thread* t = free_threads_;
166
+ if (t == NULL) {
167
+ t = new Thread;
168
+ t->capture = new const char*[ncapture_];
169
+ return t;
170
+ }
171
+ free_threads_ = t->next;
172
+ return t;
173
+ }
174
+
175
+ void NFA::CopyCapture(const char** dst, const char** src) {
176
+ for (int i = 0; i < ncapture_; i+=2) {
177
+ dst[i] = src[i];
178
+ dst[i+1] = src[i+1];
179
+ }
180
+ }
181
+
182
+ // Follows all empty arrows from r and enqueues all the states reached.
183
+ // The bits in flag (Bol, Eol, etc.) specify whether ^, $ and \b match.
184
+ // The pointer p is the current input position, and m is the
185
+ // current set of match boundaries.
186
+ void NFA::AddToThreadq(Threadq* q, int id0, int flag,
187
+ const char* p, const char** capture) {
188
+ if (id0 == 0)
189
+ return;
190
+
191
+ // Astack_ is pre-allocated to avoid resize operations.
192
+ // It has room for 2*prog_->size() entries, which is enough:
193
+ // Each inst in prog can be processed at most once,
194
+ // pushing at most two entries on stk.
195
+
196
+ int nstk = 0;
197
+ AddState* stk = astack_;
198
+ stk[nstk++] = AddState(id0);
199
+
200
+ while (nstk > 0) {
201
+ DCHECK_LE(nstk, nastack_);
202
+ const AddState& a = stk[--nstk];
203
+ if (a.j >= 0)
204
+ capture[a.j] = a.cap_j;
205
+
206
+ int id = a.id;
207
+ if (id == 0)
208
+ continue;
209
+ if (q->has_index(id)) {
210
+ if (Debug)
211
+ fprintf(stderr, " [%d%s]\n", id, FormatCapture(capture).c_str());
212
+ continue;
213
+ }
214
+
215
+ // Create entry in q no matter what. We might fill it in below,
216
+ // or we might not. Even if not, it is necessary to have it,
217
+ // so that we don't revisit r during the recursion.
218
+ q->set_new(id, NULL);
219
+
220
+ Thread** tp = &q->find(id)->second;
221
+ int j;
222
+ Thread* t;
223
+ Prog::Inst* ip = prog_->inst(id);
224
+ switch (ip->opcode()) {
225
+ default:
226
+ LOG(DFATAL) << "unhandled " << ip->opcode() << " in AddToThreadq";
227
+ break;
228
+
229
+ case kInstFail:
230
+ break;
231
+
232
+ case kInstAltMatch:
233
+ // Save state; will pick up at next byte.
234
+ t = AllocThread();
235
+ t->id = id;
236
+ CopyCapture(t->capture, capture);
237
+ *tp = t;
238
+ // fall through
239
+
240
+ case kInstAlt:
241
+ // Explore alternatives.
242
+ stk[nstk++] = AddState(ip->out1());
243
+ stk[nstk++] = AddState(ip->out());
244
+ break;
245
+
246
+ case kInstNop:
247
+ // Continue on.
248
+ stk[nstk++] = AddState(ip->out());
249
+ break;
250
+
251
+ case kInstCapture:
252
+ if ((j=ip->cap()) < ncapture_) {
253
+ // Push a dummy whose only job is to restore capture[j]
254
+ // once we finish exploring this possibility.
255
+ stk[nstk++] = AddState(0, capture[j], j);
256
+
257
+ // Record capture.
258
+ capture[j] = p;
259
+ }
260
+ stk[nstk++] = AddState(ip->out());
261
+ break;
262
+
263
+ case kInstMatch:
264
+ case kInstByteRange:
265
+ // Save state; will pick up at next byte.
266
+ t = AllocThread();
267
+ t->id = id;
268
+ CopyCapture(t->capture, capture);
269
+ *tp = t;
270
+ if (Debug)
271
+ fprintf(stderr, " + %d%s [%p]\n", id, FormatCapture(t->capture).c_str(), t);
272
+ break;
273
+
274
+ case kInstEmptyWidth:
275
+ // Continue on if we have all the right flag bits.
276
+ if (ip->empty() & ~flag)
277
+ break;
278
+ stk[nstk++] = AddState(ip->out());
279
+ break;
280
+ }
281
+ }
282
+ }
283
+
284
+ // Run runq on byte c, appending new states to nextq.
285
+ // Updates match as new, better matches are found.
286
+ // p is position of the byte c in the input string,
287
+ // used when processing capturing parens.
288
+ // flag is the bitwise or of Bol, Eol, etc., specifying whether
289
+ // ^, $ and \b match the current input point (after c).
290
+ // Frees all the threads on runq.
291
+ // If there is a shortcut to the end, returns that shortcut.
292
+ int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {
293
+ nextq->clear();
294
+
295
+ for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
296
+ Thread* t = i->second;
297
+ if (t == NULL)
298
+ continue;
299
+
300
+ if (longest_) {
301
+ // Can skip any threads started after our current best match.
302
+ if (matched_ && match_[0] < t->capture[0]) {
303
+ FreeThread(t);
304
+ continue;
305
+ }
306
+ }
307
+
308
+ int id = t->id;
309
+ Prog::Inst* ip = prog_->inst(id);
310
+
311
+ switch (ip->opcode()) {
312
+ default:
313
+ // Should only see the values handled below.
314
+ LOG(DFATAL) << "Unhandled " << ip->opcode() << " in step";
315
+ break;
316
+
317
+ case kInstByteRange:
318
+ if (ip->Matches(c))
319
+ AddToThreadq(nextq, ip->out(), flag, p+1, t->capture);
320
+ break;
321
+
322
+ case kInstAltMatch:
323
+ if (i != runq->begin())
324
+ break;
325
+ // The match is ours if we want it.
326
+ if (ip->greedy(prog_) || longest_) {
327
+ CopyCapture((const char**)match_, t->capture);
328
+ FreeThread(t);
329
+ for (++i; i != runq->end(); ++i)
330
+ FreeThread(i->second);
331
+ runq->clear();
332
+ matched_ = true;
333
+ if (ip->greedy(prog_))
334
+ return ip->out1();
335
+ return ip->out();
336
+ }
337
+ break;
338
+
339
+ case kInstMatch:
340
+ if (endmatch_ && p != etext_)
341
+ break;
342
+
343
+ const char* old = t->capture[1]; // previous end pointer
344
+ t->capture[1] = p;
345
+ if (longest_) {
346
+ // Leftmost-longest mode: save this match only if
347
+ // it is either farther to the left or at the same
348
+ // point but longer than an existing match.
349
+ if (!matched_ || t->capture[0] < match_[0] ||
350
+ (t->capture[0] == match_[0] && t->capture[1] > match_[1]))
351
+ CopyCapture((const char**)match_, t->capture);
352
+ } else {
353
+ // Leftmost-biased mode: this match is by definition
354
+ // better than what we've already found (see next line).
355
+ CopyCapture((const char**)match_, t->capture);
356
+
357
+ // Cut off the threads that can only find matches
358
+ // worse than the one we just found: don't run the
359
+ // rest of the current Threadq.
360
+ t->capture[0] = old;
361
+ FreeThread(t);
362
+ for (++i; i != runq->end(); ++i)
363
+ FreeThread(i->second);
364
+ runq->clear();
365
+ matched_ = true;
366
+ return 0;
367
+ }
368
+ t->capture[0] = old;
369
+ matched_ = true;
370
+ break;
371
+ }
372
+ FreeThread(t);
373
+ }
374
+ runq->clear();
375
+ return 0;
376
+ }
377
+
378
+ string NFA::FormatCapture(const char** capture) {
379
+ string s;
380
+
381
+ for (int i = 0; i < ncapture_; i+=2) {
382
+ if (capture[i] == NULL)
383
+ StringAppendF(&s, "(?,?)");
384
+ else if (capture[i+1] == NULL)
385
+ StringAppendF(&s, "(%d,?)", (int)(capture[i] - btext_));
386
+ else
387
+ StringAppendF(&s, "(%d,%d)",
388
+ (int)(capture[i] - btext_),
389
+ (int)(capture[i+1] - btext_));
390
+ }
391
+ return s;
392
+ }
393
+
394
+ // Returns whether haystack contains needle's memory.
395
+ static bool StringPieceContains(const StringPiece haystack, const StringPiece needle) {
396
+ return haystack.begin() <= needle.begin() &&
397
+ haystack.end() >= needle.end();
398
+ }
399
+
400
+ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
401
+ bool anchored, bool longest,
402
+ StringPiece* submatch, int nsubmatch) {
403
+ if (start_ == 0)
404
+ return false;
405
+
406
+ StringPiece context = const_context;
407
+ if (context.begin() == NULL)
408
+ context = text;
409
+
410
+ if (!StringPieceContains(context, text)) {
411
+ LOG(FATAL) << "Bad args: context does not contain text "
412
+ << reinterpret_cast<const void*>(context.begin())
413
+ << "+" << context.size() << " "
414
+ << reinterpret_cast<const void*>(text.begin())
415
+ << "+" << text.size();
416
+ return false;
417
+ }
418
+
419
+ if (prog_->anchor_start() && context.begin() != text.begin())
420
+ return false;
421
+ if (prog_->anchor_end() && context.end() != text.end())
422
+ return false;
423
+ anchored |= prog_->anchor_start();
424
+ if (prog_->anchor_end()) {
425
+ longest = true;
426
+ endmatch_ = true;
427
+ etext_ = text.end();
428
+ }
429
+
430
+ if (nsubmatch < 0) {
431
+ LOG(DFATAL) << "Bad args: nsubmatch=" << nsubmatch;
432
+ return false;
433
+ }
434
+
435
+ // Save search parameters.
436
+ ncapture_ = 2*nsubmatch;
437
+ longest_ = longest;
438
+
439
+ if (nsubmatch == 0) {
440
+ // We need to maintain match[0], both to distinguish the
441
+ // longest match (if longest is true) and also to tell
442
+ // whether we've seen any matches at all.
443
+ ncapture_ = 2;
444
+ }
445
+
446
+ match_ = new const char*[ncapture_];
447
+ matched_ = false;
448
+ memset(match_, 0, ncapture_*sizeof match_[0]);
449
+
450
+ // For debugging prints.
451
+ btext_ = context.begin();
452
+
453
+ if (Debug) {
454
+ fprintf(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n",
455
+ text.as_string().c_str(), context.as_string().c_str(), anchored,
456
+ longest);
457
+ }
458
+
459
+ // Set up search.
460
+ Threadq* runq = &q0_;
461
+ Threadq* nextq = &q1_;
462
+ runq->clear();
463
+ nextq->clear();
464
+ memset(&match_[0], 0, ncapture_*sizeof match_[0]);
465
+ const char* bp = context.begin();
466
+ int c = -1;
467
+ int wasword = 0;
468
+
469
+ if (text.begin() > context.begin()) {
470
+ c = text.begin()[-1] & 0xFF;
471
+ wasword = Prog::IsWordChar(c);
472
+ }
473
+
474
+ // Loop over the text, stepping the machine.
475
+ for (const char* p = text.begin();; p++) {
476
+ // Check for empty-width specials.
477
+ int flag = 0;
478
+
479
+ // ^ and \A
480
+ if (p == context.begin())
481
+ flag |= kEmptyBeginText | kEmptyBeginLine;
482
+ else if (p <= context.end() && p[-1] == '\n')
483
+ flag |= kEmptyBeginLine;
484
+
485
+ // $ and \z
486
+ if (p == context.end())
487
+ flag |= kEmptyEndText | kEmptyEndLine;
488
+ else if (p < context.end() && p[0] == '\n')
489
+ flag |= kEmptyEndLine;
490
+
491
+ // \b and \B
492
+ int isword = 0;
493
+ if (p < context.end())
494
+ isword = Prog::IsWordChar(p[0] & 0xFF);
495
+
496
+ if (isword != wasword)
497
+ flag |= kEmptyWordBoundary;
498
+ else
499
+ flag |= kEmptyNonWordBoundary;
500
+
501
+ if (Debug) {
502
+ fprintf(stderr, "%c[%#x/%d/%d]:", p > text.end() ? '$' : p == bp ? '^' : c, flag, isword, wasword);
503
+ for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
504
+ Thread* t = i->second;
505
+ if (t == NULL)
506
+ continue;
507
+ fprintf(stderr, " %d%s", t->id,
508
+ FormatCapture((const char**)t->capture).c_str());
509
+ }
510
+ fprintf(stderr, "\n");
511
+ }
512
+
513
+ // Process previous character (waited until now to avoid
514
+ // repeating the flag computation above).
515
+ // This is a no-op the first time around the loop, because
516
+ // runq is empty.
517
+ int id = Step(runq, nextq, c, flag, p-1);
518
+ DCHECK_EQ(runq->size(), 0);
519
+ swap(nextq, runq);
520
+ nextq->clear();
521
+ if (id != 0) {
522
+ // We're done: full match ahead.
523
+ p = text.end();
524
+ for (;;) {
525
+ Prog::Inst* ip = prog_->inst(id);
526
+ switch (ip->opcode()) {
527
+ default:
528
+ LOG(DFATAL) << "Unexpected opcode in short circuit: " << ip->opcode();
529
+ break;
530
+
531
+ case kInstCapture:
532
+ match_[ip->cap()] = p;
533
+ id = ip->out();
534
+ continue;
535
+
536
+ case kInstNop:
537
+ id = ip->out();
538
+ continue;
539
+
540
+ case kInstMatch:
541
+ match_[1] = p;
542
+ matched_ = true;
543
+ break;
544
+
545
+ case kInstEmptyWidth:
546
+ if (ip->empty() & ~(kEmptyEndLine|kEmptyEndText)) {
547
+ LOG(DFATAL) << "Unexpected empty-width in short circuit: " << ip->empty();
548
+ break;
549
+ }
550
+ id = ip->out();
551
+ continue;
552
+ }
553
+ break;
554
+ }
555
+ break;
556
+ }
557
+
558
+ if (p > text.end())
559
+ break;
560
+
561
+ // Start a new thread if there have not been any matches.
562
+ // (No point in starting a new thread if there have been
563
+ // matches, since it would be to the right of the match
564
+ // we already found.)
565
+ if (!matched_ && (!anchored || p == text.begin())) {
566
+ // If there's a required first byte for an unanchored search
567
+ // and we're not in the middle of any possible matches,
568
+ // use memchr to search for the byte quickly.
569
+ if (!anchored && first_byte_ >= 0 && runq->size() == 0 &&
570
+ p < text.end() && (p[0] & 0xFF) != first_byte_) {
571
+ p = reinterpret_cast<const char*>(memchr(p, first_byte_,
572
+ text.end() - p));
573
+ if (p == NULL) {
574
+ p = text.end();
575
+ isword = 0;
576
+ } else {
577
+ isword = Prog::IsWordChar(p[0] & 0xFF);
578
+ }
579
+ flag = Prog::EmptyFlags(context, p);
580
+ }
581
+
582
+ // Steal match storage (cleared but unused as of yet)
583
+ // temporarily to hold match boundaries for new thread.
584
+ match_[0] = p;
585
+ AddToThreadq(runq, start_, flag, p, match_);
586
+ match_[0] = NULL;
587
+ }
588
+
589
+ // If all the threads have died, stop early.
590
+ if (runq->size() == 0) {
591
+ if (Debug)
592
+ fprintf(stderr, "dead\n");
593
+ break;
594
+ }
595
+
596
+ if (p == text.end())
597
+ c = 0;
598
+ else
599
+ c = *p & 0xFF;
600
+ wasword = isword;
601
+
602
+ // Will run step(runq, nextq, c, ...) on next iteration. See above.
603
+ }
604
+
605
+ for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i)
606
+ FreeThread(i->second);
607
+
608
+ if (matched_) {
609
+ for (int i = 0; i < nsubmatch; i++)
610
+ submatch[i].set(match_[2*i], match_[2*i+1] - match_[2*i]);
611
+ if (Debug)
612
+ fprintf(stderr, "match (%d,%d)\n",
613
+ static_cast<int>(match_[0] - btext_),
614
+ static_cast<int>(match_[1] - btext_));
615
+ return true;
616
+ }
617
+ VLOG(1) << "No matches found";
618
+ return false;
619
+ }
620
+
621
+ // Computes whether all successful matches have a common first byte,
622
+ // and if so, returns that byte. If not, returns -1.
623
+ int NFA::ComputeFirstByte() {
624
+ if (start_ == 0)
625
+ return -1;
626
+
627
+ int b = -1; // first byte, not yet computed
628
+
629
+ typedef SparseSet Workq;
630
+ Workq q(prog_->size());
631
+ q.insert(start_);
632
+ for (Workq::iterator it = q.begin(); it != q.end(); ++it) {
633
+ int id = *it;
634
+ Prog::Inst* ip = prog_->inst(id);
635
+ switch (ip->opcode()) {
636
+ default:
637
+ LOG(DFATAL) << "unhandled " << ip->opcode() << " in ComputeFirstByte";
638
+ break;
639
+
640
+ case kInstMatch:
641
+ // The empty string matches: no first byte.
642
+ return -1;
643
+
644
+ case kInstByteRange:
645
+ // Must match only a single byte
646
+ if (ip->lo() != ip->hi())
647
+ return -1;
648
+ if (ip->foldcase() && 'a' <= ip->lo() && ip->lo() <= 'z')
649
+ return -1;
650
+ // If we haven't seen any bytes yet, record it;
651
+ // otherwise must match the one we saw before.
652
+ if (b == -1)
653
+ b = ip->lo();
654
+ else if (b != ip->lo())
655
+ return -1;
656
+ break;
657
+
658
+ case kInstNop:
659
+ case kInstCapture:
660
+ case kInstEmptyWidth:
661
+ // Continue on.
662
+ // Ignore ip->empty() flags for kInstEmptyWidth
663
+ // in order to be as conservative as possible
664
+ // (assume all possible empty-width flags are true).
665
+ if (ip->out())
666
+ q.insert(ip->out());
667
+ break;
668
+
669
+ case kInstAlt:
670
+ case kInstAltMatch:
671
+ // Explore alternatives.
672
+ if (ip->out())
673
+ q.insert(ip->out());
674
+ if (ip->out1())
675
+ q.insert(ip->out1());
676
+ break;
677
+
678
+ case kInstFail:
679
+ break;
680
+ }
681
+ }
682
+ return b;
683
+ }
684
+
685
+ bool
686
+ Prog::SearchNFA(const StringPiece& text, const StringPiece& context,
687
+ Anchor anchor, MatchKind kind,
688
+ StringPiece* match, int nmatch) {
689
+ if (NFA::Debug)
690
+ Dump();
691
+
692
+ NFA nfa(this);
693
+ StringPiece sp;
694
+ if (kind == kFullMatch) {
695
+ anchor = kAnchored;
696
+ if (nmatch == 0) {
697
+ match = &sp;
698
+ nmatch = 1;
699
+ }
700
+ }
701
+ if (!nfa.Search(text, context, anchor == kAnchored, kind != kFirstMatch, match, nmatch))
702
+ return false;
703
+ if (kind == kFullMatch && match[0].end() != text.end())
704
+ return false;
705
+ return true;
706
+ }
707
+
708
+ } // namespace re2
709
+