chipper 0.4.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (134) hide show
  1. data/README.rdoc +51 -0
  2. data/ext/extconf.rb +58 -0
  3. data/ext/libstemmer_c/Makefile +10 -0
  4. data/ext/libstemmer_c/examples/stemwords.c +209 -0
  5. data/ext/libstemmer_c/include/libstemmer.h +79 -0
  6. data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
  7. data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
  8. data/ext/libstemmer_c/libstemmer/modules.h +190 -0
  9. data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
  10. data/ext/libstemmer_c/mkinc.mak +82 -0
  11. data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
  12. data/ext/libstemmer_c/runtime/api.c +66 -0
  13. data/ext/libstemmer_c/runtime/api.h +26 -0
  14. data/ext/libstemmer_c/runtime/header.h +58 -0
  15. data/ext/libstemmer_c/runtime/utilities.c +478 -0
  16. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
  17. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
  18. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
  19. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
  20. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
  21. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
  22. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
  23. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
  24. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
  25. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
  26. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
  27. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
  28. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
  29. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
  30. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
  31. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
  32. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
  33. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
  34. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
  35. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
  36. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
  37. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
  38. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
  39. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
  40. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
  41. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
  42. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
  43. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
  44. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
  45. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
  46. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
  47. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
  48. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
  49. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
  50. data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
  51. data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
  52. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
  53. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
  54. data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
  55. data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
  56. data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
  57. data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
  58. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
  59. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
  60. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
  61. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
  62. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
  63. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
  64. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
  65. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
  66. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
  67. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
  68. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
  69. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
  70. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
  71. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
  72. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
  73. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
  74. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
  75. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
  76. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
  77. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
  78. data/ext/re2/bitstate.cc +378 -0
  79. data/ext/re2/compile.cc +1138 -0
  80. data/ext/re2/dfa.cc +2086 -0
  81. data/ext/re2/filtered_re2.cc +100 -0
  82. data/ext/re2/filtered_re2.h +99 -0
  83. data/ext/re2/hash.cc +231 -0
  84. data/ext/re2/mimics_pcre.cc +185 -0
  85. data/ext/re2/nfa.cc +709 -0
  86. data/ext/re2/onepass.cc +614 -0
  87. data/ext/re2/parse.cc +2202 -0
  88. data/ext/re2/perl_groups.cc +119 -0
  89. data/ext/re2/prefilter.cc +671 -0
  90. data/ext/re2/prefilter.h +105 -0
  91. data/ext/re2/prefilter_tree.cc +398 -0
  92. data/ext/re2/prefilter_tree.h +130 -0
  93. data/ext/re2/prog.cc +341 -0
  94. data/ext/re2/prog.h +376 -0
  95. data/ext/re2/re2.cc +1180 -0
  96. data/ext/re2/re2.h +837 -0
  97. data/ext/re2/regexp.cc +920 -0
  98. data/ext/re2/regexp.h +632 -0
  99. data/ext/re2/rune.cc +258 -0
  100. data/ext/re2/set.cc +113 -0
  101. data/ext/re2/set.h +55 -0
  102. data/ext/re2/simplify.cc +393 -0
  103. data/ext/re2/stringpiece.cc +87 -0
  104. data/ext/re2/stringpiece.h +182 -0
  105. data/ext/re2/tostring.cc +341 -0
  106. data/ext/re2/unicode_casefold.cc +469 -0
  107. data/ext/re2/unicode_casefold.h +75 -0
  108. data/ext/re2/unicode_groups.cc +4851 -0
  109. data/ext/re2/unicode_groups.h +64 -0
  110. data/ext/re2/valgrind.cc +24 -0
  111. data/ext/re2/variadic_function.h +346 -0
  112. data/ext/re2/walker-inl.h +244 -0
  113. data/ext/src/chipper.cc +626 -0
  114. data/ext/src/version.h +1 -0
  115. data/ext/stemmer.rb +40 -0
  116. data/ext/util/arena.h +103 -0
  117. data/ext/util/atomicops.h +79 -0
  118. data/ext/util/benchmark.h +41 -0
  119. data/ext/util/flags.h +27 -0
  120. data/ext/util/logging.h +78 -0
  121. data/ext/util/mutex.h +190 -0
  122. data/ext/util/pcre.h +679 -0
  123. data/ext/util/random.h +29 -0
  124. data/ext/util/sparse_array.h +451 -0
  125. data/ext/util/sparse_set.h +177 -0
  126. data/ext/util/test.h +57 -0
  127. data/ext/util/thread.h +26 -0
  128. data/ext/util/utf.h +43 -0
  129. data/ext/util/util.h +127 -0
  130. data/ext/util/valgrind.h +4517 -0
  131. data/test/helper.rb +5 -0
  132. data/test/test_entities.rb +57 -0
  133. data/test/test_tokens.rb +118 -0
  134. metadata +199 -0
@@ -0,0 +1,1138 @@
1
+ // Copyright 2007 The RE2 Authors. All Rights Reserved.
2
+ // Use of this source code is governed by a BSD-style
3
+ // license that can be found in the LICENSE file.
4
+
5
+ // Compile regular expression to Prog.
6
+ //
7
+ // Prog and Inst are defined in prog.h.
8
+ // This file's external interface is just Regexp::CompileToProg.
9
+ // The Compiler class defined in this file is private.
10
+
11
+ #include "re2/prog.h"
12
+ #include "re2/re2.h"
13
+ #include "re2/regexp.h"
14
+ #include "re2/walker-inl.h"
15
+
16
+ namespace re2 {
17
+
18
+ // List of pointers to Inst* that need to be filled in (patched).
19
+ // Because the Inst* haven't been filled in yet,
20
+ // we can use the Inst* word to hold the list's "next" pointer.
21
+ // It's kind of sleazy, but it works well in practice.
22
+ // See http://swtch.com/~rsc/regexp/regexp1.html for inspiration.
23
+ //
24
+ // Because the out and out1 fields in Inst are no longer pointers,
25
+ // we can't use pointers directly here either. Instead, p refers
26
+ // to inst_[p>>1].out (p&1 == 0) or inst_[p>>1].out1 (p&1 == 1).
27
+ // p == 0 represents the NULL list. This is okay because instruction #0
28
+ // is always the fail instruction, which never appears on a list.
29
+
30
+ struct PatchList {
31
+ uint32 p;
32
+
33
+ // Returns patch list containing just p.
34
+ static PatchList Mk(uint32 p);
35
+
36
+ // Patches all the entries on l to have value v.
37
+ // Caller must not ever use patch list again.
38
+ static void Patch(Prog::Inst *inst0, PatchList l, uint32 v);
39
+
40
+ // Deref returns the next pointer pointed at by p.
41
+ static PatchList Deref(Prog::Inst *inst0, PatchList l);
42
+
43
+ // Appends two patch lists and returns result.
44
+ static PatchList Append(Prog::Inst *inst0, PatchList l1, PatchList l2);
45
+ };
46
+
47
+ static PatchList nullPatchList = { 0 };
48
+
49
+ // Returns patch list containing just p.
50
+ PatchList PatchList::Mk(uint32 p) {
51
+ PatchList l;
52
+ l.p = p;
53
+ return l;
54
+ }
55
+
56
+ // Returns the next pointer pointed at by l.
57
+ PatchList PatchList::Deref(Prog::Inst* inst0, PatchList l) {
58
+ Prog::Inst* ip = &inst0[l.p>>1];
59
+ if (l.p&1)
60
+ l.p = ip->out1();
61
+ else
62
+ l.p = ip->out();
63
+ return l;
64
+ }
65
+
66
+ // Patches all the entries on l to have value v.
67
+ void PatchList::Patch(Prog::Inst *inst0, PatchList l, uint32 val) {
68
+ while (l.p != 0) {
69
+ Prog::Inst* ip = &inst0[l.p>>1];
70
+ if (l.p&1) {
71
+ l.p = ip->out1();
72
+ ip->out1_ = val;
73
+ } else {
74
+ l.p = ip->out();
75
+ ip->set_out(val);
76
+ }
77
+ }
78
+ }
79
+
80
+ // Appends two patch lists and returns result.
81
+ PatchList PatchList::Append(Prog::Inst* inst0, PatchList l1, PatchList l2) {
82
+ if (l1.p == 0)
83
+ return l2;
84
+ if (l2.p == 0)
85
+ return l1;
86
+
87
+ PatchList l = l1;
88
+ for (;;) {
89
+ PatchList next = PatchList::Deref(inst0, l);
90
+ if (next.p == 0)
91
+ break;
92
+ l = next;
93
+ }
94
+
95
+ Prog::Inst* ip = &inst0[l.p>>1];
96
+ if (l.p&1)
97
+ ip->out1_ = l2.p;
98
+ else
99
+ ip->set_out(l2.p);
100
+
101
+ return l1;
102
+ }
103
+
104
+ // Compiled program fragment.
105
+ struct Frag {
106
+ uint32 begin;
107
+ PatchList end;
108
+
109
+ Frag() : begin(0) { end.p = 0; } // needed so Frag can go in vector
110
+ Frag(uint32 begin, PatchList end) : begin(begin), end(end) {}
111
+ };
112
+
113
+ static Frag kNullFrag;
114
+
115
+ // Input encodings.
116
+ enum Encoding {
117
+ kEncodingUTF8 = 1, // UTF-8 (0-10FFFF)
118
+ kEncodingLatin1, // Latin1 (0-FF)
119
+ };
120
+
121
+ class Compiler : public Regexp::Walker<Frag> {
122
+ public:
123
+ explicit Compiler();
124
+ ~Compiler();
125
+
126
+ // Compiles Regexp to a new Prog.
127
+ // Caller is responsible for deleting Prog when finished with it.
128
+ // If reversed is true, compiles for walking over the input
129
+ // string backward (reverses all concatenations).
130
+ static Prog *Compile(Regexp* re, bool reversed, int64 max_mem);
131
+
132
+ // Compiles alternation of all the re to a new Prog.
133
+ // Each re has a match with an id equal to its index in the vector.
134
+ static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor,
135
+ Regexp* re);
136
+
137
+ // Interface for Regexp::Walker, which helps traverse the Regexp.
138
+ // The walk is purely post-recursive: given the machines for the
139
+ // children, PostVisit combines them to create the machine for
140
+ // the current node. The child_args are Frags.
141
+ // The Compiler traverses the Regexp parse tree, visiting
142
+ // each node in depth-first order. It invokes PreVisit before
143
+ // visiting the node's children and PostVisit after visiting
144
+ // the children.
145
+ Frag PreVisit(Regexp* re, Frag parent_arg, bool* stop);
146
+ Frag PostVisit(Regexp* re, Frag parent_arg, Frag pre_arg, Frag* child_args,
147
+ int nchild_args);
148
+ Frag ShortVisit(Regexp* re, Frag parent_arg);
149
+ Frag Copy(Frag arg);
150
+
151
+ // Given fragment a, returns a+ or a+?; a* or a*?; a? or a??
152
+ Frag Plus(Frag a, bool nongreedy);
153
+ Frag Star(Frag a, bool nongreedy);
154
+ Frag Quest(Frag a, bool nongreedy);
155
+
156
+ // Given fragment a, returns (a) capturing as \n.
157
+ Frag Capture(Frag a, int n);
158
+
159
+ // Given fragments a and b, returns ab; a|b
160
+ Frag Cat(Frag a, Frag b);
161
+ Frag Alt(Frag a, Frag b);
162
+
163
+ // Returns a fragment that can't match anything.
164
+ Frag NoMatch();
165
+
166
+ // Returns a fragment that matches the empty string.
167
+ Frag Match(int32 id);
168
+
169
+ // Returns a no-op fragment.
170
+ Frag Nop();
171
+
172
+ // Returns a fragment matching the byte range lo-hi.
173
+ Frag ByteRange(int lo, int hi, bool foldcase);
174
+
175
+ // Returns a fragment matching an empty-width special op.
176
+ Frag EmptyWidth(EmptyOp op);
177
+
178
+ // Adds n instructions to the program.
179
+ // Returns the index of the first one.
180
+ // Returns -1 if no more instructions are available.
181
+ int AllocInst(int n);
182
+
183
+ // Deletes unused instructions.
184
+ void Trim();
185
+
186
+ // Rune range compiler.
187
+
188
+ // Begins a new alternation.
189
+ void BeginRange();
190
+
191
+ // Adds a fragment matching the rune range lo-hi.
192
+ void AddRuneRange(Rune lo, Rune hi, bool foldcase);
193
+ void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase);
194
+ void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase);
195
+ void Add_80_10ffff();
196
+
197
+ // New suffix that matches the byte range lo-hi, then goes to next.
198
+ int RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next);
199
+ int UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next);
200
+
201
+ // Adds a suffix to alternation.
202
+ void AddSuffix(int id);
203
+
204
+ // Returns the alternation of all the added suffixes.
205
+ Frag EndRange();
206
+
207
+ // Single rune.
208
+ Frag Literal(Rune r, bool foldcase);
209
+
210
+ void Setup(Regexp::ParseFlags, int64, RE2::Anchor);
211
+ Prog* Finish();
212
+
213
+ // Returns .* where dot = any byte
214
+ Frag DotStar();
215
+
216
+ private:
217
+ Prog* prog_; // Program being built.
218
+ bool failed_; // Did we give up compiling?
219
+ Encoding encoding_; // Input encoding
220
+ bool reversed_; // Should program run backward over text?
221
+
222
+ int max_inst_; // Maximum number of instructions.
223
+
224
+ Prog::Inst* inst_; // Pointer to first instruction.
225
+ int inst_len_; // Number of instructions used.
226
+ int inst_cap_; // Number of instructions allocated.
227
+
228
+ int64 max_mem_; // Total memory budget.
229
+
230
+ map<uint64, int> rune_cache_;
231
+ Frag rune_range_;
232
+
233
+ RE2::Anchor anchor_; // anchor mode for RE2::Set
234
+
235
+ DISALLOW_EVIL_CONSTRUCTORS(Compiler);
236
+ };
237
+
238
+ Compiler::Compiler() {
239
+ prog_ = new Prog();
240
+ failed_ = false;
241
+ encoding_ = kEncodingUTF8;
242
+ reversed_ = false;
243
+ inst_ = NULL;
244
+ inst_len_ = 0;
245
+ inst_cap_ = 0;
246
+ max_inst_ = 1; // make AllocInst for fail instruction okay
247
+ max_mem_ = 0;
248
+ int fail = AllocInst(1);
249
+ inst_[fail].InitFail();
250
+ max_inst_ = 0; // Caller must change
251
+ }
252
+
253
+ Compiler::~Compiler() {
254
+ delete prog_;
255
+ delete[] inst_;
256
+ }
257
+
258
+ int Compiler::AllocInst(int n) {
259
+ if (failed_ || inst_len_ + n > max_inst_) {
260
+ failed_ = true;
261
+ return -1;
262
+ }
263
+
264
+ if (inst_len_ + n > inst_cap_) {
265
+ if (inst_cap_ == 0)
266
+ inst_cap_ = 8;
267
+ while (inst_len_ + n > inst_cap_)
268
+ inst_cap_ *= 2;
269
+ Prog::Inst* ip = new Prog::Inst[inst_cap_];
270
+ memmove(ip, inst_, inst_len_ * sizeof ip[0]);
271
+ memset(ip + inst_len_, 0, (inst_cap_ - inst_len_) * sizeof ip[0]);
272
+ delete[] inst_;
273
+ inst_ = ip;
274
+ }
275
+ int id = inst_len_;
276
+ inst_len_ += n;
277
+ return id;
278
+ }
279
+
280
+ void Compiler::Trim() {
281
+ if (inst_len_ < inst_cap_) {
282
+ Prog::Inst* ip = new Prog::Inst[inst_len_];
283
+ memmove(ip, inst_, inst_len_ * sizeof ip[0]);
284
+ delete[] inst_;
285
+ inst_ = ip;
286
+ inst_cap_ = inst_len_;
287
+ }
288
+ }
289
+
290
+ // These routines are somewhat hard to visualize in text --
291
+ // see http://swtch.com/~rsc/regexp/regexp1.html for
292
+ // pictures explaining what is going on here.
293
+
294
+ // Returns an unmatchable fragment.
295
+ Frag Compiler::NoMatch() {
296
+ return Frag(0, nullPatchList);
297
+ }
298
+
299
+ // Is a an unmatchable fragment?
300
+ static bool IsNoMatch(Frag a) {
301
+ return a.begin == 0;
302
+ }
303
+
304
+ // Given fragments a and b, returns fragment for ab.
305
+ Frag Compiler::Cat(Frag a, Frag b) {
306
+ if (IsNoMatch(a) || IsNoMatch(b))
307
+ return NoMatch();
308
+
309
+ // Elide no-op.
310
+ Prog::Inst* begin = &inst_[a.begin];
311
+ if (begin->opcode() == kInstNop &&
312
+ a.end.p == (a.begin << 1) &&
313
+ begin->out() == 0) {
314
+ PatchList::Patch(inst_, a.end, b.begin); // in case refs to a somewhere
315
+ return b;
316
+ }
317
+
318
+ // To run backward over string, reverse all concatenations.
319
+ if (reversed_) {
320
+ PatchList::Patch(inst_, b.end, a.begin);
321
+ return Frag(b.begin, a.end);
322
+ }
323
+
324
+ PatchList::Patch(inst_, a.end, b.begin);
325
+ return Frag(a.begin, b.end);
326
+ }
327
+
328
+ // Given fragments for a and b, returns fragment for a|b.
329
+ Frag Compiler::Alt(Frag a, Frag b) {
330
+ // Special case for convenience in loops.
331
+ if (IsNoMatch(a))
332
+ return b;
333
+ if (IsNoMatch(b))
334
+ return a;
335
+
336
+ int id = AllocInst(1);
337
+ if (id < 0)
338
+ return NoMatch();
339
+
340
+ inst_[id].InitAlt(a.begin, b.begin);
341
+ return Frag(id, PatchList::Append(inst_, a.end, b.end));
342
+ }
343
+
344
+ // When capturing submatches in like-Perl mode, a kOpAlt Inst
345
+ // treats out_ as the first choice, out1_ as the second.
346
+ //
347
+ // For *, +, and ?, if out_ causes another repetition,
348
+ // then the operator is greedy. If out1_ is the repetition
349
+ // (and out_ moves forward), then the operator is non-greedy.
350
+
351
+ // Given a fragment a, returns a fragment for a* or a*? (if nongreedy)
352
+ Frag Compiler::Star(Frag a, bool nongreedy) {
353
+ int id = AllocInst(1);
354
+ if (id < 0)
355
+ return NoMatch();
356
+ inst_[id].InitAlt(0, 0);
357
+ PatchList::Patch(inst_, a.end, id);
358
+ if (nongreedy) {
359
+ inst_[id].out1_ = a.begin;
360
+ return Frag(id, PatchList::Mk(id << 1));
361
+ } else {
362
+ inst_[id].set_out(a.begin);
363
+ return Frag(id, PatchList::Mk((id << 1) | 1));
364
+ }
365
+ }
366
+
367
+ // Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy)
368
+ Frag Compiler::Plus(Frag a, bool nongreedy) {
369
+ // a+ is just a* with a different entry point.
370
+ Frag f = Star(a, nongreedy);
371
+ return Frag(a.begin, f.end);
372
+ }
373
+
374
+ // Given a fragment for a, returns a fragment for a? or a?? (if nongreedy)
375
+ Frag Compiler::Quest(Frag a, bool nongreedy) {
376
+ int id = AllocInst(1);
377
+ if (id < 0)
378
+ return NoMatch();
379
+ PatchList pl;
380
+ if (nongreedy) {
381
+ inst_[id].InitAlt(0, a.begin);
382
+ pl = PatchList::Mk(id << 1);
383
+ } else {
384
+ inst_[id].InitAlt(a.begin, 0);
385
+ pl = PatchList::Mk((id << 1) | 1);
386
+ }
387
+ return Frag(id, PatchList::Append(inst_, pl, a.end));
388
+ }
389
+
390
+ // Returns a fragment for the byte range lo-hi.
391
+ Frag Compiler::ByteRange(int lo, int hi, bool foldcase) {
392
+ int id = AllocInst(1);
393
+ if (id < 0)
394
+ return NoMatch();
395
+ inst_[id].InitByteRange(lo, hi, foldcase, 0);
396
+ prog_->byte_inst_count_++;
397
+ prog_->MarkByteRange(lo, hi);
398
+ if (foldcase && lo <= 'z' && hi >= 'a') {
399
+ if (lo < 'a')
400
+ lo = 'a';
401
+ if (hi > 'z')
402
+ hi = 'z';
403
+ if (lo <= hi)
404
+ prog_->MarkByteRange(lo + 'A' - 'a', hi + 'A' - 'a');
405
+ }
406
+ return Frag(id, PatchList::Mk(id << 1));
407
+ }
408
+
409
+ // Returns a no-op fragment. Sometimes unavoidable.
410
+ Frag Compiler::Nop() {
411
+ int id = AllocInst(1);
412
+ if (id < 0)
413
+ return NoMatch();
414
+ inst_[id].InitNop(0);
415
+ return Frag(id, PatchList::Mk(id << 1));
416
+ }
417
+
418
+ // Returns a fragment that signals a match.
419
+ Frag Compiler::Match(int32 match_id) {
420
+ int id = AllocInst(1);
421
+ if (id < 0)
422
+ return NoMatch();
423
+ inst_[id].InitMatch(match_id);
424
+ return Frag(id, nullPatchList);
425
+ }
426
+
427
+ // Returns a fragment matching a particular empty-width op (like ^ or $)
428
+ Frag Compiler::EmptyWidth(EmptyOp empty) {
429
+ int id = AllocInst(1);
430
+ if (id < 0)
431
+ return NoMatch();
432
+ inst_[id].InitEmptyWidth(empty, 0);
433
+ if (empty & (kEmptyBeginLine|kEmptyEndLine))
434
+ prog_->MarkByteRange('\n', '\n');
435
+ if (empty & (kEmptyWordBoundary|kEmptyNonWordBoundary)) {
436
+ int j;
437
+ for (int i = 0; i < 256; i = j) {
438
+ for (j = i+1; j < 256 && Prog::IsWordChar(i) == Prog::IsWordChar(j); j++)
439
+ ;
440
+ prog_->MarkByteRange(i, j-1);
441
+ }
442
+ }
443
+ return Frag(id, PatchList::Mk(id << 1));
444
+ }
445
+
446
+ // Given a fragment a, returns a fragment with capturing parens around a.
447
+ Frag Compiler::Capture(Frag a, int n) {
448
+ int id = AllocInst(2);
449
+ if (id < 0)
450
+ return NoMatch();
451
+ inst_[id].InitCapture(2*n, a.begin);
452
+ inst_[id+1].InitCapture(2*n+1, 0);
453
+ PatchList::Patch(inst_, a.end, id+1);
454
+
455
+ return Frag(id, PatchList::Mk((id+1) << 1));
456
+ }
457
+
458
+ // A Rune is a name for a Unicode code point.
459
+ // Returns maximum rune encoded by UTF-8 sequence of length len.
460
+ static int MaxRune(int len) {
461
+ int b; // number of Rune blents lenn len-byte UTF-8 sequence (len < UTFmax)
462
+ if (len == 1)
463
+ b = 7;
464
+ else
465
+ b = 8-(len+1) + 6*(len-1);
466
+ return (1<<b) - 1; // maximum Rune for b bits.
467
+ }
468
+
469
+ // The rune range compiler caches common suffix fragments,
470
+ // which are very common in UTF-8 (e.g., [80-bf]).
471
+ // The fragment suffixes are identified by their start
472
+ // instructions. NULL denotes the eventual end match.
473
+ // The Frag accumulates in rune_range_. Caching common
474
+ // suffixes reduces the UTF-8 "." from 32 to 24 instructions,
475
+ // and it reduces the corresponding one-pass NFA from 16 nodes to 8.
476
+
477
+ void Compiler::BeginRange() {
478
+ rune_cache_.clear();
479
+ rune_range_.begin = 0;
480
+ rune_range_.end = nullPatchList;
481
+ }
482
+
483
+ int Compiler::UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase,
484
+ int next) {
485
+ Frag f = ByteRange(lo, hi, foldcase);
486
+ if (next != 0) {
487
+ PatchList::Patch(inst_, f.end, next);
488
+ } else {
489
+ rune_range_.end = PatchList::Append(inst_, rune_range_.end, f.end);
490
+ }
491
+ return f.begin;
492
+ }
493
+
494
+ int Compiler::RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next) {
495
+ // In Latin1 mode, there's no point in caching.
496
+ // In forward UTF-8 mode, only need to cache continuation bytes.
497
+ if (encoding_ == kEncodingLatin1 ||
498
+ (encoding_ == kEncodingUTF8 &&
499
+ !reversed_ &&
500
+ !(0x80 <= lo && hi <= 0xbf))) {
501
+ return UncachedRuneByteSuffix(lo, hi, foldcase, next);
502
+ }
503
+
504
+ uint64 key = ((uint64)next << 17) | (lo<<9) | (hi<<1) | foldcase;
505
+ map<uint64, int>::iterator it = rune_cache_.find(key);
506
+ if (it != rune_cache_.end())
507
+ return it->second;
508
+ int id = UncachedRuneByteSuffix(lo, hi, foldcase, next);
509
+ rune_cache_[key] = id;
510
+ return id;
511
+ }
512
+
513
+ void Compiler::AddSuffix(int id) {
514
+ if (rune_range_.begin == 0) {
515
+ rune_range_.begin = id;
516
+ return;
517
+ }
518
+
519
+ int alt = AllocInst(1);
520
+ if (alt < 0) {
521
+ rune_range_.begin = 0;
522
+ return;
523
+ }
524
+ inst_[alt].InitAlt(rune_range_.begin, id);
525
+ rune_range_.begin = alt;
526
+ }
527
+
528
+ Frag Compiler::EndRange() {
529
+ return rune_range_;
530
+ }
531
+
532
+ // Converts rune range lo-hi into a fragment that recognizes
533
+ // the bytes that would make up those runes in the current
534
+ // encoding (Latin 1 or UTF-8).
535
+ // This lets the machine work byte-by-byte even when
536
+ // using multibyte encodings.
537
+
538
+ void Compiler::AddRuneRange(Rune lo, Rune hi, bool foldcase) {
539
+ switch (encoding_) {
540
+ default:
541
+ case kEncodingUTF8:
542
+ AddRuneRangeUTF8(lo, hi, foldcase);
543
+ break;
544
+ case kEncodingLatin1:
545
+ AddRuneRangeLatin1(lo, hi, foldcase);
546
+ break;
547
+ }
548
+ }
549
+
550
+ void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) {
551
+ // Latin1 is easy: runes *are* bytes.
552
+ if (lo > hi || lo > 0xFF)
553
+ return;
554
+ if (hi > 0xFF)
555
+ hi = 0xFF;
556
+ AddSuffix(RuneByteSuffix(lo, hi, foldcase, 0));
557
+ }
558
+
559
+ // Table describing how to make a UTF-8 matching machine
560
+ // for the rune range 80-10FFFF (Runeself-Runemax).
561
+ // This range happens frequently enough (for example /./ and /[^a-z]/)
562
+ // and the rune_cache_ map is slow enough that this is worth
563
+ // special handling. Makes compilation of a small expression
564
+ // with a dot in it about 10% faster.
565
+ // The * in the comments below mark whole sequences.
566
+ static struct ByteRangeProg {
567
+ int next;
568
+ int lo;
569
+ int hi;
570
+ } prog_80_10ffff[] = {
571
+ // Two-byte
572
+ { -1, 0x80, 0xBF, }, // 0: 80-BF
573
+ { 0, 0xC2, 0xDF, }, // 1: C2-DF 80-BF*
574
+
575
+ // Three-byte
576
+ { 0, 0xA0, 0xBF, }, // 2: A0-BF 80-BF
577
+ { 2, 0xE0, 0xE0, }, // 3: E0 A0-BF 80-BF*
578
+ { 0, 0x80, 0xBF, }, // 4: 80-BF 80-BF
579
+ { 4, 0xE1, 0xEF, }, // 5: E1-EF 80-BF 80-BF*
580
+
581
+ // Four-byte
582
+ { 4, 0x90, 0xBF, }, // 6: 90-BF 80-BF 80-BF
583
+ { 6, 0xF0, 0xF0, }, // 7: F0 90-BF 80-BF 80-BF*
584
+ { 4, 0x80, 0xBF, }, // 8: 80-BF 80-BF 80-BF
585
+ { 8, 0xF1, 0xF3, }, // 9: F1-F3 80-BF 80-BF 80-BF*
586
+ { 4, 0x80, 0x8F, }, // 10: 80-8F 80-BF 80-BF
587
+ { 10, 0xF4, 0xF4, }, // 11: F4 80-8F 80-BF 80-BF*
588
+ };
589
+
590
+ void Compiler::Add_80_10ffff() {
591
+ int inst[arraysize(prog_80_10ffff)];
592
+ for (int i = 0; i < arraysize(prog_80_10ffff); i++) {
593
+ const ByteRangeProg& p = prog_80_10ffff[i];
594
+ int next = 0;
595
+ if (p.next >= 0)
596
+ next = inst[p.next];
597
+ inst[i] = UncachedRuneByteSuffix(p.lo, p.hi, false, next);
598
+ if ((p.lo & 0xC0) != 0x80)
599
+ AddSuffix(inst[i]);
600
+ }
601
+ }
602
+
603
+ void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) {
604
+ if (lo > hi)
605
+ return;
606
+
607
+ // Pick off 80-10FFFF as a common special case
608
+ // that can bypass the slow rune_cache_.
609
+ if (lo == 0x80 && hi == 0x10ffff && !reversed_) {
610
+ Add_80_10ffff();
611
+ return;
612
+ }
613
+
614
+ // Split range into same-length sized ranges.
615
+ for (int i = 1; i < UTFmax; i++) {
616
+ Rune max = MaxRune(i);
617
+ if (lo <= max && max < hi) {
618
+ AddRuneRangeUTF8(lo, max, foldcase);
619
+ AddRuneRangeUTF8(max+1, hi, foldcase);
620
+ return;
621
+ }
622
+ }
623
+
624
+ // ASCII range is always a special case.
625
+ if (hi < Runeself) {
626
+ AddSuffix(RuneByteSuffix(lo, hi, foldcase, 0));
627
+ return;
628
+ }
629
+
630
+ // Split range into sections that agree on leading bytes.
631
+ for (int i = 1; i < UTFmax; i++) {
632
+ uint m = (1<<(6*i)) - 1; // last i bytes of a UTF-8 sequence
633
+ if ((lo & ~m) != (hi & ~m)) {
634
+ if ((lo & m) != 0) {
635
+ AddRuneRangeUTF8(lo, lo|m, foldcase);
636
+ AddRuneRangeUTF8((lo|m)+1, hi, foldcase);
637
+ return;
638
+ }
639
+ if ((hi & m) != m) {
640
+ AddRuneRangeUTF8(lo, (hi&~m)-1, foldcase);
641
+ AddRuneRangeUTF8(hi&~m, hi, foldcase);
642
+ return;
643
+ }
644
+ }
645
+ }
646
+
647
+ // Finally. Generate byte matching equivalent for lo-hi.
648
+ uint8 ulo[UTFmax], uhi[UTFmax];
649
+ int n = runetochar(reinterpret_cast<char*>(ulo), &lo);
650
+ int m = runetochar(reinterpret_cast<char*>(uhi), &hi);
651
+ (void)m; // USED(m)
652
+ DCHECK_EQ(n, m);
653
+
654
+ int id = 0;
655
+ if (reversed_) {
656
+ for (int i = 0; i < n; i++)
657
+ id = RuneByteSuffix(ulo[i], uhi[i], false, id);
658
+ } else {
659
+ for (int i = n-1; i >= 0; i--)
660
+ id = RuneByteSuffix(ulo[i], uhi[i], false, id);
661
+ }
662
+ AddSuffix(id);
663
+ }
664
+
665
+ // Should not be called.
666
+ Frag Compiler::Copy(Frag arg) {
667
+ // We're using WalkExponential; there should be no copying.
668
+ LOG(DFATAL) << "Compiler::Copy called!";
669
+ failed_ = true;
670
+ return NoMatch();
671
+ }
672
+
673
+ // Visits a node quickly; called once WalkExponential has
674
+ // decided to cut this walk short.
675
+ Frag Compiler::ShortVisit(Regexp* re, Frag) {
676
+ failed_ = true;
677
+ return NoMatch();
678
+ }
679
+
680
+ // Called before traversing a node's children during the walk.
681
+ Frag Compiler::PreVisit(Regexp* re, Frag, bool* stop) {
682
+ // Cut off walk if we've already failed.
683
+ if (failed_)
684
+ *stop = true;
685
+
686
+ return kNullFrag; // not used by caller
687
+ }
688
+
689
+ Frag Compiler::Literal(Rune r, bool foldcase) {
690
+ switch (encoding_) {
691
+ default:
692
+ return kNullFrag;
693
+
694
+ case kEncodingLatin1:
695
+ return ByteRange(r, r, foldcase);
696
+
697
+ case kEncodingUTF8: {
698
+ if (r < Runeself) // Make common case fast.
699
+ return ByteRange(r, r, foldcase);
700
+ uint8 buf[UTFmax];
701
+ int n = runetochar(reinterpret_cast<char*>(buf), &r);
702
+ Frag f = ByteRange((uint8)buf[0], buf[0], false);
703
+ for (int i = 1; i < n; i++)
704
+ f = Cat(f, ByteRange((uint8)buf[i], buf[i], false));
705
+ return f;
706
+ }
707
+ }
708
+ }
709
+
710
+ // Called after traversing the node's children during the walk.
711
+ // Given their frags, build and return the frag for this re.
712
+ Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags,
713
+ int nchild_frags) {
714
+ // If a child failed, don't bother going forward, especially
715
+ // since the child_frags might contain Frags with NULLs in them.
716
+ if (failed_)
717
+ return NoMatch();
718
+
719
+ // Given the child fragments, return the fragment for this node.
720
+ switch (re->op()) {
721
+ case kRegexpRepeat:
722
+ // Should not see; code at bottom of function will print error
723
+ break;
724
+
725
+ case kRegexpNoMatch:
726
+ return NoMatch();
727
+
728
+ case kRegexpEmptyMatch:
729
+ return Nop();
730
+
731
+ case kRegexpHaveMatch: {
732
+ Frag f = Match(re->match_id());
733
+ // Remember unanchored match to end of string.
734
+ if (anchor_ != RE2::ANCHOR_BOTH)
735
+ f = Cat(DotStar(), f);
736
+ return f;
737
+ }
738
+
739
+ case kRegexpConcat: {
740
+ Frag f = child_frags[0];
741
+ for (int i = 1; i < nchild_frags; i++)
742
+ f = Cat(f, child_frags[i]);
743
+ return f;
744
+ }
745
+
746
+ case kRegexpAlternate: {
747
+ Frag f = child_frags[0];
748
+ for (int i = 1; i < nchild_frags; i++)
749
+ f = Alt(f, child_frags[i]);
750
+ return f;
751
+ }
752
+
753
+ case kRegexpStar:
754
+ return Star(child_frags[0], re->parse_flags()&Regexp::NonGreedy);
755
+
756
+ case kRegexpPlus:
757
+ return Plus(child_frags[0], re->parse_flags()&Regexp::NonGreedy);
758
+
759
+ case kRegexpQuest:
760
+ return Quest(child_frags[0], re->parse_flags()&Regexp::NonGreedy);
761
+
762
+ case kRegexpLiteral:
763
+ return Literal(re->rune(), re->parse_flags()&Regexp::FoldCase);
764
+
765
+ case kRegexpLiteralString: {
766
+ // Concatenation of literals.
767
+ if (re->nrunes() == 0)
768
+ return Nop();
769
+ Frag f;
770
+ for (int i = 0; i < re->nrunes(); i++) {
771
+ Frag f1 = Literal(re->runes()[i], re->parse_flags()&Regexp::FoldCase);
772
+ if (i == 0)
773
+ f = f1;
774
+ else
775
+ f = Cat(f, f1);
776
+ }
777
+ return f;
778
+ }
779
+
780
+ case kRegexpAnyChar:
781
+ BeginRange();
782
+ AddRuneRange(0, Runemax, false);
783
+ return EndRange();
784
+
785
+ case kRegexpAnyByte:
786
+ return ByteRange(0x00, 0xFF, false);
787
+
788
+ case kRegexpCharClass: {
789
+ CharClass* cc = re->cc();
790
+ if (cc->empty()) {
791
+ // This can't happen.
792
+ LOG(DFATAL) << "No ranges in char class";
793
+ failed_ = true;
794
+ return NoMatch();
795
+ }
796
+
797
+ // ASCII case-folding optimization: if the char class
798
+ // behaves the same on A-Z as it does on a-z,
799
+ // discard any ranges wholly contained in A-Z
800
+ // and mark the other ranges as foldascii.
801
+ // This reduces the size of a program for
802
+ // (?i)abc from 3 insts per letter to 1 per letter.
803
+ bool foldascii = cc->FoldsASCII();
804
+
805
+ // Character class is just a big OR of the different
806
+ // character ranges in the class.
807
+ BeginRange();
808
+ for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) {
809
+ // ASCII case-folding optimization (see above).
810
+ if (foldascii && 'A' <= i->lo && i->hi <= 'Z')
811
+ continue;
812
+
813
+ // If this range contains all of A-Za-z or none of it,
814
+ // the fold flag is unnecessary; don't bother.
815
+ bool fold = foldascii;
816
+ if ((i->lo <= 'A' && 'z' <= i->hi) || i->hi < 'A' || 'z' < i->lo)
817
+ fold = false;
818
+
819
+ AddRuneRange(i->lo, i->hi, fold);
820
+ }
821
+ return EndRange();
822
+ }
823
+
824
+ case kRegexpCapture:
825
+ // If this is a non-capturing parenthesis -- (?:foo) --
826
+ // just use the inner expression.
827
+ if (re->cap() < 0)
828
+ return child_frags[0];
829
+ return Capture(child_frags[0], re->cap());
830
+
831
+ case kRegexpBeginLine:
832
+ return EmptyWidth(reversed_ ? kEmptyEndLine : kEmptyBeginLine);
833
+
834
+ case kRegexpEndLine:
835
+ return EmptyWidth(reversed_ ? kEmptyBeginLine : kEmptyEndLine);
836
+
837
+ case kRegexpBeginText:
838
+ return EmptyWidth(reversed_ ? kEmptyEndText : kEmptyBeginText);
839
+
840
+ case kRegexpEndText:
841
+ return EmptyWidth(reversed_ ? kEmptyBeginText : kEmptyEndText);
842
+
843
+ case kRegexpWordBoundary:
844
+ return EmptyWidth(kEmptyWordBoundary);
845
+
846
+ case kRegexpNoWordBoundary:
847
+ return EmptyWidth(kEmptyNonWordBoundary);
848
+ }
849
+ LOG(DFATAL) << "Missing case in Compiler: " << re->op();
850
+ failed_ = true;
851
+ return NoMatch();
852
+ }
853
+
854
+ // Is this regexp required to start at the beginning of the text?
855
+ // Only approximate; can return false for complicated regexps like (\Aa|\Ab),
856
+ // but handles (\A(a|b)). Could use the Walker to write a more exact one.
857
+ static bool IsAnchorStart(Regexp** pre, int depth) {
858
+ Regexp* re = *pre;
859
+ Regexp* sub;
860
+ // The depth limit makes sure that we don't overflow
861
+ // the stack on a deeply nested regexp. As the comment
862
+ // above says, IsAnchorStart is conservative, so returning
863
+ // a false negative is okay. The exact limit is somewhat arbitrary.
864
+ if (re == NULL || depth >= 4)
865
+ return false;
866
+ switch (re->op()) {
867
+ default:
868
+ break;
869
+ case kRegexpConcat:
870
+ if (re->nsub() > 0) {
871
+ sub = re->sub()[0]->Incref();
872
+ if (IsAnchorStart(&sub, depth+1)) {
873
+ Regexp** subcopy = new Regexp*[re->nsub()];
874
+ subcopy[0] = sub; // already have reference
875
+ for (int i = 1; i < re->nsub(); i++)
876
+ subcopy[i] = re->sub()[i]->Incref();
877
+ *pre = Regexp::Concat(subcopy, re->nsub(), re->parse_flags());
878
+ delete[] subcopy;
879
+ re->Decref();
880
+ return true;
881
+ }
882
+ sub->Decref();
883
+ }
884
+ break;
885
+ case kRegexpCapture:
886
+ sub = re->sub()[0]->Incref();
887
+ if (IsAnchorStart(&sub, depth+1)) {
888
+ *pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
889
+ re->Decref();
890
+ return true;
891
+ }
892
+ sub->Decref();
893
+ break;
894
+ case kRegexpBeginText:
895
+ *pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
896
+ re->Decref();
897
+ return true;
898
+ }
899
+ return false;
900
+ }
901
+
902
+ // Is this regexp required to start at the end of the text?
903
+ // Only approximate; can return false for complicated regexps like (a\z|b\z),
904
+ // but handles ((a|b)\z). Could use the Walker to write a more exact one.
905
+ static bool IsAnchorEnd(Regexp** pre, int depth) {
906
+ Regexp* re = *pre;
907
+ Regexp* sub;
908
+ // The depth limit makes sure that we don't overflow
909
+ // the stack on a deeply nested regexp. As the comment
910
+ // above says, IsAnchorEnd is conservative, so returning
911
+ // a false negative is okay. The exact limit is somewhat arbitrary.
912
+ if (re == NULL || depth >= 4)
913
+ return false;
914
+ switch (re->op()) {
915
+ default:
916
+ break;
917
+ case kRegexpConcat:
918
+ if (re->nsub() > 0) {
919
+ sub = re->sub()[re->nsub() - 1]->Incref();
920
+ if (IsAnchorEnd(&sub, depth+1)) {
921
+ Regexp** subcopy = new Regexp*[re->nsub()];
922
+ subcopy[re->nsub() - 1] = sub; // already have reference
923
+ for (int i = 0; i < re->nsub() - 1; i++)
924
+ subcopy[i] = re->sub()[i]->Incref();
925
+ *pre = Regexp::Concat(subcopy, re->nsub(), re->parse_flags());
926
+ delete[] subcopy;
927
+ re->Decref();
928
+ return true;
929
+ }
930
+ sub->Decref();
931
+ }
932
+ break;
933
+ case kRegexpCapture:
934
+ sub = re->sub()[0]->Incref();
935
+ if (IsAnchorEnd(&sub, depth+1)) {
936
+ *pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
937
+ re->Decref();
938
+ return true;
939
+ }
940
+ sub->Decref();
941
+ break;
942
+ case kRegexpEndText:
943
+ *pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
944
+ re->Decref();
945
+ return true;
946
+ }
947
+ return false;
948
+ }
949
+
950
+ void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem,
951
+ RE2::Anchor anchor) {
952
+ prog_->set_flags(flags);
953
+
954
+ if (flags & Regexp::Latin1)
955
+ encoding_ = kEncodingLatin1;
956
+ max_mem_ = max_mem;
957
+ if (max_mem <= 0) {
958
+ max_inst_ = 100000; // more than enough
959
+ } else if (max_mem <= sizeof(Prog)) {
960
+ // No room for anything.
961
+ max_inst_ = 0;
962
+ } else {
963
+ int64 m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst);
964
+ // Limit instruction count so that inst->id() fits nicely in an int.
965
+ // SparseArray also assumes that the indices (inst->id()) are ints.
966
+ // The call to WalkExponential uses 2*max_inst_ below,
967
+ // and other places in the code use 2 or 3 * prog->size().
968
+ // Limiting to 2^24 should avoid overflow in those places.
969
+ // (The point of allowing more than 32 bits of memory is to
970
+ // have plenty of room for the DFA states, not to use it up
971
+ // on the program.)
972
+ if (m >= 1<<24)
973
+ m = 1<<24;
974
+
975
+ // Inst imposes its own limit (currently bigger than 2^24 but be safe).
976
+ if (m > Prog::Inst::kMaxInst)
977
+ m = Prog::Inst::kMaxInst;
978
+
979
+ max_inst_ = m;
980
+ }
981
+
982
+ anchor_ = anchor;
983
+ }
984
+
985
+ // Compiles re, returning program.
986
+ // Caller is responsible for deleting prog_.
987
+ // If reversed is true, compiles a program that expects
988
+ // to run over the input string backward (reverses all concatenations).
989
+ // The reversed flag is also recorded in the returned program.
990
+ Prog* Compiler::Compile(Regexp* re, bool reversed, int64 max_mem) {
991
+ Compiler c;
992
+
993
+ c.Setup(re->parse_flags(), max_mem, RE2::ANCHOR_BOTH /* unused */);
994
+ c.reversed_ = reversed;
995
+
996
+ // Simplify to remove things like counted repetitions
997
+ // and character classes like \d.
998
+ Regexp* sre = re->Simplify();
999
+ if (sre == NULL)
1000
+ return NULL;
1001
+
1002
+ // Record whether prog is anchored, removing the anchors.
1003
+ // (They get in the way of other optimizations.)
1004
+ bool is_anchor_start = IsAnchorStart(&sre, 0);
1005
+ bool is_anchor_end = IsAnchorEnd(&sre, 0);
1006
+
1007
+ // Generate fragment for entire regexp.
1008
+ Frag f = c.WalkExponential(sre, kNullFrag, 2*c.max_inst_);
1009
+ sre->Decref();
1010
+ if (c.failed_)
1011
+ return NULL;
1012
+
1013
+ // Success! Finish by putting Match node at end, and record start.
1014
+ // Turn off c.reversed_ (if it is set) to force the remaining concatenations
1015
+ // to behave normally.
1016
+ c.reversed_ = false;
1017
+ Frag all = c.Cat(f, c.Match(0));
1018
+ c.prog_->set_start(all.begin);
1019
+
1020
+ if (reversed) {
1021
+ c.prog_->set_anchor_start(is_anchor_end);
1022
+ c.prog_->set_anchor_end(is_anchor_start);
1023
+ } else {
1024
+ c.prog_->set_anchor_start(is_anchor_start);
1025
+ c.prog_->set_anchor_end(is_anchor_end);
1026
+ }
1027
+
1028
+ // Also create unanchored version, which starts with a .*? loop.
1029
+ if (c.prog_->anchor_start()) {
1030
+ c.prog_->set_start_unanchored(c.prog_->start());
1031
+ } else {
1032
+ Frag unanchored = c.Cat(c.DotStar(), all);
1033
+ c.prog_->set_start_unanchored(unanchored.begin);
1034
+ }
1035
+
1036
+ c.prog_->set_reversed(reversed);
1037
+
1038
+ // Hand ownership of prog_ to caller.
1039
+ return c.Finish();
1040
+ }
1041
+
1042
+ Prog* Compiler::Finish() {
1043
+ if (failed_)
1044
+ return NULL;
1045
+
1046
+ if (prog_->start() == 0 && prog_->start_unanchored() == 0) {
1047
+ // No possible matches; keep Fail instruction only.
1048
+ inst_len_ = 1;
1049
+ }
1050
+
1051
+ // Trim instruction to minimum array and transfer to Prog.
1052
+ Trim();
1053
+ prog_->inst_ = inst_;
1054
+ prog_->size_ = inst_len_;
1055
+ inst_ = NULL;
1056
+
1057
+ // Compute byte map.
1058
+ prog_->ComputeByteMap();
1059
+
1060
+ prog_->Optimize();
1061
+
1062
+ // Record remaining memory for DFA.
1063
+ if (max_mem_ <= 0) {
1064
+ prog_->set_dfa_mem(1<<20);
1065
+ } else {
1066
+ int64 m = max_mem_ - sizeof(Prog) - inst_len_*sizeof(Prog::Inst);
1067
+ if (m < 0)
1068
+ m = 0;
1069
+ prog_->set_dfa_mem(m);
1070
+ }
1071
+
1072
+ Prog* p = prog_;
1073
+ prog_ = NULL;
1074
+ return p;
1075
+ }
1076
+
1077
+ // Converts Regexp to Prog.
1078
+ Prog* Regexp::CompileToProg(int64 max_mem) {
1079
+ return Compiler::Compile(this, false, max_mem);
1080
+ }
1081
+
1082
+ Prog* Regexp::CompileToReverseProg(int64 max_mem) {
1083
+ return Compiler::Compile(this, true, max_mem);
1084
+ }
1085
+
1086
+ Frag Compiler::DotStar() {
1087
+ return Star(ByteRange(0x00, 0xff, false), true);
1088
+ }
1089
+
1090
+ // Compiles RE set to Prog.
1091
+ Prog* Compiler::CompileSet(const RE2::Options& options, RE2::Anchor anchor,
1092
+ Regexp* re) {
1093
+ Compiler c;
1094
+
1095
+ Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(options.ParseFlags());
1096
+ c.Setup(pf, options.max_mem(), anchor);
1097
+
1098
+ // Compile alternation of fragments.
1099
+ Frag all = c.WalkExponential(re, kNullFrag, 2*c.max_inst_);
1100
+ re->Decref();
1101
+ if (c.failed_)
1102
+ return NULL;
1103
+
1104
+ if (anchor == RE2::UNANCHORED) {
1105
+ // The trailing .* was added while handling kRegexpHaveMatch.
1106
+ // We just have to add the leading one.
1107
+ all = c.Cat(c.DotStar(), all);
1108
+ }
1109
+
1110
+ c.prog_->set_start(all.begin);
1111
+ c.prog_->set_start_unanchored(all.begin);
1112
+ c.prog_->set_anchor_start(true);
1113
+ c.prog_->set_anchor_end(true);
1114
+
1115
+ Prog* prog = c.Finish();
1116
+ if (prog == NULL)
1117
+ return NULL;
1118
+
1119
+ // Make sure DFA has enough memory to operate,
1120
+ // since we're not going to fall back to the NFA.
1121
+ bool failed;
1122
+ StringPiece sp = "hello, world";
1123
+ prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch,
1124
+ NULL, &failed, NULL);
1125
+ if (failed) {
1126
+ delete prog;
1127
+ return NULL;
1128
+ }
1129
+
1130
+ return prog;
1131
+ }
1132
+
1133
+ Prog* Prog::CompileSet(const RE2::Options& options, RE2::Anchor anchor,
1134
+ Regexp* re) {
1135
+ return Compiler::CompileSet(options, anchor, re);
1136
+ }
1137
+
1138
+ } // namespace re2