chipper 0.4.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (134) hide show
  1. data/README.rdoc +51 -0
  2. data/ext/extconf.rb +58 -0
  3. data/ext/libstemmer_c/Makefile +10 -0
  4. data/ext/libstemmer_c/examples/stemwords.c +209 -0
  5. data/ext/libstemmer_c/include/libstemmer.h +79 -0
  6. data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
  7. data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
  8. data/ext/libstemmer_c/libstemmer/modules.h +190 -0
  9. data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
  10. data/ext/libstemmer_c/mkinc.mak +82 -0
  11. data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
  12. data/ext/libstemmer_c/runtime/api.c +66 -0
  13. data/ext/libstemmer_c/runtime/api.h +26 -0
  14. data/ext/libstemmer_c/runtime/header.h +58 -0
  15. data/ext/libstemmer_c/runtime/utilities.c +478 -0
  16. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
  17. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
  18. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
  19. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
  20. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
  21. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
  22. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
  23. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
  24. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
  25. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
  26. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
  27. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
  28. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
  29. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
  30. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
  31. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
  32. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
  33. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
  34. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
  35. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
  36. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
  37. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
  38. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
  39. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
  40. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
  41. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
  42. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
  43. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
  44. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
  45. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
  46. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
  47. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
  48. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
  49. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
  50. data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
  51. data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
  52. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
  53. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
  54. data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
  55. data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
  56. data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
  57. data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
  58. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
  59. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
  60. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
  61. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
  62. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
  63. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
  64. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
  65. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
  66. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
  67. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
  68. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
  69. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
  70. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
  71. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
  72. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
  73. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
  74. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
  75. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
  76. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
  77. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
  78. data/ext/re2/bitstate.cc +378 -0
  79. data/ext/re2/compile.cc +1138 -0
  80. data/ext/re2/dfa.cc +2086 -0
  81. data/ext/re2/filtered_re2.cc +100 -0
  82. data/ext/re2/filtered_re2.h +99 -0
  83. data/ext/re2/hash.cc +231 -0
  84. data/ext/re2/mimics_pcre.cc +185 -0
  85. data/ext/re2/nfa.cc +709 -0
  86. data/ext/re2/onepass.cc +614 -0
  87. data/ext/re2/parse.cc +2202 -0
  88. data/ext/re2/perl_groups.cc +119 -0
  89. data/ext/re2/prefilter.cc +671 -0
  90. data/ext/re2/prefilter.h +105 -0
  91. data/ext/re2/prefilter_tree.cc +398 -0
  92. data/ext/re2/prefilter_tree.h +130 -0
  93. data/ext/re2/prog.cc +341 -0
  94. data/ext/re2/prog.h +376 -0
  95. data/ext/re2/re2.cc +1180 -0
  96. data/ext/re2/re2.h +837 -0
  97. data/ext/re2/regexp.cc +920 -0
  98. data/ext/re2/regexp.h +632 -0
  99. data/ext/re2/rune.cc +258 -0
  100. data/ext/re2/set.cc +113 -0
  101. data/ext/re2/set.h +55 -0
  102. data/ext/re2/simplify.cc +393 -0
  103. data/ext/re2/stringpiece.cc +87 -0
  104. data/ext/re2/stringpiece.h +182 -0
  105. data/ext/re2/tostring.cc +341 -0
  106. data/ext/re2/unicode_casefold.cc +469 -0
  107. data/ext/re2/unicode_casefold.h +75 -0
  108. data/ext/re2/unicode_groups.cc +4851 -0
  109. data/ext/re2/unicode_groups.h +64 -0
  110. data/ext/re2/valgrind.cc +24 -0
  111. data/ext/re2/variadic_function.h +346 -0
  112. data/ext/re2/walker-inl.h +244 -0
  113. data/ext/src/chipper.cc +626 -0
  114. data/ext/src/version.h +1 -0
  115. data/ext/stemmer.rb +40 -0
  116. data/ext/util/arena.h +103 -0
  117. data/ext/util/atomicops.h +79 -0
  118. data/ext/util/benchmark.h +41 -0
  119. data/ext/util/flags.h +27 -0
  120. data/ext/util/logging.h +78 -0
  121. data/ext/util/mutex.h +190 -0
  122. data/ext/util/pcre.h +679 -0
  123. data/ext/util/random.h +29 -0
  124. data/ext/util/sparse_array.h +451 -0
  125. data/ext/util/sparse_set.h +177 -0
  126. data/ext/util/test.h +57 -0
  127. data/ext/util/thread.h +26 -0
  128. data/ext/util/utf.h +43 -0
  129. data/ext/util/util.h +127 -0
  130. data/ext/util/valgrind.h +4517 -0
  131. data/test/helper.rb +5 -0
  132. data/test/test_entities.rb +57 -0
  133. data/test/test_tokens.rb +118 -0
  134. metadata +199 -0
@@ -0,0 +1,393 @@
1
+ // Copyright 2006 The RE2 Authors. All Rights Reserved.
2
+ // Use of this source code is governed by a BSD-style
3
+ // license that can be found in the LICENSE file.
4
+
5
+ // Rewrite POSIX and other features in re
6
+ // to use simple extended regular expression features.
7
+ // Also sort and simplify character classes.
8
+
9
+ #include "util/util.h"
10
+ #include "re2/regexp.h"
11
+ #include "re2/walker-inl.h"
12
+
13
+ namespace re2 {
14
+
15
+ // Parses the regexp src and then simplifies it and sets *dst to the
16
+ // string representation of the simplified form. Returns true on success.
17
+ // Returns false and sets *error (if error != NULL) on error.
18
+ bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags,
19
+ string* dst,
20
+ RegexpStatus* status) {
21
+ Regexp* re = Parse(src, flags, status);
22
+ if (re == NULL)
23
+ return false;
24
+ Regexp* sre = re->Simplify();
25
+ re->Decref();
26
+ if (sre == NULL) {
27
+ // Should not happen, since Simplify never fails.
28
+ LOG(ERROR) << "Simplify failed on " << src;
29
+ if (status) {
30
+ status->set_code(kRegexpInternalError);
31
+ status->set_error_arg(src);
32
+ }
33
+ return false;
34
+ }
35
+ *dst = sre->ToString();
36
+ sre->Decref();
37
+ return true;
38
+ }
39
+
40
+ // Assuming the simple_ flags on the children are accurate,
41
+ // is this Regexp* simple?
42
+ bool Regexp::ComputeSimple() {
43
+ Regexp** subs;
44
+ switch (op_) {
45
+ case kRegexpNoMatch:
46
+ case kRegexpEmptyMatch:
47
+ case kRegexpLiteral:
48
+ case kRegexpLiteralString:
49
+ case kRegexpBeginLine:
50
+ case kRegexpEndLine:
51
+ case kRegexpBeginText:
52
+ case kRegexpWordBoundary:
53
+ case kRegexpNoWordBoundary:
54
+ case kRegexpEndText:
55
+ case kRegexpAnyChar:
56
+ case kRegexpAnyByte:
57
+ case kRegexpHaveMatch:
58
+ return true;
59
+ case kRegexpConcat:
60
+ case kRegexpAlternate:
61
+ // These are simple as long as the subpieces are simple.
62
+ subs = sub();
63
+ for (int i = 0; i < nsub_; i++)
64
+ if (!subs[i]->simple_)
65
+ return false;
66
+ return true;
67
+ case kRegexpCharClass:
68
+ // Simple as long as the char class is not empty, not full.
69
+ if (ccb_ != NULL)
70
+ return !ccb_->empty() && !ccb_->full();
71
+ return !cc_->empty() && !cc_->full();
72
+ case kRegexpCapture:
73
+ subs = sub();
74
+ return subs[0]->simple_;
75
+ case kRegexpStar:
76
+ case kRegexpPlus:
77
+ case kRegexpQuest:
78
+ subs = sub();
79
+ if (!subs[0]->simple_)
80
+ return false;
81
+ switch (subs[0]->op_) {
82
+ case kRegexpStar:
83
+ case kRegexpPlus:
84
+ case kRegexpQuest:
85
+ case kRegexpEmptyMatch:
86
+ case kRegexpNoMatch:
87
+ return false;
88
+ default:
89
+ break;
90
+ }
91
+ return true;
92
+ case kRegexpRepeat:
93
+ return false;
94
+ }
95
+ LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_;
96
+ return false;
97
+ }
98
+
99
+ // Walker subclass used by Simplify.
100
+ // The simplify walk is purely post-recursive: given the simplified children,
101
+ // PostVisit creates the simplified result.
102
+ // The child_args are simplified Regexp*s.
103
+ class SimplifyWalker : public Regexp::Walker<Regexp*> {
104
+ public:
105
+ SimplifyWalker() {}
106
+ virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
107
+ virtual Regexp* PostVisit(Regexp* re,
108
+ Regexp* parent_arg,
109
+ Regexp* pre_arg,
110
+ Regexp** child_args, int nchild_args);
111
+ virtual Regexp* Copy(Regexp* re);
112
+ virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
113
+
114
+ private:
115
+ // These functions are declared inside SimplifyWalker so that
116
+ // they can edit the private fields of the Regexps they construct.
117
+
118
+ // Creates a concatenation of two Regexp, consuming refs to re1 and re2.
119
+ // Caller must Decref return value when done with it.
120
+ static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags);
121
+
122
+ // Simplifies the expression re{min,max} in terms of *, +, and ?.
123
+ // Returns a new regexp. Does not edit re. Does not consume reference to re.
124
+ // Caller must Decref return value when done with it.
125
+ static Regexp* SimplifyRepeat(Regexp* re, int min, int max,
126
+ Regexp::ParseFlags parse_flags);
127
+
128
+ // Simplifies a character class by expanding any named classes
129
+ // into rune ranges. Does not edit re. Does not consume ref to re.
130
+ // Caller must Decref return value when done with it.
131
+ static Regexp* SimplifyCharClass(Regexp* re);
132
+
133
+ DISALLOW_EVIL_CONSTRUCTORS(SimplifyWalker);
134
+ };
135
+
136
+ // Simplifies a regular expression, returning a new regexp.
137
+ // The new regexp uses traditional Unix egrep features only,
138
+ // plus the Perl (?:) non-capturing parentheses.
139
+ // Otherwise, no POSIX or Perl additions. The new regexp
140
+ // captures exactly the same subexpressions (with the same indices)
141
+ // as the original.
142
+ // Does not edit current object.
143
+ // Caller must Decref() return value when done with it.
144
+
145
+ Regexp* Regexp::Simplify() {
146
+ if (simple_)
147
+ return Incref();
148
+ SimplifyWalker w;
149
+ return w.Walk(this, NULL);
150
+ }
151
+
152
+ #define Simplify DontCallSimplify // Avoid accidental recursion
153
+
154
+ Regexp* SimplifyWalker::Copy(Regexp* re) {
155
+ return re->Incref();
156
+ }
157
+
158
+ Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
159
+ // This should never be called, since we use Walk and not
160
+ // WalkExponential.
161
+ LOG(DFATAL) << "SimplifyWalker::ShortVisit called";
162
+ return re->Incref();
163
+ }
164
+
165
+ Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {
166
+ if (re->simple_) {
167
+ *stop = true;
168
+ return re->Incref();
169
+ }
170
+ return NULL;
171
+ }
172
+
173
+ Regexp* SimplifyWalker::PostVisit(Regexp* re,
174
+ Regexp* parent_arg,
175
+ Regexp* pre_arg,
176
+ Regexp** child_args,
177
+ int nchild_args) {
178
+ switch (re->op()) {
179
+ case kRegexpNoMatch:
180
+ case kRegexpEmptyMatch:
181
+ case kRegexpLiteral:
182
+ case kRegexpLiteralString:
183
+ case kRegexpBeginLine:
184
+ case kRegexpEndLine:
185
+ case kRegexpBeginText:
186
+ case kRegexpWordBoundary:
187
+ case kRegexpNoWordBoundary:
188
+ case kRegexpEndText:
189
+ case kRegexpAnyChar:
190
+ case kRegexpAnyByte:
191
+ case kRegexpHaveMatch:
192
+ // All these are always simple.
193
+ re->simple_ = true;
194
+ return re->Incref();
195
+
196
+ case kRegexpConcat:
197
+ case kRegexpAlternate: {
198
+ // These are simple as long as the subpieces are simple.
199
+ // Two passes to avoid allocation in the common case.
200
+ bool changed = false;
201
+ Regexp** subs = re->sub();
202
+ for (int i = 0; i < re->nsub_; i++) {
203
+ Regexp* sub = subs[i];
204
+ Regexp* newsub = child_args[i];
205
+ if (newsub != sub) {
206
+ changed = true;
207
+ break;
208
+ }
209
+ }
210
+ if (!changed) {
211
+ for (int i = 0; i < re->nsub_; i++) {
212
+ Regexp* newsub = child_args[i];
213
+ newsub->Decref();
214
+ }
215
+ re->simple_ = true;
216
+ return re->Incref();
217
+ }
218
+ Regexp* nre = new Regexp(re->op(), re->parse_flags());
219
+ nre->AllocSub(re->nsub_);
220
+ Regexp** nre_subs = nre->sub();
221
+ for (int i = 0; i <re->nsub_; i++)
222
+ nre_subs[i] = child_args[i];
223
+ nre->simple_ = true;
224
+ return nre;
225
+ }
226
+
227
+ case kRegexpCapture: {
228
+ Regexp* newsub = child_args[0];
229
+ if (newsub == re->sub()[0]) {
230
+ newsub->Decref();
231
+ re->simple_ = true;
232
+ return re->Incref();
233
+ }
234
+ Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());
235
+ nre->AllocSub(1);
236
+ nre->sub()[0] = newsub;
237
+ nre->cap_ = re->cap_;
238
+ nre->simple_ = true;
239
+ return nre;
240
+ }
241
+
242
+ case kRegexpStar:
243
+ case kRegexpPlus:
244
+ case kRegexpQuest: {
245
+ Regexp* newsub = child_args[0];
246
+ // Special case: repeat the empty string as much as
247
+ // you want, but it's still the empty string.
248
+ if (newsub->op() == kRegexpEmptyMatch)
249
+ return newsub;
250
+
251
+ // These are simple as long as the subpiece is simple.
252
+ if (newsub == re->sub()[0]) {
253
+ newsub->Decref();
254
+ re->simple_ = true;
255
+ return re->Incref();
256
+ }
257
+
258
+ // These are also idempotent if flags are constant.
259
+ if (re->op() == newsub->op() &&
260
+ re->parse_flags() == newsub->parse_flags())
261
+ return newsub;
262
+
263
+ Regexp* nre = new Regexp(re->op(), re->parse_flags());
264
+ nre->AllocSub(1);
265
+ nre->sub()[0] = newsub;
266
+ nre->simple_ = true;
267
+ return nre;
268
+ }
269
+
270
+ case kRegexpRepeat: {
271
+ Regexp* newsub = child_args[0];
272
+ // Special case: repeat the empty string as much as
273
+ // you want, but it's still the empty string.
274
+ if (newsub->op() == kRegexpEmptyMatch)
275
+ return newsub;
276
+
277
+ Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_,
278
+ re->parse_flags());
279
+ newsub->Decref();
280
+ nre->simple_ = true;
281
+ return nre;
282
+ }
283
+
284
+ case kRegexpCharClass: {
285
+ Regexp* nre = SimplifyCharClass(re);
286
+ nre->simple_ = true;
287
+ return nre;
288
+ }
289
+ }
290
+
291
+ LOG(ERROR) << "Simplify case not handled: " << re->op();
292
+ return re->Incref();
293
+ }
294
+
295
+ // Creates a concatenation of two Regexp, consuming refs to re1 and re2.
296
+ // Returns a new Regexp, handing the ref to the caller.
297
+ Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,
298
+ Regexp::ParseFlags parse_flags) {
299
+ Regexp* re = new Regexp(kRegexpConcat, parse_flags);
300
+ re->AllocSub(2);
301
+ Regexp** subs = re->sub();
302
+ subs[0] = re1;
303
+ subs[1] = re2;
304
+ return re;
305
+ }
306
+
307
+ // Simplifies the expression re{min,max} in terms of *, +, and ?.
308
+ // Returns a new regexp. Does not edit re. Does not consume reference to re.
309
+ // Caller must Decref return value when done with it.
310
+ // The result will *not* necessarily have the right capturing parens
311
+ // if you call ToString() and re-parse it: (x){2} becomes (x)(x),
312
+ // but in the Regexp* representation, both (x) are marked as $1.
313
+ Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
314
+ Regexp::ParseFlags f) {
315
+ // x{n,} means at least n matches of x.
316
+ if (max == -1) {
317
+ // Special case: x{0,} is x*
318
+ if (min == 0)
319
+ return Regexp::Star(re->Incref(), f);
320
+
321
+ // Special case: x{1,} is x+
322
+ if (min == 1)
323
+ return Regexp::Plus(re->Incref(), f);
324
+
325
+ // General case: x{4,} is xxxx+
326
+ Regexp* nre = new Regexp(kRegexpConcat, f);
327
+ nre->AllocSub(min);
328
+ VLOG(1) << "Simplify " << min;
329
+ Regexp** nre_subs = nre->sub();
330
+ for (int i = 0; i < min-1; i++)
331
+ nre_subs[i] = re->Incref();
332
+ nre_subs[min-1] = Regexp::Plus(re->Incref(), f);
333
+ return nre;
334
+ }
335
+
336
+ // Special case: (x){0} matches only empty string.
337
+ if (min == 0 && max == 0)
338
+ return new Regexp(kRegexpEmptyMatch, f);
339
+
340
+ // Special case: x{1} is just x.
341
+ if (min == 1 && max == 1)
342
+ return re->Incref();
343
+
344
+ // General case: x{n,m} means n copies of x and m copies of x?.
345
+ // The machine will do less work if we nest the final m copies,
346
+ // so that x{2,5} = xx(x(x(x)?)?)?
347
+
348
+ // Build leading prefix: xx. Capturing only on the last one.
349
+ Regexp* nre = NULL;
350
+ if (min > 0) {
351
+ nre = new Regexp(kRegexpConcat, f);
352
+ nre->AllocSub(min);
353
+ Regexp** nre_subs = nre->sub();
354
+ for (int i = 0; i < min; i++)
355
+ nre_subs[i] = re->Incref();
356
+ }
357
+
358
+ // Build and attach suffix: (x(x(x)?)?)?
359
+ if (max > min) {
360
+ Regexp* suf = Regexp::Quest(re->Incref(), f);
361
+ for (int i = min+1; i < max; i++)
362
+ suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f);
363
+ if (nre == NULL)
364
+ nre = suf;
365
+ else
366
+ nre = Concat2(nre, suf, f);
367
+ }
368
+
369
+ if (nre == NULL) {
370
+ // Some degenerate case, like min > max, or min < max < 0.
371
+ // This shouldn't happen, because the parser rejects such regexps.
372
+ LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max;
373
+ return new Regexp(kRegexpNoMatch, f);
374
+ }
375
+
376
+ return nre;
377
+ }
378
+
379
+ // Simplifies a character class.
380
+ // Caller must Decref return value when done with it.
381
+ Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) {
382
+ CharClass* cc = re->cc();
383
+
384
+ // Special cases
385
+ if (cc->empty())
386
+ return new Regexp(kRegexpNoMatch, re->parse_flags());
387
+ if (cc->full())
388
+ return new Regexp(kRegexpAnyChar, re->parse_flags());
389
+
390
+ return re->Incref();
391
+ }
392
+
393
+ } // namespace re2
@@ -0,0 +1,87 @@
1
+ // Copyright 2004 The RE2 Authors. All Rights Reserved.
2
+ // Use of this source code is governed by a BSD-style
3
+ // license that can be found in the LICENSE file.
4
+
5
+ #include "re2/stringpiece.h"
6
+ #include "util/util.h"
7
+
8
+ using re2::StringPiece;
9
+
10
+ std::ostream& operator<<(std::ostream& o, const StringPiece& piece) {
11
+ o.write(piece.data(), piece.size());
12
+ return o;
13
+ }
14
+
15
+ bool StringPiece::_equal(const StringPiece& x, const StringPiece& y) {
16
+ int len = x.size();
17
+ if (len != y.size()) {
18
+ return false;
19
+ }
20
+ const char* p = x.data();
21
+ const char* p2 = y.data();
22
+ // Test last byte in case strings share large common prefix
23
+ if ((len > 0) && (p[len-1] != p2[len-1])) return false;
24
+ const char* p_limit = p + len;
25
+ for (; p < p_limit; p++, p2++) {
26
+ if (*p != *p2)
27
+ return false;
28
+ }
29
+ return true;
30
+ }
31
+
32
+ void StringPiece::CopyToString(string* target) const {
33
+ target->assign(ptr_, length_);
34
+ }
35
+
36
+ int StringPiece::copy(char* buf, size_type n, size_type pos) const {
37
+ int ret = min(length_ - pos, n);
38
+ memcpy(buf, ptr_ + pos, ret);
39
+ return ret;
40
+ }
41
+
42
+ int StringPiece::find(const StringPiece& s, size_type pos) const {
43
+ if (length_ < 0 || pos > static_cast<size_type>(length_))
44
+ return npos;
45
+
46
+ const char* result = std::search(ptr_ + pos, ptr_ + length_,
47
+ s.ptr_, s.ptr_ + s.length_);
48
+ const size_type xpos = result - ptr_;
49
+ return xpos + s.length_ <= length_ ? xpos : npos;
50
+ }
51
+
52
+ int StringPiece::find(char c, size_type pos) const {
53
+ if (length_ <= 0 || pos >= static_cast<size_type>(length_)) {
54
+ return npos;
55
+ }
56
+ const char* result = std::find(ptr_ + pos, ptr_ + length_, c);
57
+ return result != ptr_ + length_ ? result - ptr_ : npos;
58
+ }
59
+
60
+ int StringPiece::rfind(const StringPiece& s, size_type pos) const {
61
+ if (length_ < s.length_) return npos;
62
+ const size_t ulen = length_;
63
+ if (s.length_ == 0) return min(ulen, pos);
64
+
65
+ const char* last = ptr_ + min(ulen - s.length_, pos) + s.length_;
66
+ const char* result = std::find_end(ptr_, last, s.ptr_, s.ptr_ + s.length_);
67
+ return result != last ? result - ptr_ : npos;
68
+ }
69
+
70
+ int StringPiece::rfind(char c, size_type pos) const {
71
+ if (length_ <= 0) return npos;
72
+ for (int i = min(pos, static_cast<size_type>(length_ - 1));
73
+ i >= 0; --i) {
74
+ if (ptr_[i] == c) {
75
+ return i;
76
+ }
77
+ }
78
+ return npos;
79
+ }
80
+
81
+ StringPiece StringPiece::substr(size_type pos, size_type n) const {
82
+ if (pos > length_) pos = length_;
83
+ if (n > length_ - pos) n = length_ - pos;
84
+ return StringPiece(ptr_ + pos, n);
85
+ }
86
+
87
+ const StringPiece::size_type StringPiece::npos = size_type(-1);
@@ -0,0 +1,182 @@
1
+ // Copyright 2001-2010 The RE2 Authors. All Rights Reserved.
2
+ // Use of this source code is governed by a BSD-style
3
+ // license that can be found in the LICENSE file.
4
+
5
+ // A string-like object that points to a sized piece of memory.
6
+ //
7
+ // Functions or methods may use const StringPiece& parameters to accept either
8
+ // a "const char*" or a "string" value that will be implicitly converted to
9
+ // a StringPiece. The implicit conversion means that it is often appropriate
10
+ // to include this .h file in other files rather than forward-declaring
11
+ // StringPiece as would be appropriate for most other Google classes.
12
+ //
13
+ // Systematic usage of StringPiece is encouraged as it will reduce unnecessary
14
+ // conversions from "const char*" to "string" and back again.
15
+ //
16
+ //
17
+ // Arghh! I wish C++ literals were "string".
18
+
19
+ #ifndef STRINGS_STRINGPIECE_H__
20
+ #define STRINGS_STRINGPIECE_H__
21
+
22
+ #include <string.h>
23
+ #include <cstddef>
24
+ #include <iosfwd>
25
+ #include <string>
26
+
27
+ namespace re2 {
28
+
29
+ class StringPiece {
30
+ private:
31
+ const char* ptr_;
32
+ int length_;
33
+
34
+ public:
35
+ // We provide non-explicit singleton constructors so users can pass
36
+ // in a "const char*" or a "string" wherever a "StringPiece" is
37
+ // expected.
38
+ StringPiece() : ptr_(NULL), length_(0) { }
39
+ StringPiece(const char* str)
40
+ : ptr_(str), length_((str == NULL) ? 0 : static_cast<int>(strlen(str))) { }
41
+ StringPiece(const std::string& str)
42
+ : ptr_(str.data()), length_(static_cast<int>(str.size())) { }
43
+ StringPiece(const char* offset, int len) : ptr_(offset), length_(len) { }
44
+
45
+ // data() may return a pointer to a buffer with embedded NULs, and the
46
+ // returned buffer may or may not be null terminated. Therefore it is
47
+ // typically a mistake to pass data() to a routine that expects a NUL
48
+ // terminated string.
49
+ const char* data() const { return ptr_; }
50
+ int size() const { return length_; }
51
+ int length() const { return length_; }
52
+ bool empty() const { return length_ == 0; }
53
+
54
+ void clear() { ptr_ = NULL; length_ = 0; }
55
+ void set(const char* data, int len) { ptr_ = data; length_ = len; }
56
+ void set(const char* str) {
57
+ ptr_ = str;
58
+ if (str != NULL)
59
+ length_ = static_cast<int>(strlen(str));
60
+ else
61
+ length_ = 0;
62
+ }
63
+ void set(const void* data, int len) {
64
+ ptr_ = reinterpret_cast<const char*>(data);
65
+ length_ = len;
66
+ }
67
+
68
+ char operator[](int i) const { return ptr_[i]; }
69
+
70
+ void remove_prefix(int n) {
71
+ ptr_ += n;
72
+ length_ -= n;
73
+ }
74
+
75
+ void remove_suffix(int n) {
76
+ length_ -= n;
77
+ }
78
+
79
+ int compare(const StringPiece& x) const {
80
+ int r = memcmp(ptr_, x.ptr_, std::min(length_, x.length_));
81
+ if (r == 0) {
82
+ if (length_ < x.length_) r = -1;
83
+ else if (length_ > x.length_) r = +1;
84
+ }
85
+ return r;
86
+ }
87
+
88
+ std::string as_string() const {
89
+ return std::string(data(), size());
90
+ }
91
+ // We also define ToString() here, since many other string-like
92
+ // interfaces name the routine that converts to a C++ string
93
+ // "ToString", and it's confusing to have the method that does that
94
+ // for a StringPiece be called "as_string()". We also leave the
95
+ // "as_string()" method defined here for existing code.
96
+ std::string ToString() const {
97
+ return std::string(data(), size());
98
+ }
99
+
100
+ void CopyToString(std::string* target) const;
101
+ void AppendToString(std::string* target) const;
102
+
103
+ // Does "this" start with "x"
104
+ bool starts_with(const StringPiece& x) const {
105
+ return ((length_ >= x.length_) &&
106
+ (memcmp(ptr_, x.ptr_, x.length_) == 0));
107
+ }
108
+
109
+ // Does "this" end with "x"
110
+ bool ends_with(const StringPiece& x) const {
111
+ return ((length_ >= x.length_) &&
112
+ (memcmp(ptr_ + (length_-x.length_), x.ptr_, x.length_) == 0));
113
+ }
114
+
115
+ // standard STL container boilerplate
116
+ typedef char value_type;
117
+ typedef const char* pointer;
118
+ typedef const char& reference;
119
+ typedef const char& const_reference;
120
+ typedef size_t size_type;
121
+ typedef ptrdiff_t difference_type;
122
+ static const size_type npos;
123
+ typedef const char* const_iterator;
124
+ typedef const char* iterator;
125
+ typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
126
+ typedef std::reverse_iterator<iterator> reverse_iterator;
127
+ iterator begin() const { return ptr_; }
128
+ iterator end() const { return ptr_ + length_; }
129
+ const_reverse_iterator rbegin() const {
130
+ return const_reverse_iterator(ptr_ + length_);
131
+ }
132
+ const_reverse_iterator rend() const {
133
+ return const_reverse_iterator(ptr_);
134
+ }
135
+ // STLS says return size_type, but Google says return int
136
+ int max_size() const { return length_; }
137
+ int capacity() const { return length_; }
138
+
139
+ int copy(char* buf, size_type n, size_type pos = 0) const;
140
+
141
+ int find(const StringPiece& s, size_type pos = 0) const;
142
+ int find(char c, size_type pos = 0) const;
143
+ int rfind(const StringPiece& s, size_type pos = npos) const;
144
+ int rfind(char c, size_type pos = npos) const;
145
+
146
+ StringPiece substr(size_type pos, size_type n = npos) const;
147
+
148
+ static bool _equal(const StringPiece&, const StringPiece&);
149
+ };
150
+
151
+ inline bool operator==(const StringPiece& x, const StringPiece& y) {
152
+ return StringPiece::_equal(x, y);
153
+ }
154
+
155
+ inline bool operator!=(const StringPiece& x, const StringPiece& y) {
156
+ return !(x == y);
157
+ }
158
+
159
+ inline bool operator<(const StringPiece& x, const StringPiece& y) {
160
+ const int r = memcmp(x.data(), y.data(),
161
+ std::min(x.size(), y.size()));
162
+ return ((r < 0) || ((r == 0) && (x.size() < y.size())));
163
+ }
164
+
165
+ inline bool operator>(const StringPiece& x, const StringPiece& y) {
166
+ return y < x;
167
+ }
168
+
169
+ inline bool operator<=(const StringPiece& x, const StringPiece& y) {
170
+ return !(x > y);
171
+ }
172
+
173
+ inline bool operator>=(const StringPiece& x, const StringPiece& y) {
174
+ return !(x < y);
175
+ }
176
+
177
+ } // namespace re2
178
+
179
+ // allow StringPiece to be logged
180
+ extern std::ostream& operator<<(std::ostream& o, const re2::StringPiece& piece);
181
+
182
+ #endif // STRINGS_STRINGPIECE_H__