chipper 0.4.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (134) hide show
  1. data/README.rdoc +51 -0
  2. data/ext/extconf.rb +58 -0
  3. data/ext/libstemmer_c/Makefile +10 -0
  4. data/ext/libstemmer_c/examples/stemwords.c +209 -0
  5. data/ext/libstemmer_c/include/libstemmer.h +79 -0
  6. data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
  7. data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
  8. data/ext/libstemmer_c/libstemmer/modules.h +190 -0
  9. data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
  10. data/ext/libstemmer_c/mkinc.mak +82 -0
  11. data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
  12. data/ext/libstemmer_c/runtime/api.c +66 -0
  13. data/ext/libstemmer_c/runtime/api.h +26 -0
  14. data/ext/libstemmer_c/runtime/header.h +58 -0
  15. data/ext/libstemmer_c/runtime/utilities.c +478 -0
  16. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
  17. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
  18. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
  19. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
  20. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
  21. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
  22. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
  23. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
  24. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
  25. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
  26. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
  27. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
  28. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
  29. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
  30. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
  31. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
  32. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
  33. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
  34. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
  35. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
  36. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
  37. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
  38. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
  39. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
  40. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
  41. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
  42. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
  43. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
  44. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
  45. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
  46. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
  47. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
  48. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
  49. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
  50. data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
  51. data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
  52. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
  53. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
  54. data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
  55. data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
  56. data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
  57. data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
  58. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
  59. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
  60. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
  61. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
  62. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
  63. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
  64. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
  65. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
  66. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
  67. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
  68. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
  69. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
  70. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
  71. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
  72. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
  73. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
  74. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
  75. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
  76. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
  77. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
  78. data/ext/re2/bitstate.cc +378 -0
  79. data/ext/re2/compile.cc +1138 -0
  80. data/ext/re2/dfa.cc +2086 -0
  81. data/ext/re2/filtered_re2.cc +100 -0
  82. data/ext/re2/filtered_re2.h +99 -0
  83. data/ext/re2/hash.cc +231 -0
  84. data/ext/re2/mimics_pcre.cc +185 -0
  85. data/ext/re2/nfa.cc +709 -0
  86. data/ext/re2/onepass.cc +614 -0
  87. data/ext/re2/parse.cc +2202 -0
  88. data/ext/re2/perl_groups.cc +119 -0
  89. data/ext/re2/prefilter.cc +671 -0
  90. data/ext/re2/prefilter.h +105 -0
  91. data/ext/re2/prefilter_tree.cc +398 -0
  92. data/ext/re2/prefilter_tree.h +130 -0
  93. data/ext/re2/prog.cc +341 -0
  94. data/ext/re2/prog.h +376 -0
  95. data/ext/re2/re2.cc +1180 -0
  96. data/ext/re2/re2.h +837 -0
  97. data/ext/re2/regexp.cc +920 -0
  98. data/ext/re2/regexp.h +632 -0
  99. data/ext/re2/rune.cc +258 -0
  100. data/ext/re2/set.cc +113 -0
  101. data/ext/re2/set.h +55 -0
  102. data/ext/re2/simplify.cc +393 -0
  103. data/ext/re2/stringpiece.cc +87 -0
  104. data/ext/re2/stringpiece.h +182 -0
  105. data/ext/re2/tostring.cc +341 -0
  106. data/ext/re2/unicode_casefold.cc +469 -0
  107. data/ext/re2/unicode_casefold.h +75 -0
  108. data/ext/re2/unicode_groups.cc +4851 -0
  109. data/ext/re2/unicode_groups.h +64 -0
  110. data/ext/re2/valgrind.cc +24 -0
  111. data/ext/re2/variadic_function.h +346 -0
  112. data/ext/re2/walker-inl.h +244 -0
  113. data/ext/src/chipper.cc +626 -0
  114. data/ext/src/version.h +1 -0
  115. data/ext/stemmer.rb +40 -0
  116. data/ext/util/arena.h +103 -0
  117. data/ext/util/atomicops.h +79 -0
  118. data/ext/util/benchmark.h +41 -0
  119. data/ext/util/flags.h +27 -0
  120. data/ext/util/logging.h +78 -0
  121. data/ext/util/mutex.h +190 -0
  122. data/ext/util/pcre.h +679 -0
  123. data/ext/util/random.h +29 -0
  124. data/ext/util/sparse_array.h +451 -0
  125. data/ext/util/sparse_set.h +177 -0
  126. data/ext/util/test.h +57 -0
  127. data/ext/util/thread.h +26 -0
  128. data/ext/util/utf.h +43 -0
  129. data/ext/util/util.h +127 -0
  130. data/ext/util/valgrind.h +4517 -0
  131. data/test/helper.rb +5 -0
  132. data/test/test_entities.rb +57 -0
  133. data/test/test_tokens.rb +118 -0
  134. metadata +199 -0
data/ext/re2/re2.cc ADDED
@@ -0,0 +1,1180 @@
1
+ // Copyright 2003-2009 The RE2 Authors. All Rights Reserved.
2
+ // Use of this source code is governed by a BSD-style
3
+ // license that can be found in the LICENSE file.
4
+
5
+ // Regular expression interface RE2.
6
+ //
7
+ // Originally the PCRE C++ wrapper, but adapted to use
8
+ // the new automata-based regular expression engines.
9
+
10
+ #include "re2/re2.h"
11
+
12
+ #include <stdio.h>
13
+ #include <string>
14
+ #include <pthread.h>
15
+ #include <errno.h>
16
+ #include "util/util.h"
17
+ #include "util/flags.h"
18
+ #include "re2/prog.h"
19
+ #include "re2/regexp.h"
20
+
21
+ DEFINE_bool(trace_re2, false, "trace RE2 execution");
22
+
23
+ namespace re2 {
24
+
25
+ // Maximum number of args we can set
26
+ static const int kMaxArgs = 16;
27
+ static const int kVecSize = 1+kMaxArgs;
28
+
29
+ const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::FullMatchN> RE2::FullMatch;
30
+ const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::PartialMatchN> RE2::PartialMatch;
31
+ const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::ConsumeN> RE2::Consume;
32
+ const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::FindAndConsumeN> RE2::FindAndConsume;
33
+
34
+ const int RE2::Options::kDefaultMaxMem; // initialized in re2.h
35
+
36
+ // Commonly-used option sets; arguments to constructor are:
37
+ // utf8 input
38
+ // posix syntax
39
+ // longest match
40
+ // log errors
41
+ const RE2::Options RE2::DefaultOptions; // EncodingUTF8, false, false, true
42
+ const RE2::Options RE2::Latin1(RE2::Options::EncodingLatin1, false, false, true);
43
+ const RE2::Options RE2::POSIX(RE2::Options::EncodingUTF8, true, true, true);
44
+ const RE2::Options RE2::Quiet(RE2::Options::EncodingUTF8, false, false, false);
45
+
46
+ // If a regular expression has no error, its error_ field points here
47
+ static const string empty_string;
48
+
49
+ // Converts from Regexp error code to RE2 error code.
50
+ // Maybe some day they will diverge. In any event, this
51
+ // hides the existence of Regexp from RE2 users.
52
+ static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) {
53
+ switch (code) {
54
+ case re2::kRegexpSuccess:
55
+ return RE2::NoError;
56
+ case re2::kRegexpInternalError:
57
+ return RE2::ErrorInternal;
58
+ case re2::kRegexpBadEscape:
59
+ return RE2::ErrorBadEscape;
60
+ case re2::kRegexpBadCharClass:
61
+ return RE2::ErrorBadCharClass;
62
+ case re2::kRegexpBadCharRange:
63
+ return RE2::ErrorBadCharRange;
64
+ case re2::kRegexpMissingBracket:
65
+ return RE2::ErrorMissingBracket;
66
+ case re2::kRegexpMissingParen:
67
+ return RE2::ErrorMissingParen;
68
+ case re2::kRegexpTrailingBackslash:
69
+ return RE2::ErrorTrailingBackslash;
70
+ case re2::kRegexpRepeatArgument:
71
+ return RE2::ErrorRepeatArgument;
72
+ case re2::kRegexpRepeatSize:
73
+ return RE2::ErrorRepeatSize;
74
+ case re2::kRegexpRepeatOp:
75
+ return RE2::ErrorRepeatOp;
76
+ case re2::kRegexpBadPerlOp:
77
+ return RE2::ErrorBadPerlOp;
78
+ case re2::kRegexpBadUTF8:
79
+ return RE2::ErrorBadUTF8;
80
+ case re2::kRegexpBadNamedCapture:
81
+ return RE2::ErrorBadNamedCapture;
82
+ }
83
+ return RE2::ErrorInternal;
84
+ }
85
+
86
+ static string trunc(const StringPiece& pattern) {
87
+ if (pattern.size() < 100)
88
+ return pattern.as_string();
89
+ return pattern.substr(0, 100).as_string() + "...";
90
+ }
91
+
92
+
93
+ RE2::RE2(const char* pattern) {
94
+ Init(pattern, DefaultOptions);
95
+ }
96
+
97
+ RE2::RE2(const string& pattern) {
98
+ Init(pattern, DefaultOptions);
99
+ }
100
+
101
+ RE2::RE2(const StringPiece& pattern) {
102
+ Init(pattern, DefaultOptions);
103
+ }
104
+
105
+ RE2::RE2(const StringPiece& pattern, const Options& options) {
106
+ Init(pattern, options);
107
+ }
108
+
109
+ int RE2::Options::ParseFlags() const {
110
+ int flags = Regexp::ClassNL;
111
+ switch (encoding()) {
112
+ default:
113
+ LOG(ERROR) << "Unknown encoding " << encoding();
114
+ break;
115
+ case RE2::Options::EncodingUTF8:
116
+ break;
117
+ case RE2::Options::EncodingLatin1:
118
+ flags |= Regexp::Latin1;
119
+ break;
120
+ }
121
+
122
+ if (!posix_syntax())
123
+ flags |= Regexp::LikePerl;
124
+
125
+ if (literal())
126
+ flags |= Regexp::Literal;
127
+
128
+ if (never_nl())
129
+ flags |= Regexp::NeverNL;
130
+
131
+ if (!case_sensitive())
132
+ flags |= Regexp::FoldCase;
133
+
134
+ if (perl_classes())
135
+ flags |= Regexp::PerlClasses;
136
+
137
+ if (word_boundary())
138
+ flags |= Regexp::PerlB;
139
+
140
+ if (one_line())
141
+ flags |= Regexp::OneLine;
142
+
143
+ return flags;
144
+ }
145
+
146
+ void RE2::Init(const StringPiece& pattern, const Options& options) {
147
+ mutex_ = new Mutex;
148
+ pattern_ = pattern.as_string();
149
+ options_.Copy(options);
150
+ error_ = &empty_string;
151
+ error_code_ = NoError;
152
+ suffix_regexp_ = NULL;
153
+ entire_regexp_ = NULL;
154
+ prog_ = NULL;
155
+ rprog_ = NULL;
156
+ named_groups_ = NULL;
157
+ group_names_ = NULL;
158
+ num_captures_ = -1;
159
+
160
+ RegexpStatus status;
161
+ entire_regexp_ = Regexp::Parse(
162
+ pattern_,
163
+ static_cast<Regexp::ParseFlags>(options_.ParseFlags()),
164
+ &status);
165
+ if (entire_regexp_ == NULL) {
166
+ if (error_ == &empty_string)
167
+ error_ = new string(status.Text());
168
+ if (options_.log_errors()) {
169
+ LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': "
170
+ << status.Text();
171
+ }
172
+ error_arg_ = status.error_arg().as_string();
173
+ error_code_ = RegexpErrorToRE2(status.code());
174
+ return;
175
+ }
176
+
177
+ prefix_.clear();
178
+ prefix_foldcase_ = false;
179
+ re2::Regexp* suffix;
180
+ if (entire_regexp_->RequiredPrefix(&prefix_, &prefix_foldcase_, &suffix))
181
+ suffix_regexp_ = suffix;
182
+ else
183
+ suffix_regexp_ = entire_regexp_->Incref();
184
+
185
+ // Two thirds of the memory goes to the forward Prog,
186
+ // one third to the reverse prog, because the forward
187
+ // Prog has two DFAs but the reverse prog has one.
188
+ prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*2/3);
189
+ if (prog_ == NULL) {
190
+ if (options_.log_errors())
191
+ LOG(ERROR) << "Error compiling '" << trunc(pattern_) << "'";
192
+ error_ = new string("pattern too large - compile failed");
193
+ error_code_ = RE2::ErrorPatternTooLarge;
194
+ return;
195
+ }
196
+
197
+ // Could delay this until the first match call that
198
+ // cares about submatch information, but the one-pass
199
+ // machine's memory gets cut from the DFA memory budget,
200
+ // and that is harder to do if the DFA has already
201
+ // been built.
202
+ is_one_pass_ = prog_->IsOnePass();
203
+ }
204
+
205
+ // Returns rprog_, computing it if needed.
206
+ re2::Prog* RE2::ReverseProg() const {
207
+ MutexLock l(mutex_);
208
+ if (rprog_ == NULL && error_ == &empty_string) {
209
+ rprog_ = suffix_regexp_->CompileToReverseProg(options_.max_mem()/3);
210
+ if (rprog_ == NULL) {
211
+ if (options_.log_errors())
212
+ LOG(ERROR) << "Error reverse compiling '" << trunc(pattern_) << "'";
213
+ error_ = new string("pattern too large - reverse compile failed");
214
+ error_code_ = RE2::ErrorPatternTooLarge;
215
+ return NULL;
216
+ }
217
+ }
218
+ return rprog_;
219
+ }
220
+
221
+ static const map<string, int> empty_named_groups;
222
+ static const map<int, string> empty_group_names;
223
+
224
+ RE2::~RE2() {
225
+ if (suffix_regexp_)
226
+ suffix_regexp_->Decref();
227
+ if (entire_regexp_)
228
+ entire_regexp_->Decref();
229
+ delete mutex_;
230
+ delete prog_;
231
+ delete rprog_;
232
+ if (error_ != &empty_string)
233
+ delete error_;
234
+ if (named_groups_ != NULL && named_groups_ != &empty_named_groups)
235
+ delete named_groups_;
236
+ if (group_names_ != NULL && group_names_ != &empty_group_names)
237
+ delete group_names_;
238
+ }
239
+
240
+ int RE2::ProgramSize() const {
241
+ if (prog_ == NULL)
242
+ return -1;
243
+ return prog_->size();
244
+ }
245
+
246
+ // Returns named_groups_, computing it if needed.
247
+ const map<string, int>& RE2::NamedCapturingGroups() const {
248
+ MutexLock l(mutex_);
249
+ if (!ok())
250
+ return empty_named_groups;
251
+ if (named_groups_ == NULL) {
252
+ named_groups_ = suffix_regexp_->NamedCaptures();
253
+ if (named_groups_ == NULL)
254
+ named_groups_ = &empty_named_groups;
255
+ }
256
+ return *named_groups_;
257
+ }
258
+
259
+ // Returns group_names_, computing it if needed.
260
+ const map<int, string>& RE2::CapturingGroupNames() const {
261
+ MutexLock l(mutex_);
262
+ if (!ok())
263
+ return empty_group_names;
264
+ if (group_names_ == NULL) {
265
+ group_names_ = suffix_regexp_->CaptureNames();
266
+ if (group_names_ == NULL)
267
+ group_names_ = &empty_group_names;
268
+ }
269
+ return *group_names_;
270
+ }
271
+
272
+ /***** Convenience interfaces *****/
273
+
274
+ bool RE2::FullMatchN(const StringPiece& text, const RE2& re,
275
+ const Arg* const args[], int n) {
276
+ return re.DoMatch(text, ANCHOR_BOTH, NULL, args, n);
277
+ }
278
+
279
+ bool RE2::PartialMatchN(const StringPiece& text, const RE2& re,
280
+ const Arg* const args[], int n) {
281
+ return re.DoMatch(text, UNANCHORED, NULL, args, n);
282
+ }
283
+
284
+ bool RE2::ConsumeN(StringPiece* input, const RE2& re,
285
+ const Arg* const args[], int n) {
286
+ int consumed;
287
+ if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) {
288
+ input->remove_prefix(consumed);
289
+ return true;
290
+ } else {
291
+ return false;
292
+ }
293
+ }
294
+
295
+ bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re,
296
+ const Arg* const args[], int n) {
297
+ int consumed;
298
+ if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) {
299
+ input->remove_prefix(consumed);
300
+ return true;
301
+ } else {
302
+ return false;
303
+ }
304
+ }
305
+
306
+ // Returns the maximum submatch needed for the rewrite to be done by Replace().
307
+ // E.g. if rewrite == "foo \\2,\\1", returns 2.
308
+ static int MaxSubmatch(const StringPiece& rewrite) {
309
+ int max = 0;
310
+ for (const char *s = rewrite.data(), *end = s + rewrite.size();
311
+ s < end; s++) {
312
+ if (*s == '\\') {
313
+ s++;
314
+ int c = (s < end) ? *s : -1;
315
+ if (isdigit(c)) {
316
+ int n = (c - '0');
317
+ if (n > max)
318
+ max = n;
319
+ }
320
+ }
321
+ }
322
+ return max;
323
+ }
324
+
325
+ bool RE2::Replace(string *str,
326
+ const RE2& re,
327
+ const StringPiece& rewrite) {
328
+ StringPiece vec[kVecSize];
329
+ int nvec = 1 + MaxSubmatch(rewrite);
330
+ if (nvec > arraysize(vec))
331
+ return false;
332
+ if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec))
333
+ return false;
334
+
335
+ string s;
336
+ if (!re.Rewrite(&s, rewrite, vec, nvec))
337
+ return false;
338
+
339
+ assert(vec[0].begin() >= str->data());
340
+ assert(vec[0].end() <= str->data()+str->size());
341
+ str->replace(vec[0].data() - str->data(), vec[0].size(), s);
342
+ return true;
343
+ }
344
+
345
+ int RE2::GlobalReplace(string *str,
346
+ const RE2& re,
347
+ const StringPiece& rewrite) {
348
+ StringPiece vec[kVecSize];
349
+ int nvec = 1 + MaxSubmatch(rewrite);
350
+ if (nvec > arraysize(vec))
351
+ return false;
352
+
353
+ const char* p = str->data();
354
+ const char* ep = p + str->size();
355
+ const char* lastend = NULL;
356
+ string out;
357
+ int count = 0;
358
+ while (p <= ep) {
359
+ if (!re.Match(*str, p - str->data(), str->size(), UNANCHORED, vec, nvec))
360
+ break;
361
+ if (p < vec[0].begin())
362
+ out.append(p, vec[0].begin() - p);
363
+ if (vec[0].begin() == lastend && vec[0].size() == 0) {
364
+ // Disallow empty match at end of last match: skip ahead.
365
+ if (p < ep)
366
+ out.append(p, 1);
367
+ p++;
368
+ continue;
369
+ }
370
+ re.Rewrite(&out, rewrite, vec, nvec);
371
+ p = vec[0].end();
372
+ lastend = p;
373
+ count++;
374
+ }
375
+
376
+ if (count == 0)
377
+ return 0;
378
+
379
+ if (p < ep)
380
+ out.append(p, ep - p);
381
+ swap(out, *str);
382
+ return count;
383
+ }
384
+
385
+ bool RE2::Extract(const StringPiece &text,
386
+ const RE2& re,
387
+ const StringPiece &rewrite,
388
+ string *out) {
389
+ StringPiece vec[kVecSize];
390
+ int nvec = 1 + MaxSubmatch(rewrite);
391
+ if (nvec > arraysize(vec))
392
+ return false;
393
+
394
+ if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec))
395
+ return false;
396
+
397
+ out->clear();
398
+ return re.Rewrite(out, rewrite, vec, nvec);
399
+ }
400
+
401
+ string RE2::QuoteMeta(const StringPiece& unquoted) {
402
+ string result;
403
+ result.reserve(unquoted.size() << 1);
404
+
405
+ // Escape any ascii character not in [A-Za-z_0-9].
406
+ //
407
+ // Note that it's legal to escape a character even if it has no
408
+ // special meaning in a regular expression -- so this function does
409
+ // that. (This also makes it identical to the perl function of the
410
+ // same name except for the null-character special case;
411
+ // see `perldoc -f quotemeta`.)
412
+ for (int ii = 0; ii < unquoted.length(); ++ii) {
413
+ // Note that using 'isalnum' here raises the benchmark time from
414
+ // 32ns to 58ns:
415
+ if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
416
+ (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
417
+ (unquoted[ii] < '0' || unquoted[ii] > '9') &&
418
+ unquoted[ii] != '_' &&
419
+ // If this is the part of a UTF8 or Latin1 character, we need
420
+ // to copy this byte without escaping. Experimentally this is
421
+ // what works correctly with the regexp library.
422
+ !(unquoted[ii] & 128)) {
423
+ if (unquoted[ii] == '\0') { // Special handling for null chars.
424
+ // Note that this special handling is not strictly required for RE2,
425
+ // but this quoting is required for other regexp libraries such as
426
+ // PCRE.
427
+ // Can't use "\\0" since the next character might be a digit.
428
+ result += "\\x00";
429
+ continue;
430
+ }
431
+ result += '\\';
432
+ }
433
+ result += unquoted[ii];
434
+ }
435
+
436
+ return result;
437
+ }
438
+
439
+ bool RE2::PossibleMatchRange(string* min, string* max, int maxlen) const {
440
+ if (prog_ == NULL)
441
+ return false;
442
+
443
+ int n = prefix_.size();
444
+ if (n > maxlen)
445
+ n = maxlen;
446
+
447
+ // Determine initial min max from prefix_ literal.
448
+ string pmin, pmax;
449
+ pmin = prefix_.substr(0, n);
450
+ pmax = prefix_.substr(0, n);
451
+ if (prefix_foldcase_) {
452
+ // prefix is ASCII lowercase; change pmin to uppercase.
453
+ for (int i = 0; i < n; i++) {
454
+ if ('a' <= pmin[i] && pmin[i] <= 'z')
455
+ pmin[i] += 'A' - 'a';
456
+ }
457
+ }
458
+
459
+ // Add to prefix min max using PossibleMatchRange on regexp.
460
+ string dmin, dmax;
461
+ maxlen -= n;
462
+ if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) {
463
+ pmin += dmin;
464
+ pmax += dmax;
465
+ } else if (pmax.size() > 0) {
466
+ // prog_->PossibleMatchRange has failed us,
467
+ // but we still have useful information from prefix_.
468
+ // Round up pmax to allow any possible suffix.
469
+ pmax = PrefixSuccessor(pmax);
470
+ } else {
471
+ // Nothing useful.
472
+ *min = "";
473
+ *max = "";
474
+ return false;
475
+ }
476
+
477
+ *min = pmin;
478
+ *max = pmax;
479
+ return true;
480
+ }
481
+
482
+ // Avoid possible locale nonsense in standard strcasecmp.
483
+ // The string a is known to be all lowercase.
484
+ static int ascii_strcasecmp(const char* a, const char* b, int len) {
485
+ const char *ae = a + len;
486
+
487
+ for (; a < ae; a++, b++) {
488
+ uint8 x = *a;
489
+ uint8 y = *b;
490
+ if ('A' <= y && y <= 'Z')
491
+ y += 'a' - 'A';
492
+ if (x != y)
493
+ return x - y;
494
+ }
495
+ return 0;
496
+ }
497
+
498
+
499
+ /***** Actual matching and rewriting code *****/
500
+
501
+ bool RE2::Match(const StringPiece& text,
502
+ int startpos,
503
+ int endpos,
504
+ Anchor re_anchor,
505
+ StringPiece* submatch,
506
+ int nsubmatch) const {
507
+ if (!ok() || suffix_regexp_ == NULL) {
508
+ if (options_.log_errors())
509
+ LOG(ERROR) << "Invalid RE2: " << *error_;
510
+ return false;
511
+ }
512
+
513
+ if (startpos < 0 || startpos > endpos || endpos > text.size()) {
514
+ LOG(ERROR) << "RE2: invalid startpos, endpos pair.";
515
+ return false;
516
+ }
517
+
518
+ StringPiece subtext = text;
519
+ subtext.remove_prefix(startpos);
520
+ subtext.remove_suffix(text.size() - endpos);
521
+
522
+ // Use DFAs to find exact location of match, filter out non-matches.
523
+
524
+ // Don't ask for the location if we won't use it.
525
+ // SearchDFA can do extra optimizations in that case.
526
+ StringPiece match;
527
+ StringPiece* matchp = &match;
528
+ if (nsubmatch == 0)
529
+ matchp = NULL;
530
+
531
+ int ncap = 1 + NumberOfCapturingGroups();
532
+ if (ncap > nsubmatch)
533
+ ncap = nsubmatch;
534
+
535
+ // If the regexp is anchored explicitly, must not be in middle of text.
536
+ if (prog_->anchor_start() && startpos != 0)
537
+ return false;
538
+
539
+ // If the regexp is anchored explicitly, update re_anchor
540
+ // so that we can potentially fall into a faster case below.
541
+ if (prog_->anchor_start() && prog_->anchor_end())
542
+ re_anchor = ANCHOR_BOTH;
543
+ else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH)
544
+ re_anchor = ANCHOR_START;
545
+
546
+ // Check for the required prefix, if any.
547
+ int prefixlen = 0;
548
+ if (!prefix_.empty()) {
549
+ if (startpos != 0)
550
+ return false;
551
+ prefixlen = prefix_.size();
552
+ if (prefixlen > subtext.size())
553
+ return false;
554
+ if (prefix_foldcase_) {
555
+ if (ascii_strcasecmp(&prefix_[0], subtext.data(), prefixlen) != 0)
556
+ return false;
557
+ } else {
558
+ if (memcmp(&prefix_[0], subtext.data(), prefixlen) != 0)
559
+ return false;
560
+ }
561
+ subtext.remove_prefix(prefixlen);
562
+ // If there is a required prefix, the anchor must be at least ANCHOR_START.
563
+ if (re_anchor != ANCHOR_BOTH)
564
+ re_anchor = ANCHOR_START;
565
+ }
566
+
567
+ Prog::Anchor anchor = Prog::kUnanchored;
568
+ Prog::MatchKind kind = Prog::kFirstMatch;
569
+ if (options_.longest_match())
570
+ kind = Prog::kLongestMatch;
571
+ bool skipped_test = false;
572
+
573
+ bool can_one_pass = (is_one_pass_ && ncap <= Prog::kMaxOnePassCapture);
574
+
575
+ // SearchBitState allocates a bit vector of size prog_->size() * text.size().
576
+ // It also allocates a stack of 3-word structures which could potentially
577
+ // grow as large as prog_->size() * text.size() but in practice is much
578
+ // smaller.
579
+ // Conditions for using SearchBitState:
580
+ const int MaxBitStateProg = 500; // prog_->size() <= Max.
581
+ const int MaxBitStateVector = 256*1024; // bit vector size <= Max (bits)
582
+ bool can_bit_state = prog_->size() <= MaxBitStateProg;
583
+ int bit_state_text_max = MaxBitStateVector / prog_->size();
584
+
585
+ bool dfa_failed = false;
586
+ switch (re_anchor) {
587
+ default:
588
+ case UNANCHORED: {
589
+ if (!prog_->SearchDFA(subtext, text, anchor, kind,
590
+ matchp, &dfa_failed, NULL)) {
591
+ if (dfa_failed) {
592
+ // Fall back to NFA below.
593
+ skipped_test = true;
594
+ if (FLAGS_trace_re2)
595
+ LOG(INFO) << "Match " << trunc(pattern_)
596
+ << " [" << CEscape(subtext) << "]"
597
+ << " DFA failed.";
598
+ break;
599
+ }
600
+ if (FLAGS_trace_re2)
601
+ LOG(INFO) << "Match " << trunc(pattern_)
602
+ << " [" << CEscape(subtext) << "]"
603
+ << " used DFA - no match.";
604
+ return false;
605
+ }
606
+ if (FLAGS_trace_re2)
607
+ LOG(INFO) << "Match " << trunc(pattern_)
608
+ << " [" << CEscape(subtext) << "]"
609
+ << " used DFA - match";
610
+ if (matchp == NULL) // Matched. Don't care where
611
+ return true;
612
+ // SearchDFA set match[0].end() but didn't know where the
613
+ // match started. Run the regexp backward from match[0].end()
614
+ // to find the longest possible match -- that's where it started.
615
+ Prog* prog = ReverseProg();
616
+ if (prog == NULL)
617
+ return false;
618
+ if (!prog->SearchDFA(match, text, Prog::kAnchored,
619
+ Prog::kLongestMatch, &match, &dfa_failed, NULL)) {
620
+ if (dfa_failed) {
621
+ // Fall back to NFA below.
622
+ skipped_test = true;
623
+ if (FLAGS_trace_re2)
624
+ LOG(INFO) << "Match " << trunc(pattern_)
625
+ << " [" << CEscape(subtext) << "]"
626
+ << " reverse DFA failed.";
627
+ break;
628
+ }
629
+ if (FLAGS_trace_re2)
630
+ LOG(INFO) << "Match " << trunc(pattern_)
631
+ << " [" << CEscape(subtext) << "]"
632
+ << " DFA inconsistency.";
633
+ LOG(ERROR) << "DFA inconsistency";
634
+ return false;
635
+ }
636
+ if (FLAGS_trace_re2)
637
+ LOG(INFO) << "Match " << trunc(pattern_)
638
+ << " [" << CEscape(subtext) << "]"
639
+ << " used reverse DFA.";
640
+ break;
641
+ }
642
+
643
+ case ANCHOR_BOTH:
644
+ case ANCHOR_START:
645
+ if (re_anchor == ANCHOR_BOTH)
646
+ kind = Prog::kFullMatch;
647
+ anchor = Prog::kAnchored;
648
+
649
+ // If only a small amount of text and need submatch
650
+ // information anyway and we're going to use OnePass or BitState
651
+ // to get it, we might as well not even bother with the DFA:
652
+ // OnePass or BitState will be fast enough.
653
+ // On tiny texts, OnePass outruns even the DFA, and
654
+ // it doesn't have the shared state and occasional mutex that
655
+ // the DFA does.
656
+ if (can_one_pass && text.size() <= 4096 &&
657
+ (ncap > 1 || text.size() <= 8)) {
658
+ if (FLAGS_trace_re2)
659
+ LOG(INFO) << "Match " << trunc(pattern_)
660
+ << " [" << CEscape(subtext) << "]"
661
+ << " skipping DFA for OnePass.";
662
+ skipped_test = true;
663
+ break;
664
+ }
665
+ if (can_bit_state && text.size() <= bit_state_text_max && ncap > 1) {
666
+ if (FLAGS_trace_re2)
667
+ LOG(INFO) << "Match " << trunc(pattern_)
668
+ << " [" << CEscape(subtext) << "]"
669
+ << " skipping DFA for BitState.";
670
+ skipped_test = true;
671
+ break;
672
+ }
673
+ if (!prog_->SearchDFA(subtext, text, anchor, kind,
674
+ &match, &dfa_failed, NULL)) {
675
+ if (dfa_failed) {
676
+ if (FLAGS_trace_re2)
677
+ LOG(INFO) << "Match " << trunc(pattern_)
678
+ << " [" << CEscape(subtext) << "]"
679
+ << " DFA failed.";
680
+ skipped_test = true;
681
+ break;
682
+ }
683
+ if (FLAGS_trace_re2)
684
+ LOG(INFO) << "Match " << trunc(pattern_)
685
+ << " [" << CEscape(subtext) << "]"
686
+ << " used DFA - no match.";
687
+ return false;
688
+ }
689
+ break;
690
+ }
691
+
692
+ if (!skipped_test && ncap <= 1) {
693
+ // We know exactly where it matches. That's enough.
694
+ if (ncap == 1)
695
+ submatch[0] = match;
696
+ } else {
697
+ StringPiece subtext1;
698
+ if (skipped_test) {
699
+ // DFA ran out of memory or was skipped:
700
+ // need to search in entire original text.
701
+ subtext1 = subtext;
702
+ } else {
703
+ // DFA found the exact match location:
704
+ // let NFA run an anchored, full match search
705
+ // to find submatch locations.
706
+ subtext1 = match;
707
+ anchor = Prog::kAnchored;
708
+ kind = Prog::kFullMatch;
709
+ }
710
+
711
+ if (can_one_pass && anchor != Prog::kUnanchored) {
712
+ if (FLAGS_trace_re2)
713
+ LOG(INFO) << "Match " << trunc(pattern_)
714
+ << " [" << CEscape(subtext) << "]"
715
+ << " using OnePass.";
716
+ if (!prog_->SearchOnePass(subtext1, text, anchor, kind, submatch, ncap)) {
717
+ if (!skipped_test)
718
+ LOG(ERROR) << "SearchOnePass inconsistency";
719
+ return false;
720
+ }
721
+ } else if (can_bit_state && subtext1.size() <= bit_state_text_max) {
722
+ if (FLAGS_trace_re2)
723
+ LOG(INFO) << "Match " << trunc(pattern_)
724
+ << " [" << CEscape(subtext) << "]"
725
+ << " using BitState.";
726
+ if (!prog_->SearchBitState(subtext1, text, anchor,
727
+ kind, submatch, ncap)) {
728
+ if (!skipped_test)
729
+ LOG(ERROR) << "SearchBitState inconsistency";
730
+ return false;
731
+ }
732
+ } else {
733
+ if (FLAGS_trace_re2)
734
+ LOG(INFO) << "Match " << trunc(pattern_)
735
+ << " [" << CEscape(subtext) << "]"
736
+ << " using NFA.";
737
+ if (!prog_->SearchNFA(subtext1, text, anchor, kind, submatch, ncap)) {
738
+ if (!skipped_test)
739
+ LOG(ERROR) << "SearchNFA inconsistency";
740
+ return false;
741
+ }
742
+ }
743
+ }
744
+
745
+ // Adjust overall match for required prefix that we stripped off.
746
+ if (prefixlen > 0 && nsubmatch > 0)
747
+ submatch[0] = StringPiece(submatch[0].begin() - prefixlen,
748
+ submatch[0].size() + prefixlen);
749
+
750
+ // Zero submatches that don't exist in the regexp.
751
+ for (int i = ncap; i < nsubmatch; i++)
752
+ submatch[i] = NULL;
753
+ return true;
754
+ }
755
+
756
+ // Internal matcher - like Match() but takes Args not StringPieces.
757
+ bool RE2::DoMatch(const StringPiece& text,
758
+ Anchor anchor,
759
+ int* consumed,
760
+ const Arg* const* args,
761
+ int n) const {
762
+ if (!ok()) {
763
+ if (options_.log_errors())
764
+ LOG(ERROR) << "Invalid RE2: " << *error_;
765
+ return false;
766
+ }
767
+
768
+ // Count number of capture groups needed.
769
+ int nvec;
770
+ if (n == 0 && consumed == NULL)
771
+ nvec = 0;
772
+ else
773
+ nvec = n+1;
774
+
775
+ StringPiece* vec;
776
+ StringPiece stkvec[kVecSize];
777
+ StringPiece* heapvec = NULL;
778
+
779
+ if (nvec <= arraysize(stkvec)) {
780
+ vec = stkvec;
781
+ } else {
782
+ vec = new StringPiece[nvec];
783
+ heapvec = vec;
784
+ }
785
+
786
+ if (!Match(text, 0, text.size(), anchor, vec, nvec)) {
787
+ delete[] heapvec;
788
+ return false;
789
+ }
790
+
791
+ if(consumed != NULL)
792
+ *consumed = vec[0].end() - text.begin();
793
+
794
+ if (n == 0 || args == NULL) {
795
+ // We are not interested in results
796
+ delete[] heapvec;
797
+ return true;
798
+ }
799
+
800
+ int ncap = NumberOfCapturingGroups();
801
+ if (ncap < n) {
802
+ // RE has fewer capturing groups than number of arg pointers passed in
803
+ VLOG(1) << "Asked for " << n << " but only have " << ncap;
804
+ delete[] heapvec;
805
+ return false;
806
+ }
807
+
808
+ // If we got here, we must have matched the whole pattern.
809
+ for (int i = 0; i < n; i++) {
810
+ const StringPiece& s = vec[i+1];
811
+ if (!args[i]->Parse(s.data(), s.size())) {
812
+ // TODO: Should we indicate what the error was?
813
+ VLOG(1) << "Parse error on #" << i << " " << s << " "
814
+ << (void*)s.data() << "/" << s.size();
815
+ delete[] heapvec;
816
+ return false;
817
+ }
818
+ }
819
+
820
+ delete[] heapvec;
821
+ return true;
822
+ }
823
+
824
+ // Append the "rewrite" string, with backslash subsitutions from "vec",
825
+ // to string "out".
826
+ bool RE2::Rewrite(string *out, const StringPiece &rewrite,
827
+ const StringPiece *vec, int veclen) const {
828
+ for (const char *s = rewrite.data(), *end = s + rewrite.size();
829
+ s < end; s++) {
830
+ int c = *s;
831
+ if (c == '\\') {
832
+ s++;
833
+ c = (s < end) ? *s : -1;
834
+ if (isdigit(c)) {
835
+ int n = (c - '0');
836
+ if (n >= veclen) {
837
+ LOG(ERROR) << "requested group " << n
838
+ << " in regexp " << rewrite.data();
839
+ return false;
840
+ }
841
+ StringPiece snip = vec[n];
842
+ if (snip.size() > 0)
843
+ out->append(snip.data(), snip.size());
844
+ } else if (c == '\\') {
845
+ out->push_back('\\');
846
+ } else {
847
+ LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data();
848
+ return false;
849
+ }
850
+ } else {
851
+ out->push_back(c);
852
+ }
853
+ }
854
+ return true;
855
+ }
856
+
857
+ // Return the number of capturing subpatterns, or -1 if the
858
+ // regexp wasn't valid on construction.
859
+ int RE2::NumberOfCapturingGroups() const {
860
+ if (suffix_regexp_ == NULL)
861
+ return -1;
862
+ ANNOTATE_BENIGN_RACE(&num_captures_, "benign race: in the worst case"
863
+ " multiple threads end up doing the same work in parallel.");
864
+ if (num_captures_ == -1)
865
+ num_captures_ = suffix_regexp_->NumCaptures();
866
+ return num_captures_;
867
+ }
868
+
869
+ // Checks that the rewrite string is well-formed with respect to this
870
+ // regular expression.
871
+ bool RE2::CheckRewriteString(const StringPiece& rewrite, string* error) const {
872
+ int max_token = -1;
873
+ for (const char *s = rewrite.data(), *end = s + rewrite.size();
874
+ s < end; s++) {
875
+ int c = *s;
876
+ if (c != '\\') {
877
+ continue;
878
+ }
879
+ if (++s == end) {
880
+ *error = "Rewrite schema error: '\\' not allowed at end.";
881
+ return false;
882
+ }
883
+ c = *s;
884
+ if (c == '\\') {
885
+ continue;
886
+ }
887
+ if (!isdigit(c)) {
888
+ *error = "Rewrite schema error: "
889
+ "'\\' must be followed by a digit or '\\'.";
890
+ return false;
891
+ }
892
+ int n = (c - '0');
893
+ if (max_token < n) {
894
+ max_token = n;
895
+ }
896
+ }
897
+
898
+ if (max_token > NumberOfCapturingGroups()) {
899
+ SStringPrintf(error, "Rewrite schema requests %d matches, "
900
+ "but the regexp only has %d parenthesized subexpressions.",
901
+ max_token, NumberOfCapturingGroups());
902
+ return false;
903
+ }
904
+ return true;
905
+ }
906
+
907
+ /***** Parsers for various types *****/
908
+
909
+ bool RE2::Arg::parse_null(const char* str, int n, void* dest) {
910
+ // We fail if somebody asked us to store into a non-NULL void* pointer
911
+ return (dest == NULL);
912
+ }
913
+
914
+ bool RE2::Arg::parse_string(const char* str, int n, void* dest) {
915
+ if (dest == NULL) return true;
916
+ reinterpret_cast<string*>(dest)->assign(str, n);
917
+ return true;
918
+ }
919
+
920
+ bool RE2::Arg::parse_stringpiece(const char* str, int n, void* dest) {
921
+ if (dest == NULL) return true;
922
+ reinterpret_cast<StringPiece*>(dest)->set(str, n);
923
+ return true;
924
+ }
925
+
926
+ bool RE2::Arg::parse_char(const char* str, int n, void* dest) {
927
+ if (n != 1) return false;
928
+ if (dest == NULL) return true;
929
+ *(reinterpret_cast<char*>(dest)) = str[0];
930
+ return true;
931
+ }
932
+
933
+ bool RE2::Arg::parse_uchar(const char* str, int n, void* dest) {
934
+ if (n != 1) return false;
935
+ if (dest == NULL) return true;
936
+ *(reinterpret_cast<unsigned char*>(dest)) = str[0];
937
+ return true;
938
+ }
939
+
940
+ // Largest number spec that we are willing to parse
941
+ static const int kMaxNumberLength = 32;
942
+
943
+ // REQUIRES "buf" must have length at least kMaxNumberLength+1
944
+ // Copies "str" into "buf" and null-terminates.
945
+ // Overwrites *np with the new length.
946
+ static const char* TerminateNumber(char* buf, const char* str, int* np) {
947
+ int n = *np;
948
+ if (n <= 0) return "";
949
+ if (n > 0 && isspace(*str)) {
950
+ // We are less forgiving than the strtoxxx() routines and do not
951
+ // allow leading spaces.
952
+ return "";
953
+ }
954
+
955
+ // Although buf has a fixed maximum size, we can still handle
956
+ // arbitrarily large integers correctly by omitting leading zeros.
957
+ // (Numbers that are still too long will be out of range.)
958
+ // Before deciding whether str is too long,
959
+ // remove leading zeros with s/000+/00/.
960
+ // Leaving the leading two zeros in place means that
961
+ // we don't change 0000x123 (invalid) into 0x123 (valid).
962
+ // Skip over leading - before replacing.
963
+ bool neg = false;
964
+ if (n >= 1 && str[0] == '-') {
965
+ neg = true;
966
+ n--;
967
+ str++;
968
+ }
969
+
970
+ if (n >= 3 && str[0] == '0' && str[1] == '0') {
971
+ while (n >= 3 && str[2] == '0') {
972
+ n--;
973
+ str++;
974
+ }
975
+ }
976
+
977
+ if (neg) { // make room in buf for -
978
+ n++;
979
+ str--;
980
+ }
981
+
982
+ if (n > kMaxNumberLength) return "";
983
+
984
+ memmove(buf, str, n);
985
+ if (neg) {
986
+ buf[0] = '-';
987
+ }
988
+ buf[n] = '\0';
989
+ *np = n;
990
+ return buf;
991
+ }
992
+
993
+ bool RE2::Arg::parse_long_radix(const char* str,
994
+ int n,
995
+ void* dest,
996
+ int radix) {
997
+ if (n == 0) return false;
998
+ char buf[kMaxNumberLength+1];
999
+ str = TerminateNumber(buf, str, &n);
1000
+ char* end;
1001
+ errno = 0;
1002
+ long r = strtol(str, &end, radix);
1003
+ if (end != str + n) return false; // Leftover junk
1004
+ if (errno) return false;
1005
+ if (dest == NULL) return true;
1006
+ *(reinterpret_cast<long*>(dest)) = r;
1007
+ return true;
1008
+ }
1009
+
1010
+ bool RE2::Arg::parse_ulong_radix(const char* str,
1011
+ int n,
1012
+ void* dest,
1013
+ int radix) {
1014
+ if (n == 0) return false;
1015
+ char buf[kMaxNumberLength+1];
1016
+ str = TerminateNumber(buf, str, &n);
1017
+ if (str[0] == '-') {
1018
+ // strtoul() will silently accept negative numbers and parse
1019
+ // them. This module is more strict and treats them as errors.
1020
+ return false;
1021
+ }
1022
+
1023
+ char* end;
1024
+ errno = 0;
1025
+ unsigned long r = strtoul(str, &end, radix);
1026
+ if (end != str + n) return false; // Leftover junk
1027
+ if (errno) return false;
1028
+ if (dest == NULL) return true;
1029
+ *(reinterpret_cast<unsigned long*>(dest)) = r;
1030
+ return true;
1031
+ }
1032
+
1033
+ bool RE2::Arg::parse_short_radix(const char* str,
1034
+ int n,
1035
+ void* dest,
1036
+ int radix) {
1037
+ long r;
1038
+ if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
1039
+ if ((short)r != r) return false; // Out of range
1040
+ if (dest == NULL) return true;
1041
+ *(reinterpret_cast<short*>(dest)) = r;
1042
+ return true;
1043
+ }
1044
+
1045
+ bool RE2::Arg::parse_ushort_radix(const char* str,
1046
+ int n,
1047
+ void* dest,
1048
+ int radix) {
1049
+ unsigned long r;
1050
+ if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
1051
+ if ((ushort)r != r) return false; // Out of range
1052
+ if (dest == NULL) return true;
1053
+ *(reinterpret_cast<unsigned short*>(dest)) = r;
1054
+ return true;
1055
+ }
1056
+
1057
+ bool RE2::Arg::parse_int_radix(const char* str,
1058
+ int n,
1059
+ void* dest,
1060
+ int radix) {
1061
+ long r;
1062
+ if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
1063
+ if ((int)r != r) return false; // Out of range
1064
+ if (dest == NULL) return true;
1065
+ *(reinterpret_cast<int*>(dest)) = r;
1066
+ return true;
1067
+ }
1068
+
1069
+ bool RE2::Arg::parse_uint_radix(const char* str,
1070
+ int n,
1071
+ void* dest,
1072
+ int radix) {
1073
+ unsigned long r;
1074
+ if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
1075
+ if ((uint)r != r) return false; // Out of range
1076
+ if (dest == NULL) return true;
1077
+ *(reinterpret_cast<unsigned int*>(dest)) = r;
1078
+ return true;
1079
+ }
1080
+
1081
+ bool RE2::Arg::parse_longlong_radix(const char* str,
1082
+ int n,
1083
+ void* dest,
1084
+ int radix) {
1085
+ if (n == 0) return false;
1086
+ char buf[kMaxNumberLength+1];
1087
+ str = TerminateNumber(buf, str, &n);
1088
+ char* end;
1089
+ errno = 0;
1090
+ int64 r = strtoll(str, &end, radix);
1091
+ if (end != str + n) return false; // Leftover junk
1092
+ if (errno) return false;
1093
+ if (dest == NULL) return true;
1094
+ *(reinterpret_cast<int64*>(dest)) = r;
1095
+ return true;
1096
+ }
1097
+
1098
+ bool RE2::Arg::parse_ulonglong_radix(const char* str,
1099
+ int n,
1100
+ void* dest,
1101
+ int radix) {
1102
+ if (n == 0) return false;
1103
+ char buf[kMaxNumberLength+1];
1104
+ str = TerminateNumber(buf, str, &n);
1105
+ if (str[0] == '-') {
1106
+ // strtoull() will silently accept negative numbers and parse
1107
+ // them. This module is more strict and treats them as errors.
1108
+ return false;
1109
+ }
1110
+ char* end;
1111
+ errno = 0;
1112
+ uint64 r = strtoull(str, &end, radix);
1113
+ if (end != str + n) return false; // Leftover junk
1114
+ if (errno) return false;
1115
+ if (dest == NULL) return true;
1116
+ *(reinterpret_cast<uint64*>(dest)) = r;
1117
+ return true;
1118
+ }
1119
+
1120
+ static bool parse_double_float(const char* str, int n, bool isfloat, void *dest) {
1121
+ if (n == 0) return false;
1122
+ static const int kMaxLength = 200;
1123
+ char buf[kMaxLength];
1124
+ if (n >= kMaxLength) return false;
1125
+ memcpy(buf, str, n);
1126
+ buf[n] = '\0';
1127
+ errno = 0;
1128
+ char* end;
1129
+ double r;
1130
+ if (isfloat) {
1131
+ r = strtof(buf, &end);
1132
+ } else {
1133
+ r = strtod(buf, &end);
1134
+ }
1135
+ if (end != buf + n) return false; // Leftover junk
1136
+ if (errno) return false;
1137
+ if (dest == NULL) return true;
1138
+ if (isfloat) {
1139
+ *(reinterpret_cast<float*>(dest)) = r;
1140
+ } else {
1141
+ *(reinterpret_cast<double*>(dest)) = r;
1142
+ }
1143
+ return true;
1144
+ }
1145
+
1146
+ bool RE2::Arg::parse_double(const char* str, int n, void* dest) {
1147
+ return parse_double_float(str, n, false, dest);
1148
+ }
1149
+
1150
+ bool RE2::Arg::parse_float(const char* str, int n, void* dest) {
1151
+ return parse_double_float(str, n, true, dest);
1152
+ }
1153
+
1154
+
1155
+ #define DEFINE_INTEGER_PARSERS(name) \
1156
+ bool RE2::Arg::parse_##name(const char* str, int n, void* dest) { \
1157
+ return parse_##name##_radix(str, n, dest, 10); \
1158
+ } \
1159
+ bool RE2::Arg::parse_##name##_hex(const char* str, int n, void* dest) { \
1160
+ return parse_##name##_radix(str, n, dest, 16); \
1161
+ } \
1162
+ bool RE2::Arg::parse_##name##_octal(const char* str, int n, void* dest) { \
1163
+ return parse_##name##_radix(str, n, dest, 8); \
1164
+ } \
1165
+ bool RE2::Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \
1166
+ return parse_##name##_radix(str, n, dest, 0); \
1167
+ }
1168
+
1169
+ DEFINE_INTEGER_PARSERS(short);
1170
+ DEFINE_INTEGER_PARSERS(ushort);
1171
+ DEFINE_INTEGER_PARSERS(int);
1172
+ DEFINE_INTEGER_PARSERS(uint);
1173
+ DEFINE_INTEGER_PARSERS(long);
1174
+ DEFINE_INTEGER_PARSERS(ulong);
1175
+ DEFINE_INTEGER_PARSERS(longlong);
1176
+ DEFINE_INTEGER_PARSERS(ulonglong);
1177
+
1178
+ #undef DEFINE_INTEGER_PARSERS
1179
+
1180
+ } // namespace re2