chipper 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. data/README.rdoc +51 -0
  2. data/ext/extconf.rb +58 -0
  3. data/ext/libstemmer_c/Makefile +10 -0
  4. data/ext/libstemmer_c/examples/stemwords.c +209 -0
  5. data/ext/libstemmer_c/include/libstemmer.h +79 -0
  6. data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
  7. data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
  8. data/ext/libstemmer_c/libstemmer/modules.h +190 -0
  9. data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
  10. data/ext/libstemmer_c/mkinc.mak +82 -0
  11. data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
  12. data/ext/libstemmer_c/runtime/api.c +66 -0
  13. data/ext/libstemmer_c/runtime/api.h +26 -0
  14. data/ext/libstemmer_c/runtime/header.h +58 -0
  15. data/ext/libstemmer_c/runtime/utilities.c +478 -0
  16. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
  17. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
  18. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
  19. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
  20. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
  21. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
  22. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
  23. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
  24. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
  25. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
  26. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
  27. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
  28. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
  29. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
  30. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
  31. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
  32. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
  33. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
  34. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
  35. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
  36. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
  37. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
  38. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
  39. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
  40. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
  41. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
  42. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
  43. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
  44. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
  45. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
  46. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
  47. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
  48. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
  49. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
  50. data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
  51. data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
  52. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
  53. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
  54. data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
  55. data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
  56. data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
  57. data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
  58. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
  59. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
  60. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
  61. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
  62. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
  63. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
  64. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
  65. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
  66. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
  67. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
  68. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
  69. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
  70. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
  71. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
  72. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
  73. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
  74. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
  75. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
  76. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
  77. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
  78. data/ext/re2/bitstate.cc +378 -0
  79. data/ext/re2/compile.cc +1138 -0
  80. data/ext/re2/dfa.cc +2086 -0
  81. data/ext/re2/filtered_re2.cc +100 -0
  82. data/ext/re2/filtered_re2.h +99 -0
  83. data/ext/re2/hash.cc +231 -0
  84. data/ext/re2/mimics_pcre.cc +185 -0
  85. data/ext/re2/nfa.cc +709 -0
  86. data/ext/re2/onepass.cc +614 -0
  87. data/ext/re2/parse.cc +2202 -0
  88. data/ext/re2/perl_groups.cc +119 -0
  89. data/ext/re2/prefilter.cc +671 -0
  90. data/ext/re2/prefilter.h +105 -0
  91. data/ext/re2/prefilter_tree.cc +398 -0
  92. data/ext/re2/prefilter_tree.h +130 -0
  93. data/ext/re2/prog.cc +341 -0
  94. data/ext/re2/prog.h +376 -0
  95. data/ext/re2/re2.cc +1180 -0
  96. data/ext/re2/re2.h +837 -0
  97. data/ext/re2/regexp.cc +920 -0
  98. data/ext/re2/regexp.h +632 -0
  99. data/ext/re2/rune.cc +258 -0
  100. data/ext/re2/set.cc +113 -0
  101. data/ext/re2/set.h +55 -0
  102. data/ext/re2/simplify.cc +393 -0
  103. data/ext/re2/stringpiece.cc +87 -0
  104. data/ext/re2/stringpiece.h +182 -0
  105. data/ext/re2/tostring.cc +341 -0
  106. data/ext/re2/unicode_casefold.cc +469 -0
  107. data/ext/re2/unicode_casefold.h +75 -0
  108. data/ext/re2/unicode_groups.cc +4851 -0
  109. data/ext/re2/unicode_groups.h +64 -0
  110. data/ext/re2/valgrind.cc +24 -0
  111. data/ext/re2/variadic_function.h +346 -0
  112. data/ext/re2/walker-inl.h +244 -0
  113. data/ext/src/chipper.cc +626 -0
  114. data/ext/src/version.h +1 -0
  115. data/ext/stemmer.rb +40 -0
  116. data/ext/util/arena.h +103 -0
  117. data/ext/util/atomicops.h +79 -0
  118. data/ext/util/benchmark.h +41 -0
  119. data/ext/util/flags.h +27 -0
  120. data/ext/util/logging.h +78 -0
  121. data/ext/util/mutex.h +190 -0
  122. data/ext/util/pcre.h +679 -0
  123. data/ext/util/random.h +29 -0
  124. data/ext/util/sparse_array.h +451 -0
  125. data/ext/util/sparse_set.h +177 -0
  126. data/ext/util/test.h +57 -0
  127. data/ext/util/thread.h +26 -0
  128. data/ext/util/utf.h +43 -0
  129. data/ext/util/util.h +127 -0
  130. data/ext/util/valgrind.h +4517 -0
  131. data/test/helper.rb +5 -0
  132. data/test/test_entities.rb +57 -0
  133. data/test/test_tokens.rb +118 -0
  134. metadata +199 -0
data/ext/re2/re2.cc ADDED
@@ -0,0 +1,1180 @@
1
+ // Copyright 2003-2009 The RE2 Authors. All Rights Reserved.
2
+ // Use of this source code is governed by a BSD-style
3
+ // license that can be found in the LICENSE file.
4
+
5
+ // Regular expression interface RE2.
6
+ //
7
+ // Originally the PCRE C++ wrapper, but adapted to use
8
+ // the new automata-based regular expression engines.
9
+
10
+ #include "re2/re2.h"
11
+
12
+ #include <stdio.h>
13
+ #include <string>
14
+ #include <pthread.h>
15
+ #include <errno.h>
16
+ #include "util/util.h"
17
+ #include "util/flags.h"
18
+ #include "re2/prog.h"
19
+ #include "re2/regexp.h"
20
+
21
+ DEFINE_bool(trace_re2, false, "trace RE2 execution");
22
+
23
+ namespace re2 {
24
+
25
+ // Maximum number of args we can set
26
+ static const int kMaxArgs = 16;
27
+ static const int kVecSize = 1+kMaxArgs;
28
+
29
+ const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::FullMatchN> RE2::FullMatch;
30
+ const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::PartialMatchN> RE2::PartialMatch;
31
+ const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::ConsumeN> RE2::Consume;
32
+ const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::FindAndConsumeN> RE2::FindAndConsume;
33
+
34
+ const int RE2::Options::kDefaultMaxMem; // initialized in re2.h
35
+
36
+ // Commonly-used option sets; arguments to constructor are:
37
+ // utf8 input
38
+ // posix syntax
39
+ // longest match
40
+ // log errors
41
+ const RE2::Options RE2::DefaultOptions; // EncodingUTF8, false, false, true
42
+ const RE2::Options RE2::Latin1(RE2::Options::EncodingLatin1, false, false, true);
43
+ const RE2::Options RE2::POSIX(RE2::Options::EncodingUTF8, true, true, true);
44
+ const RE2::Options RE2::Quiet(RE2::Options::EncodingUTF8, false, false, false);
45
+
46
+ // If a regular expression has no error, its error_ field points here
47
+ static const string empty_string;
48
+
49
+ // Converts from Regexp error code to RE2 error code.
50
+ // Maybe some day they will diverge. In any event, this
51
+ // hides the existence of Regexp from RE2 users.
52
+ static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) {
53
+ switch (code) {
54
+ case re2::kRegexpSuccess:
55
+ return RE2::NoError;
56
+ case re2::kRegexpInternalError:
57
+ return RE2::ErrorInternal;
58
+ case re2::kRegexpBadEscape:
59
+ return RE2::ErrorBadEscape;
60
+ case re2::kRegexpBadCharClass:
61
+ return RE2::ErrorBadCharClass;
62
+ case re2::kRegexpBadCharRange:
63
+ return RE2::ErrorBadCharRange;
64
+ case re2::kRegexpMissingBracket:
65
+ return RE2::ErrorMissingBracket;
66
+ case re2::kRegexpMissingParen:
67
+ return RE2::ErrorMissingParen;
68
+ case re2::kRegexpTrailingBackslash:
69
+ return RE2::ErrorTrailingBackslash;
70
+ case re2::kRegexpRepeatArgument:
71
+ return RE2::ErrorRepeatArgument;
72
+ case re2::kRegexpRepeatSize:
73
+ return RE2::ErrorRepeatSize;
74
+ case re2::kRegexpRepeatOp:
75
+ return RE2::ErrorRepeatOp;
76
+ case re2::kRegexpBadPerlOp:
77
+ return RE2::ErrorBadPerlOp;
78
+ case re2::kRegexpBadUTF8:
79
+ return RE2::ErrorBadUTF8;
80
+ case re2::kRegexpBadNamedCapture:
81
+ return RE2::ErrorBadNamedCapture;
82
+ }
83
+ return RE2::ErrorInternal;
84
+ }
85
+
86
+ static string trunc(const StringPiece& pattern) {
87
+ if (pattern.size() < 100)
88
+ return pattern.as_string();
89
+ return pattern.substr(0, 100).as_string() + "...";
90
+ }
91
+
92
+
93
+ RE2::RE2(const char* pattern) {
94
+ Init(pattern, DefaultOptions);
95
+ }
96
+
97
+ RE2::RE2(const string& pattern) {
98
+ Init(pattern, DefaultOptions);
99
+ }
100
+
101
+ RE2::RE2(const StringPiece& pattern) {
102
+ Init(pattern, DefaultOptions);
103
+ }
104
+
105
+ RE2::RE2(const StringPiece& pattern, const Options& options) {
106
+ Init(pattern, options);
107
+ }
108
+
109
+ int RE2::Options::ParseFlags() const {
110
+ int flags = Regexp::ClassNL;
111
+ switch (encoding()) {
112
+ default:
113
+ LOG(ERROR) << "Unknown encoding " << encoding();
114
+ break;
115
+ case RE2::Options::EncodingUTF8:
116
+ break;
117
+ case RE2::Options::EncodingLatin1:
118
+ flags |= Regexp::Latin1;
119
+ break;
120
+ }
121
+
122
+ if (!posix_syntax())
123
+ flags |= Regexp::LikePerl;
124
+
125
+ if (literal())
126
+ flags |= Regexp::Literal;
127
+
128
+ if (never_nl())
129
+ flags |= Regexp::NeverNL;
130
+
131
+ if (!case_sensitive())
132
+ flags |= Regexp::FoldCase;
133
+
134
+ if (perl_classes())
135
+ flags |= Regexp::PerlClasses;
136
+
137
+ if (word_boundary())
138
+ flags |= Regexp::PerlB;
139
+
140
+ if (one_line())
141
+ flags |= Regexp::OneLine;
142
+
143
+ return flags;
144
+ }
145
+
146
+ void RE2::Init(const StringPiece& pattern, const Options& options) {
147
+ mutex_ = new Mutex;
148
+ pattern_ = pattern.as_string();
149
+ options_.Copy(options);
150
+ error_ = &empty_string;
151
+ error_code_ = NoError;
152
+ suffix_regexp_ = NULL;
153
+ entire_regexp_ = NULL;
154
+ prog_ = NULL;
155
+ rprog_ = NULL;
156
+ named_groups_ = NULL;
157
+ group_names_ = NULL;
158
+ num_captures_ = -1;
159
+
160
+ RegexpStatus status;
161
+ entire_regexp_ = Regexp::Parse(
162
+ pattern_,
163
+ static_cast<Regexp::ParseFlags>(options_.ParseFlags()),
164
+ &status);
165
+ if (entire_regexp_ == NULL) {
166
+ if (error_ == &empty_string)
167
+ error_ = new string(status.Text());
168
+ if (options_.log_errors()) {
169
+ LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': "
170
+ << status.Text();
171
+ }
172
+ error_arg_ = status.error_arg().as_string();
173
+ error_code_ = RegexpErrorToRE2(status.code());
174
+ return;
175
+ }
176
+
177
+ prefix_.clear();
178
+ prefix_foldcase_ = false;
179
+ re2::Regexp* suffix;
180
+ if (entire_regexp_->RequiredPrefix(&prefix_, &prefix_foldcase_, &suffix))
181
+ suffix_regexp_ = suffix;
182
+ else
183
+ suffix_regexp_ = entire_regexp_->Incref();
184
+
185
+ // Two thirds of the memory goes to the forward Prog,
186
+ // one third to the reverse prog, because the forward
187
+ // Prog has two DFAs but the reverse prog has one.
188
+ prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*2/3);
189
+ if (prog_ == NULL) {
190
+ if (options_.log_errors())
191
+ LOG(ERROR) << "Error compiling '" << trunc(pattern_) << "'";
192
+ error_ = new string("pattern too large - compile failed");
193
+ error_code_ = RE2::ErrorPatternTooLarge;
194
+ return;
195
+ }
196
+
197
+ // Could delay this until the first match call that
198
+ // cares about submatch information, but the one-pass
199
+ // machine's memory gets cut from the DFA memory budget,
200
+ // and that is harder to do if the DFA has already
201
+ // been built.
202
+ is_one_pass_ = prog_->IsOnePass();
203
+ }
204
+
205
+ // Returns rprog_, computing it if needed.
206
+ re2::Prog* RE2::ReverseProg() const {
207
+ MutexLock l(mutex_);
208
+ if (rprog_ == NULL && error_ == &empty_string) {
209
+ rprog_ = suffix_regexp_->CompileToReverseProg(options_.max_mem()/3);
210
+ if (rprog_ == NULL) {
211
+ if (options_.log_errors())
212
+ LOG(ERROR) << "Error reverse compiling '" << trunc(pattern_) << "'";
213
+ error_ = new string("pattern too large - reverse compile failed");
214
+ error_code_ = RE2::ErrorPatternTooLarge;
215
+ return NULL;
216
+ }
217
+ }
218
+ return rprog_;
219
+ }
220
+
221
+ static const map<string, int> empty_named_groups;
222
+ static const map<int, string> empty_group_names;
223
+
224
+ RE2::~RE2() {
225
+ if (suffix_regexp_)
226
+ suffix_regexp_->Decref();
227
+ if (entire_regexp_)
228
+ entire_regexp_->Decref();
229
+ delete mutex_;
230
+ delete prog_;
231
+ delete rprog_;
232
+ if (error_ != &empty_string)
233
+ delete error_;
234
+ if (named_groups_ != NULL && named_groups_ != &empty_named_groups)
235
+ delete named_groups_;
236
+ if (group_names_ != NULL && group_names_ != &empty_group_names)
237
+ delete group_names_;
238
+ }
239
+
240
+ int RE2::ProgramSize() const {
241
+ if (prog_ == NULL)
242
+ return -1;
243
+ return prog_->size();
244
+ }
245
+
246
+ // Returns named_groups_, computing it if needed.
247
+ const map<string, int>& RE2::NamedCapturingGroups() const {
248
+ MutexLock l(mutex_);
249
+ if (!ok())
250
+ return empty_named_groups;
251
+ if (named_groups_ == NULL) {
252
+ named_groups_ = suffix_regexp_->NamedCaptures();
253
+ if (named_groups_ == NULL)
254
+ named_groups_ = &empty_named_groups;
255
+ }
256
+ return *named_groups_;
257
+ }
258
+
259
+ // Returns group_names_, computing it if needed.
260
+ const map<int, string>& RE2::CapturingGroupNames() const {
261
+ MutexLock l(mutex_);
262
+ if (!ok())
263
+ return empty_group_names;
264
+ if (group_names_ == NULL) {
265
+ group_names_ = suffix_regexp_->CaptureNames();
266
+ if (group_names_ == NULL)
267
+ group_names_ = &empty_group_names;
268
+ }
269
+ return *group_names_;
270
+ }
271
+
272
+ /***** Convenience interfaces *****/
273
+
274
+ bool RE2::FullMatchN(const StringPiece& text, const RE2& re,
275
+ const Arg* const args[], int n) {
276
+ return re.DoMatch(text, ANCHOR_BOTH, NULL, args, n);
277
+ }
278
+
279
+ bool RE2::PartialMatchN(const StringPiece& text, const RE2& re,
280
+ const Arg* const args[], int n) {
281
+ return re.DoMatch(text, UNANCHORED, NULL, args, n);
282
+ }
283
+
284
+ bool RE2::ConsumeN(StringPiece* input, const RE2& re,
285
+ const Arg* const args[], int n) {
286
+ int consumed;
287
+ if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) {
288
+ input->remove_prefix(consumed);
289
+ return true;
290
+ } else {
291
+ return false;
292
+ }
293
+ }
294
+
295
+ bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re,
296
+ const Arg* const args[], int n) {
297
+ int consumed;
298
+ if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) {
299
+ input->remove_prefix(consumed);
300
+ return true;
301
+ } else {
302
+ return false;
303
+ }
304
+ }
305
+
306
+ // Returns the maximum submatch needed for the rewrite to be done by Replace().
307
+ // E.g. if rewrite == "foo \\2,\\1", returns 2.
308
+ static int MaxSubmatch(const StringPiece& rewrite) {
309
+ int max = 0;
310
+ for (const char *s = rewrite.data(), *end = s + rewrite.size();
311
+ s < end; s++) {
312
+ if (*s == '\\') {
313
+ s++;
314
+ int c = (s < end) ? *s : -1;
315
+ if (isdigit(c)) {
316
+ int n = (c - '0');
317
+ if (n > max)
318
+ max = n;
319
+ }
320
+ }
321
+ }
322
+ return max;
323
+ }
324
+
325
+ bool RE2::Replace(string *str,
326
+ const RE2& re,
327
+ const StringPiece& rewrite) {
328
+ StringPiece vec[kVecSize];
329
+ int nvec = 1 + MaxSubmatch(rewrite);
330
+ if (nvec > arraysize(vec))
331
+ return false;
332
+ if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec))
333
+ return false;
334
+
335
+ string s;
336
+ if (!re.Rewrite(&s, rewrite, vec, nvec))
337
+ return false;
338
+
339
+ assert(vec[0].begin() >= str->data());
340
+ assert(vec[0].end() <= str->data()+str->size());
341
+ str->replace(vec[0].data() - str->data(), vec[0].size(), s);
342
+ return true;
343
+ }
344
+
345
+ int RE2::GlobalReplace(string *str,
346
+ const RE2& re,
347
+ const StringPiece& rewrite) {
348
+ StringPiece vec[kVecSize];
349
+ int nvec = 1 + MaxSubmatch(rewrite);
350
+ if (nvec > arraysize(vec))
351
+ return false;
352
+
353
+ const char* p = str->data();
354
+ const char* ep = p + str->size();
355
+ const char* lastend = NULL;
356
+ string out;
357
+ int count = 0;
358
+ while (p <= ep) {
359
+ if (!re.Match(*str, p - str->data(), str->size(), UNANCHORED, vec, nvec))
360
+ break;
361
+ if (p < vec[0].begin())
362
+ out.append(p, vec[0].begin() - p);
363
+ if (vec[0].begin() == lastend && vec[0].size() == 0) {
364
+ // Disallow empty match at end of last match: skip ahead.
365
+ if (p < ep)
366
+ out.append(p, 1);
367
+ p++;
368
+ continue;
369
+ }
370
+ re.Rewrite(&out, rewrite, vec, nvec);
371
+ p = vec[0].end();
372
+ lastend = p;
373
+ count++;
374
+ }
375
+
376
+ if (count == 0)
377
+ return 0;
378
+
379
+ if (p < ep)
380
+ out.append(p, ep - p);
381
+ swap(out, *str);
382
+ return count;
383
+ }
384
+
385
+ bool RE2::Extract(const StringPiece &text,
386
+ const RE2& re,
387
+ const StringPiece &rewrite,
388
+ string *out) {
389
+ StringPiece vec[kVecSize];
390
+ int nvec = 1 + MaxSubmatch(rewrite);
391
+ if (nvec > arraysize(vec))
392
+ return false;
393
+
394
+ if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec))
395
+ return false;
396
+
397
+ out->clear();
398
+ return re.Rewrite(out, rewrite, vec, nvec);
399
+ }
400
+
401
+ string RE2::QuoteMeta(const StringPiece& unquoted) {
402
+ string result;
403
+ result.reserve(unquoted.size() << 1);
404
+
405
+ // Escape any ascii character not in [A-Za-z_0-9].
406
+ //
407
+ // Note that it's legal to escape a character even if it has no
408
+ // special meaning in a regular expression -- so this function does
409
+ // that. (This also makes it identical to the perl function of the
410
+ // same name except for the null-character special case;
411
+ // see `perldoc -f quotemeta`.)
412
+ for (int ii = 0; ii < unquoted.length(); ++ii) {
413
+ // Note that using 'isalnum' here raises the benchmark time from
414
+ // 32ns to 58ns:
415
+ if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
416
+ (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
417
+ (unquoted[ii] < '0' || unquoted[ii] > '9') &&
418
+ unquoted[ii] != '_' &&
419
+ // If this is the part of a UTF8 or Latin1 character, we need
420
+ // to copy this byte without escaping. Experimentally this is
421
+ // what works correctly with the regexp library.
422
+ !(unquoted[ii] & 128)) {
423
+ if (unquoted[ii] == '\0') { // Special handling for null chars.
424
+ // Note that this special handling is not strictly required for RE2,
425
+ // but this quoting is required for other regexp libraries such as
426
+ // PCRE.
427
+ // Can't use "\\0" since the next character might be a digit.
428
+ result += "\\x00";
429
+ continue;
430
+ }
431
+ result += '\\';
432
+ }
433
+ result += unquoted[ii];
434
+ }
435
+
436
+ return result;
437
+ }
438
+
439
+ bool RE2::PossibleMatchRange(string* min, string* max, int maxlen) const {
440
+ if (prog_ == NULL)
441
+ return false;
442
+
443
+ int n = prefix_.size();
444
+ if (n > maxlen)
445
+ n = maxlen;
446
+
447
+ // Determine initial min max from prefix_ literal.
448
+ string pmin, pmax;
449
+ pmin = prefix_.substr(0, n);
450
+ pmax = prefix_.substr(0, n);
451
+ if (prefix_foldcase_) {
452
+ // prefix is ASCII lowercase; change pmin to uppercase.
453
+ for (int i = 0; i < n; i++) {
454
+ if ('a' <= pmin[i] && pmin[i] <= 'z')
455
+ pmin[i] += 'A' - 'a';
456
+ }
457
+ }
458
+
459
+ // Add to prefix min max using PossibleMatchRange on regexp.
460
+ string dmin, dmax;
461
+ maxlen -= n;
462
+ if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) {
463
+ pmin += dmin;
464
+ pmax += dmax;
465
+ } else if (pmax.size() > 0) {
466
+ // prog_->PossibleMatchRange has failed us,
467
+ // but we still have useful information from prefix_.
468
+ // Round up pmax to allow any possible suffix.
469
+ pmax = PrefixSuccessor(pmax);
470
+ } else {
471
+ // Nothing useful.
472
+ *min = "";
473
+ *max = "";
474
+ return false;
475
+ }
476
+
477
+ *min = pmin;
478
+ *max = pmax;
479
+ return true;
480
+ }
481
+
482
+ // Avoid possible locale nonsense in standard strcasecmp.
483
+ // The string a is known to be all lowercase.
484
+ static int ascii_strcasecmp(const char* a, const char* b, int len) {
485
+ const char *ae = a + len;
486
+
487
+ for (; a < ae; a++, b++) {
488
+ uint8 x = *a;
489
+ uint8 y = *b;
490
+ if ('A' <= y && y <= 'Z')
491
+ y += 'a' - 'A';
492
+ if (x != y)
493
+ return x - y;
494
+ }
495
+ return 0;
496
+ }
497
+
498
+
499
+ /***** Actual matching and rewriting code *****/
500
+
501
+ bool RE2::Match(const StringPiece& text,
502
+ int startpos,
503
+ int endpos,
504
+ Anchor re_anchor,
505
+ StringPiece* submatch,
506
+ int nsubmatch) const {
507
+ if (!ok() || suffix_regexp_ == NULL) {
508
+ if (options_.log_errors())
509
+ LOG(ERROR) << "Invalid RE2: " << *error_;
510
+ return false;
511
+ }
512
+
513
+ if (startpos < 0 || startpos > endpos || endpos > text.size()) {
514
+ LOG(ERROR) << "RE2: invalid startpos, endpos pair.";
515
+ return false;
516
+ }
517
+
518
+ StringPiece subtext = text;
519
+ subtext.remove_prefix(startpos);
520
+ subtext.remove_suffix(text.size() - endpos);
521
+
522
+ // Use DFAs to find exact location of match, filter out non-matches.
523
+
524
+ // Don't ask for the location if we won't use it.
525
+ // SearchDFA can do extra optimizations in that case.
526
+ StringPiece match;
527
+ StringPiece* matchp = &match;
528
+ if (nsubmatch == 0)
529
+ matchp = NULL;
530
+
531
+ int ncap = 1 + NumberOfCapturingGroups();
532
+ if (ncap > nsubmatch)
533
+ ncap = nsubmatch;
534
+
535
+ // If the regexp is anchored explicitly, must not be in middle of text.
536
+ if (prog_->anchor_start() && startpos != 0)
537
+ return false;
538
+
539
+ // If the regexp is anchored explicitly, update re_anchor
540
+ // so that we can potentially fall into a faster case below.
541
+ if (prog_->anchor_start() && prog_->anchor_end())
542
+ re_anchor = ANCHOR_BOTH;
543
+ else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH)
544
+ re_anchor = ANCHOR_START;
545
+
546
+ // Check for the required prefix, if any.
547
+ int prefixlen = 0;
548
+ if (!prefix_.empty()) {
549
+ if (startpos != 0)
550
+ return false;
551
+ prefixlen = prefix_.size();
552
+ if (prefixlen > subtext.size())
553
+ return false;
554
+ if (prefix_foldcase_) {
555
+ if (ascii_strcasecmp(&prefix_[0], subtext.data(), prefixlen) != 0)
556
+ return false;
557
+ } else {
558
+ if (memcmp(&prefix_[0], subtext.data(), prefixlen) != 0)
559
+ return false;
560
+ }
561
+ subtext.remove_prefix(prefixlen);
562
+ // If there is a required prefix, the anchor must be at least ANCHOR_START.
563
+ if (re_anchor != ANCHOR_BOTH)
564
+ re_anchor = ANCHOR_START;
565
+ }
566
+
567
+ Prog::Anchor anchor = Prog::kUnanchored;
568
+ Prog::MatchKind kind = Prog::kFirstMatch;
569
+ if (options_.longest_match())
570
+ kind = Prog::kLongestMatch;
571
+ bool skipped_test = false;
572
+
573
+ bool can_one_pass = (is_one_pass_ && ncap <= Prog::kMaxOnePassCapture);
574
+
575
+ // SearchBitState allocates a bit vector of size prog_->size() * text.size().
576
+ // It also allocates a stack of 3-word structures which could potentially
577
+ // grow as large as prog_->size() * text.size() but in practice is much
578
+ // smaller.
579
+ // Conditions for using SearchBitState:
580
+ const int MaxBitStateProg = 500; // prog_->size() <= Max.
581
+ const int MaxBitStateVector = 256*1024; // bit vector size <= Max (bits)
582
+ bool can_bit_state = prog_->size() <= MaxBitStateProg;
583
+ int bit_state_text_max = MaxBitStateVector / prog_->size();
584
+
585
+ bool dfa_failed = false;
586
+ switch (re_anchor) {
587
+ default:
588
+ case UNANCHORED: {
589
+ if (!prog_->SearchDFA(subtext, text, anchor, kind,
590
+ matchp, &dfa_failed, NULL)) {
591
+ if (dfa_failed) {
592
+ // Fall back to NFA below.
593
+ skipped_test = true;
594
+ if (FLAGS_trace_re2)
595
+ LOG(INFO) << "Match " << trunc(pattern_)
596
+ << " [" << CEscape(subtext) << "]"
597
+ << " DFA failed.";
598
+ break;
599
+ }
600
+ if (FLAGS_trace_re2)
601
+ LOG(INFO) << "Match " << trunc(pattern_)
602
+ << " [" << CEscape(subtext) << "]"
603
+ << " used DFA - no match.";
604
+ return false;
605
+ }
606
+ if (FLAGS_trace_re2)
607
+ LOG(INFO) << "Match " << trunc(pattern_)
608
+ << " [" << CEscape(subtext) << "]"
609
+ << " used DFA - match";
610
+ if (matchp == NULL) // Matched. Don't care where
611
+ return true;
612
+ // SearchDFA set match[0].end() but didn't know where the
613
+ // match started. Run the regexp backward from match[0].end()
614
+ // to find the longest possible match -- that's where it started.
615
+ Prog* prog = ReverseProg();
616
+ if (prog == NULL)
617
+ return false;
618
+ if (!prog->SearchDFA(match, text, Prog::kAnchored,
619
+ Prog::kLongestMatch, &match, &dfa_failed, NULL)) {
620
+ if (dfa_failed) {
621
+ // Fall back to NFA below.
622
+ skipped_test = true;
623
+ if (FLAGS_trace_re2)
624
+ LOG(INFO) << "Match " << trunc(pattern_)
625
+ << " [" << CEscape(subtext) << "]"
626
+ << " reverse DFA failed.";
627
+ break;
628
+ }
629
+ if (FLAGS_trace_re2)
630
+ LOG(INFO) << "Match " << trunc(pattern_)
631
+ << " [" << CEscape(subtext) << "]"
632
+ << " DFA inconsistency.";
633
+ LOG(ERROR) << "DFA inconsistency";
634
+ return false;
635
+ }
636
+ if (FLAGS_trace_re2)
637
+ LOG(INFO) << "Match " << trunc(pattern_)
638
+ << " [" << CEscape(subtext) << "]"
639
+ << " used reverse DFA.";
640
+ break;
641
+ }
642
+
643
+ case ANCHOR_BOTH:
644
+ case ANCHOR_START:
645
+ if (re_anchor == ANCHOR_BOTH)
646
+ kind = Prog::kFullMatch;
647
+ anchor = Prog::kAnchored;
648
+
649
+ // If only a small amount of text and need submatch
650
+ // information anyway and we're going to use OnePass or BitState
651
+ // to get it, we might as well not even bother with the DFA:
652
+ // OnePass or BitState will be fast enough.
653
+ // On tiny texts, OnePass outruns even the DFA, and
654
+ // it doesn't have the shared state and occasional mutex that
655
+ // the DFA does.
656
+ if (can_one_pass && text.size() <= 4096 &&
657
+ (ncap > 1 || text.size() <= 8)) {
658
+ if (FLAGS_trace_re2)
659
+ LOG(INFO) << "Match " << trunc(pattern_)
660
+ << " [" << CEscape(subtext) << "]"
661
+ << " skipping DFA for OnePass.";
662
+ skipped_test = true;
663
+ break;
664
+ }
665
+ if (can_bit_state && text.size() <= bit_state_text_max && ncap > 1) {
666
+ if (FLAGS_trace_re2)
667
+ LOG(INFO) << "Match " << trunc(pattern_)
668
+ << " [" << CEscape(subtext) << "]"
669
+ << " skipping DFA for BitState.";
670
+ skipped_test = true;
671
+ break;
672
+ }
673
+ if (!prog_->SearchDFA(subtext, text, anchor, kind,
674
+ &match, &dfa_failed, NULL)) {
675
+ if (dfa_failed) {
676
+ if (FLAGS_trace_re2)
677
+ LOG(INFO) << "Match " << trunc(pattern_)
678
+ << " [" << CEscape(subtext) << "]"
679
+ << " DFA failed.";
680
+ skipped_test = true;
681
+ break;
682
+ }
683
+ if (FLAGS_trace_re2)
684
+ LOG(INFO) << "Match " << trunc(pattern_)
685
+ << " [" << CEscape(subtext) << "]"
686
+ << " used DFA - no match.";
687
+ return false;
688
+ }
689
+ break;
690
+ }
691
+
692
+ if (!skipped_test && ncap <= 1) {
693
+ // We know exactly where it matches. That's enough.
694
+ if (ncap == 1)
695
+ submatch[0] = match;
696
+ } else {
697
+ StringPiece subtext1;
698
+ if (skipped_test) {
699
+ // DFA ran out of memory or was skipped:
700
+ // need to search in entire original text.
701
+ subtext1 = subtext;
702
+ } else {
703
+ // DFA found the exact match location:
704
+ // let NFA run an anchored, full match search
705
+ // to find submatch locations.
706
+ subtext1 = match;
707
+ anchor = Prog::kAnchored;
708
+ kind = Prog::kFullMatch;
709
+ }
710
+
711
+ if (can_one_pass && anchor != Prog::kUnanchored) {
712
+ if (FLAGS_trace_re2)
713
+ LOG(INFO) << "Match " << trunc(pattern_)
714
+ << " [" << CEscape(subtext) << "]"
715
+ << " using OnePass.";
716
+ if (!prog_->SearchOnePass(subtext1, text, anchor, kind, submatch, ncap)) {
717
+ if (!skipped_test)
718
+ LOG(ERROR) << "SearchOnePass inconsistency";
719
+ return false;
720
+ }
721
+ } else if (can_bit_state && subtext1.size() <= bit_state_text_max) {
722
+ if (FLAGS_trace_re2)
723
+ LOG(INFO) << "Match " << trunc(pattern_)
724
+ << " [" << CEscape(subtext) << "]"
725
+ << " using BitState.";
726
+ if (!prog_->SearchBitState(subtext1, text, anchor,
727
+ kind, submatch, ncap)) {
728
+ if (!skipped_test)
729
+ LOG(ERROR) << "SearchBitState inconsistency";
730
+ return false;
731
+ }
732
+ } else {
733
+ if (FLAGS_trace_re2)
734
+ LOG(INFO) << "Match " << trunc(pattern_)
735
+ << " [" << CEscape(subtext) << "]"
736
+ << " using NFA.";
737
+ if (!prog_->SearchNFA(subtext1, text, anchor, kind, submatch, ncap)) {
738
+ if (!skipped_test)
739
+ LOG(ERROR) << "SearchNFA inconsistency";
740
+ return false;
741
+ }
742
+ }
743
+ }
744
+
745
+ // Adjust overall match for required prefix that we stripped off.
746
+ if (prefixlen > 0 && nsubmatch > 0)
747
+ submatch[0] = StringPiece(submatch[0].begin() - prefixlen,
748
+ submatch[0].size() + prefixlen);
749
+
750
+ // Zero submatches that don't exist in the regexp.
751
+ for (int i = ncap; i < nsubmatch; i++)
752
+ submatch[i] = NULL;
753
+ return true;
754
+ }
755
+
756
+ // Internal matcher - like Match() but takes Args not StringPieces.
757
+ bool RE2::DoMatch(const StringPiece& text,
758
+ Anchor anchor,
759
+ int* consumed,
760
+ const Arg* const* args,
761
+ int n) const {
762
+ if (!ok()) {
763
+ if (options_.log_errors())
764
+ LOG(ERROR) << "Invalid RE2: " << *error_;
765
+ return false;
766
+ }
767
+
768
+ // Count number of capture groups needed.
769
+ int nvec;
770
+ if (n == 0 && consumed == NULL)
771
+ nvec = 0;
772
+ else
773
+ nvec = n+1;
774
+
775
+ StringPiece* vec;
776
+ StringPiece stkvec[kVecSize];
777
+ StringPiece* heapvec = NULL;
778
+
779
+ if (nvec <= arraysize(stkvec)) {
780
+ vec = stkvec;
781
+ } else {
782
+ vec = new StringPiece[nvec];
783
+ heapvec = vec;
784
+ }
785
+
786
+ if (!Match(text, 0, text.size(), anchor, vec, nvec)) {
787
+ delete[] heapvec;
788
+ return false;
789
+ }
790
+
791
+ if(consumed != NULL)
792
+ *consumed = vec[0].end() - text.begin();
793
+
794
+ if (n == 0 || args == NULL) {
795
+ // We are not interested in results
796
+ delete[] heapvec;
797
+ return true;
798
+ }
799
+
800
+ int ncap = NumberOfCapturingGroups();
801
+ if (ncap < n) {
802
+ // RE has fewer capturing groups than number of arg pointers passed in
803
+ VLOG(1) << "Asked for " << n << " but only have " << ncap;
804
+ delete[] heapvec;
805
+ return false;
806
+ }
807
+
808
+ // If we got here, we must have matched the whole pattern.
809
+ for (int i = 0; i < n; i++) {
810
+ const StringPiece& s = vec[i+1];
811
+ if (!args[i]->Parse(s.data(), s.size())) {
812
+ // TODO: Should we indicate what the error was?
813
+ VLOG(1) << "Parse error on #" << i << " " << s << " "
814
+ << (void*)s.data() << "/" << s.size();
815
+ delete[] heapvec;
816
+ return false;
817
+ }
818
+ }
819
+
820
+ delete[] heapvec;
821
+ return true;
822
+ }
823
+
824
+ // Append the "rewrite" string, with backslash subsitutions from "vec",
825
+ // to string "out".
826
+ bool RE2::Rewrite(string *out, const StringPiece &rewrite,
827
+ const StringPiece *vec, int veclen) const {
828
+ for (const char *s = rewrite.data(), *end = s + rewrite.size();
829
+ s < end; s++) {
830
+ int c = *s;
831
+ if (c == '\\') {
832
+ s++;
833
+ c = (s < end) ? *s : -1;
834
+ if (isdigit(c)) {
835
+ int n = (c - '0');
836
+ if (n >= veclen) {
837
+ LOG(ERROR) << "requested group " << n
838
+ << " in regexp " << rewrite.data();
839
+ return false;
840
+ }
841
+ StringPiece snip = vec[n];
842
+ if (snip.size() > 0)
843
+ out->append(snip.data(), snip.size());
844
+ } else if (c == '\\') {
845
+ out->push_back('\\');
846
+ } else {
847
+ LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data();
848
+ return false;
849
+ }
850
+ } else {
851
+ out->push_back(c);
852
+ }
853
+ }
854
+ return true;
855
+ }
856
+
857
+ // Return the number of capturing subpatterns, or -1 if the
858
+ // regexp wasn't valid on construction.
859
+ int RE2::NumberOfCapturingGroups() const {
860
+ if (suffix_regexp_ == NULL)
861
+ return -1;
862
+ ANNOTATE_BENIGN_RACE(&num_captures_, "benign race: in the worst case"
863
+ " multiple threads end up doing the same work in parallel.");
864
+ if (num_captures_ == -1)
865
+ num_captures_ = suffix_regexp_->NumCaptures();
866
+ return num_captures_;
867
+ }
868
+
869
+ // Checks that the rewrite string is well-formed with respect to this
870
+ // regular expression.
871
+ bool RE2::CheckRewriteString(const StringPiece& rewrite, string* error) const {
872
+ int max_token = -1;
873
+ for (const char *s = rewrite.data(), *end = s + rewrite.size();
874
+ s < end; s++) {
875
+ int c = *s;
876
+ if (c != '\\') {
877
+ continue;
878
+ }
879
+ if (++s == end) {
880
+ *error = "Rewrite schema error: '\\' not allowed at end.";
881
+ return false;
882
+ }
883
+ c = *s;
884
+ if (c == '\\') {
885
+ continue;
886
+ }
887
+ if (!isdigit(c)) {
888
+ *error = "Rewrite schema error: "
889
+ "'\\' must be followed by a digit or '\\'.";
890
+ return false;
891
+ }
892
+ int n = (c - '0');
893
+ if (max_token < n) {
894
+ max_token = n;
895
+ }
896
+ }
897
+
898
+ if (max_token > NumberOfCapturingGroups()) {
899
+ SStringPrintf(error, "Rewrite schema requests %d matches, "
900
+ "but the regexp only has %d parenthesized subexpressions.",
901
+ max_token, NumberOfCapturingGroups());
902
+ return false;
903
+ }
904
+ return true;
905
+ }
906
+
907
+ /***** Parsers for various types *****/
908
+
909
+ bool RE2::Arg::parse_null(const char* str, int n, void* dest) {
910
+ // We fail if somebody asked us to store into a non-NULL void* pointer
911
+ return (dest == NULL);
912
+ }
913
+
914
+ bool RE2::Arg::parse_string(const char* str, int n, void* dest) {
915
+ if (dest == NULL) return true;
916
+ reinterpret_cast<string*>(dest)->assign(str, n);
917
+ return true;
918
+ }
919
+
920
+ bool RE2::Arg::parse_stringpiece(const char* str, int n, void* dest) {
921
+ if (dest == NULL) return true;
922
+ reinterpret_cast<StringPiece*>(dest)->set(str, n);
923
+ return true;
924
+ }
925
+
926
+ bool RE2::Arg::parse_char(const char* str, int n, void* dest) {
927
+ if (n != 1) return false;
928
+ if (dest == NULL) return true;
929
+ *(reinterpret_cast<char*>(dest)) = str[0];
930
+ return true;
931
+ }
932
+
933
+ bool RE2::Arg::parse_uchar(const char* str, int n, void* dest) {
934
+ if (n != 1) return false;
935
+ if (dest == NULL) return true;
936
+ *(reinterpret_cast<unsigned char*>(dest)) = str[0];
937
+ return true;
938
+ }
939
+
940
+ // Largest number spec that we are willing to parse
941
+ static const int kMaxNumberLength = 32;
942
+
943
+ // REQUIRES "buf" must have length at least kMaxNumberLength+1
944
+ // Copies "str" into "buf" and null-terminates.
945
+ // Overwrites *np with the new length.
946
+ static const char* TerminateNumber(char* buf, const char* str, int* np) {
947
+ int n = *np;
948
+ if (n <= 0) return "";
949
+ if (n > 0 && isspace(*str)) {
950
+ // We are less forgiving than the strtoxxx() routines and do not
951
+ // allow leading spaces.
952
+ return "";
953
+ }
954
+
955
+ // Although buf has a fixed maximum size, we can still handle
956
+ // arbitrarily large integers correctly by omitting leading zeros.
957
+ // (Numbers that are still too long will be out of range.)
958
+ // Before deciding whether str is too long,
959
+ // remove leading zeros with s/000+/00/.
960
+ // Leaving the leading two zeros in place means that
961
+ // we don't change 0000x123 (invalid) into 0x123 (valid).
962
+ // Skip over leading - before replacing.
963
+ bool neg = false;
964
+ if (n >= 1 && str[0] == '-') {
965
+ neg = true;
966
+ n--;
967
+ str++;
968
+ }
969
+
970
+ if (n >= 3 && str[0] == '0' && str[1] == '0') {
971
+ while (n >= 3 && str[2] == '0') {
972
+ n--;
973
+ str++;
974
+ }
975
+ }
976
+
977
+ if (neg) { // make room in buf for -
978
+ n++;
979
+ str--;
980
+ }
981
+
982
+ if (n > kMaxNumberLength) return "";
983
+
984
+ memmove(buf, str, n);
985
+ if (neg) {
986
+ buf[0] = '-';
987
+ }
988
+ buf[n] = '\0';
989
+ *np = n;
990
+ return buf;
991
+ }
992
+
993
+ bool RE2::Arg::parse_long_radix(const char* str,
994
+ int n,
995
+ void* dest,
996
+ int radix) {
997
+ if (n == 0) return false;
998
+ char buf[kMaxNumberLength+1];
999
+ str = TerminateNumber(buf, str, &n);
1000
+ char* end;
1001
+ errno = 0;
1002
+ long r = strtol(str, &end, radix);
1003
+ if (end != str + n) return false; // Leftover junk
1004
+ if (errno) return false;
1005
+ if (dest == NULL) return true;
1006
+ *(reinterpret_cast<long*>(dest)) = r;
1007
+ return true;
1008
+ }
1009
+
1010
+ bool RE2::Arg::parse_ulong_radix(const char* str,
1011
+ int n,
1012
+ void* dest,
1013
+ int radix) {
1014
+ if (n == 0) return false;
1015
+ char buf[kMaxNumberLength+1];
1016
+ str = TerminateNumber(buf, str, &n);
1017
+ if (str[0] == '-') {
1018
+ // strtoul() will silently accept negative numbers and parse
1019
+ // them. This module is more strict and treats them as errors.
1020
+ return false;
1021
+ }
1022
+
1023
+ char* end;
1024
+ errno = 0;
1025
+ unsigned long r = strtoul(str, &end, radix);
1026
+ if (end != str + n) return false; // Leftover junk
1027
+ if (errno) return false;
1028
+ if (dest == NULL) return true;
1029
+ *(reinterpret_cast<unsigned long*>(dest)) = r;
1030
+ return true;
1031
+ }
1032
+
1033
+ bool RE2::Arg::parse_short_radix(const char* str,
1034
+ int n,
1035
+ void* dest,
1036
+ int radix) {
1037
+ long r;
1038
+ if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
1039
+ if ((short)r != r) return false; // Out of range
1040
+ if (dest == NULL) return true;
1041
+ *(reinterpret_cast<short*>(dest)) = r;
1042
+ return true;
1043
+ }
1044
+
1045
+ bool RE2::Arg::parse_ushort_radix(const char* str,
1046
+ int n,
1047
+ void* dest,
1048
+ int radix) {
1049
+ unsigned long r;
1050
+ if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
1051
+ if ((ushort)r != r) return false; // Out of range
1052
+ if (dest == NULL) return true;
1053
+ *(reinterpret_cast<unsigned short*>(dest)) = r;
1054
+ return true;
1055
+ }
1056
+
1057
+ bool RE2::Arg::parse_int_radix(const char* str,
1058
+ int n,
1059
+ void* dest,
1060
+ int radix) {
1061
+ long r;
1062
+ if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
1063
+ if ((int)r != r) return false; // Out of range
1064
+ if (dest == NULL) return true;
1065
+ *(reinterpret_cast<int*>(dest)) = r;
1066
+ return true;
1067
+ }
1068
+
1069
+ bool RE2::Arg::parse_uint_radix(const char* str,
1070
+ int n,
1071
+ void* dest,
1072
+ int radix) {
1073
+ unsigned long r;
1074
+ if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
1075
+ if ((uint)r != r) return false; // Out of range
1076
+ if (dest == NULL) return true;
1077
+ *(reinterpret_cast<unsigned int*>(dest)) = r;
1078
+ return true;
1079
+ }
1080
+
1081
+ bool RE2::Arg::parse_longlong_radix(const char* str,
1082
+ int n,
1083
+ void* dest,
1084
+ int radix) {
1085
+ if (n == 0) return false;
1086
+ char buf[kMaxNumberLength+1];
1087
+ str = TerminateNumber(buf, str, &n);
1088
+ char* end;
1089
+ errno = 0;
1090
+ int64 r = strtoll(str, &end, radix);
1091
+ if (end != str + n) return false; // Leftover junk
1092
+ if (errno) return false;
1093
+ if (dest == NULL) return true;
1094
+ *(reinterpret_cast<int64*>(dest)) = r;
1095
+ return true;
1096
+ }
1097
+
1098
+ bool RE2::Arg::parse_ulonglong_radix(const char* str,
1099
+ int n,
1100
+ void* dest,
1101
+ int radix) {
1102
+ if (n == 0) return false;
1103
+ char buf[kMaxNumberLength+1];
1104
+ str = TerminateNumber(buf, str, &n);
1105
+ if (str[0] == '-') {
1106
+ // strtoull() will silently accept negative numbers and parse
1107
+ // them. This module is more strict and treats them as errors.
1108
+ return false;
1109
+ }
1110
+ char* end;
1111
+ errno = 0;
1112
+ uint64 r = strtoull(str, &end, radix);
1113
+ if (end != str + n) return false; // Leftover junk
1114
+ if (errno) return false;
1115
+ if (dest == NULL) return true;
1116
+ *(reinterpret_cast<uint64*>(dest)) = r;
1117
+ return true;
1118
+ }
1119
+
1120
+ static bool parse_double_float(const char* str, int n, bool isfloat, void *dest) {
1121
+ if (n == 0) return false;
1122
+ static const int kMaxLength = 200;
1123
+ char buf[kMaxLength];
1124
+ if (n >= kMaxLength) return false;
1125
+ memcpy(buf, str, n);
1126
+ buf[n] = '\0';
1127
+ errno = 0;
1128
+ char* end;
1129
+ double r;
1130
+ if (isfloat) {
1131
+ r = strtof(buf, &end);
1132
+ } else {
1133
+ r = strtod(buf, &end);
1134
+ }
1135
+ if (end != buf + n) return false; // Leftover junk
1136
+ if (errno) return false;
1137
+ if (dest == NULL) return true;
1138
+ if (isfloat) {
1139
+ *(reinterpret_cast<float*>(dest)) = r;
1140
+ } else {
1141
+ *(reinterpret_cast<double*>(dest)) = r;
1142
+ }
1143
+ return true;
1144
+ }
1145
+
1146
+ bool RE2::Arg::parse_double(const char* str, int n, void* dest) {
1147
+ return parse_double_float(str, n, false, dest);
1148
+ }
1149
+
1150
+ bool RE2::Arg::parse_float(const char* str, int n, void* dest) {
1151
+ return parse_double_float(str, n, true, dest);
1152
+ }
1153
+
1154
+
1155
+ #define DEFINE_INTEGER_PARSERS(name) \
1156
+ bool RE2::Arg::parse_##name(const char* str, int n, void* dest) { \
1157
+ return parse_##name##_radix(str, n, dest, 10); \
1158
+ } \
1159
+ bool RE2::Arg::parse_##name##_hex(const char* str, int n, void* dest) { \
1160
+ return parse_##name##_radix(str, n, dest, 16); \
1161
+ } \
1162
+ bool RE2::Arg::parse_##name##_octal(const char* str, int n, void* dest) { \
1163
+ return parse_##name##_radix(str, n, dest, 8); \
1164
+ } \
1165
+ bool RE2::Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \
1166
+ return parse_##name##_radix(str, n, dest, 0); \
1167
+ }
1168
+
1169
+ DEFINE_INTEGER_PARSERS(short);
1170
+ DEFINE_INTEGER_PARSERS(ushort);
1171
+ DEFINE_INTEGER_PARSERS(int);
1172
+ DEFINE_INTEGER_PARSERS(uint);
1173
+ DEFINE_INTEGER_PARSERS(long);
1174
+ DEFINE_INTEGER_PARSERS(ulong);
1175
+ DEFINE_INTEGER_PARSERS(longlong);
1176
+ DEFINE_INTEGER_PARSERS(ulonglong);
1177
+
1178
+ #undef DEFINE_INTEGER_PARSERS
1179
+
1180
+ } // namespace re2