chipper 0.4.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (134) hide show
  1. data/README.rdoc +51 -0
  2. data/ext/extconf.rb +58 -0
  3. data/ext/libstemmer_c/Makefile +10 -0
  4. data/ext/libstemmer_c/examples/stemwords.c +209 -0
  5. data/ext/libstemmer_c/include/libstemmer.h +79 -0
  6. data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
  7. data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
  8. data/ext/libstemmer_c/libstemmer/modules.h +190 -0
  9. data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
  10. data/ext/libstemmer_c/mkinc.mak +82 -0
  11. data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
  12. data/ext/libstemmer_c/runtime/api.c +66 -0
  13. data/ext/libstemmer_c/runtime/api.h +26 -0
  14. data/ext/libstemmer_c/runtime/header.h +58 -0
  15. data/ext/libstemmer_c/runtime/utilities.c +478 -0
  16. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
  17. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
  18. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
  19. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
  20. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
  21. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
  22. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
  23. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
  24. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
  25. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
  26. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
  27. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
  28. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
  29. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
  30. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
  31. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
  32. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
  33. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
  34. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
  35. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
  36. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
  37. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
  38. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
  39. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
  40. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
  41. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
  42. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
  43. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
  44. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
  45. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
  46. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
  47. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
  48. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
  49. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
  50. data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
  51. data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
  52. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
  53. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
  54. data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
  55. data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
  56. data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
  57. data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
  58. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
  59. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
  60. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
  61. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
  62. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
  63. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
  64. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
  65. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
  66. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
  67. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
  68. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
  69. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
  70. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
  71. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
  72. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
  73. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
  74. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
  75. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
  76. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
  77. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
  78. data/ext/re2/bitstate.cc +378 -0
  79. data/ext/re2/compile.cc +1138 -0
  80. data/ext/re2/dfa.cc +2086 -0
  81. data/ext/re2/filtered_re2.cc +100 -0
  82. data/ext/re2/filtered_re2.h +99 -0
  83. data/ext/re2/hash.cc +231 -0
  84. data/ext/re2/mimics_pcre.cc +185 -0
  85. data/ext/re2/nfa.cc +709 -0
  86. data/ext/re2/onepass.cc +614 -0
  87. data/ext/re2/parse.cc +2202 -0
  88. data/ext/re2/perl_groups.cc +119 -0
  89. data/ext/re2/prefilter.cc +671 -0
  90. data/ext/re2/prefilter.h +105 -0
  91. data/ext/re2/prefilter_tree.cc +398 -0
  92. data/ext/re2/prefilter_tree.h +130 -0
  93. data/ext/re2/prog.cc +341 -0
  94. data/ext/re2/prog.h +376 -0
  95. data/ext/re2/re2.cc +1180 -0
  96. data/ext/re2/re2.h +837 -0
  97. data/ext/re2/regexp.cc +920 -0
  98. data/ext/re2/regexp.h +632 -0
  99. data/ext/re2/rune.cc +258 -0
  100. data/ext/re2/set.cc +113 -0
  101. data/ext/re2/set.h +55 -0
  102. data/ext/re2/simplify.cc +393 -0
  103. data/ext/re2/stringpiece.cc +87 -0
  104. data/ext/re2/stringpiece.h +182 -0
  105. data/ext/re2/tostring.cc +341 -0
  106. data/ext/re2/unicode_casefold.cc +469 -0
  107. data/ext/re2/unicode_casefold.h +75 -0
  108. data/ext/re2/unicode_groups.cc +4851 -0
  109. data/ext/re2/unicode_groups.h +64 -0
  110. data/ext/re2/valgrind.cc +24 -0
  111. data/ext/re2/variadic_function.h +346 -0
  112. data/ext/re2/walker-inl.h +244 -0
  113. data/ext/src/chipper.cc +626 -0
  114. data/ext/src/version.h +1 -0
  115. data/ext/stemmer.rb +40 -0
  116. data/ext/util/arena.h +103 -0
  117. data/ext/util/atomicops.h +79 -0
  118. data/ext/util/benchmark.h +41 -0
  119. data/ext/util/flags.h +27 -0
  120. data/ext/util/logging.h +78 -0
  121. data/ext/util/mutex.h +190 -0
  122. data/ext/util/pcre.h +679 -0
  123. data/ext/util/random.h +29 -0
  124. data/ext/util/sparse_array.h +451 -0
  125. data/ext/util/sparse_set.h +177 -0
  126. data/ext/util/test.h +57 -0
  127. data/ext/util/thread.h +26 -0
  128. data/ext/util/utf.h +43 -0
  129. data/ext/util/util.h +127 -0
  130. data/ext/util/valgrind.h +4517 -0
  131. data/test/helper.rb +5 -0
  132. data/test/test_entities.rb +57 -0
  133. data/test/test_tokens.rb +118 -0
  134. metadata +199 -0
@@ -0,0 +1,100 @@
1
+ // Copyright 2009 The RE2 Authors. All Rights Reserved.
2
+ // Use of this source code is governed by a BSD-style
3
+ // license that can be found in the LICENSE file.
4
+
5
+ #include <string>
6
+ #include "util/util.h"
7
+ #include "re2/filtered_re2.h"
8
+ #include "re2/prefilter.h"
9
+ #include "re2/prefilter_tree.h"
10
+
11
+ namespace re2 {
12
+
13
+ FilteredRE2::FilteredRE2()
14
+ : compiled_(false),
15
+ prefilter_tree_(new PrefilterTree()) {
16
+ }
17
+
18
+ FilteredRE2::~FilteredRE2() {
19
+ for (int i = 0; i < re2_vec_.size(); i++)
20
+ delete re2_vec_[i];
21
+ delete prefilter_tree_;
22
+ }
23
+
24
+ RE2::ErrorCode FilteredRE2::Add(const StringPiece& pattern,
25
+ const RE2::Options& options, int* id) {
26
+ RE2* re = new RE2(pattern, options);
27
+ RE2::ErrorCode code = re->error_code();
28
+
29
+ if (!re->ok()) {
30
+ LOG(ERROR) << "Couldn't compile regular expression, skipping: "
31
+ << re << " due to error " << re->error();
32
+ delete re;
33
+ } else {
34
+ *id = re2_vec_.size();
35
+ re2_vec_.push_back(re);
36
+ }
37
+
38
+ return code;
39
+ }
40
+
41
+ void FilteredRE2::Compile(vector<string>* atoms) {
42
+ if (compiled_ || re2_vec_.size() == 0) {
43
+ LOG(INFO) << "C: " << compiled_ << " S:" << re2_vec_.size();
44
+ return;
45
+ }
46
+
47
+ for (int i = 0; i < re2_vec_.size(); i++) {
48
+ Prefilter* prefilter = Prefilter::FromRE2(re2_vec_[i]);
49
+ prefilter_tree_->Add(prefilter);
50
+ }
51
+ atoms->clear();
52
+ prefilter_tree_->Compile(atoms);
53
+ compiled_ = true;
54
+ }
55
+
56
+ int FilteredRE2::SlowFirstMatch(const StringPiece& text) const {
57
+ for (int i = 0; i < re2_vec_.size(); i++)
58
+ if (RE2::PartialMatch(text, *re2_vec_[i]))
59
+ return i;
60
+ return -1;
61
+ }
62
+
63
+ int FilteredRE2::FirstMatch(const StringPiece& text,
64
+ const vector<int>& atoms) const {
65
+ if (!compiled_) {
66
+ LOG(DFATAL) << "FirstMatch called before Compile";
67
+ return -1;
68
+ }
69
+ vector<int> regexps;
70
+ prefilter_tree_->RegexpsGivenStrings(atoms, &regexps);
71
+ for (int i = 0; i < regexps.size(); i++)
72
+ if (RE2::PartialMatch(text, *re2_vec_[regexps[i]]))
73
+ return regexps[i];
74
+ return -1;
75
+ }
76
+
77
+ bool FilteredRE2::AllMatches(
78
+ const StringPiece& text,
79
+ const vector<int>& atoms,
80
+ vector<int>* matching_regexps) const {
81
+ matching_regexps->clear();
82
+ vector<int> regexps;
83
+ prefilter_tree_->RegexpsGivenStrings(atoms, &regexps);
84
+ for (int i = 0; i < regexps.size(); i++)
85
+ if (RE2::PartialMatch(text, *re2_vec_[regexps[i]]))
86
+ matching_regexps->push_back(regexps[i]);
87
+ return !matching_regexps->empty();
88
+ }
89
+
90
+ void FilteredRE2::RegexpsGivenStrings(const vector<int>& matched_atoms,
91
+ vector<int>* passed_regexps) {
92
+ prefilter_tree_->RegexpsGivenStrings(matched_atoms, passed_regexps);
93
+ }
94
+
95
+
96
+ void FilteredRE2::PrintPrefilter(int regexpid) {
97
+ prefilter_tree_->PrintPrefilter(regexpid);
98
+ }
99
+
100
+ } // namespace re2
@@ -0,0 +1,99 @@
1
+ // Copyright 2009 The RE2 Authors. All Rights Reserved.
2
+ // Use of this source code is governed by a BSD-style
3
+ // license that can be found in the LICENSE file.
4
+
5
+ // The class FilteredRE2 is used as a wrapper to multiple RE2 regexps.
6
+ // It provides a prefilter mechanism that helps in cutting down the
7
+ // number of regexps that need to be actually searched.
8
+ //
9
+ // By design, it does not include a string matching engine. This is to
10
+ // allow the user of the class to use their favorite string match
11
+ // engine. The overall flow is: Add all the regexps using Add, then
12
+ // Compile the FilteredRE2. The compile returns strings that need to
13
+ // be matched. Note that all returned strings are lowercase. For
14
+ // applying regexps to a search text, the caller does the string
15
+ // matching using the strings returned. When doing the string match,
16
+ // note that the caller has to do that on lower cased version of the
17
+ // search text. Then call FirstMatch or AllMatches with a vector of
18
+ // indices of strings that were found in the text to get the actual
19
+ // regexp matches.
20
+
21
+ #ifndef RE2_FILTERED_RE2_H_
22
+ #define RE2_FILTERED_RE2_H_
23
+
24
+ #include <vector>
25
+ #include "re2/re2.h"
26
+
27
+ namespace re2 {
28
+ using std::vector;
29
+
30
+ class PrefilterTree;
31
+
32
+ class FilteredRE2 {
33
+ public:
34
+ FilteredRE2();
35
+ ~FilteredRE2();
36
+
37
+ // Uses RE2 constructor to create a RE2 object (re). Returns
38
+ // re->error_code(). If error_code is other than NoError, then re is
39
+ // deleted and not added to re2_vec_.
40
+ RE2::ErrorCode Add(const StringPiece& pattern,
41
+ const RE2::Options& options,
42
+ int *id);
43
+
44
+ // Prepares the regexps added by Add for filtering. Returns a set
45
+ // of strings that the caller should check for in candidate texts.
46
+ // The returned strings are lowercased. When doing string matching,
47
+ // the search text should be lowercased first to find matching
48
+ // strings from the set of strings returned by Compile. Call after
49
+ // all Add calls are done.
50
+ void Compile(vector<string>* strings_to_match);
51
+
52
+ // Returns the index of the first matching regexp.
53
+ // Returns -1 on no match. Can be called prior to Compile.
54
+ // Does not do any filtering: simply tries to Match the
55
+ // regexps in a loop.
56
+ int SlowFirstMatch(const StringPiece& text) const;
57
+
58
+ // Returns the index of the first matching regexp.
59
+ // Returns -1 on no match. Compile has to be called before
60
+ // calling this.
61
+ int FirstMatch(const StringPiece& text,
62
+ const vector<int>& atoms) const;
63
+
64
+ // Returns the indices of all matching regexps, after first clearing
65
+ // matched_regexps.
66
+ bool AllMatches(const StringPiece& text,
67
+ const vector<int>& atoms,
68
+ vector<int>* matching_regexps) const;
69
+
70
+ // The number of regexps added.
71
+ int NumRegexps() const { return re2_vec_.size(); }
72
+
73
+ private:
74
+
75
+ // Get the individual RE2 objects. Useful for testing.
76
+ RE2* GetRE2(int regexpid) const { return re2_vec_[regexpid]; }
77
+
78
+ // Print prefilter.
79
+ void PrintPrefilter(int regexpid);
80
+
81
+ // Useful for testing and debugging.
82
+ void RegexpsGivenStrings(const vector<int>& matched_atoms,
83
+ vector<int>* passed_regexps);
84
+
85
+ // All the regexps in the FilteredRE2.
86
+ vector<RE2*> re2_vec_;
87
+
88
+ // Has the FilteredRE2 been compiled using Compile()
89
+ bool compiled_;
90
+
91
+ // An AND-OR tree of string atoms used for filtering regexps.
92
+ PrefilterTree* prefilter_tree_;
93
+
94
+ DISALLOW_EVIL_CONSTRUCTORS(FilteredRE2);
95
+ };
96
+
97
+ } // namespace re2
98
+
99
+ #endif // RE2_FILTERED_RE2_H_
data/ext/re2/hash.cc ADDED
@@ -0,0 +1,231 @@
1
+ // Modified by Russ Cox to add "namespace re2".
2
+ // Also threw away all but hashword and hashword2.
3
+ // http://burtleburtle.net/bob/c/lookup3.c
4
+
5
+ /*
6
+ -------------------------------------------------------------------------------
7
+ lookup3.c, by Bob Jenkins, May 2006, Public Domain.
8
+
9
+ These are functions for producing 32-bit hashes for hash table lookup.
10
+ hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
11
+ are externally useful functions. Routines to test the hash are included
12
+ if SELF_TEST is defined. You can use this free for any purpose. It's in
13
+ the public domain. It has no warranty.
14
+
15
+ You probably want to use hashlittle(). hashlittle() and hashbig()
16
+ hash byte arrays. hashlittle() is is faster than hashbig() on
17
+ little-endian machines. Intel and AMD are little-endian machines.
18
+ On second thought, you probably want hashlittle2(), which is identical to
19
+ hashlittle() except it returns two 32-bit hashes for the price of one.
20
+ You could implement hashbig2() if you wanted but I haven't bothered here.
21
+
22
+ If you want to find a hash of, say, exactly 7 integers, do
23
+ a = i1; b = i2; c = i3;
24
+ mix(a,b,c);
25
+ a += i4; b += i5; c += i6;
26
+ mix(a,b,c);
27
+ a += i7;
28
+ final(a,b,c);
29
+ then use c as the hash value. If you have a variable length array of
30
+ 4-byte integers to hash, use hashword(). If you have a byte array (like
31
+ a character string), use hashlittle(). If you have several byte arrays, or
32
+ a mix of things, see the comments above hashlittle().
33
+
34
+ Why is this so big? I read 12 bytes at a time into 3 4-byte integers,
35
+ then mix those integers. This is fast (you can do a lot more thorough
36
+ mixing with 12*3 instructions on 3 integers than you can with 3 instructions
37
+ on 1 byte), but shoehorning those bytes into integers efficiently is messy.
38
+ -------------------------------------------------------------------------------
39
+ */
40
+
41
+ #include "util/util.h"
42
+
43
+ #define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
44
+
45
+ /*
46
+ -------------------------------------------------------------------------------
47
+ mix -- mix 3 32-bit values reversibly.
48
+
49
+ This is reversible, so any information in (a,b,c) before mix() is
50
+ still in (a,b,c) after mix().
51
+
52
+ If four pairs of (a,b,c) inputs are run through mix(), or through
53
+ mix() in reverse, there are at least 32 bits of the output that
54
+ are sometimes the same for one pair and different for another pair.
55
+ This was tested for:
56
+ * pairs that differed by one bit, by two bits, in any combination
57
+ of top bits of (a,b,c), or in any combination of bottom bits of
58
+ (a,b,c).
59
+ * "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
60
+ the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
61
+ is commonly produced by subtraction) look like a single 1-bit
62
+ difference.
63
+ * the base values were pseudorandom, all zero but one bit set, or
64
+ all zero plus a counter that starts at zero.
65
+
66
+ Some k values for my "a-=c; a^=rot(c,k); c+=b;" arrangement that
67
+ satisfy this are
68
+ 4 6 8 16 19 4
69
+ 9 15 3 18 27 15
70
+ 14 9 3 7 17 3
71
+ Well, "9 15 3 18 27 15" didn't quite get 32 bits diffing
72
+ for "differ" defined as + with a one-bit base and a two-bit delta. I
73
+ used http://burtleburtle.net/bob/hash/avalanche.html to choose
74
+ the operations, constants, and arrangements of the variables.
75
+
76
+ This does not achieve avalanche. There are input bits of (a,b,c)
77
+ that fail to affect some output bits of (a,b,c), especially of a. The
78
+ most thoroughly mixed value is c, but it doesn't really even achieve
79
+ avalanche in c.
80
+
81
+ This allows some parallelism. Read-after-writes are good at doubling
82
+ the number of bits affected, so the goal of mixing pulls in the opposite
83
+ direction as the goal of parallelism. I did what I could. Rotates
84
+ seem to cost as much as shifts on every machine I could lay my hands
85
+ on, and rotates are much kinder to the top and bottom bits, so I used
86
+ rotates.
87
+ -------------------------------------------------------------------------------
88
+ */
89
+ #define mix(a,b,c) \
90
+ { \
91
+ a -= c; a ^= rot(c, 4); c += b; \
92
+ b -= a; b ^= rot(a, 6); a += c; \
93
+ c -= b; c ^= rot(b, 8); b += a; \
94
+ a -= c; a ^= rot(c,16); c += b; \
95
+ b -= a; b ^= rot(a,19); a += c; \
96
+ c -= b; c ^= rot(b, 4); b += a; \
97
+ }
98
+
99
+ /*
100
+ -------------------------------------------------------------------------------
101
+ final -- final mixing of 3 32-bit values (a,b,c) into c
102
+
103
+ Pairs of (a,b,c) values differing in only a few bits will usually
104
+ produce values of c that look totally different. This was tested for
105
+ * pairs that differed by one bit, by two bits, in any combination
106
+ of top bits of (a,b,c), or in any combination of bottom bits of
107
+ (a,b,c).
108
+ * "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
109
+ the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
110
+ is commonly produced by subtraction) look like a single 1-bit
111
+ difference.
112
+ * the base values were pseudorandom, all zero but one bit set, or
113
+ all zero plus a counter that starts at zero.
114
+
115
+ These constants passed:
116
+ 14 11 25 16 4 14 24
117
+ 12 14 25 16 4 14 24
118
+ and these came close:
119
+ 4 8 15 26 3 22 24
120
+ 10 8 15 26 3 22 24
121
+ 11 8 15 26 3 22 24
122
+ -------------------------------------------------------------------------------
123
+ */
124
+ #define final(a,b,c) \
125
+ { \
126
+ c ^= b; c -= rot(b,14); \
127
+ a ^= c; a -= rot(c,11); \
128
+ b ^= a; b -= rot(a,25); \
129
+ c ^= b; c -= rot(b,16); \
130
+ a ^= c; a -= rot(c,4); \
131
+ b ^= a; b -= rot(a,14); \
132
+ c ^= b; c -= rot(b,24); \
133
+ }
134
+
135
+ namespace re2 {
136
+
137
+ /*
138
+ --------------------------------------------------------------------
139
+ This works on all machines. To be useful, it requires
140
+ -- that the key be an array of uint32_t's, and
141
+ -- that the length be the number of uint32_t's in the key
142
+
143
+ The function hashword() is identical to hashlittle() on little-endian
144
+ machines, and identical to hashbig() on big-endian machines,
145
+ except that the length has to be measured in uint32_ts rather than in
146
+ bytes. hashlittle() is more complicated than hashword() only because
147
+ hashlittle() has to dance around fitting the key bytes into registers.
148
+ --------------------------------------------------------------------
149
+ */
150
+ uint32 hashword(
151
+ const uint32 *k, /* the key, an array of uint32_t values */
152
+ size_t length, /* the length of the key, in uint32_ts */
153
+ uint32 initval) /* the previous hash, or an arbitrary value */
154
+ {
155
+ uint32_t a,b,c;
156
+
157
+ /* Set up the internal state */
158
+ a = b = c = 0xdeadbeef + (((uint32_t)length)<<2) + initval;
159
+
160
+ /*------------------------------------------------- handle most of the key */
161
+ while (length > 3)
162
+ {
163
+ a += k[0];
164
+ b += k[1];
165
+ c += k[2];
166
+ mix(a,b,c);
167
+ length -= 3;
168
+ k += 3;
169
+ }
170
+
171
+ /*------------------------------------------- handle the last 3 uint32_t's */
172
+ switch(length) /* all the case statements fall through */
173
+ {
174
+ case 3 : c+=k[2];
175
+ case 2 : b+=k[1];
176
+ case 1 : a+=k[0];
177
+ final(a,b,c);
178
+ case 0: /* case 0: nothing left to add */
179
+ break;
180
+ }
181
+ /*------------------------------------------------------ report the result */
182
+ return c;
183
+ }
184
+
185
+
186
+ /*
187
+ --------------------------------------------------------------------
188
+ hashword2() -- same as hashword(), but take two seeds and return two
189
+ 32-bit values. pc and pb must both be nonnull, and *pc and *pb must
190
+ both be initialized with seeds. If you pass in (*pb)==0, the output
191
+ (*pc) will be the same as the return value from hashword().
192
+ --------------------------------------------------------------------
193
+ */
194
+ void hashword2 (
195
+ const uint32 *k, /* the key, an array of uint32_t values */
196
+ size_t length, /* the length of the key, in uint32_ts */
197
+ uint32 *pc, /* IN: seed OUT: primary hash value */
198
+ uint32 *pb) /* IN: more seed OUT: secondary hash value */
199
+ {
200
+ uint32_t a,b,c;
201
+
202
+ /* Set up the internal state */
203
+ a = b = c = 0xdeadbeef + ((uint32_t)(length<<2)) + *pc;
204
+ c += *pb;
205
+
206
+ /*------------------------------------------------- handle most of the key */
207
+ while (length > 3)
208
+ {
209
+ a += k[0];
210
+ b += k[1];
211
+ c += k[2];
212
+ mix(a,b,c);
213
+ length -= 3;
214
+ k += 3;
215
+ }
216
+
217
+ /*------------------------------------------- handle the last 3 uint32_t's */
218
+ switch(length) /* all the case statements fall through */
219
+ {
220
+ case 3 : c+=k[2];
221
+ case 2 : b+=k[1];
222
+ case 1 : a+=k[0];
223
+ final(a,b,c);
224
+ case 0: /* case 0: nothing left to add */
225
+ break;
226
+ }
227
+ /*------------------------------------------------------ report the result */
228
+ *pc=c; *pb=b;
229
+ }
230
+
231
+ } // namespace re2
@@ -0,0 +1,185 @@
1
+ // Copyright 2008 The RE2 Authors. All Rights Reserved.
2
+ // Use of this source code is governed by a BSD-style
3
+ // license that can be found in the LICENSE file.
4
+
5
+ // Determine whether this library should match PCRE exactly
6
+ // for a particular Regexp. (If so, the testing framework can
7
+ // check that it does.)
8
+ //
9
+ // This library matches PCRE except in these cases:
10
+ // * the regexp contains a repetition of an empty string,
11
+ // like (a*)* or (a*)+. In this case, PCRE will treat
12
+ // the repetition sequence as ending with an empty string,
13
+ // while this library does not.
14
+ // * Perl and PCRE differ on whether \v matches \n.
15
+ // For historical reasons, this library implements the Perl behavior.
16
+ // * Perl and PCRE allow $ in one-line mode to match either the very
17
+ // end of the text or just before a \n at the end of the text.
18
+ // This library requires it to match only the end of the text.
19
+ // * Similarly, Perl and PCRE do not allow ^ in multi-line mode to
20
+ // match the end of the text if the last character is a \n.
21
+ // This library does allow it.
22
+ //
23
+ // Regexp::MimicsPCRE checks for any of these conditions.
24
+
25
+ #include "util/util.h"
26
+ #include "re2/regexp.h"
27
+ #include "re2/walker-inl.h"
28
+
29
+ namespace re2 {
30
+
31
+ // Returns whether re might match an empty string.
32
+ static bool CanBeEmptyString(Regexp *re);
33
+
34
+ // Walker class to compute whether library handles a regexp
35
+ // exactly as PCRE would. See comment at top for conditions.
36
+
37
+ class PCREWalker : public Regexp::Walker<bool> {
38
+ public:
39
+ PCREWalker() {}
40
+ bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, bool* child_args,
41
+ int nchild_args);
42
+
43
+ bool ShortVisit(Regexp* re, bool a) {
44
+ // Should never be called: we use Walk not WalkExponential.
45
+ LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
46
+ return a;
47
+ }
48
+ };
49
+
50
+ // Called after visiting each of re's children and accumulating
51
+ // the return values in child_args. So child_args contains whether
52
+ // this library mimics PCRE for those subexpressions.
53
+ bool PCREWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
54
+ bool* child_args, int nchild_args) {
55
+ // If children failed, so do we.
56
+ for (int i = 0; i < nchild_args; i++)
57
+ if (!child_args[i])
58
+ return false;
59
+
60
+ // Otherwise look for other reasons to fail.
61
+ switch (re->op()) {
62
+ // Look for repeated empty string.
63
+ case kRegexpStar:
64
+ case kRegexpPlus:
65
+ case kRegexpQuest:
66
+ if (CanBeEmptyString(re->sub()[0]))
67
+ return false;
68
+ break;
69
+ case kRegexpRepeat:
70
+ if (re->max() == -1 && CanBeEmptyString(re->sub()[0]))
71
+ return false;
72
+ break;
73
+
74
+ // Look for \v
75
+ case kRegexpLiteral:
76
+ if (re->rune() == '\v')
77
+ return false;
78
+ break;
79
+
80
+ // Look for $ in single-line mode.
81
+ case kRegexpEndText:
82
+ case kRegexpEmptyMatch:
83
+ if (re->parse_flags() & Regexp::WasDollar)
84
+ return false;
85
+ break;
86
+
87
+ // Look for ^ in multi-line mode.
88
+ case kRegexpBeginLine:
89
+ // No condition: in single-line mode ^ becomes kRegexpBeginText.
90
+ return false;
91
+
92
+ default:
93
+ break;
94
+ }
95
+
96
+ // Not proven guilty.
97
+ return true;
98
+ }
99
+
100
+ // Returns whether this regexp's behavior will mimic PCRE's exactly.
101
+ bool Regexp::MimicsPCRE() {
102
+ PCREWalker w;
103
+ return w.Walk(this, true);
104
+ }
105
+
106
+
107
+ // Walker class to compute whether a Regexp can match an empty string.
108
+ // It is okay to overestimate. For example, \b\B cannot match an empty
109
+ // string, because \b and \B are mutually exclusive, but this isn't
110
+ // that smart and will say it can. Spurious empty strings
111
+ // will reduce the number of regexps we sanity check against PCRE,
112
+ // but they won't break anything.
113
+
114
+ class EmptyStringWalker : public Regexp::Walker<bool> {
115
+ public:
116
+ EmptyStringWalker() { }
117
+ bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
118
+ bool* child_args, int nchild_args);
119
+
120
+ bool ShortVisit(Regexp* re, bool a) {
121
+ // Should never be called: we use Walk not WalkExponential.
122
+ LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
123
+ return a;
124
+ }
125
+
126
+ private:
127
+ DISALLOW_EVIL_CONSTRUCTORS(EmptyStringWalker);
128
+ };
129
+
130
+ // Called after visiting re's children. child_args contains the return
131
+ // value from each of the children's PostVisits (i.e., whether each child
132
+ // can match an empty string). Returns whether this clause can match an
133
+ // empty string.
134
+ bool EmptyStringWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
135
+ bool* child_args, int nchild_args) {
136
+ switch (re->op()) {
137
+ case kRegexpNoMatch: // never empty
138
+ case kRegexpLiteral:
139
+ case kRegexpAnyChar:
140
+ case kRegexpAnyByte:
141
+ case kRegexpCharClass:
142
+ case kRegexpLiteralString:
143
+ return false;
144
+
145
+ case kRegexpEmptyMatch: // always empty
146
+ case kRegexpBeginLine: // always empty, when they match
147
+ case kRegexpEndLine:
148
+ case kRegexpNoWordBoundary:
149
+ case kRegexpWordBoundary:
150
+ case kRegexpBeginText:
151
+ case kRegexpEndText:
152
+ case kRegexpStar: // can always be empty
153
+ case kRegexpQuest:
154
+ case kRegexpHaveMatch:
155
+ return true;
156
+
157
+ case kRegexpConcat: // can be empty if all children can
158
+ for (int i = 0; i < nchild_args; i++)
159
+ if (!child_args[i])
160
+ return false;
161
+ return true;
162
+
163
+ case kRegexpAlternate: // can be empty if any child can
164
+ for (int i = 0; i < nchild_args; i++)
165
+ if (child_args[i])
166
+ return true;
167
+ return false;
168
+
169
+ case kRegexpPlus: // can be empty if the child can
170
+ case kRegexpCapture:
171
+ return child_args[0];
172
+
173
+ case kRegexpRepeat: // can be empty if child can or is x{0}
174
+ return child_args[0] || re->min() == 0;
175
+ }
176
+ return false;
177
+ }
178
+
179
+ // Returns whether re can match an empty string.
180
+ static bool CanBeEmptyString(Regexp* re) {
181
+ EmptyStringWalker w;
182
+ return w.Walk(re, true);
183
+ }
184
+
185
+ } // namespace re2