chipper 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. data/README.rdoc +51 -0
  2. data/ext/extconf.rb +58 -0
  3. data/ext/libstemmer_c/Makefile +10 -0
  4. data/ext/libstemmer_c/examples/stemwords.c +209 -0
  5. data/ext/libstemmer_c/include/libstemmer.h +79 -0
  6. data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
  7. data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
  8. data/ext/libstemmer_c/libstemmer/modules.h +190 -0
  9. data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
  10. data/ext/libstemmer_c/mkinc.mak +82 -0
  11. data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
  12. data/ext/libstemmer_c/runtime/api.c +66 -0
  13. data/ext/libstemmer_c/runtime/api.h +26 -0
  14. data/ext/libstemmer_c/runtime/header.h +58 -0
  15. data/ext/libstemmer_c/runtime/utilities.c +478 -0
  16. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
  17. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
  18. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
  19. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
  20. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
  21. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
  22. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
  23. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
  24. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
  25. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
  26. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
  27. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
  28. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
  29. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
  30. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
  31. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
  32. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
  33. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
  34. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
  35. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
  36. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
  37. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
  38. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
  39. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
  40. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
  41. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
  42. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
  43. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
  44. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
  45. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
  46. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
  47. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
  48. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
  49. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
  50. data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
  51. data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
  52. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
  53. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
  54. data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
  55. data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
  56. data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
  57. data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
  58. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
  59. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
  60. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
  61. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
  62. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
  63. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
  64. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
  65. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
  66. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
  67. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
  68. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
  69. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
  70. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
  71. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
  72. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
  73. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
  74. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
  75. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
  76. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
  77. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
  78. data/ext/re2/bitstate.cc +378 -0
  79. data/ext/re2/compile.cc +1138 -0
  80. data/ext/re2/dfa.cc +2086 -0
  81. data/ext/re2/filtered_re2.cc +100 -0
  82. data/ext/re2/filtered_re2.h +99 -0
  83. data/ext/re2/hash.cc +231 -0
  84. data/ext/re2/mimics_pcre.cc +185 -0
  85. data/ext/re2/nfa.cc +709 -0
  86. data/ext/re2/onepass.cc +614 -0
  87. data/ext/re2/parse.cc +2202 -0
  88. data/ext/re2/perl_groups.cc +119 -0
  89. data/ext/re2/prefilter.cc +671 -0
  90. data/ext/re2/prefilter.h +105 -0
  91. data/ext/re2/prefilter_tree.cc +398 -0
  92. data/ext/re2/prefilter_tree.h +130 -0
  93. data/ext/re2/prog.cc +341 -0
  94. data/ext/re2/prog.h +376 -0
  95. data/ext/re2/re2.cc +1180 -0
  96. data/ext/re2/re2.h +837 -0
  97. data/ext/re2/regexp.cc +920 -0
  98. data/ext/re2/regexp.h +632 -0
  99. data/ext/re2/rune.cc +258 -0
  100. data/ext/re2/set.cc +113 -0
  101. data/ext/re2/set.h +55 -0
  102. data/ext/re2/simplify.cc +393 -0
  103. data/ext/re2/stringpiece.cc +87 -0
  104. data/ext/re2/stringpiece.h +182 -0
  105. data/ext/re2/tostring.cc +341 -0
  106. data/ext/re2/unicode_casefold.cc +469 -0
  107. data/ext/re2/unicode_casefold.h +75 -0
  108. data/ext/re2/unicode_groups.cc +4851 -0
  109. data/ext/re2/unicode_groups.h +64 -0
  110. data/ext/re2/valgrind.cc +24 -0
  111. data/ext/re2/variadic_function.h +346 -0
  112. data/ext/re2/walker-inl.h +244 -0
  113. data/ext/src/chipper.cc +626 -0
  114. data/ext/src/version.h +1 -0
  115. data/ext/stemmer.rb +40 -0
  116. data/ext/util/arena.h +103 -0
  117. data/ext/util/atomicops.h +79 -0
  118. data/ext/util/benchmark.h +41 -0
  119. data/ext/util/flags.h +27 -0
  120. data/ext/util/logging.h +78 -0
  121. data/ext/util/mutex.h +190 -0
  122. data/ext/util/pcre.h +679 -0
  123. data/ext/util/random.h +29 -0
  124. data/ext/util/sparse_array.h +451 -0
  125. data/ext/util/sparse_set.h +177 -0
  126. data/ext/util/test.h +57 -0
  127. data/ext/util/thread.h +26 -0
  128. data/ext/util/utf.h +43 -0
  129. data/ext/util/util.h +127 -0
  130. data/ext/util/valgrind.h +4517 -0
  131. data/test/helper.rb +5 -0
  132. data/test/test_entities.rb +57 -0
  133. data/test/test_tokens.rb +118 -0
  134. metadata +199 -0
@@ -0,0 +1,100 @@
1
+ // Copyright 2009 The RE2 Authors. All Rights Reserved.
2
+ // Use of this source code is governed by a BSD-style
3
+ // license that can be found in the LICENSE file.
4
+
5
+ #include <string>
6
+ #include "util/util.h"
7
+ #include "re2/filtered_re2.h"
8
+ #include "re2/prefilter.h"
9
+ #include "re2/prefilter_tree.h"
10
+
11
+ namespace re2 {
12
+
13
+ FilteredRE2::FilteredRE2()
14
+ : compiled_(false),
15
+ prefilter_tree_(new PrefilterTree()) {
16
+ }
17
+
18
+ FilteredRE2::~FilteredRE2() {
19
+ for (int i = 0; i < re2_vec_.size(); i++)
20
+ delete re2_vec_[i];
21
+ delete prefilter_tree_;
22
+ }
23
+
24
+ RE2::ErrorCode FilteredRE2::Add(const StringPiece& pattern,
25
+ const RE2::Options& options, int* id) {
26
+ RE2* re = new RE2(pattern, options);
27
+ RE2::ErrorCode code = re->error_code();
28
+
29
+ if (!re->ok()) {
30
+ LOG(ERROR) << "Couldn't compile regular expression, skipping: "
31
+ << re << " due to error " << re->error();
32
+ delete re;
33
+ } else {
34
+ *id = re2_vec_.size();
35
+ re2_vec_.push_back(re);
36
+ }
37
+
38
+ return code;
39
+ }
40
+
41
+ void FilteredRE2::Compile(vector<string>* atoms) {
42
+ if (compiled_ || re2_vec_.size() == 0) {
43
+ LOG(INFO) << "C: " << compiled_ << " S:" << re2_vec_.size();
44
+ return;
45
+ }
46
+
47
+ for (int i = 0; i < re2_vec_.size(); i++) {
48
+ Prefilter* prefilter = Prefilter::FromRE2(re2_vec_[i]);
49
+ prefilter_tree_->Add(prefilter);
50
+ }
51
+ atoms->clear();
52
+ prefilter_tree_->Compile(atoms);
53
+ compiled_ = true;
54
+ }
55
+
56
+ int FilteredRE2::SlowFirstMatch(const StringPiece& text) const {
57
+ for (int i = 0; i < re2_vec_.size(); i++)
58
+ if (RE2::PartialMatch(text, *re2_vec_[i]))
59
+ return i;
60
+ return -1;
61
+ }
62
+
63
+ int FilteredRE2::FirstMatch(const StringPiece& text,
64
+ const vector<int>& atoms) const {
65
+ if (!compiled_) {
66
+ LOG(DFATAL) << "FirstMatch called before Compile";
67
+ return -1;
68
+ }
69
+ vector<int> regexps;
70
+ prefilter_tree_->RegexpsGivenStrings(atoms, &regexps);
71
+ for (int i = 0; i < regexps.size(); i++)
72
+ if (RE2::PartialMatch(text, *re2_vec_[regexps[i]]))
73
+ return regexps[i];
74
+ return -1;
75
+ }
76
+
77
+ bool FilteredRE2::AllMatches(
78
+ const StringPiece& text,
79
+ const vector<int>& atoms,
80
+ vector<int>* matching_regexps) const {
81
+ matching_regexps->clear();
82
+ vector<int> regexps;
83
+ prefilter_tree_->RegexpsGivenStrings(atoms, &regexps);
84
+ for (int i = 0; i < regexps.size(); i++)
85
+ if (RE2::PartialMatch(text, *re2_vec_[regexps[i]]))
86
+ matching_regexps->push_back(regexps[i]);
87
+ return !matching_regexps->empty();
88
+ }
89
+
90
+ void FilteredRE2::RegexpsGivenStrings(const vector<int>& matched_atoms,
91
+ vector<int>* passed_regexps) {
92
+ prefilter_tree_->RegexpsGivenStrings(matched_atoms, passed_regexps);
93
+ }
94
+
95
+
96
+ void FilteredRE2::PrintPrefilter(int regexpid) {
97
+ prefilter_tree_->PrintPrefilter(regexpid);
98
+ }
99
+
100
+ } // namespace re2
@@ -0,0 +1,99 @@
1
+ // Copyright 2009 The RE2 Authors. All Rights Reserved.
2
+ // Use of this source code is governed by a BSD-style
3
+ // license that can be found in the LICENSE file.
4
+
5
+ // The class FilteredRE2 is used as a wrapper to multiple RE2 regexps.
6
+ // It provides a prefilter mechanism that helps in cutting down the
7
+ // number of regexps that need to be actually searched.
8
+ //
9
+ // By design, it does not include a string matching engine. This is to
10
+ // allow the user of the class to use their favorite string match
11
+ // engine. The overall flow is: Add all the regexps using Add, then
12
+ // Compile the FilteredRE2. The compile returns strings that need to
13
+ // be matched. Note that all returned strings are lowercase. For
14
+ // applying regexps to a search text, the caller does the string
15
+ // matching using the strings returned. When doing the string match,
16
+ // note that the caller has to do that on lower cased version of the
17
+ // search text. Then call FirstMatch or AllMatches with a vector of
18
+ // indices of strings that were found in the text to get the actual
19
+ // regexp matches.
20
+
21
+ #ifndef RE2_FILTERED_RE2_H_
22
+ #define RE2_FILTERED_RE2_H_
23
+
24
+ #include <vector>
25
+ #include "re2/re2.h"
26
+
27
+ namespace re2 {
28
+ using std::vector;
29
+
30
+ class PrefilterTree;
31
+
32
+ class FilteredRE2 {
33
+ public:
34
+ FilteredRE2();
35
+ ~FilteredRE2();
36
+
37
+ // Uses RE2 constructor to create a RE2 object (re). Returns
38
+ // re->error_code(). If error_code is other than NoError, then re is
39
+ // deleted and not added to re2_vec_.
40
+ RE2::ErrorCode Add(const StringPiece& pattern,
41
+ const RE2::Options& options,
42
+ int *id);
43
+
44
+ // Prepares the regexps added by Add for filtering. Returns a set
45
+ // of strings that the caller should check for in candidate texts.
46
+ // The returned strings are lowercased. When doing string matching,
47
+ // the search text should be lowercased first to find matching
48
+ // strings from the set of strings returned by Compile. Call after
49
+ // all Add calls are done.
50
+ void Compile(vector<string>* strings_to_match);
51
+
52
+ // Returns the index of the first matching regexp.
53
+ // Returns -1 on no match. Can be called prior to Compile.
54
+ // Does not do any filtering: simply tries to Match the
55
+ // regexps in a loop.
56
+ int SlowFirstMatch(const StringPiece& text) const;
57
+
58
+ // Returns the index of the first matching regexp.
59
+ // Returns -1 on no match. Compile has to be called before
60
+ // calling this.
61
+ int FirstMatch(const StringPiece& text,
62
+ const vector<int>& atoms) const;
63
+
64
+ // Returns the indices of all matching regexps, after first clearing
65
+ // matched_regexps.
66
+ bool AllMatches(const StringPiece& text,
67
+ const vector<int>& atoms,
68
+ vector<int>* matching_regexps) const;
69
+
70
+ // The number of regexps added.
71
+ int NumRegexps() const { return re2_vec_.size(); }
72
+
73
+ private:
74
+
75
+ // Get the individual RE2 objects. Useful for testing.
76
+ RE2* GetRE2(int regexpid) const { return re2_vec_[regexpid]; }
77
+
78
+ // Print prefilter.
79
+ void PrintPrefilter(int regexpid);
80
+
81
+ // Useful for testing and debugging.
82
+ void RegexpsGivenStrings(const vector<int>& matched_atoms,
83
+ vector<int>* passed_regexps);
84
+
85
+ // All the regexps in the FilteredRE2.
86
+ vector<RE2*> re2_vec_;
87
+
88
+ // Has the FilteredRE2 been compiled using Compile()
89
+ bool compiled_;
90
+
91
+ // An AND-OR tree of string atoms used for filtering regexps.
92
+ PrefilterTree* prefilter_tree_;
93
+
94
+ DISALLOW_EVIL_CONSTRUCTORS(FilteredRE2);
95
+ };
96
+
97
+ } // namespace re2
98
+
99
+ #endif // RE2_FILTERED_RE2_H_
data/ext/re2/hash.cc ADDED
@@ -0,0 +1,231 @@
1
+ // Modified by Russ Cox to add "namespace re2".
2
+ // Also threw away all but hashword and hashword2.
3
+ // http://burtleburtle.net/bob/c/lookup3.c
4
+
5
+ /*
6
+ -------------------------------------------------------------------------------
7
+ lookup3.c, by Bob Jenkins, May 2006, Public Domain.
8
+
9
+ These are functions for producing 32-bit hashes for hash table lookup.
10
+ hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
11
+ are externally useful functions. Routines to test the hash are included
12
+ if SELF_TEST is defined. You can use this free for any purpose. It's in
13
+ the public domain. It has no warranty.
14
+
15
+ You probably want to use hashlittle(). hashlittle() and hashbig()
16
+ hash byte arrays. hashlittle() is is faster than hashbig() on
17
+ little-endian machines. Intel and AMD are little-endian machines.
18
+ On second thought, you probably want hashlittle2(), which is identical to
19
+ hashlittle() except it returns two 32-bit hashes for the price of one.
20
+ You could implement hashbig2() if you wanted but I haven't bothered here.
21
+
22
+ If you want to find a hash of, say, exactly 7 integers, do
23
+ a = i1; b = i2; c = i3;
24
+ mix(a,b,c);
25
+ a += i4; b += i5; c += i6;
26
+ mix(a,b,c);
27
+ a += i7;
28
+ final(a,b,c);
29
+ then use c as the hash value. If you have a variable length array of
30
+ 4-byte integers to hash, use hashword(). If you have a byte array (like
31
+ a character string), use hashlittle(). If you have several byte arrays, or
32
+ a mix of things, see the comments above hashlittle().
33
+
34
+ Why is this so big? I read 12 bytes at a time into 3 4-byte integers,
35
+ then mix those integers. This is fast (you can do a lot more thorough
36
+ mixing with 12*3 instructions on 3 integers than you can with 3 instructions
37
+ on 1 byte), but shoehorning those bytes into integers efficiently is messy.
38
+ -------------------------------------------------------------------------------
39
+ */
40
+
41
+ #include "util/util.h"
42
+
43
+ #define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
44
+
45
+ /*
46
+ -------------------------------------------------------------------------------
47
+ mix -- mix 3 32-bit values reversibly.
48
+
49
+ This is reversible, so any information in (a,b,c) before mix() is
50
+ still in (a,b,c) after mix().
51
+
52
+ If four pairs of (a,b,c) inputs are run through mix(), or through
53
+ mix() in reverse, there are at least 32 bits of the output that
54
+ are sometimes the same for one pair and different for another pair.
55
+ This was tested for:
56
+ * pairs that differed by one bit, by two bits, in any combination
57
+ of top bits of (a,b,c), or in any combination of bottom bits of
58
+ (a,b,c).
59
+ * "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
60
+ the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
61
+ is commonly produced by subtraction) look like a single 1-bit
62
+ difference.
63
+ * the base values were pseudorandom, all zero but one bit set, or
64
+ all zero plus a counter that starts at zero.
65
+
66
+ Some k values for my "a-=c; a^=rot(c,k); c+=b;" arrangement that
67
+ satisfy this are
68
+ 4 6 8 16 19 4
69
+ 9 15 3 18 27 15
70
+ 14 9 3 7 17 3
71
+ Well, "9 15 3 18 27 15" didn't quite get 32 bits diffing
72
+ for "differ" defined as + with a one-bit base and a two-bit delta. I
73
+ used http://burtleburtle.net/bob/hash/avalanche.html to choose
74
+ the operations, constants, and arrangements of the variables.
75
+
76
+ This does not achieve avalanche. There are input bits of (a,b,c)
77
+ that fail to affect some output bits of (a,b,c), especially of a. The
78
+ most thoroughly mixed value is c, but it doesn't really even achieve
79
+ avalanche in c.
80
+
81
+ This allows some parallelism. Read-after-writes are good at doubling
82
+ the number of bits affected, so the goal of mixing pulls in the opposite
83
+ direction as the goal of parallelism. I did what I could. Rotates
84
+ seem to cost as much as shifts on every machine I could lay my hands
85
+ on, and rotates are much kinder to the top and bottom bits, so I used
86
+ rotates.
87
+ -------------------------------------------------------------------------------
88
+ */
89
+ #define mix(a,b,c) \
90
+ { \
91
+ a -= c; a ^= rot(c, 4); c += b; \
92
+ b -= a; b ^= rot(a, 6); a += c; \
93
+ c -= b; c ^= rot(b, 8); b += a; \
94
+ a -= c; a ^= rot(c,16); c += b; \
95
+ b -= a; b ^= rot(a,19); a += c; \
96
+ c -= b; c ^= rot(b, 4); b += a; \
97
+ }
98
+
99
+ /*
100
+ -------------------------------------------------------------------------------
101
+ final -- final mixing of 3 32-bit values (a,b,c) into c
102
+
103
+ Pairs of (a,b,c) values differing in only a few bits will usually
104
+ produce values of c that look totally different. This was tested for
105
+ * pairs that differed by one bit, by two bits, in any combination
106
+ of top bits of (a,b,c), or in any combination of bottom bits of
107
+ (a,b,c).
108
+ * "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
109
+ the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
110
+ is commonly produced by subtraction) look like a single 1-bit
111
+ difference.
112
+ * the base values were pseudorandom, all zero but one bit set, or
113
+ all zero plus a counter that starts at zero.
114
+
115
+ These constants passed:
116
+ 14 11 25 16 4 14 24
117
+ 12 14 25 16 4 14 24
118
+ and these came close:
119
+ 4 8 15 26 3 22 24
120
+ 10 8 15 26 3 22 24
121
+ 11 8 15 26 3 22 24
122
+ -------------------------------------------------------------------------------
123
+ */
124
+ #define final(a,b,c) \
125
+ { \
126
+ c ^= b; c -= rot(b,14); \
127
+ a ^= c; a -= rot(c,11); \
128
+ b ^= a; b -= rot(a,25); \
129
+ c ^= b; c -= rot(b,16); \
130
+ a ^= c; a -= rot(c,4); \
131
+ b ^= a; b -= rot(a,14); \
132
+ c ^= b; c -= rot(b,24); \
133
+ }
134
+
135
+ namespace re2 {
136
+
137
+ /*
138
+ --------------------------------------------------------------------
139
+ This works on all machines. To be useful, it requires
140
+ -- that the key be an array of uint32_t's, and
141
+ -- that the length be the number of uint32_t's in the key
142
+
143
+ The function hashword() is identical to hashlittle() on little-endian
144
+ machines, and identical to hashbig() on big-endian machines,
145
+ except that the length has to be measured in uint32_ts rather than in
146
+ bytes. hashlittle() is more complicated than hashword() only because
147
+ hashlittle() has to dance around fitting the key bytes into registers.
148
+ --------------------------------------------------------------------
149
+ */
150
+ uint32 hashword(
151
+ const uint32 *k, /* the key, an array of uint32_t values */
152
+ size_t length, /* the length of the key, in uint32_ts */
153
+ uint32 initval) /* the previous hash, or an arbitrary value */
154
+ {
155
+ uint32_t a,b,c;
156
+
157
+ /* Set up the internal state */
158
+ a = b = c = 0xdeadbeef + (((uint32_t)length)<<2) + initval;
159
+
160
+ /*------------------------------------------------- handle most of the key */
161
+ while (length > 3)
162
+ {
163
+ a += k[0];
164
+ b += k[1];
165
+ c += k[2];
166
+ mix(a,b,c);
167
+ length -= 3;
168
+ k += 3;
169
+ }
170
+
171
+ /*------------------------------------------- handle the last 3 uint32_t's */
172
+ switch(length) /* all the case statements fall through */
173
+ {
174
+ case 3 : c+=k[2];
175
+ case 2 : b+=k[1];
176
+ case 1 : a+=k[0];
177
+ final(a,b,c);
178
+ case 0: /* case 0: nothing left to add */
179
+ break;
180
+ }
181
+ /*------------------------------------------------------ report the result */
182
+ return c;
183
+ }
184
+
185
+
186
+ /*
187
+ --------------------------------------------------------------------
188
+ hashword2() -- same as hashword(), but take two seeds and return two
189
+ 32-bit values. pc and pb must both be nonnull, and *pc and *pb must
190
+ both be initialized with seeds. If you pass in (*pb)==0, the output
191
+ (*pc) will be the same as the return value from hashword().
192
+ --------------------------------------------------------------------
193
+ */
194
+ void hashword2 (
195
+ const uint32 *k, /* the key, an array of uint32_t values */
196
+ size_t length, /* the length of the key, in uint32_ts */
197
+ uint32 *pc, /* IN: seed OUT: primary hash value */
198
+ uint32 *pb) /* IN: more seed OUT: secondary hash value */
199
+ {
200
+ uint32_t a,b,c;
201
+
202
+ /* Set up the internal state */
203
+ a = b = c = 0xdeadbeef + ((uint32_t)(length<<2)) + *pc;
204
+ c += *pb;
205
+
206
+ /*------------------------------------------------- handle most of the key */
207
+ while (length > 3)
208
+ {
209
+ a += k[0];
210
+ b += k[1];
211
+ c += k[2];
212
+ mix(a,b,c);
213
+ length -= 3;
214
+ k += 3;
215
+ }
216
+
217
+ /*------------------------------------------- handle the last 3 uint32_t's */
218
+ switch(length) /* all the case statements fall through */
219
+ {
220
+ case 3 : c+=k[2];
221
+ case 2 : b+=k[1];
222
+ case 1 : a+=k[0];
223
+ final(a,b,c);
224
+ case 0: /* case 0: nothing left to add */
225
+ break;
226
+ }
227
+ /*------------------------------------------------------ report the result */
228
+ *pc=c; *pb=b;
229
+ }
230
+
231
+ } // namespace re2
@@ -0,0 +1,185 @@
1
+ // Copyright 2008 The RE2 Authors. All Rights Reserved.
2
+ // Use of this source code is governed by a BSD-style
3
+ // license that can be found in the LICENSE file.
4
+
5
+ // Determine whether this library should match PCRE exactly
6
+ // for a particular Regexp. (If so, the testing framework can
7
+ // check that it does.)
8
+ //
9
+ // This library matches PCRE except in these cases:
10
+ // * the regexp contains a repetition of an empty string,
11
+ // like (a*)* or (a*)+. In this case, PCRE will treat
12
+ // the repetition sequence as ending with an empty string,
13
+ // while this library does not.
14
+ // * Perl and PCRE differ on whether \v matches \n.
15
+ // For historical reasons, this library implements the Perl behavior.
16
+ // * Perl and PCRE allow $ in one-line mode to match either the very
17
+ // end of the text or just before a \n at the end of the text.
18
+ // This library requires it to match only the end of the text.
19
+ // * Similarly, Perl and PCRE do not allow ^ in multi-line mode to
20
+ // match the end of the text if the last character is a \n.
21
+ // This library does allow it.
22
+ //
23
+ // Regexp::MimicsPCRE checks for any of these conditions.
24
+
25
+ #include "util/util.h"
26
+ #include "re2/regexp.h"
27
+ #include "re2/walker-inl.h"
28
+
29
+ namespace re2 {
30
+
31
+ // Returns whether re might match an empty string.
32
+ static bool CanBeEmptyString(Regexp *re);
33
+
34
+ // Walker class to compute whether library handles a regexp
35
+ // exactly as PCRE would. See comment at top for conditions.
36
+
37
+ class PCREWalker : public Regexp::Walker<bool> {
38
+ public:
39
+ PCREWalker() {}
40
+ bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, bool* child_args,
41
+ int nchild_args);
42
+
43
+ bool ShortVisit(Regexp* re, bool a) {
44
+ // Should never be called: we use Walk not WalkExponential.
45
+ LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
46
+ return a;
47
+ }
48
+ };
49
+
50
+ // Called after visiting each of re's children and accumulating
51
+ // the return values in child_args. So child_args contains whether
52
+ // this library mimics PCRE for those subexpressions.
53
+ bool PCREWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
54
+ bool* child_args, int nchild_args) {
55
+ // If children failed, so do we.
56
+ for (int i = 0; i < nchild_args; i++)
57
+ if (!child_args[i])
58
+ return false;
59
+
60
+ // Otherwise look for other reasons to fail.
61
+ switch (re->op()) {
62
+ // Look for repeated empty string.
63
+ case kRegexpStar:
64
+ case kRegexpPlus:
65
+ case kRegexpQuest:
66
+ if (CanBeEmptyString(re->sub()[0]))
67
+ return false;
68
+ break;
69
+ case kRegexpRepeat:
70
+ if (re->max() == -1 && CanBeEmptyString(re->sub()[0]))
71
+ return false;
72
+ break;
73
+
74
+ // Look for \v
75
+ case kRegexpLiteral:
76
+ if (re->rune() == '\v')
77
+ return false;
78
+ break;
79
+
80
+ // Look for $ in single-line mode.
81
+ case kRegexpEndText:
82
+ case kRegexpEmptyMatch:
83
+ if (re->parse_flags() & Regexp::WasDollar)
84
+ return false;
85
+ break;
86
+
87
+ // Look for ^ in multi-line mode.
88
+ case kRegexpBeginLine:
89
+ // No condition: in single-line mode ^ becomes kRegexpBeginText.
90
+ return false;
91
+
92
+ default:
93
+ break;
94
+ }
95
+
96
+ // Not proven guilty.
97
+ return true;
98
+ }
99
+
100
+ // Returns whether this regexp's behavior will mimic PCRE's exactly.
101
+ bool Regexp::MimicsPCRE() {
102
+ PCREWalker w;
103
+ return w.Walk(this, true);
104
+ }
105
+
106
+
107
+ // Walker class to compute whether a Regexp can match an empty string.
108
+ // It is okay to overestimate. For example, \b\B cannot match an empty
109
+ // string, because \b and \B are mutually exclusive, but this isn't
110
+ // that smart and will say it can. Spurious empty strings
111
+ // will reduce the number of regexps we sanity check against PCRE,
112
+ // but they won't break anything.
113
+
114
+ class EmptyStringWalker : public Regexp::Walker<bool> {
115
+ public:
116
+ EmptyStringWalker() { }
117
+ bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
118
+ bool* child_args, int nchild_args);
119
+
120
+ bool ShortVisit(Regexp* re, bool a) {
121
+ // Should never be called: we use Walk not WalkExponential.
122
+ LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
123
+ return a;
124
+ }
125
+
126
+ private:
127
+ DISALLOW_EVIL_CONSTRUCTORS(EmptyStringWalker);
128
+ };
129
+
130
+ // Called after visiting re's children. child_args contains the return
131
+ // value from each of the children's PostVisits (i.e., whether each child
132
+ // can match an empty string). Returns whether this clause can match an
133
+ // empty string.
134
+ bool EmptyStringWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
135
+ bool* child_args, int nchild_args) {
136
+ switch (re->op()) {
137
+ case kRegexpNoMatch: // never empty
138
+ case kRegexpLiteral:
139
+ case kRegexpAnyChar:
140
+ case kRegexpAnyByte:
141
+ case kRegexpCharClass:
142
+ case kRegexpLiteralString:
143
+ return false;
144
+
145
+ case kRegexpEmptyMatch: // always empty
146
+ case kRegexpBeginLine: // always empty, when they match
147
+ case kRegexpEndLine:
148
+ case kRegexpNoWordBoundary:
149
+ case kRegexpWordBoundary:
150
+ case kRegexpBeginText:
151
+ case kRegexpEndText:
152
+ case kRegexpStar: // can always be empty
153
+ case kRegexpQuest:
154
+ case kRegexpHaveMatch:
155
+ return true;
156
+
157
+ case kRegexpConcat: // can be empty if all children can
158
+ for (int i = 0; i < nchild_args; i++)
159
+ if (!child_args[i])
160
+ return false;
161
+ return true;
162
+
163
+ case kRegexpAlternate: // can be empty if any child can
164
+ for (int i = 0; i < nchild_args; i++)
165
+ if (child_args[i])
166
+ return true;
167
+ return false;
168
+
169
+ case kRegexpPlus: // can be empty if the child can
170
+ case kRegexpCapture:
171
+ return child_args[0];
172
+
173
+ case kRegexpRepeat: // can be empty if child can or is x{0}
174
+ return child_args[0] || re->min() == 0;
175
+ }
176
+ return false;
177
+ }
178
+
179
+ // Returns whether re can match an empty string.
180
+ static bool CanBeEmptyString(Regexp* re) {
181
+ EmptyStringWalker w;
182
+ return w.Walk(re, true);
183
+ }
184
+
185
+ } // namespace re2