chipper 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +51 -0
- data/ext/extconf.rb +58 -0
- data/ext/libstemmer_c/Makefile +10 -0
- data/ext/libstemmer_c/examples/stemwords.c +209 -0
- data/ext/libstemmer_c/include/libstemmer.h +79 -0
- data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
- data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
- data/ext/libstemmer_c/libstemmer/modules.h +190 -0
- data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
- data/ext/libstemmer_c/mkinc.mak +82 -0
- data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
- data/ext/libstemmer_c/runtime/api.c +66 -0
- data/ext/libstemmer_c/runtime/api.h +26 -0
- data/ext/libstemmer_c/runtime/header.h +58 -0
- data/ext/libstemmer_c/runtime/utilities.c +478 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
- data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
- data/ext/re2/bitstate.cc +378 -0
- data/ext/re2/compile.cc +1138 -0
- data/ext/re2/dfa.cc +2086 -0
- data/ext/re2/filtered_re2.cc +100 -0
- data/ext/re2/filtered_re2.h +99 -0
- data/ext/re2/hash.cc +231 -0
- data/ext/re2/mimics_pcre.cc +185 -0
- data/ext/re2/nfa.cc +709 -0
- data/ext/re2/onepass.cc +614 -0
- data/ext/re2/parse.cc +2202 -0
- data/ext/re2/perl_groups.cc +119 -0
- data/ext/re2/prefilter.cc +671 -0
- data/ext/re2/prefilter.h +105 -0
- data/ext/re2/prefilter_tree.cc +398 -0
- data/ext/re2/prefilter_tree.h +130 -0
- data/ext/re2/prog.cc +341 -0
- data/ext/re2/prog.h +376 -0
- data/ext/re2/re2.cc +1180 -0
- data/ext/re2/re2.h +837 -0
- data/ext/re2/regexp.cc +920 -0
- data/ext/re2/regexp.h +632 -0
- data/ext/re2/rune.cc +258 -0
- data/ext/re2/set.cc +113 -0
- data/ext/re2/set.h +55 -0
- data/ext/re2/simplify.cc +393 -0
- data/ext/re2/stringpiece.cc +87 -0
- data/ext/re2/stringpiece.h +182 -0
- data/ext/re2/tostring.cc +341 -0
- data/ext/re2/unicode_casefold.cc +469 -0
- data/ext/re2/unicode_casefold.h +75 -0
- data/ext/re2/unicode_groups.cc +4851 -0
- data/ext/re2/unicode_groups.h +64 -0
- data/ext/re2/valgrind.cc +24 -0
- data/ext/re2/variadic_function.h +346 -0
- data/ext/re2/walker-inl.h +244 -0
- data/ext/src/chipper.cc +626 -0
- data/ext/src/version.h +1 -0
- data/ext/stemmer.rb +40 -0
- data/ext/util/arena.h +103 -0
- data/ext/util/atomicops.h +79 -0
- data/ext/util/benchmark.h +41 -0
- data/ext/util/flags.h +27 -0
- data/ext/util/logging.h +78 -0
- data/ext/util/mutex.h +190 -0
- data/ext/util/pcre.h +679 -0
- data/ext/util/random.h +29 -0
- data/ext/util/sparse_array.h +451 -0
- data/ext/util/sparse_set.h +177 -0
- data/ext/util/test.h +57 -0
- data/ext/util/thread.h +26 -0
- data/ext/util/utf.h +43 -0
- data/ext/util/util.h +127 -0
- data/ext/util/valgrind.h +4517 -0
- data/test/helper.rb +5 -0
- data/test/test_entities.rb +57 -0
- data/test/test_tokens.rb +118 -0
- metadata +199 -0
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style
|
|
3
|
+
// license that can be found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#include <string>
|
|
6
|
+
#include "util/util.h"
|
|
7
|
+
#include "re2/filtered_re2.h"
|
|
8
|
+
#include "re2/prefilter.h"
|
|
9
|
+
#include "re2/prefilter_tree.h"
|
|
10
|
+
|
|
11
|
+
namespace re2 {
|
|
12
|
+
|
|
13
|
+
FilteredRE2::FilteredRE2()
|
|
14
|
+
: compiled_(false),
|
|
15
|
+
prefilter_tree_(new PrefilterTree()) {
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
FilteredRE2::~FilteredRE2() {
|
|
19
|
+
for (int i = 0; i < re2_vec_.size(); i++)
|
|
20
|
+
delete re2_vec_[i];
|
|
21
|
+
delete prefilter_tree_;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
RE2::ErrorCode FilteredRE2::Add(const StringPiece& pattern,
|
|
25
|
+
const RE2::Options& options, int* id) {
|
|
26
|
+
RE2* re = new RE2(pattern, options);
|
|
27
|
+
RE2::ErrorCode code = re->error_code();
|
|
28
|
+
|
|
29
|
+
if (!re->ok()) {
|
|
30
|
+
LOG(ERROR) << "Couldn't compile regular expression, skipping: "
|
|
31
|
+
<< re << " due to error " << re->error();
|
|
32
|
+
delete re;
|
|
33
|
+
} else {
|
|
34
|
+
*id = re2_vec_.size();
|
|
35
|
+
re2_vec_.push_back(re);
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
return code;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
void FilteredRE2::Compile(vector<string>* atoms) {
|
|
42
|
+
if (compiled_ || re2_vec_.size() == 0) {
|
|
43
|
+
LOG(INFO) << "C: " << compiled_ << " S:" << re2_vec_.size();
|
|
44
|
+
return;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
for (int i = 0; i < re2_vec_.size(); i++) {
|
|
48
|
+
Prefilter* prefilter = Prefilter::FromRE2(re2_vec_[i]);
|
|
49
|
+
prefilter_tree_->Add(prefilter);
|
|
50
|
+
}
|
|
51
|
+
atoms->clear();
|
|
52
|
+
prefilter_tree_->Compile(atoms);
|
|
53
|
+
compiled_ = true;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
int FilteredRE2::SlowFirstMatch(const StringPiece& text) const {
|
|
57
|
+
for (int i = 0; i < re2_vec_.size(); i++)
|
|
58
|
+
if (RE2::PartialMatch(text, *re2_vec_[i]))
|
|
59
|
+
return i;
|
|
60
|
+
return -1;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
int FilteredRE2::FirstMatch(const StringPiece& text,
|
|
64
|
+
const vector<int>& atoms) const {
|
|
65
|
+
if (!compiled_) {
|
|
66
|
+
LOG(DFATAL) << "FirstMatch called before Compile";
|
|
67
|
+
return -1;
|
|
68
|
+
}
|
|
69
|
+
vector<int> regexps;
|
|
70
|
+
prefilter_tree_->RegexpsGivenStrings(atoms, ®exps);
|
|
71
|
+
for (int i = 0; i < regexps.size(); i++)
|
|
72
|
+
if (RE2::PartialMatch(text, *re2_vec_[regexps[i]]))
|
|
73
|
+
return regexps[i];
|
|
74
|
+
return -1;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
bool FilteredRE2::AllMatches(
|
|
78
|
+
const StringPiece& text,
|
|
79
|
+
const vector<int>& atoms,
|
|
80
|
+
vector<int>* matching_regexps) const {
|
|
81
|
+
matching_regexps->clear();
|
|
82
|
+
vector<int> regexps;
|
|
83
|
+
prefilter_tree_->RegexpsGivenStrings(atoms, ®exps);
|
|
84
|
+
for (int i = 0; i < regexps.size(); i++)
|
|
85
|
+
if (RE2::PartialMatch(text, *re2_vec_[regexps[i]]))
|
|
86
|
+
matching_regexps->push_back(regexps[i]);
|
|
87
|
+
return !matching_regexps->empty();
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
void FilteredRE2::RegexpsGivenStrings(const vector<int>& matched_atoms,
|
|
91
|
+
vector<int>* passed_regexps) {
|
|
92
|
+
prefilter_tree_->RegexpsGivenStrings(matched_atoms, passed_regexps);
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
void FilteredRE2::PrintPrefilter(int regexpid) {
|
|
97
|
+
prefilter_tree_->PrintPrefilter(regexpid);
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
} // namespace re2
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style
|
|
3
|
+
// license that can be found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
// The class FilteredRE2 is used as a wrapper to multiple RE2 regexps.
|
|
6
|
+
// It provides a prefilter mechanism that helps in cutting down the
|
|
7
|
+
// number of regexps that need to be actually searched.
|
|
8
|
+
//
|
|
9
|
+
// By design, it does not include a string matching engine. This is to
|
|
10
|
+
// allow the user of the class to use their favorite string match
|
|
11
|
+
// engine. The overall flow is: Add all the regexps using Add, then
|
|
12
|
+
// Compile the FilteredRE2. The compile returns strings that need to
|
|
13
|
+
// be matched. Note that all returned strings are lowercase. For
|
|
14
|
+
// applying regexps to a search text, the caller does the string
|
|
15
|
+
// matching using the strings returned. When doing the string match,
|
|
16
|
+
// note that the caller has to do that on lower cased version of the
|
|
17
|
+
// search text. Then call FirstMatch or AllMatches with a vector of
|
|
18
|
+
// indices of strings that were found in the text to get the actual
|
|
19
|
+
// regexp matches.
|
|
20
|
+
|
|
21
|
+
#ifndef RE2_FILTERED_RE2_H_
|
|
22
|
+
#define RE2_FILTERED_RE2_H_
|
|
23
|
+
|
|
24
|
+
#include <vector>
|
|
25
|
+
#include "re2/re2.h"
|
|
26
|
+
|
|
27
|
+
namespace re2 {
|
|
28
|
+
using std::vector;
|
|
29
|
+
|
|
30
|
+
class PrefilterTree;
|
|
31
|
+
|
|
32
|
+
class FilteredRE2 {
|
|
33
|
+
public:
|
|
34
|
+
FilteredRE2();
|
|
35
|
+
~FilteredRE2();
|
|
36
|
+
|
|
37
|
+
// Uses RE2 constructor to create a RE2 object (re). Returns
|
|
38
|
+
// re->error_code(). If error_code is other than NoError, then re is
|
|
39
|
+
// deleted and not added to re2_vec_.
|
|
40
|
+
RE2::ErrorCode Add(const StringPiece& pattern,
|
|
41
|
+
const RE2::Options& options,
|
|
42
|
+
int *id);
|
|
43
|
+
|
|
44
|
+
// Prepares the regexps added by Add for filtering. Returns a set
|
|
45
|
+
// of strings that the caller should check for in candidate texts.
|
|
46
|
+
// The returned strings are lowercased. When doing string matching,
|
|
47
|
+
// the search text should be lowercased first to find matching
|
|
48
|
+
// strings from the set of strings returned by Compile. Call after
|
|
49
|
+
// all Add calls are done.
|
|
50
|
+
void Compile(vector<string>* strings_to_match);
|
|
51
|
+
|
|
52
|
+
// Returns the index of the first matching regexp.
|
|
53
|
+
// Returns -1 on no match. Can be called prior to Compile.
|
|
54
|
+
// Does not do any filtering: simply tries to Match the
|
|
55
|
+
// regexps in a loop.
|
|
56
|
+
int SlowFirstMatch(const StringPiece& text) const;
|
|
57
|
+
|
|
58
|
+
// Returns the index of the first matching regexp.
|
|
59
|
+
// Returns -1 on no match. Compile has to be called before
|
|
60
|
+
// calling this.
|
|
61
|
+
int FirstMatch(const StringPiece& text,
|
|
62
|
+
const vector<int>& atoms) const;
|
|
63
|
+
|
|
64
|
+
// Returns the indices of all matching regexps, after first clearing
|
|
65
|
+
// matched_regexps.
|
|
66
|
+
bool AllMatches(const StringPiece& text,
|
|
67
|
+
const vector<int>& atoms,
|
|
68
|
+
vector<int>* matching_regexps) const;
|
|
69
|
+
|
|
70
|
+
// The number of regexps added.
|
|
71
|
+
int NumRegexps() const { return re2_vec_.size(); }
|
|
72
|
+
|
|
73
|
+
private:
|
|
74
|
+
|
|
75
|
+
// Get the individual RE2 objects. Useful for testing.
|
|
76
|
+
RE2* GetRE2(int regexpid) const { return re2_vec_[regexpid]; }
|
|
77
|
+
|
|
78
|
+
// Print prefilter.
|
|
79
|
+
void PrintPrefilter(int regexpid);
|
|
80
|
+
|
|
81
|
+
// Useful for testing and debugging.
|
|
82
|
+
void RegexpsGivenStrings(const vector<int>& matched_atoms,
|
|
83
|
+
vector<int>* passed_regexps);
|
|
84
|
+
|
|
85
|
+
// All the regexps in the FilteredRE2.
|
|
86
|
+
vector<RE2*> re2_vec_;
|
|
87
|
+
|
|
88
|
+
// Has the FilteredRE2 been compiled using Compile()
|
|
89
|
+
bool compiled_;
|
|
90
|
+
|
|
91
|
+
// An AND-OR tree of string atoms used for filtering regexps.
|
|
92
|
+
PrefilterTree* prefilter_tree_;
|
|
93
|
+
|
|
94
|
+
DISALLOW_EVIL_CONSTRUCTORS(FilteredRE2);
|
|
95
|
+
};
|
|
96
|
+
|
|
97
|
+
} // namespace re2
|
|
98
|
+
|
|
99
|
+
#endif // RE2_FILTERED_RE2_H_
|
data/ext/re2/hash.cc
ADDED
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
// Modified by Russ Cox to add "namespace re2".
|
|
2
|
+
// Also threw away all but hashword and hashword2.
|
|
3
|
+
// http://burtleburtle.net/bob/c/lookup3.c
|
|
4
|
+
|
|
5
|
+
/*
|
|
6
|
+
-------------------------------------------------------------------------------
|
|
7
|
+
lookup3.c, by Bob Jenkins, May 2006, Public Domain.
|
|
8
|
+
|
|
9
|
+
These are functions for producing 32-bit hashes for hash table lookup.
|
|
10
|
+
hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
|
|
11
|
+
are externally useful functions. Routines to test the hash are included
|
|
12
|
+
if SELF_TEST is defined. You can use this free for any purpose. It's in
|
|
13
|
+
the public domain. It has no warranty.
|
|
14
|
+
|
|
15
|
+
You probably want to use hashlittle(). hashlittle() and hashbig()
|
|
16
|
+
hash byte arrays. hashlittle() is is faster than hashbig() on
|
|
17
|
+
little-endian machines. Intel and AMD are little-endian machines.
|
|
18
|
+
On second thought, you probably want hashlittle2(), which is identical to
|
|
19
|
+
hashlittle() except it returns two 32-bit hashes for the price of one.
|
|
20
|
+
You could implement hashbig2() if you wanted but I haven't bothered here.
|
|
21
|
+
|
|
22
|
+
If you want to find a hash of, say, exactly 7 integers, do
|
|
23
|
+
a = i1; b = i2; c = i3;
|
|
24
|
+
mix(a,b,c);
|
|
25
|
+
a += i4; b += i5; c += i6;
|
|
26
|
+
mix(a,b,c);
|
|
27
|
+
a += i7;
|
|
28
|
+
final(a,b,c);
|
|
29
|
+
then use c as the hash value. If you have a variable length array of
|
|
30
|
+
4-byte integers to hash, use hashword(). If you have a byte array (like
|
|
31
|
+
a character string), use hashlittle(). If you have several byte arrays, or
|
|
32
|
+
a mix of things, see the comments above hashlittle().
|
|
33
|
+
|
|
34
|
+
Why is this so big? I read 12 bytes at a time into 3 4-byte integers,
|
|
35
|
+
then mix those integers. This is fast (you can do a lot more thorough
|
|
36
|
+
mixing with 12*3 instructions on 3 integers than you can with 3 instructions
|
|
37
|
+
on 1 byte), but shoehorning those bytes into integers efficiently is messy.
|
|
38
|
+
-------------------------------------------------------------------------------
|
|
39
|
+
*/
|
|
40
|
+
|
|
41
|
+
#include "util/util.h"
|
|
42
|
+
|
|
43
|
+
#define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
|
|
44
|
+
|
|
45
|
+
/*
|
|
46
|
+
-------------------------------------------------------------------------------
|
|
47
|
+
mix -- mix 3 32-bit values reversibly.
|
|
48
|
+
|
|
49
|
+
This is reversible, so any information in (a,b,c) before mix() is
|
|
50
|
+
still in (a,b,c) after mix().
|
|
51
|
+
|
|
52
|
+
If four pairs of (a,b,c) inputs are run through mix(), or through
|
|
53
|
+
mix() in reverse, there are at least 32 bits of the output that
|
|
54
|
+
are sometimes the same for one pair and different for another pair.
|
|
55
|
+
This was tested for:
|
|
56
|
+
* pairs that differed by one bit, by two bits, in any combination
|
|
57
|
+
of top bits of (a,b,c), or in any combination of bottom bits of
|
|
58
|
+
(a,b,c).
|
|
59
|
+
* "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
|
|
60
|
+
the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
|
|
61
|
+
is commonly produced by subtraction) look like a single 1-bit
|
|
62
|
+
difference.
|
|
63
|
+
* the base values were pseudorandom, all zero but one bit set, or
|
|
64
|
+
all zero plus a counter that starts at zero.
|
|
65
|
+
|
|
66
|
+
Some k values for my "a-=c; a^=rot(c,k); c+=b;" arrangement that
|
|
67
|
+
satisfy this are
|
|
68
|
+
4 6 8 16 19 4
|
|
69
|
+
9 15 3 18 27 15
|
|
70
|
+
14 9 3 7 17 3
|
|
71
|
+
Well, "9 15 3 18 27 15" didn't quite get 32 bits diffing
|
|
72
|
+
for "differ" defined as + with a one-bit base and a two-bit delta. I
|
|
73
|
+
used http://burtleburtle.net/bob/hash/avalanche.html to choose
|
|
74
|
+
the operations, constants, and arrangements of the variables.
|
|
75
|
+
|
|
76
|
+
This does not achieve avalanche. There are input bits of (a,b,c)
|
|
77
|
+
that fail to affect some output bits of (a,b,c), especially of a. The
|
|
78
|
+
most thoroughly mixed value is c, but it doesn't really even achieve
|
|
79
|
+
avalanche in c.
|
|
80
|
+
|
|
81
|
+
This allows some parallelism. Read-after-writes are good at doubling
|
|
82
|
+
the number of bits affected, so the goal of mixing pulls in the opposite
|
|
83
|
+
direction as the goal of parallelism. I did what I could. Rotates
|
|
84
|
+
seem to cost as much as shifts on every machine I could lay my hands
|
|
85
|
+
on, and rotates are much kinder to the top and bottom bits, so I used
|
|
86
|
+
rotates.
|
|
87
|
+
-------------------------------------------------------------------------------
|
|
88
|
+
*/
|
|
89
|
+
#define mix(a,b,c) \
|
|
90
|
+
{ \
|
|
91
|
+
a -= c; a ^= rot(c, 4); c += b; \
|
|
92
|
+
b -= a; b ^= rot(a, 6); a += c; \
|
|
93
|
+
c -= b; c ^= rot(b, 8); b += a; \
|
|
94
|
+
a -= c; a ^= rot(c,16); c += b; \
|
|
95
|
+
b -= a; b ^= rot(a,19); a += c; \
|
|
96
|
+
c -= b; c ^= rot(b, 4); b += a; \
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
/*
|
|
100
|
+
-------------------------------------------------------------------------------
|
|
101
|
+
final -- final mixing of 3 32-bit values (a,b,c) into c
|
|
102
|
+
|
|
103
|
+
Pairs of (a,b,c) values differing in only a few bits will usually
|
|
104
|
+
produce values of c that look totally different. This was tested for
|
|
105
|
+
* pairs that differed by one bit, by two bits, in any combination
|
|
106
|
+
of top bits of (a,b,c), or in any combination of bottom bits of
|
|
107
|
+
(a,b,c).
|
|
108
|
+
* "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
|
|
109
|
+
the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
|
|
110
|
+
is commonly produced by subtraction) look like a single 1-bit
|
|
111
|
+
difference.
|
|
112
|
+
* the base values were pseudorandom, all zero but one bit set, or
|
|
113
|
+
all zero plus a counter that starts at zero.
|
|
114
|
+
|
|
115
|
+
These constants passed:
|
|
116
|
+
14 11 25 16 4 14 24
|
|
117
|
+
12 14 25 16 4 14 24
|
|
118
|
+
and these came close:
|
|
119
|
+
4 8 15 26 3 22 24
|
|
120
|
+
10 8 15 26 3 22 24
|
|
121
|
+
11 8 15 26 3 22 24
|
|
122
|
+
-------------------------------------------------------------------------------
|
|
123
|
+
*/
|
|
124
|
+
#define final(a,b,c) \
|
|
125
|
+
{ \
|
|
126
|
+
c ^= b; c -= rot(b,14); \
|
|
127
|
+
a ^= c; a -= rot(c,11); \
|
|
128
|
+
b ^= a; b -= rot(a,25); \
|
|
129
|
+
c ^= b; c -= rot(b,16); \
|
|
130
|
+
a ^= c; a -= rot(c,4); \
|
|
131
|
+
b ^= a; b -= rot(a,14); \
|
|
132
|
+
c ^= b; c -= rot(b,24); \
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
namespace re2 {
|
|
136
|
+
|
|
137
|
+
/*
|
|
138
|
+
--------------------------------------------------------------------
|
|
139
|
+
This works on all machines. To be useful, it requires
|
|
140
|
+
-- that the key be an array of uint32_t's, and
|
|
141
|
+
-- that the length be the number of uint32_t's in the key
|
|
142
|
+
|
|
143
|
+
The function hashword() is identical to hashlittle() on little-endian
|
|
144
|
+
machines, and identical to hashbig() on big-endian machines,
|
|
145
|
+
except that the length has to be measured in uint32_ts rather than in
|
|
146
|
+
bytes. hashlittle() is more complicated than hashword() only because
|
|
147
|
+
hashlittle() has to dance around fitting the key bytes into registers.
|
|
148
|
+
--------------------------------------------------------------------
|
|
149
|
+
*/
|
|
150
|
+
uint32 hashword(
|
|
151
|
+
const uint32 *k, /* the key, an array of uint32_t values */
|
|
152
|
+
size_t length, /* the length of the key, in uint32_ts */
|
|
153
|
+
uint32 initval) /* the previous hash, or an arbitrary value */
|
|
154
|
+
{
|
|
155
|
+
uint32_t a,b,c;
|
|
156
|
+
|
|
157
|
+
/* Set up the internal state */
|
|
158
|
+
a = b = c = 0xdeadbeef + (((uint32_t)length)<<2) + initval;
|
|
159
|
+
|
|
160
|
+
/*------------------------------------------------- handle most of the key */
|
|
161
|
+
while (length > 3)
|
|
162
|
+
{
|
|
163
|
+
a += k[0];
|
|
164
|
+
b += k[1];
|
|
165
|
+
c += k[2];
|
|
166
|
+
mix(a,b,c);
|
|
167
|
+
length -= 3;
|
|
168
|
+
k += 3;
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
/*------------------------------------------- handle the last 3 uint32_t's */
|
|
172
|
+
switch(length) /* all the case statements fall through */
|
|
173
|
+
{
|
|
174
|
+
case 3 : c+=k[2];
|
|
175
|
+
case 2 : b+=k[1];
|
|
176
|
+
case 1 : a+=k[0];
|
|
177
|
+
final(a,b,c);
|
|
178
|
+
case 0: /* case 0: nothing left to add */
|
|
179
|
+
break;
|
|
180
|
+
}
|
|
181
|
+
/*------------------------------------------------------ report the result */
|
|
182
|
+
return c;
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
/*
|
|
187
|
+
--------------------------------------------------------------------
|
|
188
|
+
hashword2() -- same as hashword(), but take two seeds and return two
|
|
189
|
+
32-bit values. pc and pb must both be nonnull, and *pc and *pb must
|
|
190
|
+
both be initialized with seeds. If you pass in (*pb)==0, the output
|
|
191
|
+
(*pc) will be the same as the return value from hashword().
|
|
192
|
+
--------------------------------------------------------------------
|
|
193
|
+
*/
|
|
194
|
+
void hashword2 (
|
|
195
|
+
const uint32 *k, /* the key, an array of uint32_t values */
|
|
196
|
+
size_t length, /* the length of the key, in uint32_ts */
|
|
197
|
+
uint32 *pc, /* IN: seed OUT: primary hash value */
|
|
198
|
+
uint32 *pb) /* IN: more seed OUT: secondary hash value */
|
|
199
|
+
{
|
|
200
|
+
uint32_t a,b,c;
|
|
201
|
+
|
|
202
|
+
/* Set up the internal state */
|
|
203
|
+
a = b = c = 0xdeadbeef + ((uint32_t)(length<<2)) + *pc;
|
|
204
|
+
c += *pb;
|
|
205
|
+
|
|
206
|
+
/*------------------------------------------------- handle most of the key */
|
|
207
|
+
while (length > 3)
|
|
208
|
+
{
|
|
209
|
+
a += k[0];
|
|
210
|
+
b += k[1];
|
|
211
|
+
c += k[2];
|
|
212
|
+
mix(a,b,c);
|
|
213
|
+
length -= 3;
|
|
214
|
+
k += 3;
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
/*------------------------------------------- handle the last 3 uint32_t's */
|
|
218
|
+
switch(length) /* all the case statements fall through */
|
|
219
|
+
{
|
|
220
|
+
case 3 : c+=k[2];
|
|
221
|
+
case 2 : b+=k[1];
|
|
222
|
+
case 1 : a+=k[0];
|
|
223
|
+
final(a,b,c);
|
|
224
|
+
case 0: /* case 0: nothing left to add */
|
|
225
|
+
break;
|
|
226
|
+
}
|
|
227
|
+
/*------------------------------------------------------ report the result */
|
|
228
|
+
*pc=c; *pb=b;
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
} // namespace re2
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style
|
|
3
|
+
// license that can be found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
// Determine whether this library should match PCRE exactly
|
|
6
|
+
// for a particular Regexp. (If so, the testing framework can
|
|
7
|
+
// check that it does.)
|
|
8
|
+
//
|
|
9
|
+
// This library matches PCRE except in these cases:
|
|
10
|
+
// * the regexp contains a repetition of an empty string,
|
|
11
|
+
// like (a*)* or (a*)+. In this case, PCRE will treat
|
|
12
|
+
// the repetition sequence as ending with an empty string,
|
|
13
|
+
// while this library does not.
|
|
14
|
+
// * Perl and PCRE differ on whether \v matches \n.
|
|
15
|
+
// For historical reasons, this library implements the Perl behavior.
|
|
16
|
+
// * Perl and PCRE allow $ in one-line mode to match either the very
|
|
17
|
+
// end of the text or just before a \n at the end of the text.
|
|
18
|
+
// This library requires it to match only the end of the text.
|
|
19
|
+
// * Similarly, Perl and PCRE do not allow ^ in multi-line mode to
|
|
20
|
+
// match the end of the text if the last character is a \n.
|
|
21
|
+
// This library does allow it.
|
|
22
|
+
//
|
|
23
|
+
// Regexp::MimicsPCRE checks for any of these conditions.
|
|
24
|
+
|
|
25
|
+
#include "util/util.h"
|
|
26
|
+
#include "re2/regexp.h"
|
|
27
|
+
#include "re2/walker-inl.h"
|
|
28
|
+
|
|
29
|
+
namespace re2 {
|
|
30
|
+
|
|
31
|
+
// Returns whether re might match an empty string.
|
|
32
|
+
static bool CanBeEmptyString(Regexp *re);
|
|
33
|
+
|
|
34
|
+
// Walker class to compute whether library handles a regexp
|
|
35
|
+
// exactly as PCRE would. See comment at top for conditions.
|
|
36
|
+
|
|
37
|
+
class PCREWalker : public Regexp::Walker<bool> {
|
|
38
|
+
public:
|
|
39
|
+
PCREWalker() {}
|
|
40
|
+
bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, bool* child_args,
|
|
41
|
+
int nchild_args);
|
|
42
|
+
|
|
43
|
+
bool ShortVisit(Regexp* re, bool a) {
|
|
44
|
+
// Should never be called: we use Walk not WalkExponential.
|
|
45
|
+
LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
|
|
46
|
+
return a;
|
|
47
|
+
}
|
|
48
|
+
};
|
|
49
|
+
|
|
50
|
+
// Called after visiting each of re's children and accumulating
|
|
51
|
+
// the return values in child_args. So child_args contains whether
|
|
52
|
+
// this library mimics PCRE for those subexpressions.
|
|
53
|
+
bool PCREWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
|
|
54
|
+
bool* child_args, int nchild_args) {
|
|
55
|
+
// If children failed, so do we.
|
|
56
|
+
for (int i = 0; i < nchild_args; i++)
|
|
57
|
+
if (!child_args[i])
|
|
58
|
+
return false;
|
|
59
|
+
|
|
60
|
+
// Otherwise look for other reasons to fail.
|
|
61
|
+
switch (re->op()) {
|
|
62
|
+
// Look for repeated empty string.
|
|
63
|
+
case kRegexpStar:
|
|
64
|
+
case kRegexpPlus:
|
|
65
|
+
case kRegexpQuest:
|
|
66
|
+
if (CanBeEmptyString(re->sub()[0]))
|
|
67
|
+
return false;
|
|
68
|
+
break;
|
|
69
|
+
case kRegexpRepeat:
|
|
70
|
+
if (re->max() == -1 && CanBeEmptyString(re->sub()[0]))
|
|
71
|
+
return false;
|
|
72
|
+
break;
|
|
73
|
+
|
|
74
|
+
// Look for \v
|
|
75
|
+
case kRegexpLiteral:
|
|
76
|
+
if (re->rune() == '\v')
|
|
77
|
+
return false;
|
|
78
|
+
break;
|
|
79
|
+
|
|
80
|
+
// Look for $ in single-line mode.
|
|
81
|
+
case kRegexpEndText:
|
|
82
|
+
case kRegexpEmptyMatch:
|
|
83
|
+
if (re->parse_flags() & Regexp::WasDollar)
|
|
84
|
+
return false;
|
|
85
|
+
break;
|
|
86
|
+
|
|
87
|
+
// Look for ^ in multi-line mode.
|
|
88
|
+
case kRegexpBeginLine:
|
|
89
|
+
// No condition: in single-line mode ^ becomes kRegexpBeginText.
|
|
90
|
+
return false;
|
|
91
|
+
|
|
92
|
+
default:
|
|
93
|
+
break;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
// Not proven guilty.
|
|
97
|
+
return true;
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
// Returns whether this regexp's behavior will mimic PCRE's exactly.
|
|
101
|
+
bool Regexp::MimicsPCRE() {
|
|
102
|
+
PCREWalker w;
|
|
103
|
+
return w.Walk(this, true);
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
// Walker class to compute whether a Regexp can match an empty string.
|
|
108
|
+
// It is okay to overestimate. For example, \b\B cannot match an empty
|
|
109
|
+
// string, because \b and \B are mutually exclusive, but this isn't
|
|
110
|
+
// that smart and will say it can. Spurious empty strings
|
|
111
|
+
// will reduce the number of regexps we sanity check against PCRE,
|
|
112
|
+
// but they won't break anything.
|
|
113
|
+
|
|
114
|
+
class EmptyStringWalker : public Regexp::Walker<bool> {
|
|
115
|
+
public:
|
|
116
|
+
EmptyStringWalker() { }
|
|
117
|
+
bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
|
|
118
|
+
bool* child_args, int nchild_args);
|
|
119
|
+
|
|
120
|
+
bool ShortVisit(Regexp* re, bool a) {
|
|
121
|
+
// Should never be called: we use Walk not WalkExponential.
|
|
122
|
+
LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
|
|
123
|
+
return a;
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
private:
|
|
127
|
+
DISALLOW_EVIL_CONSTRUCTORS(EmptyStringWalker);
|
|
128
|
+
};
|
|
129
|
+
|
|
130
|
+
// Called after visiting re's children. child_args contains the return
|
|
131
|
+
// value from each of the children's PostVisits (i.e., whether each child
|
|
132
|
+
// can match an empty string). Returns whether this clause can match an
|
|
133
|
+
// empty string.
|
|
134
|
+
bool EmptyStringWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
|
|
135
|
+
bool* child_args, int nchild_args) {
|
|
136
|
+
switch (re->op()) {
|
|
137
|
+
case kRegexpNoMatch: // never empty
|
|
138
|
+
case kRegexpLiteral:
|
|
139
|
+
case kRegexpAnyChar:
|
|
140
|
+
case kRegexpAnyByte:
|
|
141
|
+
case kRegexpCharClass:
|
|
142
|
+
case kRegexpLiteralString:
|
|
143
|
+
return false;
|
|
144
|
+
|
|
145
|
+
case kRegexpEmptyMatch: // always empty
|
|
146
|
+
case kRegexpBeginLine: // always empty, when they match
|
|
147
|
+
case kRegexpEndLine:
|
|
148
|
+
case kRegexpNoWordBoundary:
|
|
149
|
+
case kRegexpWordBoundary:
|
|
150
|
+
case kRegexpBeginText:
|
|
151
|
+
case kRegexpEndText:
|
|
152
|
+
case kRegexpStar: // can always be empty
|
|
153
|
+
case kRegexpQuest:
|
|
154
|
+
case kRegexpHaveMatch:
|
|
155
|
+
return true;
|
|
156
|
+
|
|
157
|
+
case kRegexpConcat: // can be empty if all children can
|
|
158
|
+
for (int i = 0; i < nchild_args; i++)
|
|
159
|
+
if (!child_args[i])
|
|
160
|
+
return false;
|
|
161
|
+
return true;
|
|
162
|
+
|
|
163
|
+
case kRegexpAlternate: // can be empty if any child can
|
|
164
|
+
for (int i = 0; i < nchild_args; i++)
|
|
165
|
+
if (child_args[i])
|
|
166
|
+
return true;
|
|
167
|
+
return false;
|
|
168
|
+
|
|
169
|
+
case kRegexpPlus: // can be empty if the child can
|
|
170
|
+
case kRegexpCapture:
|
|
171
|
+
return child_args[0];
|
|
172
|
+
|
|
173
|
+
case kRegexpRepeat: // can be empty if child can or is x{0}
|
|
174
|
+
return child_args[0] || re->min() == 0;
|
|
175
|
+
}
|
|
176
|
+
return false;
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
// Returns whether re can match an empty string.
|
|
180
|
+
static bool CanBeEmptyString(Regexp* re) {
|
|
181
|
+
EmptyStringWalker w;
|
|
182
|
+
return w.Walk(re, true);
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
} // namespace re2
|