RubyGems - chipper - Versions diffs - 0.4.2 - Mend

chipper 0.4.2

Files changed (134) hide show

data/README.rdoc +51 -0
data/ext/extconf.rb +58 -0
data/ext/libstemmer_c/Makefile +10 -0
data/ext/libstemmer_c/examples/stemwords.c +209 -0
data/ext/libstemmer_c/include/libstemmer.h +79 -0
data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
data/ext/libstemmer_c/libstemmer/modules.h +190 -0
data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
data/ext/libstemmer_c/mkinc.mak +82 -0
data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
data/ext/libstemmer_c/runtime/api.c +66 -0
data/ext/libstemmer_c/runtime/api.h +26 -0
data/ext/libstemmer_c/runtime/header.h +58 -0
data/ext/libstemmer_c/runtime/utilities.c +478 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
data/ext/re2/bitstate.cc +378 -0
data/ext/re2/compile.cc +1138 -0
data/ext/re2/dfa.cc +2086 -0
data/ext/re2/filtered_re2.cc +100 -0
data/ext/re2/filtered_re2.h +99 -0
data/ext/re2/hash.cc +231 -0
data/ext/re2/mimics_pcre.cc +185 -0
data/ext/re2/nfa.cc +709 -0
data/ext/re2/onepass.cc +614 -0
data/ext/re2/parse.cc +2202 -0
data/ext/re2/perl_groups.cc +119 -0
data/ext/re2/prefilter.cc +671 -0
data/ext/re2/prefilter.h +105 -0
data/ext/re2/prefilter_tree.cc +398 -0
data/ext/re2/prefilter_tree.h +130 -0
data/ext/re2/prog.cc +341 -0
data/ext/re2/prog.h +376 -0
data/ext/re2/re2.cc +1180 -0
data/ext/re2/re2.h +837 -0
data/ext/re2/regexp.cc +920 -0
data/ext/re2/regexp.h +632 -0
data/ext/re2/rune.cc +258 -0
data/ext/re2/set.cc +113 -0
data/ext/re2/set.h +55 -0
data/ext/re2/simplify.cc +393 -0
data/ext/re2/stringpiece.cc +87 -0
data/ext/re2/stringpiece.h +182 -0
data/ext/re2/tostring.cc +341 -0
data/ext/re2/unicode_casefold.cc +469 -0
data/ext/re2/unicode_casefold.h +75 -0
data/ext/re2/unicode_groups.cc +4851 -0
data/ext/re2/unicode_groups.h +64 -0
data/ext/re2/valgrind.cc +24 -0
data/ext/re2/variadic_function.h +346 -0
data/ext/re2/walker-inl.h +244 -0
data/ext/src/chipper.cc +626 -0
data/ext/src/version.h +1 -0
data/ext/stemmer.rb +40 -0
data/ext/util/arena.h +103 -0
data/ext/util/atomicops.h +79 -0
data/ext/util/benchmark.h +41 -0
data/ext/util/flags.h +27 -0
data/ext/util/logging.h +78 -0
data/ext/util/mutex.h +190 -0
data/ext/util/pcre.h +679 -0
data/ext/util/random.h +29 -0
data/ext/util/sparse_array.h +451 -0
data/ext/util/sparse_set.h +177 -0
data/ext/util/test.h +57 -0
data/ext/util/thread.h +26 -0
data/ext/util/utf.h +43 -0
data/ext/util/util.h +127 -0
data/ext/util/valgrind.h +4517 -0
data/test/helper.rb +5 -0
data/test/test_entities.rb +57 -0
data/test/test_tokens.rb +118 -0
metadata +199 -0

data/ext/re2/filtered_re2.cc ADDED Viewed

@@ -0,0 +1,100 @@
+// Copyright 2009 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+#include <string>
+#include "util/util.h"
+#include "re2/filtered_re2.h"
+#include "re2/prefilter.h"
+#include "re2/prefilter_tree.h"
+namespace re2 {
+FilteredRE2::FilteredRE2()
+    : compiled_(false),
+      prefilter_tree_(new PrefilterTree()) {
+}
+FilteredRE2::~FilteredRE2() {
+  for (int i = 0; i < re2_vec_.size(); i++)
+    delete re2_vec_[i];
+  delete prefilter_tree_;
+}
+RE2::ErrorCode FilteredRE2::Add(const StringPiece& pattern,
+                                const RE2::Options& options, int* id) {
+  RE2* re = new RE2(pattern, options);
+  RE2::ErrorCode code = re->error_code();
+  if (!re->ok()) {
+    LOG(ERROR) << "Couldn't compile regular expression, skipping: "
+               << re << " due to error " << re->error();
+    delete re;
+  } else {
+    *id = re2_vec_.size();
+    re2_vec_.push_back(re);
+  }
+  return code;
+}
+void FilteredRE2::Compile(vector<string>* atoms) {
+  if (compiled_ || re2_vec_.size() == 0) {
+    LOG(INFO) << "C: " << compiled_ << " S:" << re2_vec_.size();
+    return;
+  }
+  for (int i = 0; i < re2_vec_.size(); i++) {
+    Prefilter* prefilter = Prefilter::FromRE2(re2_vec_[i]);
+    prefilter_tree_->Add(prefilter);
+  }
+  atoms->clear();
+  prefilter_tree_->Compile(atoms);
+  compiled_ = true;
+}
+int FilteredRE2::SlowFirstMatch(const StringPiece& text) const {
+  for (int i = 0; i < re2_vec_.size(); i++)
+    if (RE2::PartialMatch(text, *re2_vec_[i]))
+      return i;
+  return -1;
+}
+int FilteredRE2::FirstMatch(const StringPiece& text,
+                            const vector<int>& atoms) const {
+  if (!compiled_) {
+    LOG(DFATAL) << "FirstMatch called before Compile";
+    return -1;
+  }
+  vector<int> regexps;
+  prefilter_tree_->RegexpsGivenStrings(atoms, &regexps);
+  for (int i = 0; i < regexps.size(); i++)
+    if (RE2::PartialMatch(text, *re2_vec_[regexps[i]]))
+      return regexps[i];
+  return -1;
+}
+bool FilteredRE2::AllMatches(
+    const StringPiece& text,
+    const vector<int>& atoms,
+    vector<int>* matching_regexps) const {
+  matching_regexps->clear();
+  vector<int> regexps;
+  prefilter_tree_->RegexpsGivenStrings(atoms, &regexps);
+  for (int i = 0; i < regexps.size(); i++)
+    if (RE2::PartialMatch(text, *re2_vec_[regexps[i]]))
+      matching_regexps->push_back(regexps[i]);
+  return !matching_regexps->empty();
+}
+void FilteredRE2::RegexpsGivenStrings(const vector<int>& matched_atoms,
+                                      vector<int>* passed_regexps) {
+  prefilter_tree_->RegexpsGivenStrings(matched_atoms, passed_regexps);
+}
+void FilteredRE2::PrintPrefilter(int regexpid) {
+  prefilter_tree_->PrintPrefilter(regexpid);
+}
+}  // namespace re2

data/ext/re2/filtered_re2.h ADDED Viewed

@@ -0,0 +1,99 @@
+// Copyright 2009 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+// The class FilteredRE2 is used as a wrapper to multiple RE2 regexps.
+// It provides a prefilter mechanism that helps in cutting down the
+// number of regexps that need to be actually searched.
+//
+// By design, it does not include a string matching engine. This is to
+// allow the user of the class to use their favorite string match
+// engine. The overall flow is: Add all the regexps using Add, then
+// Compile the FilteredRE2. The compile returns strings that need to
+// be matched. Note that all returned strings are lowercase. For
+// applying regexps to a search text, the caller does the string
+// matching using the strings returned. When doing the string match,
+// note that the caller has to do that on lower cased version of the
+// search text. Then call FirstMatch or AllMatches with a vector of
+// indices of strings that were found in the text to get the actual
+// regexp matches.
+#ifndef RE2_FILTERED_RE2_H_
+#define RE2_FILTERED_RE2_H_
+#include <vector>
+#include "re2/re2.h"
+namespace re2 {
+using std::vector;
+class PrefilterTree;
+class FilteredRE2 {
+ public:
+  FilteredRE2();
+  ~FilteredRE2();
+  // Uses RE2 constructor to create a RE2 object (re). Returns
+  // re->error_code(). If error_code is other than NoError, then re is
+  // deleted and not added to re2_vec_.
+  RE2::ErrorCode Add(const StringPiece& pattern,
+                     const RE2::Options& options,
+                     int *id);
+  // Prepares the regexps added by Add for filtering.  Returns a set
+  // of strings that the caller should check for in candidate texts.
+  // The returned strings are lowercased. When doing string matching,
+  // the search text should be lowercased first to find matching
+  // strings from the set of strings returned by Compile.  Call after
+  // all Add calls are done.
+  void Compile(vector<string>* strings_to_match);
+  // Returns the index of the first matching regexp.
+  // Returns -1 on no match. Can be called prior to Compile.
+  // Does not do any filtering: simply tries to Match the
+  // regexps in a loop.
+  int SlowFirstMatch(const StringPiece& text) const;
+  // Returns the index of the first matching regexp.
+  // Returns -1 on no match. Compile has to be called before
+  // calling this.
+  int FirstMatch(const StringPiece& text,
+                 const vector<int>& atoms) const;
+  // Returns the indices of all matching regexps, after first clearing
+  // matched_regexps.
+  bool AllMatches(const StringPiece& text,
+                  const vector<int>& atoms,
+                  vector<int>* matching_regexps) const;
+  // The number of regexps added.
+  int NumRegexps() const { return re2_vec_.size(); }
+ private:
+  // Get the individual RE2 objects. Useful for testing.
+  RE2* GetRE2(int regexpid) const { return re2_vec_[regexpid]; }
+  // Print prefilter.
+  void PrintPrefilter(int regexpid);
+  // Useful for testing and debugging.
+  void RegexpsGivenStrings(const vector<int>& matched_atoms,
+                           vector<int>* passed_regexps);
+  // All the regexps in the FilteredRE2.
+  vector<RE2*> re2_vec_;
+  // Has the FilteredRE2 been compiled using Compile()
+  bool compiled_;
+  // An AND-OR tree of string atoms used for filtering regexps.
+  PrefilterTree* prefilter_tree_;
+  DISALLOW_EVIL_CONSTRUCTORS(FilteredRE2);
+};
+}  // namespace re2
+#endif  // RE2_FILTERED_RE2_H_

data/ext/re2/hash.cc ADDED Viewed

@@ -0,0 +1,231 @@
+// Modified by Russ Cox to add "namespace re2".
+// Also threw away all but hashword and hashword2.
+// http://burtleburtle.net/bob/c/lookup3.c
+/*
+-------------------------------------------------------------------------------
+lookup3.c, by Bob Jenkins, May 2006, Public Domain.
+These are functions for producing 32-bit hashes for hash table lookup.
+hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
+are externally useful functions.  Routines to test the hash are included
+if SELF_TEST is defined.  You can use this free for any purpose.  It's in
+the public domain.  It has no warranty.
+You probably want to use hashlittle().  hashlittle() and hashbig()
+hash byte arrays.  hashlittle() is is faster than hashbig() on
+little-endian machines.  Intel and AMD are little-endian machines.
+On second thought, you probably want hashlittle2(), which is identical to
+hashlittle() except it returns two 32-bit hashes for the price of one.
+You could implement hashbig2() if you wanted but I haven't bothered here.
+If you want to find a hash of, say, exactly 7 integers, do
+  a = i1;  b = i2;  c = i3;
+  mix(a,b,c);
+  a += i4; b += i5; c += i6;
+  mix(a,b,c);
+  a += i7;
+  final(a,b,c);
+then use c as the hash value.  If you have a variable length array of
+4-byte integers to hash, use hashword().  If you have a byte array (like
+a character string), use hashlittle().  If you have several byte arrays, or
+a mix of things, see the comments above hashlittle().
+Why is this so big?  I read 12 bytes at a time into 3 4-byte integers,
+then mix those integers.  This is fast (you can do a lot more thorough
+mixing with 12*3 instructions on 3 integers than you can with 3 instructions
+on 1 byte), but shoehorning those bytes into integers efficiently is messy.
+-------------------------------------------------------------------------------
+*/
+#include "util/util.h"
+#define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
+/*
+-------------------------------------------------------------------------------
+mix -- mix 3 32-bit values reversibly.
+This is reversible, so any information in (a,b,c) before mix() is
+still in (a,b,c) after mix().
+If four pairs of (a,b,c) inputs are run through mix(), or through
+mix() in reverse, there are at least 32 bits of the output that
+are sometimes the same for one pair and different for another pair.
+This was tested for:
+* pairs that differed by one bit, by two bits, in any combination
+  of top bits of (a,b,c), or in any combination of bottom bits of
+  (a,b,c).
+* "differ" is defined as +, -, ^, or ~^.  For + and -, I transformed
+  the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
+  is commonly produced by subtraction) look like a single 1-bit
+  difference.
+* the base values were pseudorandom, all zero but one bit set, or
+  all zero plus a counter that starts at zero.
+Some k values for my "a-=c; a^=rot(c,k); c+=b;" arrangement that
+satisfy this are
+    4  6  8 16 19  4
+    9 15  3 18 27 15
+   14  9  3  7 17  3
+Well, "9 15 3 18 27 15" didn't quite get 32 bits diffing
+for "differ" defined as + with a one-bit base and a two-bit delta.  I
+used http://burtleburtle.net/bob/hash/avalanche.html to choose
+the operations, constants, and arrangements of the variables.
+This does not achieve avalanche.  There are input bits of (a,b,c)
+that fail to affect some output bits of (a,b,c), especially of a.  The
+most thoroughly mixed value is c, but it doesn't really even achieve
+avalanche in c.
+This allows some parallelism.  Read-after-writes are good at doubling
+the number of bits affected, so the goal of mixing pulls in the opposite
+direction as the goal of parallelism.  I did what I could.  Rotates
+seem to cost as much as shifts on every machine I could lay my hands
+on, and rotates are much kinder to the top and bottom bits, so I used
+rotates.
+-------------------------------------------------------------------------------
+*/
+#define mix(a,b,c) \
+{ \
+  a -= c;  a ^= rot(c, 4);  c += b; \
+  b -= a;  b ^= rot(a, 6);  a += c; \
+  c -= b;  c ^= rot(b, 8);  b += a; \
+  a -= c;  a ^= rot(c,16);  c += b; \
+  b -= a;  b ^= rot(a,19);  a += c; \
+  c -= b;  c ^= rot(b, 4);  b += a; \
+}
+/*
+-------------------------------------------------------------------------------
+final -- final mixing of 3 32-bit values (a,b,c) into c
+Pairs of (a,b,c) values differing in only a few bits will usually
+produce values of c that look totally different.  This was tested for
+* pairs that differed by one bit, by two bits, in any combination
+  of top bits of (a,b,c), or in any combination of bottom bits of
+  (a,b,c).
+* "differ" is defined as +, -, ^, or ~^.  For + and -, I transformed
+  the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
+  is commonly produced by subtraction) look like a single 1-bit
+  difference.
+* the base values were pseudorandom, all zero but one bit set, or
+  all zero plus a counter that starts at zero.
+These constants passed:
+ 14 11 25 16 4 14 24
+ 12 14 25 16 4 14 24
+and these came close:
+  4  8 15 26 3 22 24
+ 10  8 15 26 3 22 24
+ 11  8 15 26 3 22 24
+-------------------------------------------------------------------------------
+*/
+#define final(a,b,c) \
+{ \
+  c ^= b; c -= rot(b,14); \
+  a ^= c; a -= rot(c,11); \
+  b ^= a; b -= rot(a,25); \
+  c ^= b; c -= rot(b,16); \
+  a ^= c; a -= rot(c,4);  \
+  b ^= a; b -= rot(a,14); \
+  c ^= b; c -= rot(b,24); \
+}
+namespace re2 {
+/*
+--------------------------------------------------------------------
+ This works on all machines.  To be useful, it requires
+ -- that the key be an array of uint32_t's, and
+ -- that the length be the number of uint32_t's in the key
+ The function hashword() is identical to hashlittle() on little-endian
+ machines, and identical to hashbig() on big-endian machines,
+ except that the length has to be measured in uint32_ts rather than in
+ bytes.  hashlittle() is more complicated than hashword() only because
+ hashlittle() has to dance around fitting the key bytes into registers.
+--------------------------------------------------------------------
+*/
+uint32 hashword(
+const uint32 *k,                   /* the key, an array of uint32_t values */
+size_t          length,               /* the length of the key, in uint32_ts */
+uint32        initval)         /* the previous hash, or an arbitrary value */
+{
+  uint32_t a,b,c;
+  /* Set up the internal state */
+  a = b = c = 0xdeadbeef + (((uint32_t)length)<<2) + initval;
+  /*------------------------------------------------- handle most of the key */
+  while (length > 3)
+  {
+    a += k[0];
+    b += k[1];
+    c += k[2];
+    mix(a,b,c);
+    length -= 3;
+    k += 3;
+  }
+  /*------------------------------------------- handle the last 3 uint32_t's */
+  switch(length)                     /* all the case statements fall through */
+  {
+  case 3 : c+=k[2];
+  case 2 : b+=k[1];
+  case 1 : a+=k[0];
+    final(a,b,c);
+  case 0:     /* case 0: nothing left to add */
+    break;
+  }
+  /*------------------------------------------------------ report the result */
+  return c;
+}
+/*
+--------------------------------------------------------------------
+hashword2() -- same as hashword(), but take two seeds and return two
+32-bit values.  pc and pb must both be nonnull, and *pc and *pb must
+both be initialized with seeds.  If you pass in (*pb)==0, the output
+(*pc) will be the same as the return value from hashword().
+--------------------------------------------------------------------
+*/
+void hashword2 (
+const uint32 *k,                   /* the key, an array of uint32_t values */
+size_t          length,               /* the length of the key, in uint32_ts */
+uint32       *pc,                      /* IN: seed OUT: primary hash value */
+uint32       *pb)               /* IN: more seed OUT: secondary hash value */
+{
+  uint32_t a,b,c;
+  /* Set up the internal state */
+  a = b = c = 0xdeadbeef + ((uint32_t)(length<<2)) + *pc;
+  c += *pb;
+  /*------------------------------------------------- handle most of the key */
+  while (length > 3)
+  {
+    a += k[0];
+    b += k[1];
+    c += k[2];
+    mix(a,b,c);
+    length -= 3;
+    k += 3;
+  }
+  /*------------------------------------------- handle the last 3 uint32_t's */
+  switch(length)                     /* all the case statements fall through */
+  {
+  case 3 : c+=k[2];
+  case 2 : b+=k[1];
+  case 1 : a+=k[0];
+    final(a,b,c);
+  case 0:     /* case 0: nothing left to add */
+    break;
+  }
+  /*------------------------------------------------------ report the result */
+  *pc=c; *pb=b;
+}
+}  // namespace re2

data/ext/re2/mimics_pcre.cc ADDED Viewed

@@ -0,0 +1,185 @@
+// Copyright 2008 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+// Determine whether this library should match PCRE exactly
+// for a particular Regexp.  (If so, the testing framework can
+// check that it does.)
+//
+// This library matches PCRE except in these cases:
+//   * the regexp contains a repetition of an empty string,
+//     like (a*)* or (a*)+.  In this case, PCRE will treat
+//     the repetition sequence as ending with an empty string,
+//     while this library does not.
+//   * Perl and PCRE differ on whether \v matches \n.
+//     For historical reasons, this library implements the Perl behavior.
+//   * Perl and PCRE allow $ in one-line mode to match either the very
+//     end of the text or just before a \n at the end of the text.
+//     This library requires it to match only the end of the text.
+//   * Similarly, Perl and PCRE do not allow ^ in multi-line mode to
+//     match the end of the text if the last character is a \n.
+//     This library does allow it.
+//
+// Regexp::MimicsPCRE checks for any of these conditions.
+#include "util/util.h"
+#include "re2/regexp.h"
+#include "re2/walker-inl.h"
+namespace re2 {
+// Returns whether re might match an empty string.
+static bool CanBeEmptyString(Regexp *re);
+// Walker class to compute whether library handles a regexp
+// exactly as PCRE would.  See comment at top for conditions.
+class PCREWalker : public Regexp::Walker<bool> {
+ public:
+  PCREWalker() {}
+  bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, bool* child_args,
+                 int nchild_args);
+  bool ShortVisit(Regexp* re, bool a) {
+    // Should never be called: we use Walk not WalkExponential.
+    LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
+    return a;
+  }
+};
+// Called after visiting each of re's children and accumulating
+// the return values in child_args.  So child_args contains whether
+// this library mimics PCRE for those subexpressions.
+bool PCREWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
+                           bool* child_args, int nchild_args) {
+  // If children failed, so do we.
+  for (int i = 0; i < nchild_args; i++)
+    if (!child_args[i])
+      return false;
+  // Otherwise look for other reasons to fail.
+  switch (re->op()) {
+    // Look for repeated empty string.
+    case kRegexpStar:
+    case kRegexpPlus:
+    case kRegexpQuest:
+      if (CanBeEmptyString(re->sub()[0]))
+        return false;
+      break;
+    case kRegexpRepeat:
+      if (re->max() == -1 && CanBeEmptyString(re->sub()[0]))
+        return false;
+      break;
+    // Look for \v
+    case kRegexpLiteral:
+      if (re->rune() == '\v')
+        return false;
+      break;
+    // Look for $ in single-line mode.
+    case kRegexpEndText:
+    case kRegexpEmptyMatch:
+      if (re->parse_flags() & Regexp::WasDollar)
+        return false;
+      break;
+    // Look for ^ in multi-line mode.
+    case kRegexpBeginLine:
+      // No condition: in single-line mode ^ becomes kRegexpBeginText.
+      return false;
+    default:
+      break;
+  }
+  // Not proven guilty.
+  return true;
+}
+// Returns whether this regexp's behavior will mimic PCRE's exactly.
+bool Regexp::MimicsPCRE() {
+  PCREWalker w;
+  return w.Walk(this, true);
+}
+// Walker class to compute whether a Regexp can match an empty string.
+// It is okay to overestimate.  For example, \b\B cannot match an empty
+// string, because \b and \B are mutually exclusive, but this isn't
+// that smart and will say it can.  Spurious empty strings
+// will reduce the number of regexps we sanity check against PCRE,
+// but they won't break anything.
+class EmptyStringWalker : public Regexp::Walker<bool> {
+ public:
+  EmptyStringWalker() { }
+  bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
+                 bool* child_args, int nchild_args);
+  bool ShortVisit(Regexp* re, bool a) {
+    // Should never be called: we use Walk not WalkExponential.
+    LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
+    return a;
+  }
+ private:
+  DISALLOW_EVIL_CONSTRUCTORS(EmptyStringWalker);
+};
+// Called after visiting re's children.  child_args contains the return
+// value from each of the children's PostVisits (i.e., whether each child
+// can match an empty string).  Returns whether this clause can match an
+// empty string.
+bool EmptyStringWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
+                                  bool* child_args, int nchild_args) {
+  switch (re->op()) {
+    case kRegexpNoMatch:               // never empty
+    case kRegexpLiteral:
+    case kRegexpAnyChar:
+    case kRegexpAnyByte:
+    case kRegexpCharClass:
+    case kRegexpLiteralString:
+      return false;
+    case kRegexpEmptyMatch:            // always empty
+    case kRegexpBeginLine:             // always empty, when they match
+    case kRegexpEndLine:
+    case kRegexpNoWordBoundary:
+    case kRegexpWordBoundary:
+    case kRegexpBeginText:
+    case kRegexpEndText:
+    case kRegexpStar:                  // can always be empty
+    case kRegexpQuest:
+    case kRegexpHaveMatch:
+      return true;
+    case kRegexpConcat:                // can be empty if all children can
+      for (int i = 0; i < nchild_args; i++)
+        if (!child_args[i])
+          return false;
+      return true;
+    case kRegexpAlternate:             // can be empty if any child can
+      for (int i = 0; i < nchild_args; i++)
+        if (child_args[i])
+          return true;
+      return false;
+    case kRegexpPlus:                  // can be empty if the child can
+    case kRegexpCapture:
+      return child_args[0];
+    case kRegexpRepeat:                // can be empty if child can or is x{0}
+      return child_args[0] || re->min() == 0;
+  }
+  return false;
+}
+// Returns whether re can match an empty string.
+static bool CanBeEmptyString(Regexp* re) {
+  EmptyStringWalker w;
+  return w.Walk(re, true);
+}
+}  // namespace re2