RubyGems - chipper - Versions diffs - 0.4.2 - Mend

chipper 0.4.2

Files changed (134) hide show

data/README.rdoc +51 -0
data/ext/extconf.rb +58 -0
data/ext/libstemmer_c/Makefile +10 -0
data/ext/libstemmer_c/examples/stemwords.c +209 -0
data/ext/libstemmer_c/include/libstemmer.h +79 -0
data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
data/ext/libstemmer_c/libstemmer/modules.h +190 -0
data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
data/ext/libstemmer_c/mkinc.mak +82 -0
data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
data/ext/libstemmer_c/runtime/api.c +66 -0
data/ext/libstemmer_c/runtime/api.h +26 -0
data/ext/libstemmer_c/runtime/header.h +58 -0
data/ext/libstemmer_c/runtime/utilities.c +478 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
data/ext/re2/bitstate.cc +378 -0
data/ext/re2/compile.cc +1138 -0
data/ext/re2/dfa.cc +2086 -0
data/ext/re2/filtered_re2.cc +100 -0
data/ext/re2/filtered_re2.h +99 -0
data/ext/re2/hash.cc +231 -0
data/ext/re2/mimics_pcre.cc +185 -0
data/ext/re2/nfa.cc +709 -0
data/ext/re2/onepass.cc +614 -0
data/ext/re2/parse.cc +2202 -0
data/ext/re2/perl_groups.cc +119 -0
data/ext/re2/prefilter.cc +671 -0
data/ext/re2/prefilter.h +105 -0
data/ext/re2/prefilter_tree.cc +398 -0
data/ext/re2/prefilter_tree.h +130 -0
data/ext/re2/prog.cc +341 -0
data/ext/re2/prog.h +376 -0
data/ext/re2/re2.cc +1180 -0
data/ext/re2/re2.h +837 -0
data/ext/re2/regexp.cc +920 -0
data/ext/re2/regexp.h +632 -0
data/ext/re2/rune.cc +258 -0
data/ext/re2/set.cc +113 -0
data/ext/re2/set.h +55 -0
data/ext/re2/simplify.cc +393 -0
data/ext/re2/stringpiece.cc +87 -0
data/ext/re2/stringpiece.h +182 -0
data/ext/re2/tostring.cc +341 -0
data/ext/re2/unicode_casefold.cc +469 -0
data/ext/re2/unicode_casefold.h +75 -0
data/ext/re2/unicode_groups.cc +4851 -0
data/ext/re2/unicode_groups.h +64 -0
data/ext/re2/valgrind.cc +24 -0
data/ext/re2/variadic_function.h +346 -0
data/ext/re2/walker-inl.h +244 -0
data/ext/src/chipper.cc +626 -0
data/ext/src/version.h +1 -0
data/ext/stemmer.rb +40 -0
data/ext/util/arena.h +103 -0
data/ext/util/atomicops.h +79 -0
data/ext/util/benchmark.h +41 -0
data/ext/util/flags.h +27 -0
data/ext/util/logging.h +78 -0
data/ext/util/mutex.h +190 -0
data/ext/util/pcre.h +679 -0
data/ext/util/random.h +29 -0
data/ext/util/sparse_array.h +451 -0
data/ext/util/sparse_set.h +177 -0
data/ext/util/test.h +57 -0
data/ext/util/thread.h +26 -0
data/ext/util/utf.h +43 -0
data/ext/util/util.h +127 -0
data/ext/util/valgrind.h +4517 -0
data/test/helper.rb +5 -0
data/test/test_entities.rb +57 -0
data/test/test_tokens.rb +118 -0
metadata +199 -0

data/ext/re2/regexp.cc ADDED Viewed

@@ -0,0 +1,920 @@
+// Copyright 2006 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+// Regular expression representation.
+// Tested by parse_test.cc
+#include "util/util.h"
+#include "re2/regexp.h"
+#include "re2/stringpiece.h"
+#include "re2/walker-inl.h"
+namespace re2 {
+// Constructor.  Allocates vectors as appropriate for operator.
+Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
+  : op_(op),
+    simple_(false),
+    parse_flags_(static_cast<uint16>(parse_flags)),
+    ref_(1),
+    nsub_(0),
+    down_(NULL) {
+  subone_ = NULL;
+  memset(the_union_, 0, sizeof the_union_);
+}
+// Destructor.  Assumes already cleaned up children.
+// Private: use Decref() instead of delete to destroy Regexps.
+// Can't call Decref on the sub-Regexps here because
+// that could cause arbitrarily deep recursion, so
+// required Decref() to have handled them for us.
+Regexp::~Regexp() {
+  if (nsub_ > 0)
+    LOG(DFATAL) << "Regexp not destroyed.";
+  switch (op_) {
+    default:
+      break;
+    case kRegexpCapture:
+      delete name_;
+      break;
+    case kRegexpLiteralString:
+      delete[] runes_;
+      break;
+    case kRegexpCharClass:
+      cc_->Delete();
+      delete ccb_;
+      break;
+  }
+}
+// If it's possible to destroy this regexp without recurring,
+// do so and return true.  Else return false.
+bool Regexp::QuickDestroy() {
+  if (nsub_ == 0) {
+    delete this;
+    return true;
+  }
+  return false;
+}
+static map<Regexp*, int> ref_map;
+static Mutex ref_mutex;
+int Regexp::Ref() {
+  if (ref_ < kMaxRef)
+    return ref_;
+  MutexLock l(&ref_mutex);
+  return ref_map[this];
+}
+// Increments reference count, returns object as convenience.
+Regexp* Regexp::Incref() {
+  if (ref_ >= kMaxRef-1) {
+    // Store ref count in overflow map.
+    MutexLock l(&ref_mutex);
+    if (ref_ == kMaxRef) {  // already overflowed
+      ref_map[this]++;
+      return this;
+    }
+    // overflowing now
+    ref_map[this] = kMaxRef;
+    ref_ = kMaxRef;
+    return this;
+  }
+  ref_++;
+  return this;
+}
+// Decrements reference count and deletes this object if count reaches 0.
+void Regexp::Decref() {
+  if (ref_ == kMaxRef) {
+    // Ref count is stored in overflow map.
+    MutexLock l(&ref_mutex);
+    int r = ref_map[this] - 1;
+    if (r < kMaxRef) {
+      ref_ = r;
+      ref_map.erase(this);
+    } else {
+      ref_map[this] = r;
+    }
+    return;
+  }
+  ref_--;
+  if (ref_ == 0)
+    Destroy();
+}
+// Deletes this object; ref count has count reached 0.
+void Regexp::Destroy() {
+  if (QuickDestroy())
+    return;
+  // Handle recursive Destroy with explicit stack
+  // to avoid arbitrarily deep recursion on process stack [sigh].
+  down_ = NULL;
+  Regexp* stack = this;
+  while (stack != NULL) {
+    Regexp* re = stack;
+    stack = re->down_;
+    if (re->ref_ != 0)
+      LOG(DFATAL) << "Bad reference count " << re->ref_;
+    if (re->nsub_ > 0) {
+      Regexp** subs = re->sub();
+      for (int i = 0; i < re->nsub_; i++) {
+        Regexp* sub = subs[i];
+        if (sub == NULL)
+          continue;
+        if (sub->ref_ == kMaxRef)
+          sub->Decref();
+        else
+          --sub->ref_;
+        if (sub->ref_ == 0 && !sub->QuickDestroy()) {
+          sub->down_ = stack;
+          stack = sub;
+        }
+      }
+      if (re->nsub_ > 1)
+        delete[] subs;
+      re->nsub_ = 0;
+    }
+    delete re;
+  }
+}
+void Regexp::AddRuneToString(Rune r) {
+  DCHECK(op_ == kRegexpLiteralString);
+  if (nrunes_ == 0) {
+    // start with 8
+    runes_ = new Rune[8];
+  } else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) {
+    // double on powers of two
+    Rune *old = runes_;
+    runes_ = new Rune[nrunes_ * 2];
+    for (int i = 0; i < nrunes_; i++)
+      runes_[i] = old[i];
+    delete[] old;
+  }
+  runes_[nrunes_++] = r;
+}
+Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
+  Regexp* re = new Regexp(kRegexpHaveMatch, flags);
+  re->match_id_ = match_id;
+  return re;
+}
+Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
+  if (sub->op() == kRegexpPlus && sub->parse_flags() == flags)
+    return sub;
+  Regexp* re = new Regexp(kRegexpPlus, flags);
+  re->AllocSub(1);
+  re->sub()[0] = sub;
+  return re;
+}
+Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
+  if (sub->op() == kRegexpStar && sub->parse_flags() == flags)
+    return sub;
+  Regexp* re = new Regexp(kRegexpStar, flags);
+  re->AllocSub(1);
+  re->sub()[0] = sub;
+  return re;
+}
+Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
+  if (sub->op() == kRegexpQuest && sub->parse_flags() == flags)
+    return sub;
+  Regexp* re = new Regexp(kRegexpQuest, flags);
+  re->AllocSub(1);
+  re->sub()[0] = sub;
+  return re;
+}
+Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
+                                  ParseFlags flags, bool can_factor) {
+  if (nsub == 1)
+    return sub[0];
+  Regexp** subcopy = NULL;
+  if (op == kRegexpAlternate && can_factor) {
+    // Going to edit sub; make a copy so we don't step on caller.
+    subcopy = new Regexp*[nsub];
+    memmove(subcopy, sub, nsub * sizeof sub[0]);
+    sub = subcopy;
+    nsub = FactorAlternation(sub, nsub, flags);
+    if (nsub == 1) {
+      Regexp* re = sub[0];
+      delete[] subcopy;
+      return re;
+    }
+  }
+  if (nsub > kMaxNsub) {
+    // Too many subexpressions to fit in a single Regexp.
+    // Make a two-level tree.  Two levels gets us to 65535^2.
+    int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub;
+    Regexp* re = new Regexp(op, flags);
+    re->AllocSub(nbigsub);
+    Regexp** subs = re->sub();
+    for (int i = 0; i < nbigsub - 1; i++)
+      subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false);
+    subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,
+                                          nsub - (nbigsub-1)*kMaxNsub, flags,
+                                          false);
+    delete[] subcopy;
+    return re;
+  }
+  Regexp* re = new Regexp(op, flags);
+  re->AllocSub(nsub);
+  Regexp** subs = re->sub();
+  for (int i = 0; i < nsub; i++)
+    subs[i] = sub[i];
+  delete[] subcopy;
+  return re;
+}
+Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {
+  return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);
+}
+Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {
+  return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);
+}
+Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {
+  return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);
+}
+Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {
+  Regexp* re = new Regexp(kRegexpCapture, flags);
+  re->AllocSub(1);
+  re->sub()[0] = sub;
+  re->cap_ = cap;
+  return re;
+}
+Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) {
+  Regexp* re = new Regexp(kRegexpRepeat, flags);
+  re->AllocSub(1);
+  re->sub()[0] = sub;
+  re->min_ = min;
+  re->max_ = max;
+  return re;
+}
+Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) {
+  Regexp* re = new Regexp(kRegexpLiteral, flags);
+  re->rune_ = rune;
+  return re;
+}
+Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) {
+  if (nrunes <= 0)
+    return new Regexp(kRegexpEmptyMatch, flags);
+  if (nrunes == 1)
+    return NewLiteral(runes[0], flags);
+  Regexp* re = new Regexp(kRegexpLiteralString, flags);
+  for (int i = 0; i < nrunes; i++)
+    re->AddRuneToString(runes[i]);
+  return re;
+}
+Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) {
+  Regexp* re = new Regexp(kRegexpCharClass, flags);
+  re->cc_ = cc;
+  return re;
+}
+// Swaps this and that in place.
+void Regexp::Swap(Regexp* that) {
+  // Can use memmove because Regexp is just a struct (no vtable).
+  char tmp[sizeof *this];
+  memmove(tmp, this, sizeof tmp);
+  memmove(this, that, sizeof tmp);
+  memmove(that, tmp, sizeof tmp);
+}
+// Tests equality of all top-level structure but not subregexps.
+static bool TopEqual(Regexp* a, Regexp* b) {
+  if (a->op() != b->op())
+    return false;
+  switch (a->op()) {
+    case kRegexpNoMatch:
+    case kRegexpEmptyMatch:
+    case kRegexpAnyChar:
+    case kRegexpAnyByte:
+    case kRegexpBeginLine:
+    case kRegexpEndLine:
+    case kRegexpWordBoundary:
+    case kRegexpNoWordBoundary:
+    case kRegexpBeginText:
+      return true;
+    case kRegexpEndText:
+      // The parse flags remember whether it's \z or (?-m:$),
+      // which matters when testing against PCRE.
+      return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;
+    case kRegexpLiteral:
+      return a->rune() == b->rune() &&
+             ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;
+    case kRegexpLiteralString:
+      return a->nrunes() == b->nrunes() &&
+             ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&
+             memcmp(a->runes(), b->runes(),
+                    a->nrunes() * sizeof a->runes()[0]) == 0;
+    case kRegexpAlternate:
+    case kRegexpConcat:
+      return a->nsub() == b->nsub();
+    case kRegexpStar:
+    case kRegexpPlus:
+    case kRegexpQuest:
+      return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;
+    case kRegexpRepeat:
+      return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 &&
+             a->min() == b->min() &&
+             a->max() == b->max();
+    case kRegexpCapture:
+      return a->cap() == b->cap() && a->name() == b->name();
+    case kRegexpHaveMatch:
+      return a->match_id() == b->match_id();
+    case kRegexpCharClass: {
+      CharClass* acc = a->cc();
+      CharClass* bcc = b->cc();
+      return acc->size() == bcc->size() &&
+             acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
+             memcmp(acc->begin(), bcc->begin(),
+                    (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;
+    }
+  }
+  LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
+  return 0;
+}
+bool Regexp::Equal(Regexp* a, Regexp* b) {
+  if (a == NULL || b == NULL)
+    return a == b;
+  if (!TopEqual(a, b))
+    return false;
+  // Fast path:
+  // return without allocating vector if there are no subregexps.
+  switch (a->op()) {
+    case kRegexpAlternate:
+    case kRegexpConcat:
+    case kRegexpStar:
+    case kRegexpPlus:
+    case kRegexpQuest:
+    case kRegexpRepeat:
+    case kRegexpCapture:
+      break;
+    default:
+      return true;
+  }
+  // Committed to doing real work.
+  // The stack (vector) has pairs of regexps waiting to
+  // be compared.  The regexps are only equal if
+  // all the pairs end up being equal.
+  vector<Regexp*> stk;
+  for (;;) {
+    // Invariant: TopEqual(a, b) == true.
+    Regexp* a2;
+    Regexp* b2;
+    switch (a->op()) {
+      default:
+        break;
+      case kRegexpAlternate:
+      case kRegexpConcat:
+        for (int i = 0; i < a->nsub(); i++) {
+          a2 = a->sub()[i];
+          b2 = b->sub()[i];
+          if (!TopEqual(a2, b2))
+            return false;
+          stk.push_back(a2);
+          stk.push_back(b2);
+        }
+        break;
+      case kRegexpStar:
+      case kRegexpPlus:
+      case kRegexpQuest:
+      case kRegexpRepeat:
+      case kRegexpCapture:
+        a2 = a->sub()[0];
+        b2 = b->sub()[0];
+        if (!TopEqual(a2, b2))
+          return false;
+        // Really:
+        //   stk.push_back(a2);
+        //   stk.push_back(b2);
+        //   break;
+        // but faster to assign directly and loop.
+        a = a2;
+        b = b2;
+        continue;
+    }
+    int n = stk.size();
+    if (n == 0)
+      break;
+    a = stk[n-2];
+    b = stk[n-1];
+    stk.resize(n-2);
+  }
+  return true;
+}
+// Keep in sync with enum RegexpStatusCode in regexp.h
+static const string kErrorStrings[] = {
+  "no error",
+  "unexpected error",
+  "invalid escape sequence",
+  "invalid character class",
+  "invalid character class range",
+  "missing ]",
+  "missing )",
+  "trailing \\",
+  "no argument for repetition operator",
+  "invalid repetition size",
+  "bad repetition operator",
+  "invalid perl operator",
+  "invalid UTF-8",
+  "invalid named capture group",
+};
+const string& RegexpStatus::CodeText(enum RegexpStatusCode code) {
+  if (code < 0 || code >= arraysize(kErrorStrings))
+    code = kRegexpInternalError;
+  return kErrorStrings[code];
+}
+string RegexpStatus::Text() const {
+  if (error_arg_.empty())
+    return CodeText(code_);
+  string s;
+  s.append(CodeText(code_));
+  s.append(": ");
+  s.append(error_arg_.data(), error_arg_.size());
+  return s;
+}
+void RegexpStatus::Copy(const RegexpStatus& status) {
+  code_ = status.code_;
+  error_arg_ = status.error_arg_;
+}
+typedef int Ignored;  // Walker<void> doesn't exist
+// Walker subclass to count capturing parens in regexp.
+class NumCapturesWalker : public Regexp::Walker<Ignored> {
+ public:
+  NumCapturesWalker() : ncapture_(0) {}
+  int ncapture() { return ncapture_; }
+  virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
+    if (re->op() == kRegexpCapture)
+      ncapture_++;
+    return ignored;
+  }
+  virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
+    // Should never be called: we use Walk not WalkExponential.
+    LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";
+    return ignored;
+  }
+ private:
+  int ncapture_;
+  DISALLOW_EVIL_CONSTRUCTORS(NumCapturesWalker);
+};
+int Regexp::NumCaptures() {
+  NumCapturesWalker w;
+  w.Walk(this, 0);
+  return w.ncapture();
+}
+// Walker class to build map of named capture groups and their indices.
+class NamedCapturesWalker : public Regexp::Walker<Ignored> {
+ public:
+  NamedCapturesWalker() : map_(NULL) {}
+  ~NamedCapturesWalker() { delete map_; }
+  map<string, int>* TakeMap() {
+    map<string, int>* m = map_;
+    map_ = NULL;
+    return m;
+  }
+  Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
+    if (re->op() == kRegexpCapture && re->name() != NULL) {
+      // Allocate map once we find a name.
+      if (map_ == NULL)
+        map_ = new map<string, int>;
+      // Record first occurrence of each name.
+      // (The rule is that if you have the same name
+      // multiple times, only the leftmost one counts.)
+      if (map_->find(*re->name()) == map_->end())
+        (*map_)[*re->name()] = re->cap();
+    }
+    return ignored;
+  }
+  virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
+    // Should never be called: we use Walk not WalkExponential.
+    LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";
+    return ignored;
+  }
+ private:
+  map<string, int>* map_;
+  DISALLOW_EVIL_CONSTRUCTORS(NamedCapturesWalker);
+};
+map<string, int>* Regexp::NamedCaptures() {
+  NamedCapturesWalker w;
+  w.Walk(this, 0);
+  return w.TakeMap();
+}
+// Walker class to build map from capture group indices to their names.
+class CaptureNamesWalker : public Regexp::Walker<Ignored> {
+ public:
+  CaptureNamesWalker() : map_(NULL) {}
+  ~CaptureNamesWalker() { delete map_; }
+  map<int, string>* TakeMap() {
+    map<int, string>* m = map_;
+    map_ = NULL;
+    return m;
+  }
+  Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
+    if (re->op() == kRegexpCapture && re->name() != NULL) {
+      // Allocate map once we find a name.
+      if (map_ == NULL)
+        map_ = new map<int, string>;
+      (*map_)[re->cap()] = *re->name();
+    }
+    return ignored;
+  }
+  virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
+    // Should never be called: we use Walk not WalkExponential.
+    LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
+    return ignored;
+  }
+ private:
+  map<int, string>* map_;
+  DISALLOW_EVIL_CONSTRUCTORS(CaptureNamesWalker);
+};
+map<int, string>* Regexp::CaptureNames() {
+  CaptureNamesWalker w;
+  w.Walk(this, 0);
+  return w.TakeMap();
+}
+// Determines whether regexp matches must be anchored
+// with a fixed string prefix.  If so, returns the prefix and
+// the regexp that remains after the prefix.  The prefix might
+// be ASCII case-insensitive.
+bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
+  // No need for a walker: the regexp must be of the form
+  // 1. some number of ^ anchors
+  // 2. a literal char or string
+  // 3. the rest
+  prefix->clear();
+  *foldcase = false;
+  *suffix = NULL;
+  if (op_ != kRegexpConcat)
+    return false;
+  // Some number of anchors, then a literal or concatenation.
+  int i = 0;
+  Regexp** sub = this->sub();
+  while (i < nsub_ && sub[i]->op_ == kRegexpBeginText)
+    i++;
+  if (i == 0 || i >= nsub_)
+    return false;
+  Regexp* re = sub[i];
+  switch (re->op_) {
+    default:
+      return false;
+    case kRegexpLiteralString:
+      // Convert to string in proper encoding.
+      if (re->parse_flags() & Latin1) {
+        prefix->resize(re->nrunes_);
+        for (int j = 0; j < re->nrunes_; j++)
+          (*prefix)[j] = re->runes_[j];
+      } else {
+        // Convert to UTF-8 in place.
+        // Assume worst-case space and then trim.
+        prefix->resize(re->nrunes_ * UTFmax);
+        char *p = &(*prefix)[0];
+        for (int j = 0; j < re->nrunes_; j++) {
+          Rune r = re->runes_[j];
+          if (r < Runeself)
+            *p++ = r;
+          else
+            p += runetochar(p, &r);
+        }
+        prefix->resize(p - &(*prefix)[0]);
+      }
+      break;
+    case kRegexpLiteral:
+      if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) {
+        prefix->append(1, re->rune_);
+      } else {
+        char buf[UTFmax];
+        prefix->append(buf, runetochar(buf, &re->rune_));
+      }
+      break;
+  }
+  *foldcase = (sub[i]->parse_flags() & FoldCase);
+  i++;
+  // The rest.
+  if (i < nsub_) {
+    for (int j = i; j < nsub_; j++)
+      sub[j]->Incref();
+    re = Concat(sub + i, nsub_ - i, parse_flags());
+  } else {
+    re = new Regexp(kRegexpEmptyMatch, parse_flags());
+  }
+  *suffix = re;
+  return true;
+}
+// Character class builder is a balanced binary tree (STL set)
+// containing non-overlapping, non-abutting RuneRanges.
+// The less-than operator used in the tree treats two
+// ranges as equal if they overlap at all, so that
+// lookups for a particular Rune are possible.
+CharClassBuilder::CharClassBuilder() {
+  nrunes_ = 0;
+  upper_ = 0;
+  lower_ = 0;
+}
+// Add lo-hi to the class; return whether class got bigger.
+bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
+  if (hi < lo)
+    return false;
+  if (lo <= 'z' && hi >= 'A') {
+    // Overlaps some alpha, maybe not all.
+    // Update bitmaps telling which ASCII letters are in the set.
+    Rune lo1 = max<Rune>(lo, 'A');
+    Rune hi1 = min<Rune>(hi, 'Z');
+    if (lo1 <= hi1)
+      upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');
+    lo1 = max<Rune>(lo, 'a');
+    hi1 = min<Rune>(hi, 'z');
+    if (lo1 <= hi1)
+      lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
+  }
+  {  // Check whether lo, hi is already in the class.
+    iterator it = ranges_.find(RuneRange(lo, lo));
+    if (it != end() && it->lo <= lo && hi <= it->hi)
+      return false;
+  }
+  // Look for a range abutting lo on the left.
+  // If it exists, take it out and increase our range.
+  if (lo > 0) {
+    iterator it = ranges_.find(RuneRange(lo-1, lo-1));
+    if (it != end()) {
+      lo = it->lo;
+      if (it->hi > hi)
+        hi = it->hi;
+      nrunes_ -= it->hi - it->lo + 1;
+      ranges_.erase(it);
+    }
+  }
+  // Look for a range abutting hi on the right.
+  // If it exists, take it out and increase our range.
+  if (hi < Runemax) {
+    iterator it = ranges_.find(RuneRange(hi+1, hi+1));
+    if (it != end()) {
+      hi = it->hi;
+      nrunes_ -= it->hi - it->lo + 1;
+      ranges_.erase(it);
+    }
+  }
+  // Look for ranges between lo and hi.  Take them out.
+  // This is only safe because the set has no overlapping ranges.
+  // We've already removed any ranges abutting lo and hi, so
+  // any that overlap [lo, hi] must be contained within it.
+  for (;;) {
+    iterator it = ranges_.find(RuneRange(lo, hi));
+    if (it == end())
+      break;
+    nrunes_ -= it->hi - it->lo + 1;
+    ranges_.erase(it);
+  }
+  // Finally, add [lo, hi].
+  nrunes_ += hi - lo + 1;
+  ranges_.insert(RuneRange(lo, hi));
+  return true;
+}
+void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {
+  for (iterator it = cc->begin(); it != cc->end(); ++it)
+    AddRange(it->lo, it->hi);
+}
+bool CharClassBuilder::Contains(Rune r) {
+  return ranges_.find(RuneRange(r, r)) != end();
+}
+// Does the character class behave the same on A-Z as on a-z?
+bool CharClassBuilder::FoldsASCII() {
+  return ((upper_ ^ lower_) & AlphaMask) == 0;
+}
+CharClassBuilder* CharClassBuilder::Copy() {
+  CharClassBuilder* cc = new CharClassBuilder;
+  for (iterator it = begin(); it != end(); ++it)
+    cc->ranges_.insert(RuneRange(it->lo, it->hi));
+  cc->upper_ = upper_;
+  cc->lower_ = lower_;
+  cc->nrunes_ = nrunes_;
+  return cc;
+}
+void CharClassBuilder::RemoveAbove(Rune r) {
+  if (r >= Runemax)
+    return;
+  if (r < 'z') {
+    if (r < 'a')
+      lower_ = 0;
+    else
+      lower_ &= AlphaMask >> ('z' - r);
+  }
+  if (r < 'Z') {
+    if (r < 'A')
+      upper_ = 0;
+    else
+      upper_ &= AlphaMask >> ('Z' - r);
+  }
+  for (;;) {
+    iterator it = ranges_.find(RuneRange(r + 1, Runemax));
+    if (it == end())
+      break;
+    RuneRange rr = *it;
+    ranges_.erase(it);
+    nrunes_ -= rr.hi - rr.lo + 1;
+    if (rr.lo <= r) {
+      rr.hi = r;
+      ranges_.insert(rr);
+      nrunes_ += rr.hi - rr.lo + 1;
+    }
+  }
+}
+void CharClassBuilder::Negate() {
+  // Build up negation and then copy in.
+  // Could edit ranges in place, but C++ won't let me.
+  vector<RuneRange> v;
+  v.reserve(ranges_.size() + 1);
+  // In negation, first range begins at 0, unless
+  // the current class begins at 0.
+  iterator it = begin();
+  if (it == end()) {
+    v.push_back(RuneRange(0, Runemax));
+  } else {
+    int nextlo = 0;
+    if (it->lo == 0) {
+      nextlo = it->hi + 1;
+      ++it;
+    }
+    for (; it != end(); ++it) {
+      v.push_back(RuneRange(nextlo, it->lo - 1));
+      nextlo = it->hi + 1;
+    }
+    if (nextlo <= Runemax)
+      v.push_back(RuneRange(nextlo, Runemax));
+  }
+  ranges_.clear();
+  for (int i = 0; i < v.size(); i++)
+    ranges_.insert(v[i]);
+  upper_ = AlphaMask & ~upper_;
+  lower_ = AlphaMask & ~lower_;
+  nrunes_ = Runemax+1 - nrunes_;
+}
+// Character class is a sorted list of ranges.
+// The ranges are allocated in the same block as the header,
+// necessitating a special allocator and Delete method.
+CharClass* CharClass::New(int maxranges) {
+  CharClass* cc;
+  uint8* data = new uint8[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
+  cc = reinterpret_cast<CharClass*>(data);
+  cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc);
+  cc->nranges_ = 0;
+  cc->folds_ascii_ = false;
+  cc->nrunes_ = 0;
+  return cc;
+}
+void CharClass::Delete() {
+  if (this == NULL)
+    return;
+  uint8 *data = reinterpret_cast<uint8*>(this);
+  delete[] data;
+}
+CharClass* CharClass::Negate() {
+  CharClass* cc = CharClass::New(nranges_+1);
+  cc->folds_ascii_ = folds_ascii_;
+  cc->nrunes_ = Runemax + 1 - nrunes_;
+  int n = 0;
+  int nextlo = 0;
+  for (CharClass::iterator it = begin(); it != end(); ++it) {
+    if (it->lo == nextlo) {
+      nextlo = it->hi + 1;
+    } else {
+      cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1);
+      nextlo = it->hi + 1;
+    }
+  }
+  if (nextlo <= Runemax)
+    cc->ranges_[n++] = RuneRange(nextlo, Runemax);
+  cc->nranges_ = n;
+  return cc;
+}
+bool CharClass::Contains(Rune r) {
+  RuneRange* rr = ranges_;
+  int n = nranges_;
+  while (n > 0) {
+    int m = n/2;
+    if (rr[m].hi < r) {
+      rr += m+1;
+      n -= m+1;
+    } else if (r < rr[m].lo) {
+      n = m;
+    } else {  // rr[m].lo <= r && r <= rr[m].hi
+      return true;
+    }
+  }
+  return false;
+}
+CharClass* CharClassBuilder::GetCharClass() {
+  CharClass* cc = CharClass::New(ranges_.size());
+  int n = 0;
+  for (iterator it = begin(); it != end(); ++it)
+    cc->ranges_[n++] = *it;
+  cc->nranges_ = n;
+  DCHECK_LE(n, ranges_.size());
+  cc->nrunes_ = nrunes_;
+  cc->folds_ascii_ = FoldsASCII();
+  return cc;
+}
+}  // namespace re2