RubyGems - chipper - Versions diffs - 0.4.2 - Mend

chipper 0.4.2

Files changed (134) hide show

data/README.rdoc +51 -0
data/ext/extconf.rb +58 -0
data/ext/libstemmer_c/Makefile +10 -0
data/ext/libstemmer_c/examples/stemwords.c +209 -0
data/ext/libstemmer_c/include/libstemmer.h +79 -0
data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
data/ext/libstemmer_c/libstemmer/modules.h +190 -0
data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
data/ext/libstemmer_c/mkinc.mak +82 -0
data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
data/ext/libstemmer_c/runtime/api.c +66 -0
data/ext/libstemmer_c/runtime/api.h +26 -0
data/ext/libstemmer_c/runtime/header.h +58 -0
data/ext/libstemmer_c/runtime/utilities.c +478 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
data/ext/re2/bitstate.cc +378 -0
data/ext/re2/compile.cc +1138 -0
data/ext/re2/dfa.cc +2086 -0
data/ext/re2/filtered_re2.cc +100 -0
data/ext/re2/filtered_re2.h +99 -0
data/ext/re2/hash.cc +231 -0
data/ext/re2/mimics_pcre.cc +185 -0
data/ext/re2/nfa.cc +709 -0
data/ext/re2/onepass.cc +614 -0
data/ext/re2/parse.cc +2202 -0
data/ext/re2/perl_groups.cc +119 -0
data/ext/re2/prefilter.cc +671 -0
data/ext/re2/prefilter.h +105 -0
data/ext/re2/prefilter_tree.cc +398 -0
data/ext/re2/prefilter_tree.h +130 -0
data/ext/re2/prog.cc +341 -0
data/ext/re2/prog.h +376 -0
data/ext/re2/re2.cc +1180 -0
data/ext/re2/re2.h +837 -0
data/ext/re2/regexp.cc +920 -0
data/ext/re2/regexp.h +632 -0
data/ext/re2/rune.cc +258 -0
data/ext/re2/set.cc +113 -0
data/ext/re2/set.h +55 -0
data/ext/re2/simplify.cc +393 -0
data/ext/re2/stringpiece.cc +87 -0
data/ext/re2/stringpiece.h +182 -0
data/ext/re2/tostring.cc +341 -0
data/ext/re2/unicode_casefold.cc +469 -0
data/ext/re2/unicode_casefold.h +75 -0
data/ext/re2/unicode_groups.cc +4851 -0
data/ext/re2/unicode_groups.h +64 -0
data/ext/re2/valgrind.cc +24 -0
data/ext/re2/variadic_function.h +346 -0
data/ext/re2/walker-inl.h +244 -0
data/ext/src/chipper.cc +626 -0
data/ext/src/version.h +1 -0
data/ext/stemmer.rb +40 -0
data/ext/util/arena.h +103 -0
data/ext/util/atomicops.h +79 -0
data/ext/util/benchmark.h +41 -0
data/ext/util/flags.h +27 -0
data/ext/util/logging.h +78 -0
data/ext/util/mutex.h +190 -0
data/ext/util/pcre.h +679 -0
data/ext/util/random.h +29 -0
data/ext/util/sparse_array.h +451 -0
data/ext/util/sparse_set.h +177 -0
data/ext/util/test.h +57 -0
data/ext/util/thread.h +26 -0
data/ext/util/utf.h +43 -0
data/ext/util/util.h +127 -0
data/ext/util/valgrind.h +4517 -0
data/test/helper.rb +5 -0
data/test/test_entities.rb +57 -0
data/test/test_tokens.rb +118 -0
metadata +199 -0

data/ext/re2/nfa.cc ADDED Viewed

@@ -0,0 +1,709 @@
+// Copyright 2006-2007 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+// Tested by search_test.cc.
+//
+// Prog::SearchNFA, an NFA search.
+// This is an actual NFA like the theorists talk about,
+// not the pseudo-NFA found in backtracking regexp implementations.
+//
+// IMPLEMENTATION
+//
+// This algorithm is a variant of one that appeared in Rob Pike's sam editor,
+// which is a variant of the one described in Thompson's 1968 CACM paper.
+// See http://swtch.com/~rsc/regexp/ for various history.  The main feature
+// over the DFA implementation is that it tracks submatch boundaries.
+//
+// When the choice of submatch boundaries is ambiguous, this particular
+// implementation makes the same choices that traditional backtracking
+// implementations (in particular, Perl and PCRE) do.
+// Note that unlike in Perl and PCRE, this algorithm *cannot* take exponential
+// time in the length of the input.
+//
+// Like Thompson's original machine and like the DFA implementation, this
+// implementation notices a match only once it is one byte past it.
+#include "re2/prog.h"
+#include "re2/regexp.h"
+#include "util/sparse_array.h"
+#include "util/sparse_set.h"
+namespace re2 {
+class NFA {
+ public:
+  NFA(Prog* prog);
+  ~NFA();
+  // Searches for a matching string.
+  //   * If anchored is true, only considers matches starting at offset.
+  //     Otherwise finds lefmost match at or after offset.
+  //   * If longest is true, returns the longest match starting
+  //     at the chosen start point.  Otherwise returns the so-called
+  //     left-biased match, the one traditional backtracking engines
+  //     (like Perl and PCRE) find.
+  // Records submatch boundaries in submatch[1..nsubmatch-1].
+  // Submatch[0] is the entire match.  When there is a choice in
+  // which text matches each subexpression, the submatch boundaries
+  // are chosen to match what a backtracking implementation would choose.
+  bool Search(const StringPiece& text, const StringPiece& context,
+              bool anchored, bool longest,
+              StringPiece* submatch, int nsubmatch);
+  static const int Debug = 0;
+ private:
+  struct Thread {
+    union {
+      int id;
+      Thread* next;  // when on free list
+    };
+    const char** capture;
+  };
+  // State for explicit stack in AddToThreadq.
+  struct AddState {
+    int id;           // Inst to process
+    int j;
+    const char* cap_j;  // if j>=0, set capture[j] = cap_j before processing ip
+    AddState()
+      : id(0), j(-1), cap_j(NULL) {}
+    explicit AddState(int id)
+      : id(id), j(-1), cap_j(NULL) {}
+    AddState(int id, const char* cap_j, int j)
+      : id(id), j(j), cap_j(cap_j) {}
+  };
+  // Threadq is a list of threads.  The list is sorted by the order
+  // in which Perl would explore that particular state -- the earlier
+  // choices appear earlier in the list.
+  typedef SparseArray<Thread*> Threadq;
+  inline Thread* AllocThread();
+  inline void FreeThread(Thread*);
+  // Add r (or its children, following unlabeled arrows)
+  // to the workqueue q with associated capture info.
+  void AddToThreadq(Threadq* q, int id, int flag,
+                    const char* p, const char** capture);
+  // Run runq on byte c, appending new states to nextq.
+  // Updates matched_ and match_ as new, better matches are found.
+  // p is position of the next byte (the one after c)
+  // in the input string, used when processing capturing parens.
+  // flag is the bitwise or of Bol, Eol, etc., specifying whether
+  // ^, $ and \b match the current input point (after c).
+  inline int Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p);
+  // Returns text version of capture information, for debugging.
+  string FormatCapture(const char** capture);
+  inline void CopyCapture(const char** dst, const char** src);
+  // Computes whether all matches must begin with the same first
+  // byte, and if so, returns that byte.  If not, returns -1.
+  int ComputeFirstByte();
+  Prog* prog_;          // underlying program
+  int start_;           // start instruction in program
+  int ncapture_;        // number of submatches to track
+  bool longest_;        // whether searching for longest match
+  bool endmatch_;       // whether match must end at text.end()
+  const char* btext_;   // beginning of text being matched (for FormatSubmatch)
+  const char* etext_;   // end of text being matched (for endmatch_)
+  Threadq q0_, q1_;     // pre-allocated for Search.
+  const char** match_;  // best match so far
+  bool matched_;        // any match so far?
+  AddState* astack_;    // pre-allocated for AddToThreadq
+  int nastack_;
+  int first_byte_;      // required first byte for match, or -1 if none
+  Thread* free_threads_;  // free list
+  DISALLOW_EVIL_CONSTRUCTORS(NFA);
+};
+NFA::NFA(Prog* prog) {
+  prog_ = prog;
+  start_ = prog->start();
+  ncapture_ = 0;
+  longest_ = false;
+  endmatch_ = false;
+  btext_ = NULL;
+  etext_ = NULL;
+  q0_.resize(prog_->size());
+  q1_.resize(prog_->size());
+  nastack_ = 2*prog_->size();
+  astack_ = new AddState[nastack_];
+  match_ = NULL;
+  matched_ = false;
+  free_threads_ = NULL;
+  first_byte_ = ComputeFirstByte();
+}
+NFA::~NFA() {
+  delete[] match_;
+  delete[] astack_;
+  Thread* next;
+  for (Thread* t = free_threads_; t; t = next) {
+    next = t->next;
+    delete[] t->capture;
+    delete t;
+  }
+}
+void NFA::FreeThread(Thread *t) {
+  if (t == NULL)
+    return;
+  t->next = free_threads_;
+  free_threads_ = t;
+}
+NFA::Thread* NFA::AllocThread() {
+  Thread* t = free_threads_;
+  if (t == NULL) {
+    t = new Thread;
+    t->capture = new const char*[ncapture_];
+    return t;
+  }
+  free_threads_ = t->next;
+  return t;
+}
+void NFA::CopyCapture(const char** dst, const char** src) {
+  for (int i = 0; i < ncapture_; i+=2) {
+    dst[i] = src[i];
+    dst[i+1] = src[i+1];
+  }
+}
+// Follows all empty arrows from r and enqueues all the states reached.
+// The bits in flag (Bol, Eol, etc.) specify whether ^, $ and \b match.
+// The pointer p is the current input position, and m is the
+// current set of match boundaries.
+void NFA::AddToThreadq(Threadq* q, int id0, int flag,
+                       const char* p, const char** capture) {
+  if (id0 == 0)
+    return;
+  // Astack_ is pre-allocated to avoid resize operations.
+  // It has room for 2*prog_->size() entries, which is enough:
+  // Each inst in prog can be processed at most once,
+  // pushing at most two entries on stk.
+  int nstk = 0;
+  AddState* stk = astack_;
+  stk[nstk++] = AddState(id0);
+  while (nstk > 0) {
+    DCHECK_LE(nstk, nastack_);
+    const AddState& a = stk[--nstk];
+    if (a.j >= 0)
+      capture[a.j] = a.cap_j;
+    int id = a.id;
+    if (id == 0)
+      continue;
+    if (q->has_index(id)) {
+      if (Debug)
+        fprintf(stderr, "  [%d%s]\n", id, FormatCapture(capture).c_str());
+      continue;
+    }
+    // Create entry in q no matter what.  We might fill it in below,
+    // or we might not.  Even if not, it is necessary to have it,
+    // so that we don't revisit r during the recursion.
+    q->set_new(id, NULL);
+    Thread** tp = &q->find(id)->second;
+    int j;
+    Thread* t;
+    Prog::Inst* ip = prog_->inst(id);
+    switch (ip->opcode()) {
+    default:
+      LOG(DFATAL) << "unhandled " << ip->opcode() << " in AddToThreadq";
+      break;
+    case kInstFail:
+      break;
+    case kInstAltMatch:
+      // Save state; will pick up at next byte.
+      t = AllocThread();
+      t->id = id;
+      CopyCapture(t->capture, capture);
+      *tp = t;
+      // fall through
+    case kInstAlt:
+      // Explore alternatives.
+      stk[nstk++] = AddState(ip->out1());
+      stk[nstk++] = AddState(ip->out());
+      break;
+    case kInstNop:
+      // Continue on.
+      stk[nstk++] = AddState(ip->out());
+      break;
+    case kInstCapture:
+      if ((j=ip->cap()) < ncapture_) {
+        // Push a dummy whose only job is to restore capture[j]
+        // once we finish exploring this possibility.
+        stk[nstk++] = AddState(0, capture[j], j);
+        // Record capture.
+        capture[j] = p;
+      }
+      stk[nstk++] = AddState(ip->out());
+      break;
+    case kInstMatch:
+    case kInstByteRange:
+      // Save state; will pick up at next byte.
+      t = AllocThread();
+      t->id = id;
+      CopyCapture(t->capture, capture);
+      *tp = t;
+      if (Debug)
+        fprintf(stderr, " + %d%s [%p]\n", id, FormatCapture(t->capture).c_str(), t);
+      break;
+    case kInstEmptyWidth:
+      // Continue on if we have all the right flag bits.
+      if (ip->empty() & ~flag)
+        break;
+      stk[nstk++] = AddState(ip->out());
+      break;
+    }
+  }
+}
+// Run runq on byte c, appending new states to nextq.
+// Updates match as new, better matches are found.
+// p is position of the byte c in the input string,
+// used when processing capturing parens.
+// flag is the bitwise or of Bol, Eol, etc., specifying whether
+// ^, $ and \b match the current input point (after c).
+// Frees all the threads on runq.
+// If there is a shortcut to the end, returns that shortcut.
+int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {
+  nextq->clear();
+  for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
+    Thread* t = i->second;
+    if (t == NULL)
+      continue;
+    if (longest_) {
+      // Can skip any threads started after our current best match.
+      if (matched_ && match_[0] < t->capture[0]) {
+        FreeThread(t);
+        continue;
+      }
+    }
+    int id = t->id;
+    Prog::Inst* ip = prog_->inst(id);
+    switch (ip->opcode()) {
+      default:
+        // Should only see the values handled below.
+        LOG(DFATAL) << "Unhandled " << ip->opcode() << " in step";
+        break;
+      case kInstByteRange:
+        if (ip->Matches(c))
+          AddToThreadq(nextq, ip->out(), flag, p+1, t->capture);
+        break;
+      case kInstAltMatch:
+        if (i != runq->begin())
+          break;
+        // The match is ours if we want it.
+        if (ip->greedy(prog_) || longest_) {
+          CopyCapture((const char**)match_, t->capture);
+          FreeThread(t);
+          for (++i; i != runq->end(); ++i)
+            FreeThread(i->second);
+          runq->clear();
+          matched_ = true;
+          if (ip->greedy(prog_))
+            return ip->out1();
+          return ip->out();
+        }
+        break;
+      case kInstMatch:
+        if (endmatch_ && p != etext_)
+          break;
+        const char* old = t->capture[1];  // previous end pointer
+        t->capture[1] = p;
+        if (longest_) {
+          // Leftmost-longest mode: save this match only if
+          // it is either farther to the left or at the same
+          // point but longer than an existing match.
+          if (!matched_ || t->capture[0] < match_[0] ||
+              (t->capture[0] == match_[0] && t->capture[1] > match_[1]))
+            CopyCapture((const char**)match_, t->capture);
+        } else {
+          // Leftmost-biased mode: this match is by definition
+          // better than what we've already found (see next line).
+          CopyCapture((const char**)match_, t->capture);
+          // Cut off the threads that can only find matches
+          // worse than the one we just found: don't run the
+          // rest of the current Threadq.
+          t->capture[0] = old;
+          FreeThread(t);
+          for (++i; i != runq->end(); ++i)
+            FreeThread(i->second);
+          runq->clear();
+          matched_ = true;
+          return 0;
+        }
+        t->capture[0] = old;
+        matched_ = true;
+        break;
+    }
+    FreeThread(t);
+  }
+  runq->clear();
+  return 0;
+}
+string NFA::FormatCapture(const char** capture) {
+  string s;
+  for (int i = 0; i < ncapture_; i+=2) {
+    if (capture[i] == NULL)
+      StringAppendF(&s, "(?,?)");
+    else if (capture[i+1] == NULL)
+      StringAppendF(&s, "(%d,?)", (int)(capture[i] - btext_));
+    else
+      StringAppendF(&s, "(%d,%d)",
+                    (int)(capture[i] - btext_),
+                    (int)(capture[i+1] - btext_));
+  }
+  return s;
+}
+// Returns whether haystack contains needle's memory.
+static bool StringPieceContains(const StringPiece haystack, const StringPiece needle) {
+  return haystack.begin() <= needle.begin() &&
+         haystack.end() >= needle.end();
+}
+bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
+            bool anchored, bool longest,
+            StringPiece* submatch, int nsubmatch) {
+  if (start_ == 0)
+    return false;
+  StringPiece context = const_context;
+  if (context.begin() == NULL)
+    context = text;
+  if (!StringPieceContains(context, text)) {
+    LOG(FATAL) << "Bad args: context does not contain text "
+                << reinterpret_cast<const void*>(context.begin())
+                << "+" << context.size() << " "
+                << reinterpret_cast<const void*>(text.begin())
+                << "+" << text.size();
+    return false;
+  }
+  if (prog_->anchor_start() && context.begin() != text.begin())
+    return false;
+  if (prog_->anchor_end() && context.end() != text.end())
+    return false;
+  anchored |= prog_->anchor_start();
+  if (prog_->anchor_end()) {
+    longest = true;
+    endmatch_ = true;
+    etext_ = text.end();
+  }
+  if (nsubmatch < 0) {
+    LOG(DFATAL) << "Bad args: nsubmatch=" << nsubmatch;
+    return false;
+  }
+  // Save search parameters.
+  ncapture_ = 2*nsubmatch;
+  longest_ = longest;
+  if (nsubmatch == 0) {
+    // We need to maintain match[0], both to distinguish the
+    // longest match (if longest is true) and also to tell
+    // whether we've seen any matches at all.
+    ncapture_ = 2;
+  }
+  match_ = new const char*[ncapture_];
+  matched_ = false;
+  memset(match_, 0, ncapture_*sizeof match_[0]);
+  // For debugging prints.
+  btext_ = context.begin();
+  if (Debug) {
+    fprintf(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n",
+            text.as_string().c_str(), context.as_string().c_str(), anchored,
+            longest);
+  }
+  // Set up search.
+  Threadq* runq = &q0_;
+  Threadq* nextq = &q1_;
+  runq->clear();
+  nextq->clear();
+  memset(&match_[0], 0, ncapture_*sizeof match_[0]);
+  const char* bp = context.begin();
+  int c = -1;
+  int wasword = 0;
+  if (text.begin() > context.begin()) {
+    c = text.begin()[-1] & 0xFF;
+    wasword = Prog::IsWordChar(c);
+  }
+  // Loop over the text, stepping the machine.
+  for (const char* p = text.begin();; p++) {
+    // Check for empty-width specials.
+    int flag = 0;
+    // ^ and \A
+    if (p == context.begin())
+      flag |= kEmptyBeginText | kEmptyBeginLine;
+    else if (p <= context.end() && p[-1] == '\n')
+      flag |= kEmptyBeginLine;
+    // $ and \z
+    if (p == context.end())
+      flag |= kEmptyEndText | kEmptyEndLine;
+    else if (p < context.end() && p[0] == '\n')
+      flag |= kEmptyEndLine;
+    // \b and \B
+    int isword = 0;
+    if (p < context.end())
+      isword = Prog::IsWordChar(p[0] & 0xFF);
+    if (isword != wasword)
+      flag |= kEmptyWordBoundary;
+    else
+      flag |= kEmptyNonWordBoundary;
+    if (Debug) {
+      fprintf(stderr, "%c[%#x/%d/%d]:", p > text.end() ? '$' : p == bp ? '^' : c, flag, isword, wasword);
+      for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
+        Thread* t = i->second;
+        if (t == NULL)
+          continue;
+        fprintf(stderr, " %d%s", t->id,
+                FormatCapture((const char**)t->capture).c_str());
+      }
+      fprintf(stderr, "\n");
+    }
+    // Process previous character (waited until now to avoid
+    // repeating the flag computation above).
+    // This is a no-op the first time around the loop, because
+    // runq is empty.
+    int id = Step(runq, nextq, c, flag, p-1);
+    DCHECK_EQ(runq->size(), 0);
+    swap(nextq, runq);
+    nextq->clear();
+    if (id != 0) {
+      // We're done: full match ahead.
+      p = text.end();
+      for (;;) {
+        Prog::Inst* ip = prog_->inst(id);
+        switch (ip->opcode()) {
+          default:
+            LOG(DFATAL) << "Unexpected opcode in short circuit: " << ip->opcode();
+            break;
+          case kInstCapture:
+            match_[ip->cap()] = p;
+            id = ip->out();
+            continue;
+          case kInstNop:
+            id = ip->out();
+            continue;
+          case kInstMatch:
+            match_[1] = p;
+            matched_ = true;
+            break;
+          case kInstEmptyWidth:
+            if (ip->empty() & ~(kEmptyEndLine|kEmptyEndText)) {
+              LOG(DFATAL) << "Unexpected empty-width in short circuit: " << ip->empty();
+              break;
+            }
+            id = ip->out();
+            continue;
+        }
+        break;
+      }
+      break;
+    }
+    if (p > text.end())
+      break;
+    // Start a new thread if there have not been any matches.
+    // (No point in starting a new thread if there have been
+    // matches, since it would be to the right of the match
+    // we already found.)
+    if (!matched_ && (!anchored || p == text.begin())) {
+      // If there's a required first byte for an unanchored search
+      // and we're not in the middle of any possible matches,
+      // use memchr to search for the byte quickly.
+      if (!anchored && first_byte_ >= 0 && runq->size() == 0 &&
+          p < text.end() && (p[0] & 0xFF) != first_byte_) {
+        p = reinterpret_cast<const char*>(memchr(p, first_byte_,
+                                                 text.end() - p));
+        if (p == NULL) {
+          p = text.end();
+          isword = 0;
+        } else {
+          isword = Prog::IsWordChar(p[0] & 0xFF);
+        }
+        flag = Prog::EmptyFlags(context, p);
+      }
+      // Steal match storage (cleared but unused as of yet)
+      // temporarily to hold match boundaries for new thread.
+      match_[0] = p;
+      AddToThreadq(runq, start_, flag, p, match_);
+      match_[0] = NULL;
+    }
+    // If all the threads have died, stop early.
+    if (runq->size() == 0) {
+      if (Debug)
+        fprintf(stderr, "dead\n");
+      break;
+    }
+    if (p == text.end())
+      c = 0;
+    else
+      c = *p & 0xFF;
+    wasword = isword;
+    // Will run step(runq, nextq, c, ...) on next iteration.  See above.
+  }
+  for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i)
+    FreeThread(i->second);
+  if (matched_) {
+    for (int i = 0; i < nsubmatch; i++)
+      submatch[i].set(match_[2*i], match_[2*i+1] - match_[2*i]);
+    if (Debug)
+      fprintf(stderr, "match (%d,%d)\n",
+              static_cast<int>(match_[0] - btext_),
+              static_cast<int>(match_[1] - btext_));
+    return true;
+  }
+  VLOG(1) << "No matches found";
+  return false;
+}
+// Computes whether all successful matches have a common first byte,
+// and if so, returns that byte.  If not, returns -1.
+int NFA::ComputeFirstByte() {
+  if (start_ == 0)
+    return -1;
+  int b = -1;  // first byte, not yet computed
+  typedef SparseSet Workq;
+  Workq q(prog_->size());
+  q.insert(start_);
+  for (Workq::iterator it = q.begin(); it != q.end(); ++it) {
+    int id = *it;
+    Prog::Inst* ip = prog_->inst(id);
+    switch (ip->opcode()) {
+      default:
+        LOG(DFATAL) << "unhandled " << ip->opcode() << " in ComputeFirstByte";
+        break;
+      case kInstMatch:
+        // The empty string matches: no first byte.
+        return -1;
+      case kInstByteRange:
+        // Must match only a single byte
+        if (ip->lo() != ip->hi())
+          return -1;
+        if (ip->foldcase() && 'a' <= ip->lo() && ip->lo() <= 'z')
+          return -1;
+        // If we haven't seen any bytes yet, record it;
+        // otherwise must match the one we saw before.
+        if (b == -1)
+          b = ip->lo();
+        else if (b != ip->lo())
+          return -1;
+        break;
+      case kInstNop:
+      case kInstCapture:
+      case kInstEmptyWidth:
+        // Continue on.
+        // Ignore ip->empty() flags for kInstEmptyWidth
+        // in order to be as conservative as possible
+        // (assume all possible empty-width flags are true).
+        if (ip->out())
+          q.insert(ip->out());
+        break;
+      case kInstAlt:
+      case kInstAltMatch:
+        // Explore alternatives.
+        if (ip->out())
+          q.insert(ip->out());
+        if (ip->out1())
+          q.insert(ip->out1());
+        break;
+      case kInstFail:
+        break;
+    }
+  }
+  return b;
+}
+bool
+Prog::SearchNFA(const StringPiece& text, const StringPiece& context,
+                Anchor anchor, MatchKind kind,
+                StringPiece* match, int nmatch) {
+  if (NFA::Debug)
+    Dump();
+  NFA nfa(this);
+  StringPiece sp;
+  if (kind == kFullMatch) {
+    anchor = kAnchored;
+    if (nmatch == 0) {
+      match = &sp;
+      nmatch = 1;
+    }
+  }
+  if (!nfa.Search(text, context, anchor == kAnchored, kind != kFirstMatch, match, nmatch))
+    return false;
+  if (kind == kFullMatch && match[0].end() != text.end())
+    return false;
+  return true;
+}
+}  // namespace re2