RubyGems - sdsykes-ferret - Versions diffs - 0.11.6.19 - Mend

sdsykes-ferret 0.11.6.19

Files changed (195) hide show

data/CHANGELOG +24 -0
data/MIT-LICENSE +20 -0
data/README +102 -0
data/Rakefile +338 -0
data/TODO +17 -0
data/TUTORIAL +231 -0
data/bin/ferret-browser +79 -0
data/ext/analysis.c +1555 -0
data/ext/analysis.h +219 -0
data/ext/api.c +69 -0
data/ext/api.h +27 -0
data/ext/array.c +123 -0
data/ext/array.h +53 -0
data/ext/bitvector.c +540 -0
data/ext/bitvector.h +272 -0
data/ext/compound_io.c +383 -0
data/ext/config.h +42 -0
data/ext/document.c +156 -0
data/ext/document.h +53 -0
data/ext/except.c +120 -0
data/ext/except.h +168 -0
data/ext/extconf.rb +14 -0
data/ext/ferret.c +402 -0
data/ext/ferret.h +91 -0
data/ext/filter.c +156 -0
data/ext/fs_store.c +483 -0
data/ext/global.c +418 -0
data/ext/global.h +117 -0
data/ext/hash.c +567 -0
data/ext/hash.h +473 -0
data/ext/hashset.c +170 -0
data/ext/hashset.h +187 -0
data/ext/header.h +58 -0
data/ext/helper.c +62 -0
data/ext/helper.h +13 -0
data/ext/inc/lang.h +48 -0
data/ext/inc/threading.h +31 -0
data/ext/index.c +6425 -0
data/ext/index.h +961 -0
data/ext/lang.h +66 -0
data/ext/libstemmer.c +92 -0
data/ext/libstemmer.h +79 -0
data/ext/mempool.c +87 -0
data/ext/mempool.h +35 -0
data/ext/modules.h +162 -0
data/ext/multimapper.c +310 -0
data/ext/multimapper.h +51 -0
data/ext/posh.c +1006 -0
data/ext/posh.h +1007 -0
data/ext/priorityqueue.c +151 -0
data/ext/priorityqueue.h +143 -0
data/ext/q_boolean.c +1608 -0
data/ext/q_const_score.c +161 -0
data/ext/q_filtered_query.c +209 -0
data/ext/q_fuzzy.c +268 -0
data/ext/q_match_all.c +148 -0
data/ext/q_multi_term.c +677 -0
data/ext/q_parser.c +2825 -0
data/ext/q_phrase.c +1126 -0
data/ext/q_prefix.c +100 -0
data/ext/q_range.c +350 -0
data/ext/q_span.c +2402 -0
data/ext/q_term.c +337 -0
data/ext/q_wildcard.c +171 -0
data/ext/r_analysis.c +2575 -0
data/ext/r_index.c +3472 -0
data/ext/r_qparser.c +585 -0
data/ext/r_search.c +4105 -0
data/ext/r_store.c +513 -0
data/ext/r_utils.c +963 -0
data/ext/ram_store.c +471 -0
data/ext/search.c +1741 -0
data/ext/search.h +885 -0
data/ext/similarity.c +150 -0
data/ext/similarity.h +82 -0
data/ext/sort.c +983 -0
data/ext/stem_ISO_8859_1_danish.c +338 -0
data/ext/stem_ISO_8859_1_danish.h +16 -0
data/ext/stem_ISO_8859_1_dutch.c +635 -0
data/ext/stem_ISO_8859_1_dutch.h +16 -0
data/ext/stem_ISO_8859_1_english.c +1156 -0
data/ext/stem_ISO_8859_1_english.h +16 -0
data/ext/stem_ISO_8859_1_finnish.c +792 -0
data/ext/stem_ISO_8859_1_finnish.h +16 -0
data/ext/stem_ISO_8859_1_french.c +1276 -0
data/ext/stem_ISO_8859_1_french.h +16 -0
data/ext/stem_ISO_8859_1_german.c +512 -0
data/ext/stem_ISO_8859_1_german.h +16 -0
data/ext/stem_ISO_8859_1_italian.c +1091 -0
data/ext/stem_ISO_8859_1_italian.h +16 -0
data/ext/stem_ISO_8859_1_norwegian.c +296 -0
data/ext/stem_ISO_8859_1_norwegian.h +16 -0
data/ext/stem_ISO_8859_1_porter.c +776 -0
data/ext/stem_ISO_8859_1_porter.h +16 -0
data/ext/stem_ISO_8859_1_portuguese.c +1035 -0
data/ext/stem_ISO_8859_1_portuguese.h +16 -0
data/ext/stem_ISO_8859_1_spanish.c +1119 -0
data/ext/stem_ISO_8859_1_spanish.h +16 -0
data/ext/stem_ISO_8859_1_swedish.c +307 -0
data/ext/stem_ISO_8859_1_swedish.h +16 -0
data/ext/stem_KOI8_R_russian.c +701 -0
data/ext/stem_KOI8_R_russian.h +16 -0
data/ext/stem_UTF_8_danish.c +344 -0
data/ext/stem_UTF_8_danish.h +16 -0
data/ext/stem_UTF_8_dutch.c +653 -0
data/ext/stem_UTF_8_dutch.h +16 -0
data/ext/stem_UTF_8_english.c +1176 -0
data/ext/stem_UTF_8_english.h +16 -0
data/ext/stem_UTF_8_finnish.c +808 -0
data/ext/stem_UTF_8_finnish.h +16 -0
data/ext/stem_UTF_8_french.c +1296 -0
data/ext/stem_UTF_8_french.h +16 -0
data/ext/stem_UTF_8_german.c +526 -0
data/ext/stem_UTF_8_german.h +16 -0
data/ext/stem_UTF_8_italian.c +1113 -0
data/ext/stem_UTF_8_italian.h +16 -0
data/ext/stem_UTF_8_norwegian.c +302 -0
data/ext/stem_UTF_8_norwegian.h +16 -0
data/ext/stem_UTF_8_porter.c +794 -0
data/ext/stem_UTF_8_porter.h +16 -0
data/ext/stem_UTF_8_portuguese.c +1055 -0
data/ext/stem_UTF_8_portuguese.h +16 -0
data/ext/stem_UTF_8_russian.c +709 -0
data/ext/stem_UTF_8_russian.h +16 -0
data/ext/stem_UTF_8_spanish.c +1137 -0
data/ext/stem_UTF_8_spanish.h +16 -0
data/ext/stem_UTF_8_swedish.c +313 -0
data/ext/stem_UTF_8_swedish.h +16 -0
data/ext/stopwords.c +401 -0
data/ext/store.c +692 -0
data/ext/store.h +777 -0
data/ext/term_vectors.c +352 -0
data/ext/threading.h +31 -0
data/ext/utilities.c +446 -0
data/ext/win32.h +54 -0
data/lib/ferret.rb +29 -0
data/lib/ferret/browser.rb +246 -0
data/lib/ferret/browser/s/global.js +192 -0
data/lib/ferret/browser/s/style.css +148 -0
data/lib/ferret/browser/views/document/list.rhtml +49 -0
data/lib/ferret/browser/views/document/show.rhtml +27 -0
data/lib/ferret/browser/views/error/index.rhtml +7 -0
data/lib/ferret/browser/views/help/index.rhtml +8 -0
data/lib/ferret/browser/views/home/index.rhtml +29 -0
data/lib/ferret/browser/views/layout.rhtml +22 -0
data/lib/ferret/browser/views/term-vector/index.rhtml +4 -0
data/lib/ferret/browser/views/term/index.rhtml +199 -0
data/lib/ferret/browser/views/term/termdocs.rhtml +1 -0
data/lib/ferret/browser/webrick.rb +14 -0
data/lib/ferret/document.rb +130 -0
data/lib/ferret/field_infos.rb +44 -0
data/lib/ferret/index.rb +786 -0
data/lib/ferret/number_tools.rb +157 -0
data/lib/ferret_version.rb +3 -0
data/setup.rb +1555 -0
data/test/test_all.rb +5 -0
data/test/test_helper.rb +24 -0
data/test/threading/number_to_spoken.rb +132 -0
data/test/threading/thread_safety_index_test.rb +79 -0
data/test/threading/thread_safety_read_write_test.rb +76 -0
data/test/threading/thread_safety_test.rb +133 -0
data/test/unit/analysis/tc_analyzer.rb +548 -0
data/test/unit/analysis/tc_token_stream.rb +646 -0
data/test/unit/index/tc_index.rb +762 -0
data/test/unit/index/tc_index_reader.rb +699 -0
data/test/unit/index/tc_index_writer.rb +437 -0
data/test/unit/index/th_doc.rb +315 -0
data/test/unit/largefile/tc_largefile.rb +46 -0
data/test/unit/query_parser/tc_query_parser.rb +238 -0
data/test/unit/search/tc_filter.rb +135 -0
data/test/unit/search/tc_fuzzy_query.rb +147 -0
data/test/unit/search/tc_index_searcher.rb +61 -0
data/test/unit/search/tc_multi_searcher.rb +128 -0
data/test/unit/search/tc_multiple_search_requests.rb +58 -0
data/test/unit/search/tc_search_and_sort.rb +179 -0
data/test/unit/search/tc_sort.rb +49 -0
data/test/unit/search/tc_sort_field.rb +27 -0
data/test/unit/search/tc_spans.rb +190 -0
data/test/unit/search/tm_searcher.rb +384 -0
data/test/unit/store/tc_fs_store.rb +77 -0
data/test/unit/store/tc_ram_store.rb +35 -0
data/test/unit/store/tm_store.rb +34 -0
data/test/unit/store/tm_store_lock.rb +68 -0
data/test/unit/tc_document.rb +81 -0
data/test/unit/ts_analysis.rb +2 -0
data/test/unit/ts_index.rb +2 -0
data/test/unit/ts_largefile.rb +4 -0
data/test/unit/ts_query_parser.rb +2 -0
data/test/unit/ts_search.rb +2 -0
data/test/unit/ts_store.rb +2 -0
data/test/unit/ts_utils.rb +2 -0
data/test/unit/utils/tc_bit_vector.rb +295 -0
data/test/unit/utils/tc_number_tools.rb +117 -0
data/test/unit/utils/tc_priority_queue.rb +106 -0
metadata +285 -0

data/TODO ADDED

@@ -0,0 +1,17 @@
+= TODO
+* user defined sorting
+* add field compression
+* Fix highlighting to work for compressed fields
+* Fix highlighting to work for external fields
+* Add Ferret::Index::Index
+* Fix:
+> Working Query:  field1:value1 AND NOT field2:value2
+> Failing Query:    field1:value1 AND ( NOT field2:value2 )
+= Done
+* Add string Sort descripter
+* fix memory bug
+* add MultiReader interface
+* add lexicographical sort (byte sort)
+* Add highlighting

data/TUTORIAL ADDED

@@ -0,0 +1,231 @@
+= Quick Introduction to Ferret
+The simplest way to use Ferret is through the Ferret::Index::Index class.
+This is now aliased by Ferret::I for quick and easy access. Start by including
+the Ferret module.
+  require 'ferret'
+  include Ferret
+=== Creating an index
+To create an in memory index is very simple;
+  index = Index::Index.new()
+To create a persistent index;
+  index = Index::Index.new(:path => '/path/to/index')
+Both of these methods create new Indexes with the StandardAnalyzer. An
+analyzer is what you use to divide the input data up into tokens which you can
+search for later. If you'd like to use a different analyzer you can specify it
+here, eg;
+  index = Index::Index.new(:path => '/path/to/index',
+                           :analyzer => Analysis::WhiteSpaceAnalyzer.new)
+For more options when creating an Index refer to Ferret::Index::Index.
+=== Adding Documents
+To add a document you can simply add a string or an array of strings. This will
+store all the strings in the "" (ie empty string) field (unless you specify the
+default field when you create the index).
+  index << "This is a new document to be indexed"
+  index << ["And here", "is another", "new document", "to be indexed"]
+But these are pretty simple documents. If this is all you want to index you
+could probably just use SimpleSearch. So let's give our documents some fields;
+  index << {:title => "Programming Ruby", :content => "blah blah blah"}
+  index << {:title => "Programming Ruby", :content => "yada yada yada"}
+Note the way that all field-names are Symbols. Although Strings will work,
+this is a best-practice in Ferret. Or if you are indexing data stored in a
+database, you'll probably want to store the id;
+  index << {:id => row.id, :title => row.title, :date => row.date}
+So far we have been storing and tokenizing all of the input data along with
+term vectors. If we want to change this we need to change the way we setup the
+index. You must create a FieldInfos object describing the index:
+  field_infos = FieldInfos.new(:store => :no,
+                               :index => :untokenized_omit_norms,
+                               :term_vector => :no)
+The values that you set FieldInfos to have will be used by default by all
+fields. If you want to change the properties for specific fields, you need to
+add a FieldInfo to field_infos.
+  field_infos.add_field(:title, :store => :yes, :index => :yes, :boost => 10.0)
+  field_infos.add_field(:content, :store => :yes,
+                                  :index => :yes,
+                                  :term_vector => :with_positions_offsets)
+If you need to add a field to an already open index you do so like this:
+  index.field_infos.add_field(:new_field, :store => :yes)
+=== Searching
+Now that we have data in our index, how do we actually use this index to
+search the data? The Index offers two search methods, Index#search and
+Index#search_each. The first method returns a Ferret::Index::TopDocs object.
+The second we'll show here. Lets say we wanted to find all documents with the
+phrase "quick brown fox" in the content field. We'd write;
+  index.search_each('content:"quick brown fox"') do |id, score|
+    puts "Document #{id} found with a score of #{score}"
+  end
+But "fast" has a pretty similar meaning to "quick" and we don't mind if the
+fox is a little red. Also, the phrase could be in the title so we'll search
+there as well. So we could expand our search like this;
+  index.search_each('title|content:"quick|fast brown|red fox"') do |id, score|
+    puts "Document #{id} found with a score of #{score}"
+  end
+What if we want to find all documents entered on or after 5th of September,
+2005 with the words "ruby" or "rails" in any field. We could type something like;
+  index.search_each('date:( >= 20050905) *:(ruby OR rails)') do |id, score|
+    puts "Document #{index[id][:title]} found with a score of #{score}"
+  end
+Ferret has quite a complex query language. To find out more about Ferret's
+query language, see Ferret::QueryParser. You can also construct even more
+complex queries like Ferret::Search::Spans by hand. See Ferret::Search::Query
+for more information.
+=== Highlighting
+Ferret now has a super-fast highlighting method. See
+Ferret::Index::Index#highlight. Here is an example of how you would use it
+when printing to the console:
+  index.search_each('date:( >= 20050905) content:(ruby OR rails)') do |id, score|
+    puts "Document #{index[id][:title]} found with a score of #{score}"
+    highlights = index.highlight("content:(ruby OR rails)", 0,
+                                 :field => :content,
+                                 :pre_tag = "\033[36m",
+                                 :post_tag = "\033[m")
+    puts highlights
+  end
+And if you want to highlight a whole document, set :excerpt_length to :all:
+  puts index.highlight(query, doc_id,
+                       :field => :content,
+                       :pre_tag = "\033[36m",
+                       :post_tag = "\033[m",
+                       :excerpt_length => :all)
+=== Accessing Documents
+You may have noticed that when we run a search we only get the document id
+back. By itself this isn't much use to us. Getting the data from the index is
+very straightforward. For example if we want the :title field form the 3rd
+document type;
+  index[2][:title]
+Documents are lazy loading so if you try this:
+  puts index[2]
+You will always get an empty Hash. To load all fields, call the load method:
+  puts index[2].load
+NOTE: documents are indexed from 0. You can also use array-like index
+parameters to access index. For example
+  index[1..4]
+  index[10, 10]
+  index[-5]
+The default field is :id (although you can change this with index's
+:default_create_field parameter);
+  index << "This is a document"
+  index[0][:id]
+Let's go back to the database example above. If we store all of our documents
+with an id then we can access that field using the id. As long as we called
+our id field :id we can do this
+  index["89721347"]["title"]
+Pretty simple huh? You should note though that if there are more then one
+document with the same *id* or *key* then only the first one will be returned
+so it is probably better that you ensure the key is unique somehow. By setting
+Index's :key attribute to :id, Ferret will do this automatically for you. It
+can even handle multiple field primary keys. For example, you could set to
+:key to [:id, :model] and Ferret would keep the documents unique for that pair
+of fields.
+=== Modifying and Deleting Documents
+What if we want to change the data in the index. Ferret doesn't actually let
+you change the data once it is in the index. But you can delete documents so
+the standard way to modify data is to delete it and re-add it again with the
+modifications made. It is important to note that when doing this the documents
+will get a new document number so you should be careful not to use a document
+number after the document has been deleted. Here is an example of modifying a
+document;
+  index << {:title => "Programing Rbuy", :content => "blah blah blah"}
+  doc_num = nil
+  index.search_each('title:"Programing Rbuy"') {|id, score| doc_id = id}
+  return unless doc_id
+  doc = index[doc_id]
+  index.delete(doc_id)
+  # modify doc. It is just a Hash after all
+  doc[:title] = "Programming Ruby"
+  index << doc
+If you set the :key parameter as described in the last section there is no
+need to delete the document. It will be automatically deleted when you add
+another document with the same key.
+Also, we can use the id field, as above, to delete documents. This time though
+every document that matches the id will be deleted. Again, it is probably a
+good idea if you somehow ensure that your *ids* are kept unique.
+  id = "23453422"
+  index.delete(id)
+=== Onwards
+This is just a small sampling of what Ferret allows you to do.  Ferret, like
+Lucene, is designed to be extended, and allows you to construct your own query
+types, analyzers, and so on. Going onwards you should check out the following
+documentation:
+* Ferret::Analysis: for more information on how the data is processed when it
+  is tokenized. There are a number of things you can do with your data such as
+  adding stop lists or perhaps a porter stemmer. There are also a number of
+  analyzers already available and it is almost trivial to create a new one
+  with a simple regular expression.
+* Ferret::Search: for more information on querying the index. There are a
+  number of already available queries and it's unlikely you'll need to create
+  your own. You may however want to take advantage of the sorting or filtering
+  abilities of Ferret to present your data the best way you see fit.
+* Ferret::QueryParser: if you want to find out more about what you can do with
+  Ferret's Query Parser, this is the place to look. The query parser is one
+  area that could use a bit of work so please send your suggestions.
+* Ferret::Index: for more advanced access to the index you'll probably want to
+  use the Ferret::Index::IndexWriter and Ferret::Index::IndexReader. This is
+  the place to look for more information on them.
+* Ferret::Store: This is the module used to access the actual index storage
+  and won't be of much interest to most people.

data/bin/ferret-browser ADDED

@@ -0,0 +1,79 @@
+#!/usr/bin/env ruby
+$: << File.expand_path(File.join(File.basename(__FILE__), '../lib'))
+require 'ferret'
+require 'ferret/browser'
+require 'optparse'
+require 'ostruct'
+SERVER_OPTIONS = ['webrick']
+conf = OpenStruct.new(:host => '0.0.0.0', :port => 3301)
+opts = OptionParser.new do |opts|
+  opts.banner = "Usage: #{File.basename($0)} /path/to/index"
+  opts.separator ""
+  opts.separator "Specific Options:"
+  opts.on("-h", "--host HOSTNAME",
+          "Host for web server to bind to (default is all IPs)") { |conf.host| }
+  opts.on("-p", "--port NUM",
+          "Port for web server (defaults to #{conf.port})") { |conf.port| }
+  opts.on("-s", "--server NAME",
+          "Server to force (#{SERVER_OPTIONS.join(', ')}).") { |s| conf.server = s.to_sym }
+  opts.separator ""
+  opts.separator "Common options:"
+  opts.on_tail("-?", "--help", "Show this message") do
+    puts opts
+    exit
+  end
+  opts.on_tail("-v", "--version", "Show version") do
+    puts Ferret::VERSION
+    exit
+  end
+end
+opts.parse! ARGV
+if ARGV.length != 1
+  puts opts
+  exit
+end
+@path = ARGV[0]
+# Load the Ferret index
+begin
+  @reader = Ferret::Index::IndexReader.new(@path)
+rescue Ferret::FileNotFoundError => e
+  puts "\033[31mCannot start Ferret. No index exists at \"\033[m" +
+    "\033[33m#{@path}\033[m\033[31m\".\033[m"
+  exit
+rescue Exception => e
+  puts "\033[31mCannot start Ferret.\n\033[m\033[33m#{e.to_s}\031[m"
+  exit
+end
+unless conf.server
+  conf.server = :webrick
+end
+case conf.server.to_s
+when 'webrick'
+  require 'webrick/httpserver'
+  require 'ferret/browser/webrick'
+  # Mount the root
+  s = WEBrick::HTTPServer.new(:BindAddress => conf.host, :Port => conf.port)
+  s.mount "/s", WEBrick::HTTPServlet::FileHandler, Ferret::Browser::Controller::STATIC_DIR, true
+  s.mount "/", WEBrick::FerretBrowserHandler, @reader, @path
+  # Server up
+  trap(:INT) do
+    s.shutdown
+  end
+  s.start
+else
+  raise "server #{conf.server} not known. Must be one of [#{SERVER_OPTIONS.join(', ')}]"
+end

data/ext/analysis.c ADDED

@@ -0,0 +1,1555 @@
+#include "analysis.h"
+#include "hash.h"
+#include "libstemmer.h"
+#include <string.h>
+#include <ctype.h>
+#include <wctype.h>
+#include <wchar.h>
+/****************************************************************************
+ *
+ * Token
+ *
+ ****************************************************************************/
+INLINE Token *tk_set(Token *tk,
+                     char *text, int tlen, off_t start, off_t end, int pos_inc)
+{
+    if (tlen >= MAX_WORD_SIZE) {
+        tlen = MAX_WORD_SIZE - 1;
+    }
+    memcpy(tk->text, text, sizeof(char) * tlen);
+    tk->text[tlen] = '\0';
+    tk->len = tlen;
+    tk->start = start;
+    tk->end = end;
+    tk->pos_inc = pos_inc;
+    return tk;
+}
+INLINE Token *tk_set_ts(Token *tk,
+                        char *start, char *end, char *text, int pos_inc)
+{
+    return tk_set(tk, start, (int)(end - start),
+                  (off_t)(start - text), (off_t)(end - text), pos_inc);
+}
+INLINE Token *tk_set_no_len(Token *tk,
+                            char *text, off_t start, off_t end, int pos_inc)
+{
+    return tk_set(tk, text, (int)strlen(text), start, end, pos_inc);
+}
+INLINE Token *w_tk_set(Token *tk, wchar_t *text, off_t start, off_t end,
+                       int pos_inc)
+{
+    int len = wcstombs(tk->text, text, MAX_WORD_SIZE - 1);
+    tk->text[len] = '\0';
+    tk->len = len;
+    tk->start = start;
+    tk->end = end;
+    tk->pos_inc = pos_inc;
+    return tk;
+}
+int tk_eq(Token *tk1, Token *tk2)
+{
+    return (strcmp((char *)tk1->text, (char *)tk2->text) == 0 &&
+            tk1->start == tk2->start && tk1->end == tk2->end &&
+            tk1->pos_inc == tk2->pos_inc);
+}
+int tk_cmp(Token *tk1, Token *tk2)
+{
+    int cmp;
+    if (tk1->start > tk2->start) {
+        cmp = 1;
+    }
+    else if (tk1->start < tk2->start) {
+        cmp = -1;
+    }
+    else {
+        if (tk1->end > tk2->end) {
+            cmp = 1;
+        }
+        else if (tk1->end < tk2->end) {
+            cmp = -1;
+        }
+        else {
+            cmp = strcmp((char *)tk1->text, (char *)tk2->text);
+        }
+    }
+    return cmp;
+}
+void tk_destroy(void *p)
+{
+    free(p);
+}
+Token *tk_new()
+{
+    return ALLOC(Token);
+}
+/****************************************************************************
+ *
+ * TokenStream
+ *
+ ****************************************************************************/
+void ts_deref(TokenStream *ts)
+{
+    if (--ts->ref_cnt <= 0) {
+        ts->destroy_i(ts);
+    }
+}
+static TokenStream *ts_reset(TokenStream *ts, char *text)
+{
+    ts->t = ts->text = text;
+    return ts;
+}
+TokenStream *ts_clone_size(TokenStream *orig_ts, size_t size)
+{
+    TokenStream *ts = (TokenStream *)ecalloc(size);
+    memcpy(ts, orig_ts, size);
+    ts->ref_cnt = 1;
+    return ts;
+}
+TokenStream *ts_new_i(size_t size)
+{
+    TokenStream *ts = ecalloc(size);
+    ts->destroy_i = (void (*)(TokenStream *))&free;
+    ts->reset = &ts_reset;
+    ts->ref_cnt = 1;
+    return ts;
+}
+/****************************************************************************
+ * CachedTokenStream
+ ****************************************************************************/
+#define CTS(token_stream) ((CachedTokenStream *)(token_stream))
+static TokenStream *cts_clone_i(TokenStream *orig_ts)
+{
+    return ts_clone_size(orig_ts, sizeof(CachedTokenStream));
+}
+static TokenStream *cts_new()
+{
+    TokenStream *ts = ts_new(CachedTokenStream);
+    ts->clone_i = &cts_clone_i;
+    return ts;
+}
+/* * Multi-byte TokenStream * */
+#define MBTS(token_stream) ((MultiByteTokenStream *)(token_stream))
+INLINE int mb_next_char(wchar_t *wchr, const char *s, mbstate_t *state)
+{
+    int num_bytes;
+    if ((num_bytes = (int)mbrtowc(wchr, s, MB_CUR_MAX, state)) < 0) {
+        const char *t = s;
+        do {
+            t++;
+            ZEROSET(state, mbstate_t);
+            num_bytes = (int)mbrtowc(wchr, t, MB_CUR_MAX, state);
+        } while ((num_bytes < 0) && (*t != 0));
+        num_bytes = t - s;
+        if (*t == 0) *wchr = 0;
+    }
+    return num_bytes;
+}
+static TokenStream *mb_ts_reset(TokenStream *ts, char *text)
+{
+    ZEROSET(&(MBTS(ts)->state), mbstate_t);
+    ts_reset(ts, text);
+    return ts;
+}
+static TokenStream *mb_ts_clone_i(TokenStream *orig_ts)
+{
+    return ts_clone_size(orig_ts, sizeof(MultiByteTokenStream));
+}
+TokenStream *mb_ts_new()
+{
+    TokenStream *ts = ts_new(MultiByteTokenStream);
+    ts->reset = &mb_ts_reset;
+    ts->clone_i = &mb_ts_clone_i;
+    ts->ref_cnt = 1;
+    return ts;
+}
+/****************************************************************************
+ *
+ * Analyzer
+ *
+ ****************************************************************************/
+void a_deref(Analyzer *a)
+{
+    if (--a->ref_cnt <= 0) {
+        a->destroy_i(a);
+    }
+}
+static void a_standard_destroy_i(Analyzer *a)
+{
+    if (a->current_ts) {
+        ts_deref(a->current_ts);
+    }
+    free(a);
+}
+static TokenStream *a_standard_get_ts(Analyzer *a, char *field, char *text)
+{
+    TokenStream *ts;
+    (void)field;
+    ts = ts_clone(a->current_ts);
+    return ts->reset(ts, text);
+}
+Analyzer *analyzer_new(TokenStream *ts,
+                       void (*destroy_i)(Analyzer *a),
+                       TokenStream *(*get_ts)(Analyzer *a, char *field,
+                                              char *text))
+{
+    Analyzer *a = ALLOC(Analyzer);
+    a->current_ts = ts;
+    a->destroy_i = (destroy_i ? destroy_i : &a_standard_destroy_i);
+    a->get_ts = (get_ts ? get_ts : &a_standard_get_ts);
+    a->ref_cnt = 1;
+    return a;
+}
+/****************************************************************************
+ *
+ * Non
+ *
+ ****************************************************************************/
+/*
+ * NonTokenizer
+ */
+static Token *nt_next(TokenStream *ts)
+{
+    if (ts->t) {
+        size_t len = strlen(ts->t);
+        ts->t = NULL;
+        return tk_set(&(CTS(ts)->token), ts->text, len, 0, len, 1);
+    }
+    else {
+        return NULL;
+    }
+}
+TokenStream *non_tokenizer_new()
+{
+    TokenStream *ts = cts_new();
+    ts->next = &nt_next;
+    return ts;
+}
+/*
+ * NonAnalyzer
+ */
+Analyzer *non_analyzer_new()
+{
+    return analyzer_new(non_tokenizer_new(), NULL, NULL);
+}
+/****************************************************************************
+ *
+ * Whitespace
+ *
+ ****************************************************************************/
+/*
+ * WhitespaceTokenizer
+ */
+static Token *wst_next(TokenStream *ts)
+{
+    char *t = ts->t;
+    char *start;
+    while (*t != '\0' && isspace(*t)) {
+        t++;
+    }
+    if (*t == '\0') {
+        return NULL;
+    }
+    start = t;
+    while (*t != '\0' && !isspace(*t)) {
+        t++;
+    }
+    ts->t = t;
+    return tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
+}
+TokenStream *whitespace_tokenizer_new()
+{
+    TokenStream *ts = cts_new();
+    ts->next = &wst_next;
+    return ts;
+}
+/*
+ * Multi-byte WhitespaceTokenizer
+ */
+static Token *mb_wst_next(TokenStream *ts)
+{
+    int i;
+    char *start;
+    char *t = ts->t;
+    wchar_t wchr;
+    mbstate_t *state = &(MBTS(ts)->state);
+    i = mb_next_char(&wchr, t, state);
+    while (wchr != 0 && iswspace(wchr)) {
+        t += i;
+        i = mb_next_char(&wchr, t, state);
+    }
+    if (wchr == 0) {
+        return NULL;
+    }
+    start = t;
+    t += i;
+    i = mb_next_char(&wchr, t, state);
+    while (wchr != 0 && !iswspace(wchr)) {
+        t += i;
+        i = mb_next_char(&wchr, t, state);
+    }
+    ts->t = t;
+    return tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
+}
+/*
+ * Lowercasing Multi-byte WhitespaceTokenizer
+ */
+static Token *mb_wst_next_lc(TokenStream *ts)
+{
+    int i;
+    char *start;
+    char *t = ts->t;
+    wchar_t wchr;
+    wchar_t wbuf[MAX_WORD_SIZE + 1], *w, *w_end;
+    mbstate_t *state = &(MBTS(ts)->state);
+    w = wbuf;
+    w_end = &wbuf[MAX_WORD_SIZE];
+    i = mb_next_char(&wchr, t, state);
+    while (wchr != 0 && iswspace(wchr)) {
+        t += i;
+        i = mb_next_char(&wchr, t, state);
+    }
+    if (wchr == 0) {
+        return NULL;
+    }
+    start = t;
+    t += i;
+    *w++ = towlower(wchr);
+    i = mb_next_char(&wchr, t, state);
+    while (wchr != 0 && !iswspace(wchr)) {
+        if (w < w_end) {
+            *w++ = towlower(wchr);
+        }
+        t += i;
+        i = mb_next_char(&wchr, t, state);
+    }
+    *w = 0;
+    ts->t = t;
+    return w_tk_set(&(CTS(ts)->token), wbuf, (off_t)(start - ts->text),
+                    (off_t)(t - ts->text), 1);
+}
+TokenStream *mb_whitespace_tokenizer_new(bool lowercase)
+{
+    TokenStream *ts = mb_ts_new();
+    ts->next = lowercase ? &mb_wst_next_lc : &mb_wst_next;
+    return ts;
+}
+/*
+ * WhitespaceAnalyzers
+ */
+Analyzer *whitespace_analyzer_new(bool lowercase)
+{
+    TokenStream *ts;
+    if (lowercase) {
+        ts = lowercase_filter_new(whitespace_tokenizer_new());
+    }
+    else {
+        ts = whitespace_tokenizer_new();
+    }
+    return analyzer_new(ts, NULL, NULL);
+}
+Analyzer *mb_whitespace_analyzer_new(bool lowercase)
+{
+    return analyzer_new(mb_whitespace_tokenizer_new(lowercase), NULL, NULL);
+}
+/****************************************************************************
+ *
+ * Letter
+ *
+ ****************************************************************************/
+/*
+ * LetterTokenizer
+ */
+Token *lt_next(TokenStream *ts)
+{
+    char *start;
+    char *t = ts->t;
+    while (*t != '\0' && !isalpha(*t)) {
+        t++;
+    }
+    if (*t == '\0') {
+        return NULL;
+    }
+    start = t;
+    while (*t != '\0' && isalpha(*t)) {
+        t++;
+    }
+    ts->t = t;
+    return tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
+}
+TokenStream *letter_tokenizer_new()
+{
+    TokenStream *ts = cts_new();
+    ts->next = &lt_next;
+    return ts;
+}
+/*
+ * Multi-byte LetterTokenizer
+ */
+Token *mb_lt_next(TokenStream *ts)
+{
+    int i;
+    char *start;
+    char *t = ts->t;
+    wchar_t wchr;
+    mbstate_t *state = &(MBTS(ts)->state);
+    i = mb_next_char(&wchr, t, state);
+    while (wchr != 0 && !iswalpha(wchr)) {
+        t += i;
+        i = mb_next_char(&wchr, t, state);
+    }
+    if (wchr == 0) {
+        return NULL;
+    }
+    start = t;
+    t += i;
+    i = mb_next_char(&wchr, t, state);
+    while (wchr != 0 && iswalpha(wchr)) {
+        t += i;
+        i = mb_next_char(&wchr, t, state);
+    }
+    ts->t = t;
+    return tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
+}
+/*
+ * Lowercasing Multi-byte LetterTokenizer
+ */
+Token *mb_lt_next_lc(TokenStream *ts)
+{
+    int i;
+    char *start;
+    char *t = ts->t;
+    wchar_t wchr;
+    wchar_t wbuf[MAX_WORD_SIZE + 1], *w, *w_end;
+    mbstate_t *state = &(MBTS(ts)->state);
+    w = wbuf;
+    w_end = &wbuf[MAX_WORD_SIZE];
+    i = mb_next_char(&wchr, t, state);
+    while (wchr != 0 && !iswalpha(wchr)) {
+        t += i;
+        i = mb_next_char(&wchr, t, state);
+    }
+    if (wchr == 0) {
+        return NULL;
+    }
+    start = t;
+    t += i;
+    *w++ = towlower(wchr);
+    i = mb_next_char(&wchr, t, state);
+    while (wchr != 0 && iswalpha(wchr)) {
+        if (w < w_end) {
+            *w++ = towlower(wchr);
+        }
+        t += i;
+        i = mb_next_char(&wchr, t, state);
+    }
+    *w = 0;
+    ts->t = t;
+    return w_tk_set(&(CTS(ts)->token), wbuf, (off_t)(start - ts->text),
+                    (off_t)(t - ts->text), 1);
+}
+TokenStream *mb_letter_tokenizer_new(bool lowercase)
+{
+    TokenStream *ts = mb_ts_new();
+    ts->next = lowercase ? &mb_lt_next_lc : &mb_lt_next;
+    return ts;
+}
+/*
+ * LetterAnalyzers
+ */
+Analyzer *letter_analyzer_new(bool lowercase)
+{
+    TokenStream *ts;
+    if (lowercase) {
+        ts = lowercase_filter_new(letter_tokenizer_new());
+    }
+    else {
+        ts = letter_tokenizer_new();
+    }
+    return analyzer_new(ts, NULL, NULL);
+}
+Analyzer *mb_letter_analyzer_new(bool lowercase)
+{
+    return analyzer_new(mb_letter_tokenizer_new(lowercase), NULL, NULL);
+}
+/****************************************************************************
+ *
+ * Standard
+ *
+ ****************************************************************************/
+#define STDTS(token_stream) ((StandardTokenizer *)(token_stream))
+/*
+ * StandardTokenizer
+ */
+static int std_get_alpha(TokenStream *ts, char *token)
+{
+    int i = 0;
+    char *t = ts->t;
+    while (t[i] != '\0' && isalnum(t[i])) {
+        if (i < MAX_WORD_SIZE) {
+            token[i] = t[i];
+        }
+        i++;
+    }
+    return i;
+}
+static int mb_std_get_alpha(TokenStream *ts, char *token)
+{
+    char *t = ts->t;
+    wchar_t wchr;
+    int i;
+    mbstate_t state; ZEROSET(&state, mbstate_t);
+    i = mb_next_char(&wchr, t, &state);
+    while (wchr != 0 && iswalnum(wchr)) {
+        t += i;
+        i = mb_next_char(&wchr, t, &state);
+    }
+    i = (int)(t - ts->t);
+    if (i > MAX_WORD_SIZE) {
+        i = MAX_WORD_SIZE - 1;
+    }
+    memcpy(token, ts->t, i);
+    return i;
+}
+/*
+static int std_get_alnum(TokenStream *ts, char *token)
+{
+    int i = 0;
+    char *t = ts->t;
+    while (t[i] != '\0' && isalnum(t[i])) {
+        if (i < MAX_WORD_SIZE) {
+            token[i] = t[i];
+        }
+        i++;
+    }
+    return i;
+}
+static int mb_std_get_alnum(TokenStream *ts, char *token)
+{
+    char *t = ts->t;
+    wchar_t wchr;
+    int i;
+    mbstate_t state; ZEROSET(&state, mbstate_t);
+    i = mb_next_char(&wchr, t, &state);
+    while (wchr != 0 && iswalnum(wchr)) {
+        t += i;
+        i = mb_next_char(&wchr, t, &state);
+    }
+    i = (int)(t - ts->t);
+    if (i > MAX_WORD_SIZE) {
+        i = MAX_WORD_SIZE - 1;
+    }
+    memcpy(token, ts->t, i);
+    return i;
+}
+*/
+static int isnumpunc(char c)
+{
+    return (c == '.' || c == ',' || c == '\\' || c == '/' || c == '_'
+            || c == '-');
+}
+static int w_isnumpunc(wchar_t c)
+{
+    return (c == L'.' || c == L',' || c == L'\\' || c == L'/' || c == L'_'
+            || c == L'-');
+}
+static int isurlpunc(char c)
+{
+    return (c == '.' || c == '/' || c == '-' || c == '_');
+}
+static int isurlc(char c)
+{
+    return (c == '.' || c == '/' || c == '-' || c == '_' || isalnum(c));
+}
+static int isurlxatpunc(char c)
+{
+    return (c == '.' || c == '/' || c == '-' || c == '_' || c == '@');
+}
+static int isurlxatc(char c)
+{
+    return (c == '.' || c == '/' || c == '-' || c == '_' || c == '@'
+            || isalnum(c));
+}
+static bool std_is_tok_char(char *c)
+{
+    if (isspace(*c)) {
+        return false;           /* most common so check first. */
+    }
+    if (isalnum(*c) || isnumpunc(*c) || *c == '&' ||
+        *c == '@' || *c == '\'' || *c == ':') {
+        return true;
+    }
+    return false;
+}
+static bool mb_std_is_tok_char(char *t)
+{
+    wchar_t c;
+    mbstate_t state; ZEROSET(&state, mbstate_t);
+    if (((int)mbrtowc(&c, t, MB_CUR_MAX, &state)) < 0) {
+        /* error which we can handle next time round. For now just return
+         * false so that we can return a token */
+        return false;
+    }
+    if (iswspace(c)) {
+        return false;           /* most common so check first. */
+    }
+    if (iswalnum(c) || w_isnumpunc(c) || c == L'&' || c == L'@' || c == L'\''
+        || c == L':') {
+        return true;
+    }
+    return false;
+}
+/* (alnum)((punc)(alnum))+ where every second sequence of alnum must contain at
+ * least one digit.
+ * (alnum) = [a-zA-Z0-9]
+ * (punc) = [_\/.,-]
+ */
+static int std_get_number(char *input)
+{
+    int i = 0;
+    int count = 0;
+    int last_seen_digit = 2;
+    int seen_digit = false;
+    while (last_seen_digit >= 0) {
+        while ((input[i] != '\0') && isalnum(input[i])) {
+            if ((last_seen_digit < 2) && isdigit(input[i])) {
+                last_seen_digit = 2;
+            }
+            if ((seen_digit == false) && isdigit(input[i])) {
+                seen_digit = true;
+            }
+            i++;
+        }
+        last_seen_digit--;
+        if (!isnumpunc(input[i]) || !isalnum(input[i + 1])) {
+            if (last_seen_digit >= 0) {
+                count = i;
+            }
+            break;
+        }
+        count = i;
+        i++;
+    }
+    if (seen_digit) {
+        return count;
+    }
+    else {
+        return 0;
+    }
+}
+static int std_get_apostrophe(char *input)
+{
+    char *t = input;
+    while (isalpha(*t) || *t == '\'') {
+        t++;
+    }
+    return (int)(t - input);
+}
+static int mb_std_get_apostrophe(char *input)
+{
+    char *t = input;
+    wchar_t wchr;
+    int i;
+    mbstate_t state; ZEROSET(&state, mbstate_t);
+    i = mb_next_char(&wchr, t, &state);
+    while (iswalpha(wchr) || wchr == L'\'') {
+        t += i;
+        i = mb_next_char(&wchr, t, &state);
+    }
+    return (int)(t - input);
+}
+static int std_get_url(char *input, char *token, int i)
+{
+    while (isurlc(input[i])) {
+        if (isurlpunc(input[i]) && isurlpunc(input[i - 1])) {
+            break; /* can't have two puncs in a row */
+        }
+        if (i < MAX_WORD_SIZE) {
+            token[i] = input[i];
+        }
+        i++;
+    }
+    /* strip trailing puncs */
+    while (isurlpunc(input[i - 1])) {
+        i--;
+    }
+    return i;
+}
+/* Company names can contain '@' and '&' like AT&T and Excite@Home. Let's
+*/
+static int std_get_company_name(char *input)
+{
+    int i = 0;
+    while (isalpha(input[i]) || input[i] == '@' || input[i] == '&') {
+        i++;
+    }
+    return i;
+}
+/*
+static int mb_std_get_company_name(char *input, TokenStream *ts)
+{
+    char *t = input;
+    wchar_t wchr;
+    int i;
+    mbstate_t state; ZEROSET(&state, mbstate_t);
+    i = mb_next_char(&wchr, t, &state);
+    while (iswalpha(wchr) || wchr == L'@' || wchr == L'&') {
+        t += i;
+        i = mb_next_char(&wchr, t, &state);
+    }
+    return (int)(t - input);
+}
+*/
+static bool std_advance_to_start(TokenStream *ts)
+{
+    char *t = ts->t;
+    while (*t != '\0' && !isalnum(*t)) {
+        if (isnumpunc(*t) && isdigit(t[1])) break;
+        t++;
+    }
+    ts->t = t;
+    return (*t != '\0');
+}
+static bool mb_std_advance_to_start(TokenStream *ts)
+{
+    int i;
+    wchar_t wchr;
+    mbstate_t state; ZEROSET(&state, mbstate_t);
+    i = mb_next_char(&wchr, ts->t, &state);
+    while (wchr != 0 && !iswalnum(wchr)) {
+        if (isnumpunc(*ts->t) && isdigit(ts->t[1])) break;
+        ts->t += i;
+        i = mb_next_char(&wchr, ts->t, &state);
+    }
+    return (wchr != 0);
+}
+static Token *std_next(TokenStream *ts)
+{
+    StandardTokenizer *std_tz = STDTS(ts);
+    char *s;
+    char *t;
+    char *start = NULL;
+    char *num_end = NULL;
+    char token[MAX_WORD_SIZE + 1];
+    int token_i = 0;
+    int len;
+    bool is_acronym;
+    bool seen_at_symbol;
+    if (!std_tz->advance_to_start(ts)) {
+        return NULL;
+    }
+    start = t = ts->t;
+    token_i = std_tz->get_alpha(ts, token);
+    t += token_i;
+    if (!std_tz->is_tok_char(t)) {
+        /* very common case, ie a plain word, so check and return */
+        ts->t = t;
+        return tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
+    }
+    if (*t == '\'') {       /* apostrophe case. */
+        t += std_tz->get_apostrophe(t);
+        ts->t = t;
+        len = (int)(t - start);
+        /* strip possesive */
+        if ((t[-1] == 's' || t[-1] == 'S') && t[-2] == '\'') {
+            t -= 2;
+            tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
+            CTS(ts)->token.end += 2;
+        }
+        else if (t[-1] == '\'') {
+            t -= 1;
+            tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
+            CTS(ts)->token.end += 1;
+        }
+        else {
+            tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
+        }
+        return &(CTS(ts)->token);
+    }
+    if (*t == '&') {        /* apostrophe case. */
+        t += std_get_company_name(t);
+        ts->t = t;
+        return tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
+    }
+    if ((isdigit(*t) || isnumpunc(*t))       /* possibly a number */
+        && (len = std_get_number(t) > 0)) {
+        num_end = start + len;
+        if (!std_tz->is_tok_char(num_end)) { /* won't find a longer token */
+            ts->t = num_end;
+            return tk_set_ts(&(CTS(ts)->token), start, num_end, ts->text, 1);
+        }
+        /* else there may be a longer token so check */
+    }
+    if (t[0] == ':' && t[1] == '/' && t[2] == '/') {
+        /* check for a known url start */
+        token[token_i] = '\0';
+        t += 3;
+        while (*t == '/') {
+            t++;
+        }
+        if (isalpha(*t) &&
+            (memcmp(token, "ftp", 3) == 0 ||
+             memcmp(token, "http", 4) == 0 ||
+             memcmp(token, "https", 5) == 0 ||
+             memcmp(token, "file", 4) == 0)) {
+            len = std_get_url(t, token, 0); /* dispose of first part of the URL */
+        }
+        else {              /* still treat as url but keep the first part */
+            token_i = (int)(t - start);
+            memcpy(token, start, token_i * sizeof(char));
+            len = token_i + std_get_url(t, token, token_i); /* keep start */
+        }
+        ts->t = t + len;
+        token[len] = 0;
+        return tk_set(&(CTS(ts)->token), token, len, (off_t)(start - ts->text),
+               (off_t)(ts->t - ts->text), 1);
+    }
+    /* now see how long a url we can find. */
+    is_acronym = true;
+    seen_at_symbol = false;
+    while (isurlxatc(*t)) {
+        if (is_acronym && !isalpha(*t) && (*t != '.')) {
+            is_acronym = false;
+        }
+        if (isurlxatpunc(*t) && isurlxatpunc(t[-1])) {
+            break; /* can't have two punctuation characters in a row */
+        }
+        if (*t == '@') {
+            if (seen_at_symbol) {
+                break; /* we can only have one @ symbol */
+            }
+            else {
+                seen_at_symbol = true;
+            }
+        }
+        t++;
+    }
+    while (isurlxatpunc(t[-1]) && t > ts->t) {
+        t--;                /* strip trailing punctuation */
+    }
+    if (t < ts->t || (num_end != NULL && num_end < ts->t)) {
+        fprintf(stderr, "Warning: encoding error. Please check that you are using the correct locale for your input");
+        return NULL;
+    } else if (num_end == NULL || t > num_end) {
+        ts->t = t;
+        if (is_acronym) {   /* check it is one letter followed by one '.' */
+            for (s = start; s < t - 1; s++) {
+                if (isalpha(*s) && (s[1] != '.'))
+                    is_acronym = false;
+            }
+        }
+        if (is_acronym) {   /* strip '.'s */
+            for (s = start + token_i; s < t; s++) {
+                if (*s != '.') {
+                    token[token_i] = *s;
+                    token_i++;
+                }
+            }
+            tk_set(&(CTS(ts)->token), token, token_i,
+                   (off_t)(start - ts->text),
+                   (off_t)(t - ts->text), 1);
+        }
+        else { /* just return the url as is */
+            tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
+        }
+    }
+    else {                  /* return the number */
+        ts->t = num_end;
+        tk_set_ts(&(CTS(ts)->token), start, num_end, ts->text, 1);
+    }
+    return &(CTS(ts)->token);
+}
+static TokenStream *std_ts_clone_i(TokenStream *orig_ts)
+{
+    return ts_clone_size(orig_ts, sizeof(StandardTokenizer));
+}
+static TokenStream *std_ts_new()
+{
+    TokenStream *ts = ts_new(StandardTokenizer);
+    ts->clone_i     = &std_ts_clone_i;
+    ts->next        = &std_next;
+    return ts;
+}
+TokenStream *standard_tokenizer_new()
+{
+    TokenStream *ts = std_ts_new();
+    STDTS(ts)->advance_to_start = &std_advance_to_start;
+    STDTS(ts)->get_alpha        = &std_get_alpha;
+    STDTS(ts)->is_tok_char      = &std_is_tok_char;
+    STDTS(ts)->get_apostrophe   = &std_get_apostrophe;
+    return ts;
+}
+TokenStream *mb_standard_tokenizer_new()
+{
+    TokenStream *ts = std_ts_new();
+    STDTS(ts)->advance_to_start = &mb_std_advance_to_start;
+    STDTS(ts)->get_alpha        = &mb_std_get_alpha;
+    STDTS(ts)->is_tok_char      = &mb_std_is_tok_char;
+    STDTS(ts)->get_apostrophe   = &mb_std_get_apostrophe;
+    return ts;
+}
+/****************************************************************************
+ *
+ * Filters
+ *
+ ****************************************************************************/
+#define TkFilt(filter) ((TokenFilter *)(filter))
+TokenStream *filter_clone_size(TokenStream *ts, size_t size)
+{
+    TokenStream *ts_new = ts_clone_size(ts, size);
+    TkFilt(ts_new)->sub_ts = TkFilt(ts)->sub_ts->clone_i(TkFilt(ts)->sub_ts);
+    return ts_new;
+}
+static TokenStream *filter_clone_i(TokenStream *ts)
+{
+    return filter_clone_size(ts, sizeof(TokenFilter));
+}
+static TokenStream *filter_reset(TokenStream *ts, char *text)
+{
+    TkFilt(ts)->sub_ts->reset(TkFilt(ts)->sub_ts, text);
+    return ts;
+}
+static void filter_destroy_i(TokenStream *ts)
+{
+    ts_deref(TkFilt(ts)->sub_ts);
+    free(ts);
+}
+#define tf_new(type, sub) tf_new_i(sizeof(type), sub)
+TokenStream *tf_new_i(size_t size, TokenStream *sub_ts)
+{
+    TokenStream *ts     = (TokenStream *)ecalloc(size);
+    TkFilt(ts)->sub_ts  = sub_ts;
+    ts->clone_i         = &filter_clone_i;
+    ts->destroy_i       = &filter_destroy_i;
+    ts->reset           = &filter_reset;
+    ts->ref_cnt         = 1;
+    return ts;
+}
+/****************************************************************************
+ * StopFilter
+ ****************************************************************************/
+#define StopFilt(filter) ((StopFilter *)(filter))
+static void sf_destroy_i(TokenStream *ts)
+{
+    h_destroy(StopFilt(ts)->words);
+    filter_destroy_i(ts);
+}
+static TokenStream *sf_clone_i(TokenStream *orig_ts)
+{
+    TokenStream *new_ts = filter_clone_size(orig_ts, sizeof(MappingFilter));
+    REF(StopFilt(new_ts)->words);
+    return new_ts;
+}
+static Token *sf_next(TokenStream *ts)
+{
+    int pos_inc = 0;
+    HashTable *words = StopFilt(ts)->words;
+    TokenFilter *tf = TkFilt(ts);
+    Token *tk = tf->sub_ts->next(tf->sub_ts);
+    while ((tk != NULL) && (h_get(words, tk->text) != NULL)) {
+        pos_inc += tk->pos_inc;
+        tk = tf->sub_ts->next(tf->sub_ts);
+    }
+    if (tk != NULL) {
+        tk->pos_inc += pos_inc;
+    }
+    return tk;
+}
+TokenStream *stop_filter_new_with_words_len(TokenStream *sub_ts,
+                                            const char **words, int len)
+{
+    int i;
+    char *word;
+    HashTable *word_table = h_new_str(&free, (free_ft) NULL);
+    TokenStream *ts = tf_new(StopFilter, sub_ts);
+    for (i = 0; i < len; i++) {
+        word = estrdup(words[i]);
+        h_set(word_table, word, word);
+    }
+    StopFilt(ts)->words = word_table;
+    ts->next            = &sf_next;
+    ts->destroy_i       = &sf_destroy_i;
+    ts->clone_i         = &sf_clone_i;
+    return ts;
+}
+TokenStream *stop_filter_new_with_words(TokenStream *sub_ts,
+                                        const char **words)
+{
+    char *word;
+    HashTable *word_table = h_new_str(&free, (free_ft) NULL);
+    TokenStream *ts = tf_new(StopFilter, sub_ts);
+    while (*words) {
+        word = estrdup(*words);
+        h_set(word_table, word, word);
+        words++;
+    }
+    StopFilt(ts)->words = word_table;
+    ts->next            = &sf_next;
+    ts->destroy_i       = &sf_destroy_i;
+    ts->clone_i         = &sf_clone_i;
+    return ts;
+}
+TokenStream *stop_filter_new(TokenStream *ts)
+{
+    return stop_filter_new_with_words(ts, FULL_ENGLISH_STOP_WORDS);
+}
+/****************************************************************************
+ * MappingFilter
+ ****************************************************************************/
+#define MFilt(filter) ((MappingFilter *)(filter))
+static void mf_destroy_i(TokenStream *ts)
+{
+    mulmap_destroy(MFilt(ts)->mapper);
+    filter_destroy_i(ts);
+}
+static TokenStream *mf_clone_i(TokenStream *orig_ts)
+{
+    TokenStream *new_ts = filter_clone_size(orig_ts, sizeof(MappingFilter));
+    REF(MFilt(new_ts)->mapper);
+    return new_ts;
+}
+static Token *mf_next(TokenStream *ts)
+{
+    char buf[MAX_WORD_SIZE + 1];
+    MultiMapper *mapper = MFilt(ts)->mapper;
+    TokenFilter *tf = TkFilt(ts);
+    Token *tk = tf->sub_ts->next(tf->sub_ts);
+    if (tk != NULL) {
+        tk->len = mulmap_map_len(mapper, buf, tk->text, MAX_WORD_SIZE);
+        memcpy(tk->text, buf, tk->len + 1);
+    }
+    return tk;
+}
+static TokenStream *mf_reset(TokenStream *ts, char *text)
+{
+    MultiMapper *mm = MFilt(ts)->mapper;
+    if (mm->d_size == 0) {
+        mulmap_compile(MFilt(ts)->mapper);
+    }
+    filter_reset(ts, text);
+    return ts;
+}
+TokenStream *mapping_filter_new(TokenStream *sub_ts)
+{
+    TokenStream *ts   = tf_new(MappingFilter, sub_ts);
+    MFilt(ts)->mapper = mulmap_new();
+    ts->next          = &mf_next;
+    ts->destroy_i     = &mf_destroy_i;
+    ts->clone_i       = &mf_clone_i;
+    ts->reset         = &mf_reset;
+    return ts;
+}
+TokenStream *mapping_filter_add(TokenStream *ts, const char *pattern,
+                                const char *replacement)
+{
+    mulmap_add_mapping(MFilt(ts)->mapper, pattern, replacement);
+    return ts;
+}
+/****************************************************************************
+ * HyphenFilter
+ ****************************************************************************/
+#define HyphenFilt(filter) ((HyphenFilter *)(filter))
+static TokenStream *hf_clone_i(TokenStream *orig_ts)
+{
+    TokenStream *new_ts = filter_clone_size(orig_ts, sizeof(HyphenFilter));
+    return new_ts;
+}
+static Token *hf_next(TokenStream *ts)
+{
+    HyphenFilter *hf = HyphenFilt(ts);
+    TokenFilter *tf = TkFilt(ts);
+    Token *tk = hf->tk;
+    if (hf->pos < hf->len) {
+        const int pos = hf->pos;
+        const int text_len = strlen(hf->text + pos);
+        strcpy(tk->text, hf->text + pos);
+        tk->pos_inc = ((pos != 0) ? 1 : 0);
+        tk->start = hf->start + pos;
+        tk->end = tk->start + text_len;
+        hf->pos += text_len + 1;
+        tk->len = text_len;
+        return tk;
+    }
+    else {
+        char *p;
+        bool seen_hyphen = false;
+        bool seen_other_punc = false;
+        hf->tk = tk = tf->sub_ts->next(tf->sub_ts);
+        if (NULL == tk) return NULL;
+        p = tk->text + 1;
+        while (*p) {
+            if (*p == '-') {
+                seen_hyphen = true;
+            }
+            else if (!isalpha(*p)) {
+                seen_other_punc = true;
+                break;
+            }
+            p++;
+        }
+        if (seen_hyphen && !seen_other_punc) {
+            char *q = hf->text;
+            char *r = tk->text;
+            p = tk->text;
+            while (*p) {
+                if (*p == '-') {
+                    *q = '\0';
+                }
+                else {
+                    *r = *q = *p;
+                    r++;
+                }
+                q++;
+                p++;
+            }
+            *r = *q = '\0';
+            hf->start = tk->start;
+            hf->pos = 0;
+            hf->len = q - hf->text;
+            tk->len = r - tk->text;
+        }
+    }
+    return tk;
+}
+TokenStream *hyphen_filter_new(TokenStream *sub_ts)
+{
+    TokenStream *ts = tf_new(HyphenFilter, sub_ts);
+    ts->next        = &hf_next;
+    ts->clone_i     = &hf_clone_i;
+    return ts;
+}
+/****************************************************************************
+ * LowerCaseFilter
+ ****************************************************************************/
+Token *mb_lcf_next(TokenStream *ts)
+{
+    wchar_t wbuf[MAX_WORD_SIZE + 1], *wchr;
+    Token *tk = TkFilt(ts)->sub_ts->next(TkFilt(ts)->sub_ts);
+    int x;
+    wbuf[MAX_WORD_SIZE] = 0;
+    if (tk == NULL) {
+        return tk;
+    }
+    if ((x=mbstowcs(wbuf, tk->text, MAX_WORD_SIZE)) <= 0) return tk;
+    wchr = wbuf;
+    while (*wchr != 0) {
+        *wchr = towlower(*wchr);
+        wchr++;
+    }
+    tk->len = wcstombs(tk->text, wbuf, MAX_WORD_SIZE);
+    if (tk->len <= 0) {
+        strcpy(tk->text, "BAD_DATA");
+        tk->len = 8;
+    }
+    tk->text[tk->len] = '\0';
+    return tk;
+}
+TokenStream *mb_lowercase_filter_new(TokenStream *sub_ts)
+{
+    TokenStream *ts = tf_new(TokenFilter, sub_ts);
+    ts->next = &mb_lcf_next;
+    return ts;
+}
+Token *lcf_next(TokenStream *ts)
+{
+    int i = 0;
+    Token *tk = TkFilt(ts)->sub_ts->next(TkFilt(ts)->sub_ts);
+    if (tk == NULL) {
+        return tk;
+    }
+    while (tk->text[i] != '\0') {
+        tk->text[i] = tolower(tk->text[i]);
+        i++;
+    }
+    return tk;
+}
+TokenStream *lowercase_filter_new(TokenStream *sub_ts)
+{
+    TokenStream *ts = tf_new(TokenFilter, sub_ts);
+    ts->next = &lcf_next;
+    return ts;
+}
+/****************************************************************************
+ * StemFilter
+ ****************************************************************************/
+#define StemFilt(filter) ((StemFilter *)(filter))
+void stemf_destroy_i(TokenStream *ts)
+{
+    sb_stemmer_delete(StemFilt(ts)->stemmer);
+    free(StemFilt(ts)->algorithm);
+    free(StemFilt(ts)->charenc);
+    filter_destroy_i(ts);
+}
+Token *stemf_next(TokenStream *ts)
+{
+    int len;
+    const sb_symbol *stemmed;
+    struct sb_stemmer *stemmer = StemFilt(ts)->stemmer;
+    TokenFilter *tf = TkFilt(ts);
+    Token *tk = tf->sub_ts->next(tf->sub_ts);
+    if (tk == NULL) {
+        return tk;
+    }
+    stemmed = sb_stemmer_stem(stemmer, (sb_symbol *)tk->text, tk->len);
+    len = sb_stemmer_length(stemmer);
+    if (len >= MAX_WORD_SIZE) {
+        len = MAX_WORD_SIZE - 1;
+    }
+    memcpy(tk->text, stemmed, len);
+    tk->text[len] = '\0';
+    tk->len = len;
+    return tk;
+}
+TokenStream *stemf_clone_i(TokenStream *orig_ts)
+{
+    TokenStream *new_ts      = filter_clone_size(orig_ts, sizeof(StemFilter));
+    StemFilter *stemf        = StemFilt(new_ts);
+    StemFilter *orig_stemf   = StemFilt(orig_ts);
+    stemf->stemmer =
+        sb_stemmer_new(orig_stemf->algorithm, orig_stemf->charenc);
+    stemf->algorithm =
+        orig_stemf->algorithm ? estrdup(orig_stemf->algorithm) : NULL;
+    stemf->charenc =
+        orig_stemf->charenc ? estrdup(orig_stemf->charenc) : NULL;
+    return new_ts;
+}
+TokenStream *stem_filter_new(TokenStream *ts, const char *algorithm,
+                             const char *charenc)
+{
+    TokenStream *tf = tf_new(StemFilter, ts);
+    StemFilt(tf)->stemmer   = sb_stemmer_new(algorithm, charenc);
+    StemFilt(tf)->algorithm = algorithm ? estrdup(algorithm) : NULL;
+    StemFilt(tf)->charenc   = charenc ? estrdup(charenc) : NULL;
+    tf->next = &stemf_next;
+    tf->destroy_i = &stemf_destroy_i;
+    tf->clone_i = &stemf_clone_i;
+    return tf;
+}
+/****************************************************************************
+ *
+ * Analyzers
+ *
+ ****************************************************************************/
+/****************************************************************************
+ * Standard
+ ****************************************************************************/
+Analyzer *standard_analyzer_new_with_words_len(const char **words, int len,
+                                               bool lowercase)
+{
+    TokenStream *ts = standard_tokenizer_new();
+    if (lowercase) {
+        ts = lowercase_filter_new(ts);
+    }
+    ts = hyphen_filter_new(stop_filter_new_with_words_len(ts, words, len));
+    return analyzer_new(ts, NULL, NULL);
+}
+Analyzer *standard_analyzer_new_with_words(const char **words,
+                                           bool lowercase)
+{
+    TokenStream *ts = standard_tokenizer_new();
+    if (lowercase) {
+        ts = lowercase_filter_new(ts);
+    }
+    ts = hyphen_filter_new(stop_filter_new_with_words(ts, words));
+    return analyzer_new(ts, NULL, NULL);
+}
+Analyzer *mb_standard_analyzer_new_with_words_len(const char **words,
+                                                  int len, bool lowercase)
+{
+    TokenStream *ts = mb_standard_tokenizer_new();
+    if (lowercase) {
+        ts = mb_lowercase_filter_new(ts);
+    }
+    ts = hyphen_filter_new(stop_filter_new_with_words_len(ts, words, len));
+    return analyzer_new(ts, NULL, NULL);
+}
+Analyzer *mb_standard_analyzer_new_with_words(const char **words,
+                                              bool lowercase)
+{
+    TokenStream *ts = mb_standard_tokenizer_new();
+    if (lowercase) {
+        ts = mb_lowercase_filter_new(ts);
+    }
+    ts = hyphen_filter_new(stop_filter_new_with_words(ts, words));
+    return analyzer_new(ts, NULL, NULL);
+}
+Analyzer *standard_analyzer_new(bool lowercase)
+{
+    return standard_analyzer_new_with_words(FULL_ENGLISH_STOP_WORDS,
+                                            lowercase);
+}
+Analyzer *mb_standard_analyzer_new(bool lowercase)
+{
+    return mb_standard_analyzer_new_with_words(FULL_ENGLISH_STOP_WORDS,
+                                               lowercase);
+}
+/****************************************************************************
+ *
+ * PerFieldAnalyzer
+ *
+ ****************************************************************************/
+#define PFA(analyzer) ((PerFieldAnalyzer *)(analyzer))
+void pfa_destroy_i(Analyzer *self)
+{
+    h_destroy(PFA(self)->dict);
+    a_deref(PFA(self)->default_a);
+    free(self);
+}
+TokenStream *pfa_get_ts(Analyzer *self, char *field, char *text)
+{
+    Analyzer *a = h_get(PFA(self)->dict, field);
+    if (a == NULL) {
+        a = PFA(self)->default_a;
+    }
+    return a_get_ts(a, field, text);
+}
+void pfa_sub_a_destroy_i(void *p)
+{
+    Analyzer *a = (Analyzer *) p;
+    a_deref(a);
+}
+void pfa_add_field(Analyzer *self, char *field, Analyzer *analyzer)
+{
+    h_set(PFA(self)->dict, estrdup(field), analyzer);
+}
+Analyzer *per_field_analyzer_new(Analyzer *default_a)
+{
+    Analyzer *a = (Analyzer *)ecalloc(sizeof(PerFieldAnalyzer));
+    PFA(a)->default_a = default_a;
+    PFA(a)->dict = h_new_str(&free, &pfa_sub_a_destroy_i);
+    a->destroy_i = &pfa_destroy_i;
+    a->get_ts    = pfa_get_ts;
+    a->ref_cnt   = 1;
+    return a;
+}
+#ifdef ALONE
+int main(int argc, char **argv)
+{
+    char buf[10000];
+    Analyzer *a = standard_analyzer_new(true);
+    TokenStream *ts;
+    Token *tk;
+    while (fgets(buf, 9999, stdin) != NULL) {
+        ts = a_get_ts(a, "hello", buf);
+        while ((tk = ts->next(ts)) != NULL) {
+            printf("<%s:%ld:%ld> ", tk->text, tk->start, tk->end);
+        }
+        printf("\n");
+        ts_deref(ts);
+    }
+    return 0;
+}
+#endif