RubyGems - jk-ferret - Versions diffs - 0.11.8.2 - Mend

jk-ferret 0.11.8.2

Files changed (228) hide show

data/CHANGELOG +24 -0
data/MIT-LICENSE +20 -0
data/README +90 -0
data/RELEASE_CHANGES +137 -0
data/RELEASE_NOTES +60 -0
data/Rakefile +443 -0
data/TODO +109 -0
data/TUTORIAL +231 -0
data/bin/ferret-browser +79 -0
data/ext/BZLIB_blocksort.c +1094 -0
data/ext/BZLIB_bzlib.c +1578 -0
data/ext/BZLIB_compress.c +672 -0
data/ext/BZLIB_crctable.c +104 -0
data/ext/BZLIB_decompress.c +626 -0
data/ext/BZLIB_huffman.c +205 -0
data/ext/BZLIB_randtable.c +84 -0
data/ext/STEMMER_api.c +66 -0
data/ext/STEMMER_libstemmer.c +93 -0
data/ext/STEMMER_stem_ISO_8859_1_danish.c +337 -0
data/ext/STEMMER_stem_ISO_8859_1_dutch.c +624 -0
data/ext/STEMMER_stem_ISO_8859_1_english.c +1117 -0
data/ext/STEMMER_stem_ISO_8859_1_finnish.c +762 -0
data/ext/STEMMER_stem_ISO_8859_1_french.c +1246 -0
data/ext/STEMMER_stem_ISO_8859_1_german.c +503 -0
data/ext/STEMMER_stem_ISO_8859_1_hungarian.c +1230 -0
data/ext/STEMMER_stem_ISO_8859_1_italian.c +1065 -0
data/ext/STEMMER_stem_ISO_8859_1_norwegian.c +297 -0
data/ext/STEMMER_stem_ISO_8859_1_porter.c +749 -0
data/ext/STEMMER_stem_ISO_8859_1_portuguese.c +1017 -0
data/ext/STEMMER_stem_ISO_8859_1_spanish.c +1093 -0
data/ext/STEMMER_stem_ISO_8859_1_swedish.c +307 -0
data/ext/STEMMER_stem_ISO_8859_2_romanian.c +998 -0
data/ext/STEMMER_stem_KOI8_R_russian.c +700 -0
data/ext/STEMMER_stem_UTF_8_danish.c +339 -0
data/ext/STEMMER_stem_UTF_8_dutch.c +634 -0
data/ext/STEMMER_stem_UTF_8_english.c +1125 -0
data/ext/STEMMER_stem_UTF_8_finnish.c +768 -0
data/ext/STEMMER_stem_UTF_8_french.c +1256 -0
data/ext/STEMMER_stem_UTF_8_german.c +509 -0
data/ext/STEMMER_stem_UTF_8_hungarian.c +1234 -0
data/ext/STEMMER_stem_UTF_8_italian.c +1073 -0
data/ext/STEMMER_stem_UTF_8_norwegian.c +299 -0
data/ext/STEMMER_stem_UTF_8_porter.c +755 -0
data/ext/STEMMER_stem_UTF_8_portuguese.c +1023 -0
data/ext/STEMMER_stem_UTF_8_romanian.c +1004 -0
data/ext/STEMMER_stem_UTF_8_russian.c +694 -0
data/ext/STEMMER_stem_UTF_8_spanish.c +1097 -0
data/ext/STEMMER_stem_UTF_8_swedish.c +309 -0
data/ext/STEMMER_stem_UTF_8_turkish.c +2205 -0
data/ext/STEMMER_utilities.c +478 -0
data/ext/analysis.c +1710 -0
data/ext/analysis.h +266 -0
data/ext/api.h +26 -0
data/ext/array.c +125 -0
data/ext/array.h +62 -0
data/ext/bitvector.c +96 -0
data/ext/bitvector.h +594 -0
data/ext/bzlib.h +282 -0
data/ext/bzlib_private.h +503 -0
data/ext/compound_io.c +384 -0
data/ext/config.h +52 -0
data/ext/document.c +159 -0
data/ext/document.h +63 -0
data/ext/except.c +102 -0
data/ext/except.h +176 -0
data/ext/extconf.rb +15 -0
data/ext/ferret.c +416 -0
data/ext/ferret.h +94 -0
data/ext/field_index.c +262 -0
data/ext/field_index.h +52 -0
data/ext/filter.c +157 -0
data/ext/fs_store.c +493 -0
data/ext/global.c +458 -0
data/ext/global.h +302 -0
data/ext/hash.c +524 -0
data/ext/hash.h +515 -0
data/ext/hashset.c +192 -0
data/ext/hashset.h +215 -0
data/ext/header.h +58 -0
data/ext/helper.c +63 -0
data/ext/helper.h +21 -0
data/ext/index.c +6804 -0
data/ext/index.h +935 -0
data/ext/internal.h +1019 -0
data/ext/lang.c +10 -0
data/ext/lang.h +68 -0
data/ext/libstemmer.h +79 -0
data/ext/mempool.c +88 -0
data/ext/mempool.h +43 -0
data/ext/modules.h +190 -0
data/ext/multimapper.c +351 -0
data/ext/multimapper.h +60 -0
data/ext/posh.c +1006 -0
data/ext/posh.h +973 -0
data/ext/priorityqueue.c +149 -0
data/ext/priorityqueue.h +155 -0
data/ext/q_boolean.c +1621 -0
data/ext/q_const_score.c +162 -0
data/ext/q_filtered_query.c +212 -0
data/ext/q_fuzzy.c +280 -0
data/ext/q_match_all.c +149 -0
data/ext/q_multi_term.c +673 -0
data/ext/q_parser.c +3103 -0
data/ext/q_phrase.c +1206 -0
data/ext/q_prefix.c +98 -0
data/ext/q_range.c +682 -0
data/ext/q_span.c +2390 -0
data/ext/q_term.c +337 -0
data/ext/q_wildcard.c +167 -0
data/ext/r_analysis.c +2626 -0
data/ext/r_index.c +3468 -0
data/ext/r_qparser.c +635 -0
data/ext/r_search.c +4490 -0
data/ext/r_store.c +513 -0
data/ext/r_utils.c +1131 -0
data/ext/ram_store.c +476 -0
data/ext/scanner.c +895 -0
data/ext/scanner.h +36 -0
data/ext/scanner_mb.c +6701 -0
data/ext/scanner_utf8.c +4415 -0
data/ext/search.c +1864 -0
data/ext/search.h +953 -0
data/ext/similarity.c +151 -0
data/ext/similarity.h +89 -0
data/ext/sort.c +786 -0
data/ext/stem_ISO_8859_1_danish.h +16 -0
data/ext/stem_ISO_8859_1_dutch.h +16 -0
data/ext/stem_ISO_8859_1_english.h +16 -0
data/ext/stem_ISO_8859_1_finnish.h +16 -0
data/ext/stem_ISO_8859_1_french.h +16 -0
data/ext/stem_ISO_8859_1_german.h +16 -0
data/ext/stem_ISO_8859_1_hungarian.h +16 -0
data/ext/stem_ISO_8859_1_italian.h +16 -0
data/ext/stem_ISO_8859_1_norwegian.h +16 -0
data/ext/stem_ISO_8859_1_porter.h +16 -0
data/ext/stem_ISO_8859_1_portuguese.h +16 -0
data/ext/stem_ISO_8859_1_spanish.h +16 -0
data/ext/stem_ISO_8859_1_swedish.h +16 -0
data/ext/stem_ISO_8859_2_romanian.h +16 -0
data/ext/stem_KOI8_R_russian.h +16 -0
data/ext/stem_UTF_8_danish.h +16 -0
data/ext/stem_UTF_8_dutch.h +16 -0
data/ext/stem_UTF_8_english.h +16 -0
data/ext/stem_UTF_8_finnish.h +16 -0
data/ext/stem_UTF_8_french.h +16 -0
data/ext/stem_UTF_8_german.h +16 -0
data/ext/stem_UTF_8_hungarian.h +16 -0
data/ext/stem_UTF_8_italian.h +16 -0
data/ext/stem_UTF_8_norwegian.h +16 -0
data/ext/stem_UTF_8_porter.h +16 -0
data/ext/stem_UTF_8_portuguese.h +16 -0
data/ext/stem_UTF_8_romanian.h +16 -0
data/ext/stem_UTF_8_russian.h +16 -0
data/ext/stem_UTF_8_spanish.h +16 -0
data/ext/stem_UTF_8_swedish.h +16 -0
data/ext/stem_UTF_8_turkish.h +16 -0
data/ext/stopwords.c +410 -0
data/ext/store.c +698 -0
data/ext/store.h +799 -0
data/ext/symbol.c +10 -0
data/ext/symbol.h +23 -0
data/ext/term_vectors.c +73 -0
data/ext/threading.h +31 -0
data/ext/win32.h +62 -0
data/lib/ferret.rb +30 -0
data/lib/ferret/browser.rb +246 -0
data/lib/ferret/browser/s/global.js +192 -0
data/lib/ferret/browser/s/style.css +148 -0
data/lib/ferret/browser/views/document/list.rhtml +49 -0
data/lib/ferret/browser/views/document/show.rhtml +27 -0
data/lib/ferret/browser/views/error/index.rhtml +7 -0
data/lib/ferret/browser/views/help/index.rhtml +8 -0
data/lib/ferret/browser/views/home/index.rhtml +29 -0
data/lib/ferret/browser/views/layout.rhtml +22 -0
data/lib/ferret/browser/views/term-vector/index.rhtml +4 -0
data/lib/ferret/browser/views/term/index.rhtml +199 -0
data/lib/ferret/browser/views/term/termdocs.rhtml +1 -0
data/lib/ferret/browser/webrick.rb +14 -0
data/lib/ferret/document.rb +130 -0
data/lib/ferret/field_infos.rb +44 -0
data/lib/ferret/field_symbol.rb +87 -0
data/lib/ferret/index.rb +973 -0
data/lib/ferret/number_tools.rb +157 -0
data/lib/ferret/version.rb +3 -0
data/setup.rb +1555 -0
data/test/long_running/largefile/tc_largefile.rb +46 -0
data/test/test_all.rb +5 -0
data/test/test_helper.rb +29 -0
data/test/test_installed.rb +1 -0
data/test/threading/number_to_spoken.rb +132 -0
data/test/threading/thread_safety_index_test.rb +88 -0
data/test/threading/thread_safety_read_write_test.rb +73 -0
data/test/threading/thread_safety_test.rb +133 -0
data/test/unit/analysis/tc_analyzer.rb +550 -0
data/test/unit/analysis/tc_token_stream.rb +653 -0
data/test/unit/index/tc_index.rb +867 -0
data/test/unit/index/tc_index_reader.rb +699 -0
data/test/unit/index/tc_index_writer.rb +447 -0
data/test/unit/index/th_doc.rb +332 -0
data/test/unit/query_parser/tc_query_parser.rb +238 -0
data/test/unit/search/tc_filter.rb +156 -0
data/test/unit/search/tc_fuzzy_query.rb +147 -0
data/test/unit/search/tc_index_searcher.rb +67 -0
data/test/unit/search/tc_multi_searcher.rb +128 -0
data/test/unit/search/tc_multiple_search_requests.rb +58 -0
data/test/unit/search/tc_search_and_sort.rb +179 -0
data/test/unit/search/tc_sort.rb +49 -0
data/test/unit/search/tc_sort_field.rb +27 -0
data/test/unit/search/tc_spans.rb +190 -0
data/test/unit/search/tm_searcher.rb +436 -0
data/test/unit/store/tc_fs_store.rb +115 -0
data/test/unit/store/tc_ram_store.rb +35 -0
data/test/unit/store/tm_store.rb +34 -0
data/test/unit/store/tm_store_lock.rb +68 -0
data/test/unit/tc_document.rb +81 -0
data/test/unit/tc_field_symbol.rb +26 -0
data/test/unit/ts_analysis.rb +2 -0
data/test/unit/ts_index.rb +2 -0
data/test/unit/ts_largefile.rb +4 -0
data/test/unit/ts_query_parser.rb +2 -0
data/test/unit/ts_search.rb +2 -0
data/test/unit/ts_store.rb +2 -0
data/test/unit/ts_utils.rb +2 -0
data/test/unit/utils/tc_bit_vector.rb +295 -0
data/test/unit/utils/tc_number_tools.rb +117 -0
data/test/unit/utils/tc_priority_queue.rb +106 -0
data/test/utils/content_generator.rb +226 -0
metadata +319 -0

data/ext/r_analysis.c ADDED Viewed

@@ -0,0 +1,2626 @@
+#include "lang.h"
+#ifdef FRT_RUBY_VERSION_1_9
+#  include <ruby/re.h>
+#else
+#  include <regex.h>
+#endif
+#include <locale.h>
+#ifdef FRT_RUBY_VERSION_1_9
+#  include <ruby/st.h>
+#else
+#  include <st.h>
+#endif
+#include "ferret.h"
+#include "analysis.h"
+static char *frb_locale = NULL;
+static VALUE mAnalysis;
+static VALUE cToken;
+static VALUE cAsciiLetterTokenizer;
+static VALUE cLetterTokenizer;
+static VALUE cAsciiWhiteSpaceTokenizer;
+static VALUE cWhiteSpaceTokenizer;
+static VALUE cAsciiStandardTokenizer;
+static VALUE cStandardTokenizer;
+static VALUE cRegExpTokenizer;
+static VALUE cAsciiLowerCaseFilter;
+static VALUE cLowerCaseFilter;
+static VALUE cStopFilter;
+static VALUE cMappingFilter;
+static VALUE cHyphenFilter;
+static VALUE cStemFilter;
+static VALUE cAnalyzer;
+static VALUE cAsciiLetterAnalyzer;
+static VALUE cLetterAnalyzer;
+static VALUE cAsciiWhiteSpaceAnalyzer;
+static VALUE cWhiteSpaceAnalyzer;
+static VALUE cAsciiStandardAnalyzer;
+static VALUE cStandardAnalyzer;
+static VALUE cPerFieldAnalyzer;
+static VALUE cRegExpAnalyzer;
+static VALUE cTokenStream;
+/* TokenStream Methods */
+static ID id_next;
+static ID id_reset;
+static ID id_clone;
+static ID id_text;
+/* Analyzer Methods */
+static ID id_token_stream;
+static VALUE object_space;
+#ifndef FRT_RUBY_VERSION_1_9
+extern int ruby_re_search(struct re_pattern_buffer *, const char *, int, int,
+                          int, struct re_registers *);
+#endif
+int
+frb_rb_hash_size(VALUE hash)
+{
+#ifdef FRT_RUBY_VERSION_1_9
+    return RHASH(hash)->ntbl->num_entries;
+#else
+    return RHASH(hash)->tbl->num_entries;
+#endif
+}
+/****************************************************************************
+ *
+ * Utility Methods
+ *
+ ****************************************************************************/
+static char **
+get_stopwords(VALUE rstop_words)
+{
+    char **stop_words;
+    int i, len;
+    VALUE rstr;
+    Check_Type(rstop_words, T_ARRAY);
+    len = RARRAY_LEN(rstop_words);
+    stop_words = ALLOC_N(char *, RARRAY_LEN(rstop_words) + 1);
+    stop_words[len] = NULL;
+    for (i = 0; i < len; i++) {
+        rstr = rb_obj_as_string(RARRAY_PTR(rstop_words)[i]);
+        stop_words[i] = rs2s(rstr);
+    }
+    return stop_words;
+}
+/****************************************************************************
+ *
+ * token methods
+ *
+ ****************************************************************************/
+typedef struct RToken {
+    VALUE text;
+    int start;
+    int end;
+    int pos_inc;
+} RToken;
+static void
+frb_token_free(void *p)
+{
+    free(p);
+}
+static void
+frb_token_mark(void *p)
+{
+    RToken *token = (RToken *)p;
+    rb_gc_mark(token->text);
+}
+static VALUE
+frb_token_alloc(VALUE klass)
+{
+    return Data_Wrap_Struct(klass, &frb_token_mark, &frb_token_free,
+                            ALLOC(RToken));
+}
+static VALUE
+get_token(Token *tk)
+{
+    RToken *token = ALLOC(RToken);
+    token->text = rb_str_new2(tk->text);
+    token->start = tk->start;
+    token->end = tk->end;
+    token->pos_inc = tk->pos_inc;
+    return Data_Wrap_Struct(cToken, &frb_token_mark, &frb_token_free, token);
+}
+Token *
+frb_set_token(Token *tk, VALUE rt)
+{
+    RToken *rtk;
+    if (rt == Qnil) return NULL;
+    Data_Get_Struct(rt, RToken, rtk);
+    tk_set(tk, rs2s(rtk->text), RSTRING_LEN(rtk->text),
+           rtk->start, rtk->end, rtk->pos_inc);
+    return tk;
+}
+#define GET_TK(tk, self) Data_Get_Struct(self, RToken, tk)
+/*
+ *  call-seq:
+ *     Token.new(text, start, end, pos_inc = 1) -> new Token
+ *
+ *  Creates a new token setting the text, start and end offsets of the token
+ *  and the position increment for the token.
+ *
+ *  The position increment is usually set to 1 but you can set it to other
+ *  values as needed.  For example, if you have a stop word filter you will be
+ *  skipping tokens. Let's say you have the stop words "the" and "and" and you
+ *  parse the title "The Old Man and the Sea". The terms "Old", "Man" and
+ *  "Sea" will have the position increments 2, 1 and 3 respectively.
+ *
+ *  Another reason you might want to vary the position increment is if you are
+ *  adding synonyms to the index. For example let's say you have the synonym
+ *  group "quick", "fast" and "speedy". When tokenizing the phrase "Next day
+ *  speedy delivery", you'll add "speedy" first with a position increment of 1
+ *  and then "fast" and "quick" with position increments of 0 since they are
+ *  represented in the same position.
+ *
+ *  The offset set values +start+ and +end+ should be byte offsets, not
+ *  character offsets. This makes it easy to use those offsets to quickly
+ *  access the token in the input string and also to insert highlighting tags
+ *  when necessary.
+ *
+ *  text::       the main text for the token.
+ *  start::      the start offset of the token in bytes.
+ *  end::        the end offset of the token in bytes.
+ *  pos_inc::    the position increment of a token. See above.
+ *  return::     a newly created and assigned Token object
+ */
+static VALUE
+frb_token_init(int argc, VALUE *argv, VALUE self)
+{
+    RToken *token;
+    VALUE rtext, rstart, rend, rpos_inc, rtype;
+    GET_TK(token, self);
+    token->pos_inc = 1;
+    switch (rb_scan_args(argc, argv, "32", &rtext, &rstart,
+                         &rend, &rpos_inc, &rtype)) {
+        case 5: /* type gets ignored at this stage */
+        case 4: token->pos_inc = FIX2INT(rpos_inc);
+    }
+    token->text = rb_obj_as_string(rtext);
+    token->start = FIX2INT(rstart);
+    token->end = FIX2INT(rend);
+    return self;
+}
+/*
+ *  call-seq:
+ *     token.cmp(other_token) -> bool
+ *
+ *  Used to compare two tokens. Token is extended by Comparable so you can
+ *  also use +<+, +>+, +<=+, +>=+ etc. to compare tokens.
+ *
+ *  Tokens are sorted by the position in the text at which they occur, ie
+ *  the start offset. If two tokens have the same start offset, (see
+ *  pos_inc=) then, they are sorted by the end offset and then
+ *  lexically by the token text.
+ */
+static VALUE
+frb_token_cmp(VALUE self, VALUE rother)
+{
+    RToken *token, *other;
+    int cmp;
+    GET_TK(token, self);
+    GET_TK(other, rother);
+    if (token->start > other->start) {
+        cmp = 1;
+    } else if (token->start < other->start) {
+        cmp = -1;
+    } else {
+        if (token->end > other->end) {
+            cmp = 1;
+        } else if (token->end < other->end) {
+            cmp = -1;
+        } else {
+            cmp = strcmp(rs2s(token->text), rs2s(other->text));
+        }
+    }
+    return INT2FIX(cmp);
+}
+/*
+ *  call-seq:
+ *     token.text -> text
+ *
+ *  Returns the text that this token represents
+ */
+static VALUE
+frb_token_get_text(VALUE self)
+{
+    RToken *token;
+    GET_TK(token, self);
+    return token->text;
+}
+/*
+ *  call-seq:
+ *     token.text = text -> text
+ *
+ *  Set the text for this token.
+ */
+static VALUE
+frb_token_set_text(VALUE self, VALUE rtext)
+{
+    RToken *token;
+    GET_TK(token, self);
+    token->text = rtext;
+    return rtext;
+}
+/*
+ *  call-seq:
+ *     token.start -> integer
+ *
+ *  Start byte-position of this token
+ */
+static VALUE
+frb_token_get_start_offset(VALUE self)
+{
+    RToken *token;
+    GET_TK(token, self);
+    return INT2FIX(token->start);
+}
+/*
+ *  call-seq:
+ *     token.end -> integer
+ *
+ *  End byte-position of this token
+ */
+static VALUE
+frb_token_get_end_offset(VALUE self)
+{
+    RToken *token;
+    GET_TK(token, self);
+    return INT2FIX(token->end);
+}
+/*
+ *  call-seq:
+ *     token.pos_inc -> integer
+ *
+ *  Position Increment for this token
+ */
+static VALUE
+frb_token_get_pos_inc(VALUE self)
+{
+    RToken *token;
+    GET_TK(token, self);
+    return INT2FIX(token->pos_inc);
+}
+/*
+ *  call-seq:
+ *     token.start = start -> integer
+ *
+ *  Set start byte-position of this token
+ */
+static VALUE
+frb_token_set_start_offset(VALUE self, VALUE rstart)
+{
+    RToken *token;
+    GET_TK(token, self);
+    token->start = FIX2INT(rstart);
+    return rstart;
+}
+/*
+ *  call-seq:
+ *     token.end = end -> integer
+ *
+ *  Set end byte-position of this token
+ */
+static VALUE
+frb_token_set_end_offset(VALUE self, VALUE rend)
+{
+    RToken *token;
+    GET_TK(token, self);
+    token->end = FIX2INT(rend);
+    return rend;
+}
+/*
+ *  call-seq:
+ *     token.pos_inc = pos_inc -> integer
+ *
+ *  Set the position increment.  This determines the position of this token
+ *  relative to the previous Token in a TokenStream, used in phrase
+ *  searching.
+ *
+ *  The default value is 1.
+ *
+ *  Some common uses for this are:
+ *
+ *  * Set it to zero to put multiple terms in the same position.  This is
+ *    useful if, e.g., a word has multiple stems.  Searches for phrases
+ *    including either stem will match.  In this case, all but the first
+ *    stem's increment should be set to zero: the increment of the first
+ *    instance should be one.  Repeating a token with an increment of zero
+ *    can also be used to boost the scores of matches on that token.
+ *
+ *  * Set it to values greater than one to inhibit exact phrase matches.
+ *    If, for example, one does not want phrases to match across removed
+ *    stop words, then one could build a stop word filter that removes stop
+ *    words and also sets the increment to the number of stop words removed
+ *    before each non-stop word.  Then exact phrase queries will only match
+ *    when the terms occur with no intervening stop words.
+ *
+ */
+static VALUE
+frb_token_set_pos_inc(VALUE self, VALUE rpos_inc)
+{
+    RToken *token;
+    GET_TK(token, self);
+    token->pos_inc = FIX2INT(rpos_inc);
+    return rpos_inc;
+}
+/*
+ *  call-seq:
+ *     token.to_s -> token_str
+ *
+ *  Return a string representation of the token
+ */
+static VALUE
+frb_token_to_s(VALUE self)
+{
+    RToken *token;
+    char *buf;
+    GET_TK(token, self);
+    buf = alloca(RSTRING_LEN(token->text) + 80);
+    sprintf(buf, "token[\"%s\":%d:%d:%d]", rs2s(token->text),
+            token->start, token->end, token->pos_inc);
+    return rb_str_new2(buf);
+}
+/****************************************************************************
+ *
+ * TokenStream Methods
+ *
+ ****************************************************************************/
+#define GET_TS(ts, self) Data_Get_Struct(self, TokenStream, ts)
+static void
+frb_ts_mark(void *p)
+{
+    TokenStream *ts = (TokenStream *)p;
+    if (ts->text)   frb_gc_mark(&ts->text);
+}
+static void
+frb_ts_free(TokenStream *ts)
+{
+    if (object_get(&ts->text) != Qnil) {
+        object_del(&ts->text);
+    }
+    object_del(ts);
+    ts_deref(ts);
+}
+static void frb_rets_free(TokenStream *ts);
+static void frb_rets_mark(TokenStream *ts);
+static Token *rets_next(TokenStream *ts);
+static VALUE
+get_rb_token_stream(TokenStream *ts)
+{
+    VALUE rts = object_get(ts);
+    if (rts == Qnil) {
+        if (ts->next == &rets_next) {
+            rts = Data_Wrap_Struct(cTokenStream, &frb_rets_mark,
+                                   &frb_rets_free, ts);
+        } else {
+            rts = Data_Wrap_Struct(cTokenStream, &frb_ts_mark,
+                                   &frb_ts_free, ts);
+        }
+        object_add(ts, rts);
+    }
+    return rts;
+}
+static INLINE VALUE
+get_wrapped_ts(VALUE self, VALUE rstr, TokenStream *ts)
+{
+    StringValue(rstr);
+    ts->reset(ts, rs2s(rstr));
+    Frt_Wrap_Struct(self, &frb_ts_mark, &frb_ts_free, ts);
+    object_add(&ts->text, rstr);
+    object_add(ts, self);
+    return self;
+}
+/*
+ *  call-seq:
+ *     token_stream.text = text -> text
+ *
+ *  Set the text attribute of the TokenStream to the text you wish to be
+ *  tokenized. For example, you may do this;
+ *
+ *      token_stream.text = File.read(file_name)
+ */
+static VALUE
+frb_ts_set_text(VALUE self, VALUE rtext)
+{
+    TokenStream *ts;
+    Data_Get_Struct(self, TokenStream, ts);
+    StringValue(rtext);
+    ts->reset(ts, rs2s(rtext));
+    /* prevent garbage collection */
+    rb_ivar_set(self, id_text, rtext);
+    return rtext;
+}
+/*
+ *  call-seq:
+ *     token_stream.text = text -> text
+ *
+ *  Return the text that the TokenStream is tokenizing
+ */
+static VALUE
+frb_ts_get_text(VALUE self)
+{
+    VALUE rtext = Qnil;
+    TokenStream *ts;
+    Data_Get_Struct(self, TokenStream, ts);
+    if ((rtext = object_get(&ts->text)) == Qnil) {
+        if (ts->text) {
+            rtext = rb_str_new2(ts->text);
+            object_set(&ts->text, rtext);
+        }
+    }
+    return rtext;
+}
+/*
+ *  call-seq:
+ *     token_stream.next -> token
+ *
+ *  Return the next token from the TokenStream or nil if there are no more
+ *  tokens.
+ */
+static VALUE
+frb_ts_next(VALUE self)
+{
+    TokenStream *ts;
+    Token *next;
+    GET_TS(ts, self);
+    next = ts->next(ts);
+    if (next == NULL) {
+        return Qnil;
+    }
+    return get_token(next);
+}
+/****************************************************************************
+ * TokenFilter
+ ****************************************************************************/
+#define TkFilt(filter) ((TokenFilter *)(filter))
+static void
+frb_tf_mark(void *p)
+{
+    TokenStream *ts = (TokenStream *)p;
+    if (TkFilt(ts)->sub_ts) {
+        frb_gc_mark(&TkFilt(ts)->sub_ts);
+    }
+}
+static void
+frb_tf_free(TokenStream *ts)
+{
+    if (TkFilt(ts)->sub_ts && (object_get(&TkFilt(ts)->sub_ts) != Qnil)) {
+        object_del(&TkFilt(ts)->sub_ts);
+    }
+    object_del(ts);
+    ts_deref(ts);
+}
+/****************************************************************************
+ * CWrappedTokenStream
+ ****************************************************************************/
+#define CachedTS(token_stream) ((CachedTokenStream *)(token_stream))
+#define CWTS(token_stream) ((CWrappedTokenStream *)(token_stream))
+typedef struct CWrappedTokenStream {
+    CachedTokenStream super;
+    VALUE rts;
+} CWrappedTokenStream;
+static void
+cwrts_destroy_i(TokenStream *ts)
+{
+    if (object_get(&ts->text) != Qnil) {
+        object_del(&ts->text);
+    }
+    rb_hash_delete(object_space, ((VALUE)ts)|1);
+    /*printf("rb_hash_size = %d\n", frb_rb_hash_size(object_space)); */
+    free(ts);
+}
+static Token *
+cwrts_next(TokenStream *ts)
+{
+    VALUE rtoken = rb_funcall(CWTS(ts)->rts, id_next, 0);
+    return frb_set_token(&(CachedTS(ts)->token), rtoken);
+}
+static TokenStream *
+cwrts_reset(TokenStream *ts, char *text)
+{
+    ts->t = ts->text = text;
+    rb_funcall(CWTS(ts)->rts, id_reset, 1, rb_str_new2(text));
+    return ts;
+}
+static TokenStream *
+cwrts_clone_i(TokenStream *orig_ts)
+{
+    TokenStream *new_ts = ts_clone_size(orig_ts, sizeof(CWrappedTokenStream));
+    VALUE rts = CWTS(new_ts)->rts = rb_funcall(CWTS(orig_ts)->rts, id_clone, 0);
+    rb_hash_aset(object_space, ((VALUE)new_ts)|1, rts);
+    return new_ts;
+}
+static TokenStream *
+frb_get_cwrapped_rts(VALUE rts)
+{
+    TokenStream *ts;
+    if (frb_is_cclass(rts) && DATA_PTR(rts)) {
+        GET_TS(ts, rts);
+        REF(ts);
+    }
+    else {
+        ts = ts_new(CWrappedTokenStream);
+        CWTS(ts)->rts = rts;
+        ts->next = &cwrts_next;
+        ts->reset = &cwrts_reset;
+        ts->clone_i = &cwrts_clone_i;
+        ts->destroy_i = &cwrts_destroy_i;
+        /* prevent from being garbage collected */
+        rb_hash_aset(object_space, ((VALUE)ts)|1, rts);
+        ts->ref_cnt = 1;
+    }
+    return ts;
+}
+/****************************************************************************
+ * RegExpTokenStream
+ ****************************************************************************/
+#define P "[_\\/.,-]"
+#define HASDIGIT "\\w*\\d\\w*"
+#define ALPHA "[-_[:alpha:]]"
+#define ALNUM "[-_[:alnum:]]"
+#define RETS(token_stream) ((RegExpTokenStream *)(token_stream))
+static const char *TOKEN_RE =
+    ALPHA "+(('" ALPHA "+)+|\\.(" ALPHA "\\.)+|"
+    "(@|\\&)\\w+([-.]\\w+)*|:\\/\\/" ALNUM "+([-.\\/]" ALNUM "+)*)?"
+    "|\\w+(([-._]\\w+)*\\@\\w+([-.]\\w+)+"
+    "|" P HASDIGIT "(" P "\\w+" P HASDIGIT ")*(" P "\\w+)?"
+    "|(\\.\\w+)+"
+    "|"
+    ")";
+static VALUE rtoken_re;
+typedef struct RegExpTokenStream {
+    CachedTokenStream super;
+    VALUE rtext;
+    VALUE regex;
+    VALUE proc;
+    long   curr_ind;
+} RegExpTokenStream;
+static void
+rets_destroy_i(TokenStream *ts)
+{
+    if (object_get(&ts->text) != Qnil) {
+        object_del(&ts->text);
+    }
+    rb_hash_delete(object_space, ((VALUE)ts)|1);
+    /*printf("rb_hash_size = %d\n", frb_rb_hash_size(object_space)); */
+    free(ts);
+}
+static void
+frb_rets_free(TokenStream *ts)
+{
+    if (object_get(&ts->text) != Qnil) {
+        object_del(&ts->text);
+    }
+    object_del(ts);
+    ts_deref(ts);
+}
+static void
+frb_rets_mark(TokenStream *ts)
+{
+    if (ts->text)   frb_gc_mark(&ts->text);
+    rb_gc_mark(RETS(ts)->rtext);
+    rb_gc_mark(RETS(ts)->regex);
+    rb_gc_mark(RETS(ts)->proc);
+}
+/*
+ *  call-seq:
+ *     tokenizer.text = text -> text
+ *
+ *  Set the text to be tokenized by the tokenizer. The tokenizer gets reset to
+ *  tokenize the text from the beginning.
+ */
+static VALUE
+frb_rets_set_text(VALUE self, VALUE rtext)
+{
+    TokenStream *ts;
+    GET_TS(ts, self);
+    rb_hash_aset(object_space, ((VALUE)ts)|1, rtext);
+    StringValue(rtext);
+    RETS(ts)->rtext = rtext;
+    RETS(ts)->curr_ind = 0;
+    return rtext;
+}
+/*
+ *  call-seq:
+ *     tokenizer.text = text -> text
+ *
+ *  Get the text being tokenized by the tokenizer.
+ */
+static VALUE
+frb_rets_get_text(VALUE self)
+{
+    TokenStream *ts;
+    GET_TS(ts, self);
+    return RETS(ts)->rtext;
+}
+#ifdef FRT_RUBY_VERSION_1_9
+// partly lifted from ruby 1.9 string.c
+#include <ruby/encoding.h>
+#define BEG(no) regs->beg[no]
+#define END(no) regs->end[no]
+#define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
+static VALUE
+  scan_once(VALUE str, VALUE pat, long *start)
+{
+  VALUE match;
+  struct re_registers *regs;
+  if (rb_reg_search(pat, str, *start, 0) >= 0) {
+    match = rb_backref_get();
+    regs = RMATCH_REGS(match);
+    if (BEG(0) == END(0)) {
+      rb_encoding *enc = STR_ENC_GET(str);
+      /*
+      * Always consume at least one character of the input string
+       */
+        if (RSTRING_LEN(str) > END(0))
+        *start = END(0)+rb_enc_mbclen(RSTRING_PTR(str)+END(0),
+        RSTRING_END(str), enc);
+      else
+        *start = END(0)+1;
+    }
+    else {
+      *start = END(0);
+    }
+    return rb_reg_nth_match(0, match);
+  }
+  return Qnil;
+}
+//
+static Token *
+  rets_next(TokenStream *ts)
+{
+  VALUE ret;
+  long rtok_len;
+  int beg, end;
+  Check_Type(RETS(ts)->regex, T_REGEXP);
+  ret = scan_once(RETS(ts)->rtext, RETS(ts)->regex, &(RETS(ts)->curr_ind));
+  if (NIL_P(ret)) return NULL;
+  Check_Type(ret, T_STRING);
+  rtok_len = RSTRING_LEN(ret);
+  beg = RETS(ts)->curr_ind - rtok_len;
+  end = RETS(ts)->curr_ind;
+  if (NIL_P(RETS(ts)->proc)) {
+    return tk_set(&(CachedTS(ts)->token), rs2s(ret), rtok_len,
+      beg, end, 1);
+  } else {
+    VALUE rtok;
+    rtok = rb_funcall(RETS(ts)->proc, id_call, 1, ret);
+    return tk_set(&(CachedTS(ts)->token), rs2s(rtok),
+      RSTRING_LEN(rtok), beg, end, 1);
+  }
+}
+#else
+static Token *
+rets_next(TokenStream *ts)
+{
+    static struct re_registers regs;
+    int ret, beg, end;
+    long rtext_len = RSTRING_LEN(RETS(ts)->rtext);
+    char *rtext_ptr = RSTRING_PTR(RETS(ts)->rtext);
+    Check_Type(RETS(ts)->regex, T_REGEXP);
+    ret = ruby_re_search(RREGEXP(RETS(ts)->regex)->ptr,
+                         rtext_ptr, rtext_len,
+                         RETS(ts)->curr_ind, rtext_len - RETS(ts)->curr_ind,
+                         &regs);
+    if (ret == -2) rb_raise(rb_eStandardError, "regexp buffer overflow");
+    if (ret < 0) return NULL; /* not matched */
+    beg = regs.beg[0];
+    RETS(ts)->curr_ind = end = regs.end[0];
+    if (NIL_P(RETS(ts)->proc)) {
+        return tk_set(&(CachedTS(ts)->token), rtext_ptr + beg, end - beg,
+                      beg, end, 1);
+    } else {
+        VALUE rtok = rb_str_new(rtext_ptr + beg, end - beg);
+        rtok = rb_funcall(RETS(ts)->proc, id_call, 1, rtok);
+        return tk_set(&(CachedTS(ts)->token), rs2s(rtok),
+                      RSTRING_LEN(rtok), beg, end, 1);
+    }
+}
+#endif
+static TokenStream *
+rets_reset(TokenStream *ts, char *text)
+{
+    RETS(ts)->rtext = rb_str_new2(text);
+    RETS(ts)->curr_ind = 0;
+    return ts;
+}
+static TokenStream *
+rets_clone_i(TokenStream *orig_ts)
+{
+    TokenStream *ts = ts_clone_size(orig_ts, sizeof(RegExpTokenStream));
+    return ts;
+}
+static TokenStream *
+rets_new(VALUE rtext, VALUE regex, VALUE proc)
+{
+    TokenStream *ts = ts_new(RegExpTokenStream);
+    if (rtext != Qnil) {
+        rtext = StringValue(rtext);
+        rb_hash_aset(object_space, ((VALUE)ts)|1, rtext);
+    }
+    ts->reset = &rets_reset;
+    ts->next = &rets_next;
+    ts->clone_i = &rets_clone_i;
+    ts->destroy_i = &rets_destroy_i;
+    RETS(ts)->curr_ind = 0;
+    RETS(ts)->rtext = rtext;
+    RETS(ts)->proc = proc;
+    if (NIL_P(regex)) {
+        RETS(ts)->regex = rtoken_re;
+    } else {
+        Check_Type(regex, T_REGEXP);
+        RETS(ts)->regex = regex;
+    }
+    return ts;
+}
+/*
+ *  call-seq:
+ *    RegExpTokenizer.new(input, /[[:alpha:]]+/)
+ *
+ *  Create a new tokenizer based on a regular expression
+ *
+ *  input::  text to tokenizer
+ *  regexp:: regular expression used to recognize tokens in the input
+ */
+static VALUE
+frb_rets_init(int argc, VALUE *argv, VALUE self)
+{
+    VALUE rtext, regex, proc;
+    TokenStream *ts;
+    rb_scan_args(argc, argv, "11&", &rtext, &regex, &proc);
+    ts = rets_new(rtext, regex, proc);
+    Frt_Wrap_Struct(self, &frb_rets_mark, &frb_rets_free, ts);
+    object_add(ts, self);
+    return self;
+}
+/****************************************************************************
+ * Tokenizers
+ ****************************************************************************/
+#define TS_ARGS(dflt) \
+    bool lower;\
+VALUE rlower, rstr;\
+rb_scan_args(argc, argv, "11", &rstr, &rlower);\
+lower = (argc ? RTEST(rlower) : dflt)
+/*
+ *  call-seq:
+ *     AsciiLetterTokenizer.new() -> tokenizer
+ *
+ *  Create a new AsciiLetterTokenizer
+ */
+static VALUE
+frb_a_letter_tokenizer_init(VALUE self, VALUE rstr)
+{
+    return get_wrapped_ts(self, rstr, letter_tokenizer_new());
+}
+/*
+ *  call-seq:
+ *     LetterTokenizer.new(lower = true) -> tokenizer
+ *
+ *  Create a new LetterTokenizer which optionally downcases tokens. Downcasing
+ *  is done according the current locale.
+ *
+ *  lower:: set to false if you don't wish to downcase tokens
+ */
+static VALUE
+frb_letter_tokenizer_init(int argc, VALUE *argv, VALUE self)
+{
+    TS_ARGS(false);
+#ifndef POSH_OS_WIN32
+    if (!frb_locale) frb_locale = setlocale(LC_CTYPE, "");
+#endif
+    return get_wrapped_ts(self, rstr, mb_letter_tokenizer_new(lower));
+}
+/*
+ *  call-seq:
+ *     AsciiWhiteSpaceTokenizer.new() -> tokenizer
+ *
+ *  Create a new AsciiWhiteSpaceTokenizer
+ */
+static VALUE
+frb_a_whitespace_tokenizer_init(VALUE self, VALUE rstr)
+{
+    return get_wrapped_ts(self, rstr, whitespace_tokenizer_new());
+}
+/*
+ *  call-seq:
+ *     WhiteSpaceTokenizer.new(lower = true) -> tokenizer
+ *
+ *  Create a new WhiteSpaceTokenizer which optionally downcases tokens.
+ *  Downcasing is done according the current locale.
+ *
+ *  lower:: set to false if you don't wish to downcase tokens
+ */
+static VALUE
+frb_whitespace_tokenizer_init(int argc, VALUE *argv, VALUE self)
+{
+    TS_ARGS(false);
+#ifndef POSH_OS_WIN32
+    if (!frb_locale) frb_locale = setlocale(LC_CTYPE, "");
+#endif
+    return get_wrapped_ts(self, rstr, mb_whitespace_tokenizer_new(lower));
+}
+/*
+ *  call-seq:
+ *     AsciiStandardTokenizer.new() -> tokenizer
+ *
+ *  Create a new AsciiStandardTokenizer
+ */
+static VALUE
+frb_a_standard_tokenizer_init(VALUE self, VALUE rstr)
+{
+    return get_wrapped_ts(self, rstr, standard_tokenizer_new());
+}
+/*
+ *  call-seq:
+ *     StandardTokenizer.new(lower = true) -> tokenizer
+ *
+ *  Create a new StandardTokenizer which optionally downcases tokens.
+ *  Downcasing is done according the current locale.
+ *
+ *  lower:: set to false if you don't wish to downcase tokens
+ */
+static VALUE
+frb_standard_tokenizer_init(VALUE self, VALUE rstr)
+{
+#ifndef POSH_OS_WIN32
+    if (!frb_locale) frb_locale = setlocale(LC_CTYPE, "");
+#endif
+    return get_wrapped_ts(self, rstr, mb_standard_tokenizer_new());
+}
+/****************************************************************************
+ * Filters
+ ****************************************************************************/
+/*
+ *  call-seq:
+ *     AsciiLowerCaseFilter.new(token_stream) -> token_stream
+ *
+ *  Create an AsciiLowerCaseFilter which normalizes a token's text to
+ *  lowercase but only for ASCII characters. For other characters use
+ *  LowerCaseFilter.
+ */
+static VALUE
+frb_a_lowercase_filter_init(VALUE self, VALUE rsub_ts)
+{
+    TokenStream *ts = frb_get_cwrapped_rts(rsub_ts);
+    ts = lowercase_filter_new(ts);
+    object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
+    Frt_Wrap_Struct(self, &frb_tf_mark, &frb_tf_free, ts);
+    object_add(ts, self);
+    return self;
+}
+/*
+ *  call-seq:
+ *     LowerCaseFilter.new(token_stream) -> token_stream
+ *
+ *  Create an LowerCaseFilter which normalizes a token's text to
+ *  lowercase based on the current locale.
+ */
+static VALUE
+frb_lowercase_filter_init(VALUE self, VALUE rsub_ts)
+{
+    TokenStream *ts = frb_get_cwrapped_rts(rsub_ts);
+#ifndef POSH_OS_WIN32
+    if (!frb_locale) frb_locale = setlocale(LC_CTYPE, "");
+#endif
+    ts = mb_lowercase_filter_new(ts);
+    object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
+    Frt_Wrap_Struct(self, &frb_tf_mark, &frb_tf_free, ts);
+    object_add(ts, self);
+    return self;
+}
+/*
+ *  call-seq:
+ *     HyphenFilter.new(token_stream) -> token_stream
+ *
+ *  Create an HyphenFilter which filters hyphenated words. The way it works is
+ *  by adding both the word concatenated into a single word and split into
+ *  multiple words. ie "e-mail" becomes "email" and "e mail". This way a
+ *  search for "e-mail", "email" and "mail" will all match. This filter is
+ *  used by default by the StandardAnalyzer.
+ */
+static VALUE
+frb_hyphen_filter_init(VALUE self, VALUE rsub_ts)
+{
+    TokenStream *ts = frb_get_cwrapped_rts(rsub_ts);
+    ts = hyphen_filter_new(ts);
+    object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
+    Frt_Wrap_Struct(self, &frb_tf_mark, &frb_tf_free, ts);
+    object_add(ts, self);
+    return self;
+}
+/*
+ *  call-seq:
+ *     StopFilter.new(token_stream) -> token_stream
+ *     StopFilter.new(token_stream, ["the", "and", "it"]) -> token_stream
+ *
+ *  Create an StopFilter which removes *stop-words* from a TokenStream. You can
+ *  optionally specify the stopwords you wish to have removed.
+ *
+ *  token_stream:: TokenStream to be filtered
+ *  stop_words::   Array of *stop-words* you wish to be filtered out. This
+ *                 defaults to a list of English stop-words. The
+ *                 Ferret::Analysis contains a number of stop-word lists.
+ */
+static VALUE
+frb_stop_filter_init(int argc, VALUE *argv, VALUE self)
+{
+    VALUE rsub_ts, rstop_words;
+    TokenStream *ts;
+    rb_scan_args(argc, argv, "11", &rsub_ts, &rstop_words);
+    ts = frb_get_cwrapped_rts(rsub_ts);
+    if (rstop_words != Qnil) {
+        char **stop_words = get_stopwords(rstop_words);
+        ts = stop_filter_new_with_words(ts, (const char **)stop_words);
+        free(stop_words);
+    } else {
+        ts = stop_filter_new(ts);
+    }
+    object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
+    Frt_Wrap_Struct(self, &frb_tf_mark, &frb_tf_free, ts);
+    object_add(ts, self);
+    return self;
+}
+static INLINE void frb_add_mapping_i(TokenStream *mf, VALUE from,
+                                     const char *to)
+{
+    switch (TYPE(from)) {
+        case T_STRING:
+            mapping_filter_add(mf, rs2s(from), to);
+            break;
+        case T_SYMBOL:
+            mapping_filter_add(mf, rb_id2name(SYM2ID(from)), to);
+            break;
+        default:
+            rb_raise(rb_eArgError,
+                     "cannot map from %s with MappingFilter",
+                     rs2s(rb_obj_as_string(from)));
+            break;
+    }
+}
+static int frb_add_mappings_i(VALUE key, VALUE value, VALUE arg)
+{
+    if (key == Qundef) {
+        return ST_CONTINUE;
+    } else {
+        TokenStream *mf = (TokenStream *)arg;
+        const char *to;
+        switch (TYPE(value)) {
+            case T_STRING:
+                to = rs2s(value);
+                break;
+            case T_SYMBOL:
+                to = rb_id2name(SYM2ID(value));
+                break;
+            default:
+                rb_raise(rb_eArgError,
+                         "cannot map to %s with MappingFilter",
+                         rs2s(rb_obj_as_string(key)));
+                break;
+        }
+        if (TYPE(key) == T_ARRAY) {
+            int i;
+            for (i = RARRAY_LEN(key) - 1; i >= 0; i--) {
+                frb_add_mapping_i(mf, RARRAY_PTR(key)[i], to);
+            }
+        }
+        else {
+            frb_add_mapping_i(mf, key, to);
+        }
+    }
+    return ST_CONTINUE;
+}
+/*
+ *  call-seq:
+ *     MappingFilter.new(token_stream, mapping) -> token_stream
+ *
+ *  Create an MappingFilter which maps strings in tokens. This is usually used
+ *  to map UTF-8 characters to ASCII characters for easier searching and
+ *  better search recall. The mapping is compiled into a Deterministic Finite
+ *  Automata so it is super fast. This Filter can therefor be used for
+ *  indexing very large datasets. Currently regular expressions are not
+ *  supported. If you are really interested in the feature, please contact me
+ *  at dbalmain@gmail.com.
+ *
+ *  token_stream:: TokenStream to be filtered
+ *  mapping::      Hash of mappings to apply to tokens. The key can be a
+ *                 String or an Array of Strings. The value must be a String
+ *
+ *  == Example
+ *
+ *     filt = MappingFilter.new(token_stream,
+ *                              {
+ *                                ['à','á','â','ã','ä','å'] => 'a',
+ *                                ['è','é','ê','ë','ē','ę'] => 'e'
+ *                              })
+ */
+static VALUE
+frb_mapping_filter_init(VALUE self, VALUE rsub_ts, VALUE mapping)
+{
+    TokenStream *ts;
+    ts = frb_get_cwrapped_rts(rsub_ts);
+    ts = mapping_filter_new(ts);
+    rb_hash_foreach(mapping, frb_add_mappings_i, (VALUE)ts);
+    mulmap_compile(((MappingFilter *)ts)->mapper);
+    object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
+    Frt_Wrap_Struct(self, &frb_tf_mark, &frb_tf_free, ts);
+    object_add(ts, self);
+    return self;
+}
+/*
+ *  call-seq:
+ *     StemFilter.new(token_stream) -> token_stream
+ *     StemFilter.new(token_stream,
+ *                    algorithm="english",
+ *                    encoding="UTF-8") -> token_stream
+ *
+ *  Create an StemFilter which uses a snowball stemmer (thank you Martin
+ *  Porter) to stem words. You can optionally specify the algorithm (default:
+ *  "english") and encoding (default: "UTF-8").
+ *
+ *  token_stream:: TokenStream to be filtered
+ *  algorithm::    The algorithm (or language) to use
+ *  encoding::     The encoding of the data (default: "UTF-8")
+ */
+static VALUE
+frb_stem_filter_init(int argc, VALUE *argv, VALUE self)
+{
+    VALUE rsub_ts, ralgorithm, rcharenc;
+    char *algorithm = "english";
+    char *charenc = NULL;
+    TokenStream *ts;
+    rb_scan_args(argc, argv, "12", &rsub_ts, &ralgorithm, &rcharenc);
+    ts = frb_get_cwrapped_rts(rsub_ts);
+    switch (argc) {
+        case 3: charenc = rs2s(rb_obj_as_string(rcharenc));
+        case 2: algorithm = rs2s(rb_obj_as_string(ralgorithm));
+    }
+    ts = stem_filter_new(ts, algorithm, charenc);
+    object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
+    Frt_Wrap_Struct(self, &frb_tf_mark, &frb_tf_free, ts);
+    object_add(ts, self);
+    if (((StemFilter *)ts)->stemmer == NULL) {
+        rb_raise(rb_eArgError, "No stemmer could be found with the encoding "
+                 "%s and the language %s", charenc, algorithm);
+    }
+    return self;
+}
+/****************************************************************************
+ *
+ * Analyzer Methods
+ *
+ ****************************************************************************/
+/****************************************************************************
+ * CWrappedAnalyzer Methods
+ ****************************************************************************/
+#define GET_A(a, self) Data_Get_Struct(self, Analyzer, a)
+#define CWA(analyzer) ((CWrappedAnalyzer *)(analyzer))
+typedef struct CWrappedAnalyzer
+{
+    Analyzer super;
+    VALUE ranalyzer;
+} CWrappedAnalyzer;
+static void
+cwa_destroy_i(Analyzer *a)
+{
+    rb_hash_delete(object_space, ((VALUE)a)|1);
+    /*printf("rb_hash_size = %d\n", frb_rb_hash_size(object_space)); */
+    free(a);
+}
+static TokenStream *
+cwa_get_ts(Analyzer *a, Symbol field, char *text)
+{
+    VALUE rts = rb_funcall(CWA(a)->ranalyzer, id_token_stream, 2,
+                           FSYM2SYM(field), rb_str_new2(text));
+    return frb_get_cwrapped_rts(rts);
+}
+Analyzer *
+frb_get_cwrapped_analyzer(VALUE ranalyzer)
+{
+    Analyzer *a = NULL;
+    if (frb_is_cclass(ranalyzer) && DATA_PTR(ranalyzer)) {
+        Data_Get_Struct(ranalyzer, Analyzer, a);
+        REF(a);
+    }
+    else {
+        a = (Analyzer *)frt_ecalloc(sizeof(CWrappedAnalyzer));
+        a->destroy_i = &cwa_destroy_i;
+        a->get_ts    = &cwa_get_ts;
+        a->ref_cnt   = 1;
+        ((CWrappedAnalyzer *)a)->ranalyzer = ranalyzer;
+        /* prevent from being garbage collected */
+        rb_hash_aset(object_space, ((VALUE)a)|1, ranalyzer);
+    }
+    return a;
+}
+static void
+frb_analyzer_free(Analyzer *a)
+{
+    object_del(a);
+    a_deref(a);
+}
+VALUE
+frb_get_analyzer(Analyzer *a)
+{
+    VALUE self = Qnil;
+    if (a) {
+        self = object_get(a);
+        if (self == Qnil) {
+            self = Data_Wrap_Struct(cAnalyzer, NULL, &frb_analyzer_free, a);
+            REF(a);
+            object_add(a, self);
+        }
+    }
+    return self;
+}
+INLINE VALUE
+get_rb_ts_from_a(Analyzer *a, VALUE rfield, VALUE rstring)
+{
+    TokenStream *ts = a_get_ts(a, frb_field(rfield), rs2s(rstring));
+    /* Make sure that there is no entry already */
+    object_set(&ts->text, rstring);
+    return get_rb_token_stream(ts);
+}
+/*
+ *  call-seq:
+ *     analyzer.token_stream(field_name, input) -> token_stream
+ *
+ *  Create a new TokenStream to tokenize +input+. The TokenStream created may
+ *  also depend on the +field_name+. Although this parameter is typically
+ *  ignored.
+ *
+ *  field_name:: name of the field to be tokenized
+ *  input::      data from the field to be tokenized
+ */
+static VALUE
+frb_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
+{
+    /* NOTE: Any changes made to this method may also need to be applied to
+     * frb_re_analyzer_token_stream */
+    Analyzer *a;
+    GET_A(a, self);
+    StringValue(rstring);
+    return get_rb_ts_from_a(a, rfield, rstring);
+}
+#define GET_LOWER(dflt) \
+    bool lower;\
+VALUE rlower;\
+rb_scan_args(argc, argv, "01", &rlower);\
+lower = (argc ? RTEST(rlower) : dflt)
+/*
+ *  call-seq:
+ *     AsciiWhiteSpaceAnalyzer.new(lower = false) -> analyzer
+ *
+ *  Create a new AsciiWhiteSpaceAnalyzer which downcases tokens by default
+ *  but can optionally leave case as is. Lowercasing will only be done to
+ *  ASCII characters.
+ *
+ *  lower:: set to false if you don't want the field's tokens to be downcased
+ */
+static VALUE
+frb_a_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
+{
+    Analyzer *a;
+    GET_LOWER(false);
+    a = whitespace_analyzer_new(lower);
+    Frt_Wrap_Struct(self, NULL, &frb_analyzer_free, a);
+    object_add(a, self);
+    return self;
+}
+/*
+ *  call-seq:
+ *     WhiteSpaceAnalyzer.new(lower = false) -> analyzer
+ *
+ *  Create a new WhiteSpaceAnalyzer which downcases tokens by default but can
+ *  optionally leave case as is. Lowercasing will be done based on the current
+ *  locale.
+ *
+ *  lower:: set to false if you don't want the field's tokens to be downcased
+ */
+static VALUE
+frb_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
+{
+    Analyzer *a;
+    GET_LOWER(false);
+#ifndef POSH_OS_WIN32
+    if (!frb_locale) frb_locale = setlocale(LC_CTYPE, "");
+#endif
+    a = mb_whitespace_analyzer_new(lower);
+    Frt_Wrap_Struct(self, NULL, &frb_analyzer_free, a);
+    object_add(a, self);
+    return self;
+}
+/*
+ *  call-seq:
+ *     AsciiLetterAnalyzer.new(lower = true) -> analyzer
+ *
+ *  Create a new AsciiWhiteSpaceAnalyzer which downcases tokens by default
+ *  but can optionally leave case as is. Lowercasing will only be done to
+ *  ASCII characters.
+ *
+ *  lower:: set to false if you don't want the field's tokens to be downcased
+ */
+static VALUE
+frb_a_letter_analyzer_init(int argc, VALUE *argv, VALUE self)
+{
+    Analyzer *a;
+    GET_LOWER(true);
+    a = letter_analyzer_new(lower);
+    Frt_Wrap_Struct(self, NULL, &frb_analyzer_free, a);
+    object_add(a, self);
+    return self;
+}
+/*
+ *  call-seq:
+ *     LetterAnalyzer.new(lower = true) -> analyzer
+ *
+ *  Create a new LetterAnalyzer which downcases tokens by default but can
+ *  optionally leave case as is. Lowercasing will be done based on the current
+ *  locale.
+ *
+ *  lower:: set to false if you don't want the field's tokens to be downcased
+ */
+static VALUE
+frb_letter_analyzer_init(int argc, VALUE *argv, VALUE self)
+{
+    Analyzer *a;
+    GET_LOWER(true);
+#ifndef POSH_OS_WIN32
+    if (!frb_locale) frb_locale = setlocale(LC_CTYPE, "");
+#endif
+    a = mb_letter_analyzer_new(lower);
+    Frt_Wrap_Struct(self, NULL, &frb_analyzer_free, a);
+    object_add(a, self);
+    return self;
+}
+static VALUE
+get_rstopwords(const char **stop_words)
+{
+    char **w = (char **)stop_words;
+    VALUE rstopwords = rb_ary_new();
+    while (*w) {
+        rb_ary_push(rstopwords, rb_str_new2(*w));
+        w++;
+    }
+    return rstopwords;
+}
+/*
+ *  call-seq:
+ *     AsciiStandardAnalyzer.new(lower = true, stop_words = FULL_ENGLISH_STOP_WORDS)
+ *     -> analyzer
+ *
+ *  Create a new AsciiStandardAnalyzer which downcases tokens by default but
+ *  can optionally leave case as is. Lowercasing will be done based on the
+ *  current locale. You can also set the list of stop-words to be used by the
+ *  StopFilter.
+ *
+ *  lower::      set to false if you don't want the field's tokens to be downcased
+ *  stop_words:: list of stop-words to pass to the StopFilter
+ */
+static VALUE
+frb_a_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
+{
+    bool lower;
+    VALUE rlower, rstop_words;
+    Analyzer *a;
+    rb_scan_args(argc, argv, "02", &rstop_words, &rlower);
+    lower = ((rlower == Qnil) ? true : RTEST(rlower));
+    if (rstop_words != Qnil) {
+        char **stop_words = get_stopwords(rstop_words);
+        a = standard_analyzer_new_with_words((const char **)stop_words, lower);
+        free(stop_words);
+    } else {
+        a = standard_analyzer_new(lower);
+    }
+    Frt_Wrap_Struct(self, NULL, &frb_analyzer_free, a);
+    object_add(a, self);
+    return self;
+}
+/*
+ *  call-seq:
+ *     StandardAnalyzer.new(stop_words = FULL_ENGLISH_STOP_WORDS, lower=true)
+ *     -> analyzer
+ *
+ *  Create a new StandardAnalyzer which downcases tokens by default but can
+ *  optionally leave case as is. Lowercasing will be done based on the current
+ *  locale. You can also set the list of stop-words to be used by the
+ *  StopFilter.
+ *
+ *  lower::      set to false if you don't want the field's tokens to be downcased
+ *  stop_words:: list of stop-words to pass to the StopFilter
+ */
+static VALUE
+frb_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
+{
+    bool lower;
+    VALUE rlower, rstop_words;
+    Analyzer *a;
+#ifndef POSH_OS_WIN32
+    if (!frb_locale) frb_locale = setlocale(LC_CTYPE, "");
+#endif
+    rb_scan_args(argc, argv, "02", &rstop_words, &rlower);
+    lower = ((rlower == Qnil) ? true : RTEST(rlower));
+    if (rstop_words != Qnil) {
+        char **stop_words = get_stopwords(rstop_words);
+        a = mb_standard_analyzer_new_with_words((const char **)stop_words, lower);
+        free(stop_words);
+    } else {
+        a = mb_standard_analyzer_new(lower);
+    }
+    Frt_Wrap_Struct(self, NULL, &frb_analyzer_free, a);
+    object_add(a, self);
+    return self;
+}
+static void
+frb_h_mark_values_i(void *key, void *value, void *arg)
+{
+    frb_gc_mark(value);
+}
+static void
+frb_pfa_mark(void *p)
+{
+    frb_gc_mark(PFA(p)->default_a);
+    h_each(PFA(p)->dict, &frb_h_mark_values_i, NULL);
+}
+/*** PerFieldAnalyzer ***/
+/*
+ *  call-seq:
+ *     PerFieldAnalyzer.new(default_analyzer) -> analyzer
+ *
+ *  Create a new PerFieldAnalyzer specifying the default analyzer to use on
+ *  all fields that are set specifically.
+ *
+ *  default_analyzer:: analyzer to be used on fields that aren't otherwise
+ *                     specified
+ */
+static VALUE
+frb_per_field_analyzer_init(VALUE self, VALUE ranalyzer)
+{
+    Analyzer *def = frb_get_cwrapped_analyzer(ranalyzer);
+    Analyzer *a = per_field_analyzer_new(def);
+    Frt_Wrap_Struct(self, &frb_pfa_mark, &frb_analyzer_free, a);
+    object_add(a, self);
+    return self;
+}
+/*
+ *  call-seq:
+ *     per_field_analyzer.add_field(field_name, default_analyzer) -> self
+ *     per_field_analyzer[field_name] = default_analyzer -> self
+ *
+ *  Set the analyzer to be used on field +field_name+. Note that field_name
+ *  should be a symbol.
+ *
+ *  field_name:: field we wish to set the analyzer for
+ *  analyzer::   analyzer to be used on +field_name+
+ */
+static VALUE
+frb_per_field_analyzer_add_field(VALUE self, VALUE rfield, VALUE ranalyzer)
+{
+    Analyzer *pfa, *a;
+    Data_Get_Struct(self, Analyzer, pfa);
+    a = frb_get_cwrapped_analyzer(ranalyzer);
+    pfa_add_field(pfa, frb_field(rfield), a);
+    return self;
+}
+/*
+ *  call-seq:
+ *     analyzer.token_stream(field_name, input) -> token_stream
+ *
+ *  Create a new TokenStream to tokenize +input+. The TokenStream created will
+ *  also depend on the +field_name+ in the case of the PerFieldAnalyzer.
+ *
+ *  field_name:: name of the field to be tokenized
+ *  input::      data from the field to be tokenized
+ */
+static VALUE
+frb_pfa_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
+{
+    Analyzer *pfa, *a;
+    Symbol field = frb_field(rfield);
+    GET_A(pfa, self);
+    StringValue(rstring);
+    a = (Analyzer *)h_get(PFA(pfa)->dict, field);
+    if (a == NULL) {
+        a = PFA(pfa)->default_a;
+    }
+    if (a->get_ts == cwa_get_ts) {
+        return rb_funcall(CWA(a)->ranalyzer, id_token_stream, 2,
+                          FSYM2SYM(field), rb_str_new2(rs2s(rstring)));
+    }
+    else {
+        return get_rb_ts_from_a(a, rfield, rstring);
+    }
+}
+/*** RegExpAnalyzer ***/
+static void
+frb_re_analyzer_mark(Analyzer *a)
+{
+    frb_gc_mark(a->current_ts);
+}
+static void
+re_analyzer_destroy_i(Analyzer *a)
+{
+    ts_deref(a->current_ts);
+    free(a);
+}
+/*
+ *  call-seq:
+ *     RegExpAnalyzer.new(reg_exp, lower = true) -> analyzer
+ *
+ *  Create a new RegExpAnalyzer which will create tokenizers based on the
+ *  regular expression and lowercasing if required.
+ *
+ *  reg_exp:: the token matcher for the tokenizer to use
+ *  lower::   set to false if you don't want to downcase the tokens
+ */
+static VALUE
+frb_re_analyzer_init(int argc, VALUE *argv, VALUE self)
+{
+    VALUE lower, rets, regex, proc;
+    Analyzer *a;
+    TokenStream *ts;
+    rb_scan_args(argc, argv, "02&", &regex, &lower, &proc);
+    ts = rets_new(Qnil, regex, proc);
+    rets = Data_Wrap_Struct(cRegExpTokenizer, &frb_rets_mark, &frb_rets_free, ts);
+    object_add(ts, rets);
+    if (lower != Qfalse) {
+        rets = frb_lowercase_filter_init(frb_data_alloc(cLowerCaseFilter), rets);
+        ts = DATA_PTR(rets);
+    }
+    REF(ts);
+    a = analyzer_new(ts, &re_analyzer_destroy_i, NULL);
+    Frt_Wrap_Struct(self, &frb_re_analyzer_mark, &frb_analyzer_free, a);
+    object_add(a, self);
+    return self;
+}
+/*
+ *  call-seq:
+ *     analyzer.token_stream(field_name, input) -> token_stream
+ *
+ *  Create a new TokenStream to tokenize +input+. The TokenStream created may
+ *  also depend on the +field_name+. Although this parameter is typically
+ *  ignored.
+ *
+ *  field_name:: name of the field to be tokenized
+ *  input::      data from the field to be tokenized
+ */
+static VALUE
+frb_re_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rtext)
+{
+    TokenStream *ts;
+    Analyzer *a;
+    GET_A(a, self);
+    StringValue(rtext);
+    ts = a_get_ts(a, frb_field(rfield), rs2s(rtext));
+    /* Make sure that there is no entry already */
+    object_set(&ts->text, rtext);
+    if (ts->next == &rets_next) {
+        RETS(ts)->rtext = rtext;
+        rb_hash_aset(object_space, ((VALUE)ts)|1, rtext);
+    }
+    else {
+        RETS(((TokenFilter*)ts)->sub_ts)->rtext = rtext;
+        rb_hash_aset(object_space, ((VALUE)((TokenFilter*)ts)->sub_ts)|1, rtext);
+    }
+    return get_rb_token_stream(ts);
+}
+/****************************************************************************
+ *
+ * Locale stuff
+ *
+ ****************************************************************************/
+/*
+ *  call-seq:
+ *     Ferret.locale -> locale_str
+ *
+ *  Returns a string corresponding to the locale set. For example;
+ *
+ *     puts Ferret.locale #=> "en_US.UTF-8"
+ */
+static VALUE frb_get_locale(VALUE self, VALUE locale)
+{
+    return (frb_locale ? rb_str_new2(frb_locale) : Qnil);
+}
+/*
+ *  call-seq:
+ *     Ferret.locale = "en_US.UTF-8"
+ *
+ *  Set the global locale. You should use this method to set different locales
+ *  when indexing documents with different encodings.
+ */
+static VALUE frb_set_locale(VALUE self, VALUE locale)
+{
+    char *l = ((locale == Qnil) ? NULL : rs2s(rb_obj_as_string(locale)));
+    frb_locale = setlocale(LC_CTYPE, l);
+    return frb_locale ? rb_str_new2(frb_locale) : Qnil;
+}
+/****************************************************************************
+ *
+ * Init Functions
+ *
+ ****************************************************************************/
+/*
+ *  Document-class: Ferret::Analysis::Token
+ *
+ *  == Summary
+ *
+ *  A Token is an occurrence of a term from the text of a field.  It consists
+ *  of a term's text and the start and end offset of the term in the text of
+ *  the field;
+ *
+ *  The start and end offsets permit applications to re-associate a token with
+ *  its source text, e.g., to display highlighted query terms in a document
+ *  browser, or to show matching text fragments in a KWIC (KeyWord In Context)
+ *  display, etc.
+ *
+ *  === Attributes
+ *
+ *  text::  the terms text which may have been modified by a Token Filter or
+ *          Tokenizer from the text originally found in the document
+ *  start:: is the position of the first character corresponding to
+ *          this token in the source text
+ *  end::   is equal to one greater than the position of the last
+ *          character corresponding of this token Note that the
+ *          difference between @end_offset and @start_offset may not be
+ *          equal to @text.length(), as the term text may have been
+ *          altered by a stemmer or some other filter.
+ */
+static void Init_Token(void)
+{
+    cToken = rb_define_class_under(mAnalysis, "Token", rb_cObject);
+    rb_define_alloc_func(cToken, frb_token_alloc);
+    rb_include_module(cToken, rb_mComparable);
+    rb_define_method(cToken, "initialize",  frb_token_init, -1);
+    rb_define_method(cToken, "<=>",         frb_token_cmp, 1);
+    rb_define_method(cToken, "text",        frb_token_get_text, 0);
+    rb_define_method(cToken, "text=",       frb_token_set_text, 1);
+    rb_define_method(cToken, "start",       frb_token_get_start_offset, 0);
+    rb_define_method(cToken, "start=",      frb_token_set_start_offset, 1);
+    rb_define_method(cToken, "end",         frb_token_get_end_offset, 0);
+    rb_define_method(cToken, "end=",        frb_token_set_end_offset, 1);
+    rb_define_method(cToken, "pos_inc",     frb_token_get_pos_inc, 0);
+    rb_define_method(cToken, "pos_inc=",    frb_token_set_pos_inc, 1);
+    rb_define_method(cToken, "to_s",        frb_token_to_s, 0);
+}
+/*
+ *  Document-class: Ferret::Analysis::TokenStream
+ *
+ *  == Summary
+ *
+ *  A TokenStream enumerates the sequence of tokens, either from
+ *  fields of a document or from query text.
+ *
+ *  This is an abstract class.  Concrete subclasses are:
+ *
+ *  Tokenizer::   a TokenStream whose input is a string
+ *  TokenFilter:: a TokenStream whose input is another TokenStream
+ */
+static void Init_TokenStream(void)
+{
+    cTokenStream = rb_define_class_under(mAnalysis, "TokenStream", rb_cObject);
+    frb_mark_cclass(cTokenStream);
+    rb_define_method(cTokenStream, "next", frb_ts_next, 0);
+    rb_define_method(cTokenStream, "text=", frb_ts_set_text, 1);
+    rb_define_method(cTokenStream, "text", frb_ts_get_text, 0);
+}
+/*
+ *  Document-class: Ferret::Analysis::AsciiLetterTokenizer
+ *
+ *  == Summary
+ *
+ *  A LetterTokenizer is a tokenizer that divides text at non-ASCII letters.
+ *  That is to say, it defines tokens as maximal strings of adjacent letters,
+ *  as defined by the regular expression _/[A-Za-z]+/_.
+ *
+ *  === Example
+ *
+ *    "Dave's résumé, at http://www.davebalmain.com/ 1234"
+ *      => ["Dave", "s", "r", "sum", "at", "http", "www", "davebalmain", "com"]
+ */
+static void Init_AsciiLetterTokenizer(void)
+{
+    cAsciiLetterTokenizer =
+        rb_define_class_under(mAnalysis, "AsciiLetterTokenizer", cTokenStream);
+    frb_mark_cclass(cAsciiLetterTokenizer);
+    rb_define_alloc_func(cAsciiLetterTokenizer, frb_data_alloc);
+    rb_define_method(cAsciiLetterTokenizer, "initialize",
+                     frb_a_letter_tokenizer_init, 1);
+}
+/*
+ *  Document-class: Ferret::Analysis::LetterTokenizer
+ *
+ *  == Summary
+ *
+ *  A LetterTokenizer is a tokenizer that divides text at non-letters. That is
+ *  to say, it defines tokens as maximal strings of adjacent letters, as
+ *  defined by the regular expression _/[[:alpha:]]+/_ where [:alpha] matches
+ *  all characters in your local locale.
+ *
+ *  === Example
+ *
+ *    "Dave's résumé, at http://www.davebalmain.com/ 1234"
+ *      => ["Dave", "s", "résumé", "at", "http", "www", "davebalmain", "com"]
+ */
+static void Init_LetterTokenizer(void)
+{
+    cLetterTokenizer =
+        rb_define_class_under(mAnalysis, "LetterTokenizer", cTokenStream);
+    frb_mark_cclass(cLetterTokenizer);
+    rb_define_alloc_func(cLetterTokenizer, frb_data_alloc);
+    rb_define_method(cLetterTokenizer, "initialize",
+                     frb_letter_tokenizer_init, -1);
+}
+/*
+ *  Document-class: Ferret::Analysis::AsciiWhiteSpaceTokenizer
+ *
+ *  == Summary
+ *
+ *  A WhiteSpaceTokenizer is a tokenizer that divides text at white-space.
+ *  Adjacent sequences of non-WhiteSpace characters form tokens.
+ *
+ *  === Example
+ *
+ *    "Dave's résumé, at http://www.davebalmain.com/ 1234"
+ *      => ["Dave's", "résumé,", "at", "http://www.davebalmain.com", "1234"]
+ */
+static void Init_AsciiWhiteSpaceTokenizer(void)
+{
+    cAsciiWhiteSpaceTokenizer =
+        rb_define_class_under(mAnalysis, "AsciiWhiteSpaceTokenizer",
+                              cTokenStream);
+    frb_mark_cclass(cAsciiWhiteSpaceTokenizer);
+    rb_define_alloc_func(cAsciiWhiteSpaceTokenizer, frb_data_alloc);
+    rb_define_method(cAsciiWhiteSpaceTokenizer, "initialize",
+                     frb_a_whitespace_tokenizer_init, 1);
+}
+/*
+ *  Document-class: Ferret::Analysis::WhiteSpaceTokenizer
+ *
+ *  == Summary
+ *
+ *  A WhiteSpaceTokenizer is a tokenizer that divides text at white-space.
+ *  Adjacent sequences of non-WhiteSpace characters form tokens.
+ *
+ *  === Example
+ *
+ *    "Dave's résumé, at http://www.davebalmain.com/ 1234"
+ *      => ["Dave's", "résumé,", "at", "http://www.davebalmain.com", "1234"]
+ */
+static void Init_WhiteSpaceTokenizer(void)
+{
+    cWhiteSpaceTokenizer =
+        rb_define_class_under(mAnalysis, "WhiteSpaceTokenizer", cTokenStream);
+    frb_mark_cclass(cWhiteSpaceTokenizer);
+    rb_define_alloc_func(cWhiteSpaceTokenizer, frb_data_alloc);
+    rb_define_method(cWhiteSpaceTokenizer, "initialize",
+                     frb_whitespace_tokenizer_init, -1);
+}
+/*
+ *  Document-class: Ferret::Analysis::AsciiStandardTokenizer
+ *
+ *  == Summary
+ *
+ *  The standard tokenizer is an advanced tokenizer which tokenizes most
+ *  words correctly as well as tokenizing things like email addresses, web
+ *  addresses, phone numbers, etc.
+ *
+ *  === Example
+ *
+ *    "Dave's résumé, at http://www.davebalmain.com/ 1234"
+ *      => ["Dave's", "r", "sum", "at", "http://www.davebalmain.com", "1234"]
+ */
+static void Init_AsciiStandardTokenizer(void)
+{
+    cAsciiStandardTokenizer =
+        rb_define_class_under(mAnalysis, "AsciiStandardTokenizer", cTokenStream);
+    frb_mark_cclass(cAsciiStandardTokenizer);
+    rb_define_alloc_func(cAsciiStandardTokenizer, frb_data_alloc);
+    rb_define_method(cAsciiStandardTokenizer, "initialize",
+                     frb_a_standard_tokenizer_init, 1);
+}
+/*
+ *  Document-class: Ferret::Analysis::StandardTokenizer
+ *
+ *  == Summary
+ *
+ *  The standard tokenizer is an advanced tokenizer which tokenizes most
+ *  words correctly as well as tokenizing things like email addresses, web
+ *  addresses, phone numbers, etc.
+ *
+ *  === Example
+ *
+ *    "Dave's résumé, at http://www.davebalmain.com/ 1234"
+ *      => ["Dave's", "résumé", "at", "http://www.davebalmain.com", "1234"]
+ */
+static void Init_StandardTokenizer(void)
+{
+    cStandardTokenizer =
+        rb_define_class_under(mAnalysis, "StandardTokenizer", cTokenStream);
+    frb_mark_cclass(cStandardTokenizer);
+    rb_define_alloc_func(cStandardTokenizer, frb_data_alloc);
+    rb_define_method(cStandardTokenizer, "initialize",
+                     frb_standard_tokenizer_init, 1);
+}
+/*
+ *  Document-class: Ferret::Analysis::RegExpTokenizer
+ *
+ *  == Summary
+ *
+ *  A tokenizer that recognizes tokens based on a regular expression passed to
+ *  the constructor. Most possible tokenizers can be created using this class.
+ *
+ *  === Example
+ *
+ *  Below is an example of a simple implementation of a LetterTokenizer using
+ *  an RegExpTokenizer. Basically, a token is a sequence of alphabetic
+ *  characters separated by one or more non-alphabetic characters.
+ *
+ *    # of course you would add more than just é
+ *    RegExpTokenizer.new(input, /[[:alpha:]é]+/)
+ *
+ *    "Dave's résumé, at http://www.davebalmain.com/ 1234"
+ *      => ["Dave", "s", "résumé", "at", "http", "www", "davebalmain", "com"]
+ */
+static void Init_RegExpTokenizer(void)
+{
+    cRegExpTokenizer =
+        rb_define_class_under(mAnalysis, "RegExpTokenizer", cTokenStream);
+    frb_mark_cclass(cRegExpTokenizer);
+    rtoken_re = rb_reg_new(TOKEN_RE, strlen(TOKEN_RE), 0);
+    rb_define_const(cRegExpTokenizer, "REGEXP", rtoken_re);
+    rb_define_alloc_func(cRegExpTokenizer, frb_data_alloc);
+    rb_define_method(cRegExpTokenizer, "initialize",
+                     frb_rets_init, -1);
+    rb_define_method(cRegExpTokenizer, "text=", frb_rets_set_text, 1);
+    rb_define_method(cRegExpTokenizer, "text", frb_rets_get_text, 0);
+}
+/***************/
+/*** Filters ***/
+/***************/
+/*
+ *  Document-class: Ferret::Analysis::AsciiLowerCaseFilter
+ *
+ *  == Summary
+ *
+ *  AsciiLowerCaseFilter normalizes a token's text to lowercase but only for
+ *  ASCII characters. For other characters use LowerCaseFilter.
+ *
+ *  === Example
+ *
+ *    ["One", "TWO", "three", "RÉSUMÉ"] => ["one", "two", "three", "rÉsumÉ"]
+ *
+ */
+static void Init_AsciiLowerCaseFilter(void)
+{
+    cAsciiLowerCaseFilter =
+        rb_define_class_under(mAnalysis, "AsciiLowerCaseFilter", cTokenStream);
+    frb_mark_cclass(cAsciiLowerCaseFilter);
+    rb_define_alloc_func(cAsciiLowerCaseFilter, frb_data_alloc);
+    rb_define_method(cAsciiLowerCaseFilter, "initialize",
+                     frb_a_lowercase_filter_init, 1);
+}
+/*
+ *  Document-class: Ferret::Analysis::LowerCaseFilter
+ *
+ *  == Summary
+ *
+ *  LowerCaseFilter normalizes a token's text to lowercase based on the
+ *  current locale.
+ *
+ *  === Example
+ *
+ *    ["One", "TWO", "three", "RÉSUMÉ"] => ["one", "two", "three", "résumé"]
+ *
+ */
+static void Init_LowerCaseFilter(void)
+{
+    cLowerCaseFilter =
+        rb_define_class_under(mAnalysis, "LowerCaseFilter", cTokenStream);
+    frb_mark_cclass(cLowerCaseFilter);
+    rb_define_alloc_func(cLowerCaseFilter, frb_data_alloc);
+    rb_define_method(cLowerCaseFilter, "initialize",
+                     frb_lowercase_filter_init, 1);
+}
+/*
+ *  Document-class: Ferret::Analysis::HyphenFilter
+ *
+ *  == Summary
+ *
+ *  HyphenFilter filters hyphenated words by adding both the word concatenated
+ *  into a single word and split into multiple words. ie "e-mail" becomes
+ *  "email" and "e mail". This way a search for "e-mail", "email" and "mail"
+ *  will all match. This filter is used by default by the StandardAnalyzer.
+ *
+ *  === Example
+ *
+ *    ["e-mail", "set-up"] => ["email", "e", "mail", "setup", "set", "up"]
+ *
+ */
+static void Init_HyphenFilter(void)
+{
+    cHyphenFilter =
+        rb_define_class_under(mAnalysis, "HyphenFilter", cTokenStream);
+    frb_mark_cclass(cHyphenFilter);
+    rb_define_alloc_func(cHyphenFilter, frb_data_alloc);
+    rb_define_method(cHyphenFilter, "initialize", frb_hyphen_filter_init, 1);
+}
+/*
+ *  Document-class: Ferret::Analysis::MappingFilter
+ *
+ *  == Summary
+ *
+ *  A MappingFilter maps strings in tokens. This is usually used to map UTF-8
+ *  characters to ASCII characters for easier searching and better search
+ *  recall. The mapping is compiled into a Deterministic Finite Automata so it
+ *  is super fast. This Filter can therefor be used for indexing very large
+ *  datasets. Currently regular expressions are not supported. If you are
+ *  really interested in the feature, please contact me at dbalmain@gmail.com.
+ *
+ *  == Example
+ *
+ *     mapping = {
+ *       ['à','á','â','ã','ä','å','ā','ă']         => 'a',
+ *       'æ'                                       => 'ae',
+ *       ['ď','đ']                                 => 'd',
+ *       ['ç','ć','č','ĉ','ċ']                     => 'c',
+ *       ['è','é','ê','ë','ē','ę','ě','ĕ','ė',]    => 'e',
+ *       ['ƒ']                                     => 'f',
+ *       ['ĝ','ğ','ġ','ģ']                         => 'g',
+ *       ['ĥ','ħ']                                 => 'h',
+ *       ['ì','ì','í','î','ï','ī','ĩ','ĭ']         => 'i',
+ *       ['į','ı','ĳ','ĵ']                         => 'j',
+ *       ['ķ','ĸ']                                 => 'k',
+ *       ['ł','ľ','ĺ','ļ','ŀ']                     => 'l',
+ *       ['ñ','ń','ň','ņ','ŉ','ŋ']                 => 'n',
+ *       ['ò','ó','ô','õ','ö','ø','ō','ő','ŏ','ŏ'] => 'o',
+ *       ['œ']                                     => 'oek',
+ *       ['ą']                                     => 'q',
+ *       ['ŕ','ř','ŗ']                             => 'r',
+ *       ['ś','š','ş','ŝ','ș']                     => 's',
+ *       ['ť','ţ','ŧ','ț']                         => 't',
+ *       ['ù','ú','û','ü','ū','ů','ű','ŭ','ũ','ų'] => 'u',
+ *       ['ŵ']                                     => 'w',
+ *       ['ý','ÿ','ŷ']                             => 'y',
+ *       ['ž','ż','ź']                             => 'z'
+ *     }
+ *     filt = MappingFilter.new(token_stream, mapping)
+ */
+static void Init_MappingFilter(void)
+{
+    cMappingFilter =
+        rb_define_class_under(mAnalysis, "MappingFilter", cTokenStream);
+    frb_mark_cclass(cMappingFilter);
+    rb_define_alloc_func(cMappingFilter, frb_data_alloc);
+    rb_define_method(cMappingFilter, "initialize",
+                     frb_mapping_filter_init, 2);
+}
+/*
+ *  Document-class: Ferret::Analysis::StopFilter
+ *
+ *  == Summary
+ *
+ *  A StopFilter filters *stop-words* from a TokenStream. Stop-words are words
+ *  that you don't wish to be index. Usually they will be common words like
+ *  "the" and "and" although you can specify whichever words you want.
+ *
+ *  === Example
+ *
+ *    ["the", "pig", "and", "whistle"] => ["pig", "whistle"]
+ */
+static void Init_StopFilter(void)
+{
+    cStopFilter =
+        rb_define_class_under(mAnalysis, "StopFilter", cTokenStream);
+    frb_mark_cclass(cStopFilter);
+    rb_define_alloc_func(cStopFilter, frb_data_alloc);
+    rb_define_method(cStopFilter, "initialize",
+                     frb_stop_filter_init, -1);
+}
+/*
+ *  Document-class: Ferret::Analysis::StemFilter
+ *
+ *  == Summary
+ *
+ *  A StemFilter takes a term and transforms the term as per the SnowBall
+ *  stemming algorithm.  Note: the input to the stemming filter must already
+ *  be in lower case, so you will need to use LowerCaseFilter or lowercasing
+ *  Tokenizer further down the Tokenizer chain in order for this to work
+ *  properly!
+ *
+ *  === Available algorithms and encodings
+ *
+ *    Algorithm       Algorithm Pseudonyms       Encoding
+ *    ----------------------------------------------------------------
+ *     "danish",     | "da", "dan"              | "ISO_8859_1", "UTF_8"
+ *     "dutch",      | "dut", "nld"             | "ISO_8859_1", "UTF_8"
+ *     "english",    | "en", "eng"              | "ISO_8859_1", "UTF_8"
+ *     "finnish",    | "fi", "fin"              | "ISO_8859_1", "UTF_8"
+ *     "french",     | "fr", "fra", "fre"       | "ISO_8859_1", "UTF_8"
+ *     "german",     | "de", "deu", "ge", "ger" | "ISO_8859_1", "UTF_8"
+ *     "hungarian",  | "hu", "hun"              | "ISO_8859_1", "UTF_8"
+ *     "italian",    | "it", "ita"              | "ISO_8859_1", "UTF_8"
+ *     "norwegian",  | "nl", "no"               | "ISO_8859_1", "UTF_8"
+ *     "porter",     |                          | "ISO_8859_1", "UTF_8"
+ *     "portuguese", | "por", "pt"              | "ISO_8859_1", "UTF_8"
+ *     "romanian",   | "ro", "ron", "rum"       | "ISO_8859_2", "UTF_8"
+ *     "russian",    | "ru", "rus"              | "KOI8_R",     "UTF_8"
+ *     "spanish",    | "es", "esl"              | "ISO_8859_1", "UTF_8"
+ *     "swedish",    | "sv", "swe"              | "ISO_8859_1", "UTF_8"
+ *     "turkish",    | "tr", "tur"              |               "UTF_8"
+ *
+ *
+ *  === New Stemmers
+ *
+ *  The following stemmers have recently benn added. Please try them out;
+ *
+ *    * Hungarian
+ *    * Romanian
+ *    * Turkish
+ *
+ *  === Example
+ *
+ *  To use this filter with other analyzers, you'll want to write an Analyzer
+ *  class that sets up the TokenStream chain as you want it.  To use this with
+ *  a lowercasing Tokenizer, for example, you'd write an analyzer like this:
+ *
+ *    def MyAnalyzer < Analyzer
+ *      def token_stream(field, str)
+ *        return StemFilter.new(LowerCaseFilter.new(StandardTokenizer.new(str)))
+ *      end
+ *    end
+ *
+ *    "debate debates debated debating debater"
+ *      => ["debat", "debat", "debat", "debat", "debat"]
+ *
+ *  === Attributes
+ *
+ *  token_stream:: TokenStream to be filtered
+ *  algorithm::    The algorithm (or language) to use (default: "english")
+ *  encoding::     The encoding of the data (default: "UTF-8")
+ */
+static void Init_StemFilter(void)
+{
+    cStemFilter =
+        rb_define_class_under(mAnalysis, "StemFilter", cTokenStream);
+    frb_mark_cclass(cStemFilter);
+    rb_define_alloc_func(cStemFilter, frb_data_alloc);
+    rb_define_method(cStemFilter, "initialize",
+                     frb_stem_filter_init, -1);
+}
+/*************************/
+/*** * * Analyzers * * ***/
+/*************************/
+/*
+ *  Document-class: Ferret::Analysis::Analyzer
+ *
+ *  == Summary
+ *
+ *  An Analyzer builds TokenStreams, which analyze text.  It thus represents
+ *  a policy for extracting index terms from text.
+ *
+ *  Typical implementations first build a Tokenizer, which breaks the stream
+ *  of characters from the Reader into raw Tokens. One or more TokenFilters
+ *  may then be applied to the output of the Tokenizer.
+ *
+ *  The default Analyzer just creates a LowerCaseTokenizer which converts
+ *  all text to lowercase tokens. See LowerCaseTokenizer for more details.
+ *
+ *  === Example
+ *
+ *  To create your own custom Analyzer you simply need to implement a
+ *  token_stream method which takes the field name and the data to be
+ *  tokenized as parameters and returns a TokenStream. Most analyzers
+ *  typically ignore the field name.
+ *
+ *  Here we'll create a StemmingAnalyzer;
+ *
+ *    def MyAnalyzer < Analyzer
+ *      def token_stream(field, str)
+ *        return StemFilter.new(LowerCaseFilter.new(StandardTokenizer.new(str)))
+ *      end
+ *    end
+ */
+static void Init_Analyzer(void)
+{
+    cAnalyzer =
+        rb_define_class_under(mAnalysis, "Analyzer", rb_cObject);
+    frb_mark_cclass(cAnalyzer);
+    rb_define_alloc_func(cAnalyzer, frb_data_alloc);
+    rb_define_method(cAnalyzer, "initialize", frb_letter_analyzer_init, -1);
+    rb_define_method(cAnalyzer, "token_stream", frb_analyzer_token_stream, 2);
+}
+/*
+ *  Document-class: Ferret::Analysis::AsciiLetterAnalyzer
+ *
+ *  == Summary
+ *
+ *  An AsciiLetterAnalyzer creates a TokenStream that splits the input up into
+ *  maximal strings of ASCII characters. If implemented in Ruby it would look
+ *  like;
+ *
+ *    class AsciiLetterAnalyzer
+ *      def initialize(lower = true)
+ *        @lower = lower
+ *      end
+ *
+ *      def token_stream(field, str)
+ *        if @lower
+ *          return AsciiLowerCaseFilter.new(AsciiLetterTokenizer.new(str))
+ *        else
+ *          return AsciiLetterTokenizer.new(str)
+ *        end
+ *      end
+ *    end
+ *
+ *  As you can see it makes use of the AsciiLetterTokenizer and
+ *  AsciiLowerCaseFilter. Note that this tokenizer won't recognize non-ASCII
+ *  characters so you should use the LetterAnalyzer is you want to analyze
+ *  multi-byte data like "UTF-8".
+ */
+static void Init_AsciiLetterAnalyzer(void)
+{
+    cAsciiLetterAnalyzer =
+        rb_define_class_under(mAnalysis, "AsciiLetterAnalyzer", cAnalyzer);
+    frb_mark_cclass(cAsciiLetterAnalyzer);
+    rb_define_alloc_func(cAsciiLetterAnalyzer, frb_data_alloc);
+    rb_define_method(cAsciiLetterAnalyzer, "initialize",
+                     frb_a_letter_analyzer_init, -1);
+}
+/*
+ *  Document-class: Ferret::Analysis::LetterAnalyzer
+ *
+ *  == Summary
+ *
+ *  A LetterAnalyzer creates a TokenStream that splits the input up into
+ *  maximal strings of characters as recognized by the current locale. If
+ *  implemented in Ruby it would look like;
+ *
+ *    class LetterAnalyzer
+ *      def initialize(lower = true)
+ *        @lower = lower
+ *      end
+ *
+ *      def token_stream(field, str)
+ *        return LetterTokenizer.new(str, @lower)
+ *      end
+ *    end
+ *
+ *  As you can see it makes use of the LetterTokenizer.
+ */
+static void Init_LetterAnalyzer(void)
+{
+    cLetterAnalyzer =
+        rb_define_class_under(mAnalysis, "LetterAnalyzer", cAnalyzer);
+    frb_mark_cclass(cLetterAnalyzer);
+    rb_define_alloc_func(cLetterAnalyzer, frb_data_alloc);
+    rb_define_method(cLetterAnalyzer, "initialize",
+                     frb_letter_analyzer_init, -1);
+}
+/*
+ *  Document-class: Ferret::Analysis::AsciiWhiteSpaceAnalyzer
+ *
+ *  == Summary
+ *
+ *  The AsciiWhiteSpaceAnalyzer recognizes tokens as maximal strings of
+ *  non-whitespace characters. If implemented in Ruby the
+ *  AsciiWhiteSpaceAnalyzer would look like;
+ *
+ *    class AsciiWhiteSpaceAnalyzer
+ *      def initialize(lower = true)
+ *        @lower = lower
+ *      end
+ *
+ *      def token_stream(field, str)
+ *        if @lower
+ *          return AsciiLowerCaseFilter.new(AsciiWhiteSpaceTokenizer.new(str))
+ *        else
+ *          return AsciiWhiteSpaceTokenizer.new(str)
+ *        end
+ *      end
+ *    end
+ *
+ *  As you can see it makes use of the AsciiWhiteSpaceTokenizer. You should
+ *  use WhiteSpaceAnalyzer if you want to recognize multibyte encodings such
+ *  as "UTF-8".
+ */
+static void Init_AsciiWhiteSpaceAnalyzer(void)
+{
+    cAsciiWhiteSpaceAnalyzer =
+        rb_define_class_under(mAnalysis, "AsciiWhiteSpaceAnalyzer", cAnalyzer);
+    frb_mark_cclass(cAsciiWhiteSpaceAnalyzer);
+    rb_define_alloc_func(cAsciiWhiteSpaceAnalyzer, frb_data_alloc);
+    rb_define_method(cAsciiWhiteSpaceAnalyzer, "initialize",
+                     frb_a_white_space_analyzer_init, -1);
+}
+/*
+ *  Document-class: Ferret::Analysis::WhiteSpaceAnalyzer
+ *
+ *  == Summary
+ *
+ *  The WhiteSpaceAnalyzer recognizes tokens as maximal strings of
+ *  non-whitespace characters. If implemented in Ruby the WhiteSpaceAnalyzer
+ *  would look like;
+ *
+ *    class WhiteSpaceAnalyzer
+ *      def initialize(lower = true)
+ *        @lower = lower
+ *      end
+ *
+ *      def token_stream(field, str)
+ *        return WhiteSpaceTokenizer.new(str, @lower)
+ *      end
+ *    end
+ *
+ *  As you can see it makes use of the WhiteSpaceTokenizer.
+ */
+static void Init_WhiteSpaceAnalyzer(void)
+{
+    cWhiteSpaceAnalyzer =
+        rb_define_class_under(mAnalysis, "WhiteSpaceAnalyzer", cAnalyzer);
+    frb_mark_cclass(cWhiteSpaceAnalyzer);
+    rb_define_alloc_func(cWhiteSpaceAnalyzer, frb_data_alloc);
+    rb_define_method(cWhiteSpaceAnalyzer, "initialize",
+                     frb_white_space_analyzer_init, -1);
+}
+/*
+ *  Document-class: Ferret::Analysis::AsciiStandardAnalyzer
+ *
+ *  == Summary
+ *
+ *  The AsciiStandardAnalyzer is the most advanced of the available
+ *  ASCII-analyzers. If it were implemented in Ruby it would look like this;
+ *
+ *    class AsciiStandardAnalyzer
+ *      def initialize(stop_words = FULL_ENGLISH_STOP_WORDS, lower = true)
+ *        @lower = lower
+ *        @stop_words = stop_words
+ *      end
+ *
+ *      def token_stream(field, str)
+ *        ts = AsciiStandardTokenizer.new(str)
+ *        ts = AsciiLowerCaseFilter.new(ts) if @lower
+ *        ts = StopFilter.new(ts, @stop_words)
+ *        ts = HyphenFilter.new(ts)
+ *      end
+ *    end
+ *
+ *  As you can see it makes use of the AsciiStandardTokenizer and you can also
+ *  add your own list of stop-words if you wish. Note that this tokenizer
+ *  won't recognize non-ASCII characters so you should use the
+ *  StandardAnalyzer is you want to analyze multi-byte data like "UTF-8".
+ */
+static void Init_AsciiStandardAnalyzer(void)
+{
+    cAsciiStandardAnalyzer =
+        rb_define_class_under(mAnalysis, "AsciiStandardAnalyzer", cAnalyzer);
+    frb_mark_cclass(cAsciiStandardAnalyzer);
+    rb_define_alloc_func(cAsciiStandardAnalyzer, frb_data_alloc);
+    rb_define_method(cAsciiStandardAnalyzer, "initialize",
+                     frb_a_standard_analyzer_init, -1);
+}
+/*
+ *  Document-class: Ferret::Analysis::StandardAnalyzer
+ *
+ *  == Summary
+ *
+ *  The StandardAnalyzer is the most advanced of the available analyzers. If
+ *  it were implemented in Ruby it would look like this;
+ *
+ *    class StandardAnalyzer
+ *      def initialize(stop_words = FULL_ENGLISH_STOP_WORDS, lower = true)
+ *        @lower = lower
+ *        @stop_words = stop_words
+ *      end
+ *
+ *      def token_stream(field, str)
+ *        ts = StandardTokenizer.new(str)
+ *        ts = LowerCaseFilter.new(ts) if @lower
+ *        ts = StopFilter.new(ts, @stop_words)
+ *        ts = HyphenFilter.new(ts)
+ *      end
+ *    end
+ *
+ *  As you can see it makes use of the StandardTokenizer and you can also add
+ *  your own list of stopwords if you wish.
+ */
+static void Init_StandardAnalyzer(void)
+{
+    cStandardAnalyzer =
+        rb_define_class_under(mAnalysis, "StandardAnalyzer", cAnalyzer);
+    frb_mark_cclass(cStandardAnalyzer);
+    rb_define_alloc_func(cStandardAnalyzer, frb_data_alloc);
+    rb_define_method(cStandardAnalyzer, "initialize",
+                     frb_standard_analyzer_init, -1);
+}
+/*
+ *  Document-class: Ferret::Analysis::PerFieldAnalyzer
+ *
+ *  == Summary
+ *
+ *  The PerFieldAnalyzer is for use when you want to analyze different fields
+ *  with different analyzers. With the PerFieldAnalyzer you can specify how
+ *  you want each field analyzed.
+ *
+ *  === Example
+ *
+ *    # Create a new PerFieldAnalyzer which uses StandardAnalyzer by default
+ *    pfa = PerFieldAnalyzer.new(StandardAnalyzer.new())
+ *
+ *    # Use the WhiteSpaceAnalyzer with no lowercasing on the :title field
+ *    pfa[:title] = WhiteSpaceAnalyzer.new(false)
+ *
+ *    # Use a custom analyzer on the :created_at field
+ *    pfa[:created_at] = DateAnalyzer.new
+ */
+static void Init_PerFieldAnalyzer(void)
+{
+    cPerFieldAnalyzer =
+        rb_define_class_under(mAnalysis, "PerFieldAnalyzer", cAnalyzer);
+    frb_mark_cclass(cPerFieldAnalyzer);
+    rb_define_alloc_func(cPerFieldAnalyzer, frb_data_alloc);
+    rb_define_method(cPerFieldAnalyzer, "initialize",
+                     frb_per_field_analyzer_init, 1);
+    rb_define_method(cPerFieldAnalyzer, "add_field",
+                     frb_per_field_analyzer_add_field, 2);
+    rb_define_method(cPerFieldAnalyzer, "[]=",
+                     frb_per_field_analyzer_add_field, 2);
+    rb_define_method(cPerFieldAnalyzer, "token_stream",
+                     frb_pfa_analyzer_token_stream, 2);
+}
+/*
+ *  Document-class: Ferret::Analysis::RegExpAnalyzer
+ *
+ *  == Summary
+ *
+ *  Using a RegExpAnalyzer is a simple way to create a custom analyzer. If
+ *  implemented in Ruby it would look like this;
+ *
+ *    class RegExpAnalyzer
+ *      def initialize(reg_exp, lower = true)
+ *        @lower = lower
+ *        @reg_exp = reg_exp
+ *      end
+ *
+ *      def token_stream(field, str)
+ *        if @lower
+ *          return LowerCaseFilter.new(RegExpTokenizer.new(str, reg_exp))
+ *        else
+ *          return RegExpTokenizer.new(str, reg_exp)
+ *        end
+ *      end
+ *    end
+ *
+ *  === Example
+ *
+ *    csv_analyzer = RegExpAnalyzer.new(/[^,]+/, false)
+ */
+static void Init_RegExpAnalyzer(void)
+{
+    cRegExpAnalyzer =
+        rb_define_class_under(mAnalysis, "RegExpAnalyzer", cAnalyzer);
+    frb_mark_cclass(cRegExpAnalyzer);
+    rb_define_alloc_func(cRegExpAnalyzer, frb_data_alloc);
+    rb_define_method(cRegExpAnalyzer, "initialize",
+                     frb_re_analyzer_init, -1);
+    rb_define_method(cRegExpAnalyzer, "token_stream",
+                     frb_re_analyzer_token_stream, 2);
+}
+/* rdoc hack
+extern VALUE mFerret = rb_define_module("Ferret");
+*/
+/*
+ *  Document-module: Ferret::Analysis
+ *
+ *  == Summary
+ *
+ *  The Analysis module contains all the classes used to analyze and tokenize
+ *  the data to be indexed. There are three main classes you need to know
+ *  about when dealing with analysis; Analyzer, TokenStream and Token.
+ *
+ *  == Classes
+ *
+ *  === Analyzer
+ *
+ *  Analyzers handle all of your tokenizing needs. You pass an Analyzer to the
+ *  indexing class when you create it and it will create the TokenStreams
+ *  necessary to tokenize the fields in the documents. Most of the time you
+ *  won't need to worry about TokenStreams and Tokens, one of the Analyzers
+ *  distributed with Ferret will do exactly what you need. Otherwise you'll
+ *  need to implement a custom analyzer.
+ *
+ *  === TokenStream
+ *
+ *  A TokenStream is an enumeration of Tokens. There are two standard types of
+ *  TokenStream; Tokenizer and TokenFilter. A Tokenizer takes a String and
+ *  turns it into a list of Tokens. A TokenFilter takes another TokenStream
+ *  and post-processes the Tokens. You can chain as many TokenFilters together
+ *  as you like but they always need to finish with a Tokenizer.
+ *
+ *  === Token
+ *
+ *  A Token is a single term from a document field. A token contains the text
+ *  representing the term as well as the start and end offset of the token.
+ *  The start and end offset will represent the token as it appears in the
+ *  source field. Some TokenFilters may change the text in the Token but the
+ *  start and end offsets should stay the same so (end - start) won't
+ *  necessarily be equal to the length of text in the token. For example using
+ *  a stemming TokenFilter the term "Beginning" might have start and end
+ *  offsets of 10 and 19 respectively ("Beginning".length == 9) but Token#text
+ *  might be "begin" (after stemming).
+ */
+void
+Init_Analysis(void)
+{
+    mAnalysis = rb_define_module_under(mFerret, "Analysis");
+    /* TokenStream Methods */
+    id_next = rb_intern("next");
+    id_reset = rb_intern("text=");
+    id_clone = rb_intern("clone");
+    id_text = rb_intern("@text");
+    /* Analyzer Methods */
+    id_token_stream = rb_intern("token_stream");
+    object_space = rb_hash_new();
+    rb_define_const(mFerret, "OBJECT_SPACE", object_space);
+    /*** * * Locale stuff * * ***/
+    rb_define_singleton_method(mFerret, "locale=", frb_set_locale, 1);
+    rb_define_singleton_method(mFerret, "locale", frb_get_locale, 0);
+    rb_define_const(mAnalysis, "ENGLISH_STOP_WORDS",
+                    get_rstopwords(ENGLISH_STOP_WORDS));
+    rb_define_const(mAnalysis, "FULL_ENGLISH_STOP_WORDS",
+                    get_rstopwords(FULL_ENGLISH_STOP_WORDS));
+    rb_define_const(mAnalysis, "EXTENDED_ENGLISH_STOP_WORDS",
+                    get_rstopwords(EXTENDED_ENGLISH_STOP_WORDS));
+    rb_define_const(mAnalysis, "FULL_FRENCH_STOP_WORDS",
+                    get_rstopwords(FULL_FRENCH_STOP_WORDS));
+    rb_define_const(mAnalysis, "FULL_SPANISH_STOP_WORDS",
+                    get_rstopwords(FULL_SPANISH_STOP_WORDS));
+    rb_define_const(mAnalysis, "FULL_PORTUGUESE_STOP_WORDS",
+                    get_rstopwords(FULL_PORTUGUESE_STOP_WORDS));
+    rb_define_const(mAnalysis, "FULL_ITALIAN_STOP_WORDS",
+                    get_rstopwords(FULL_ITALIAN_STOP_WORDS));
+    rb_define_const(mAnalysis, "FULL_GERMAN_STOP_WORDS",
+                    get_rstopwords(FULL_GERMAN_STOP_WORDS));
+    rb_define_const(mAnalysis, "FULL_DUTCH_STOP_WORDS",
+                    get_rstopwords(FULL_DUTCH_STOP_WORDS));
+    rb_define_const(mAnalysis, "FULL_SWEDISH_STOP_WORDS",
+                    get_rstopwords(FULL_SWEDISH_STOP_WORDS));
+    rb_define_const(mAnalysis, "FULL_NORWEGIAN_STOP_WORDS",
+                    get_rstopwords(FULL_NORWEGIAN_STOP_WORDS));
+    rb_define_const(mAnalysis, "FULL_DANISH_STOP_WORDS",
+                    get_rstopwords(FULL_DANISH_STOP_WORDS));
+    rb_define_const(mAnalysis, "FULL_RUSSIAN_STOP_WORDS",
+                    get_rstopwords(FULL_RUSSIAN_STOP_WORDS));
+    rb_define_const(mAnalysis, "FULL_FINNISH_STOP_WORDS",
+                    get_rstopwords(FULL_FINNISH_STOP_WORDS));
+    rb_define_const(mAnalysis, "FULL_HUNGARIAN_STOP_WORDS",
+                    get_rstopwords(FULL_HUNGARIAN_STOP_WORDS));
+    Init_Token();
+    Init_TokenStream();
+    Init_AsciiLetterTokenizer();
+    Init_LetterTokenizer();
+    Init_AsciiWhiteSpaceTokenizer();
+    Init_WhiteSpaceTokenizer();
+    Init_AsciiStandardTokenizer();
+    Init_StandardTokenizer();
+    Init_RegExpTokenizer();
+    Init_AsciiLowerCaseFilter();
+    Init_LowerCaseFilter();
+    Init_HyphenFilter();
+    Init_StopFilter();
+    Init_MappingFilter();
+    Init_StemFilter();
+    Init_Analyzer();
+    Init_AsciiLetterAnalyzer();
+    Init_LetterAnalyzer();
+    Init_AsciiWhiteSpaceAnalyzer();
+    Init_WhiteSpaceAnalyzer();
+    Init_AsciiStandardAnalyzer();
+    Init_StandardAnalyzer();
+    Init_PerFieldAnalyzer();
+    Init_RegExpAnalyzer();
+}