RubyGems - ferret - Versions diffs - 0.10.2 → 0.10.3 - Mend

ferret 0.10.2 → 0.10.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

data/Rakefile +31 -36
data/ext/analysis.c +97 -37
data/ext/analysis.h +11 -0
data/ext/ferret.c +10 -0
data/ext/ferret.h +2 -0
data/ext/inc/lang.h +1 -0
data/ext/index.c +2 -2
data/ext/lang.h +1 -0
data/ext/q_parser.c +25 -5
data/ext/r_analysis.c +97 -53
data/ext/r_index.c +0 -1
data/ext/r_search.c +1 -1
data/ext/search.c +7 -3
data/ext/term_vectors.c +1 -1
data/lib/ferret/index.rb +94 -48
data/lib/ferret_version.rb +1 -1
data/test/unit/analysis/tc_analyzer.rb +24 -8
data/test/unit/analysis/tc_token_stream.rb +7 -0
data/test/unit/index/tc_index.rb +2 -2
data/test/unit/query_parser/tc_query_parser.rb +3 -3
metadata +12 -7
data/ext/tags +0 -7841

data/ext/r_analysis.c CHANGED

@@ -18,6 +18,7 @@ static VALUE cRegExpTokenizer;
 static VALUE cAsciiLowerCaseFilter;
 static VALUE cLowerCaseFilter;
 static VALUE cStopFilter;
+static VALUE cHyphenFilter;
 static VALUE cStemFilter;
 static VALUE cAnalyzer;
@@ -568,22 +569,20 @@ static TokenStream *
 frt_get_cwrapped_rts(VALUE rts)
 {
     TokenStream *ts;
-    switch (TYPE(rts)) {
-        case T_DATA:
-            GET_TS(ts, rts);
-            REF(ts);
-            break;
-        default:
-            ts = ts_new(CWrappedTokenStream);
-            CWTS(ts)->rts = rts;
-            ts->next = &cwrts_next;
-            ts->reset = &cwrts_reset;
-            ts->clone_i = &cwrts_clone_i;
-            ts->destroy_i = &cwrts_destroy_i;
-            /* prevent from being garbage collected */
-            rb_hash_aset(object_space, LONG2NUM(rts), rts);
-            ts->ref_cnt = 1;
-            break;
+    if (rb_ivar_get(CLASS_OF(rts), id_cclass) == Qtrue) {
+        GET_TS(ts, rts);
+        REF(ts);
+    }
+    else {
+        ts = ts_new(CWrappedTokenStream);
+        CWTS(ts)->rts = rts;
+        ts->next = &cwrts_next;
+        ts->reset = &cwrts_reset;
+        ts->clone_i = &cwrts_clone_i;
+        ts->destroy_i = &cwrts_destroy_i;
+        /* prevent from being garbage collected */
+        rb_hash_aset(object_space, LONG2NUM(rts), rts);
+        ts->ref_cnt = 1;
     }
     return ts;
 }
@@ -911,6 +910,28 @@ frt_lowercase_filter_init(VALUE self, VALUE rsub_ts)
     return self;
 }
+/*
+ *  call-seq:
+ *     HyphenFilter.new(token_stream) -> token_stream
+ *
+ *  Create an HyphenFilter which filters hyphenated words. The way it works is
+ *  by adding both the word concatenated into a single word and split into
+ *  multiple words. ie "e-mail" becomes "email" and "e mail". This way a
+ *  search for "e-mail", "email" and "mail" will all match. This filter is
+ *  used by default by the StandardAnalyzer.
+ */
+static VALUE
+frt_hyphen_filter_init(VALUE self, VALUE rsub_ts)
+{
+    TokenStream *ts = frt_get_cwrapped_rts(rsub_ts);
+    ts = hyphen_filter_new(ts);
+    object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
+    Frt_Wrap_Struct(self, &frt_tf_mark, &frt_tf_free, ts);
+    object_add(ts, self);
+    return self;
+}
 /*
  *  call-seq:
  *     StopFilter.new(token_stream) -> token_stream
@@ -1021,20 +1042,18 @@ Analyzer *
 frt_get_cwrapped_analyzer(VALUE ranalyzer)
 {
     Analyzer *a = NULL;
-    switch (TYPE(ranalyzer)) {
-        case T_DATA:
-            Data_Get_Struct(ranalyzer, Analyzer, a);
-            REF(a);
-            break;
-        default:
-            a = (Analyzer *)ecalloc(sizeof(CWrappedAnalyzer));
-            a->destroy_i = &cwa_destroy_i;
-            a->get_ts    = &cwa_get_ts;
-            a->ref_cnt   = 1;
-            ((CWrappedAnalyzer *)a)->ranalyzer = ranalyzer;
-            /* prevent from being garbage collected */
-            rb_hash_aset(object_space, LONG2NUM(ranalyzer), ranalyzer);
-            break;
+    if (rb_ivar_get(CLASS_OF(ranalyzer), id_cclass) == Qtrue) {
+        Data_Get_Struct(ranalyzer, Analyzer, a);
+        REF(a);
+    }
+    else {
+        a = (Analyzer *)ecalloc(sizeof(CWrappedAnalyzer));
+        a->destroy_i = &cwa_destroy_i;
+        a->get_ts    = &cwa_get_ts;
+        a->ref_cnt   = 1;
+        ((CWrappedAnalyzer *)a)->ranalyzer = ranalyzer;
+        /* prevent from being garbage collected */
+        rb_hash_aset(object_space, LONG2NUM(ranalyzer), ranalyzer);
     }
     return a;
 }
@@ -1350,11 +1369,14 @@ frt_re_analyzer_init(int argc, VALUE *argv, VALUE self)
     ts = rets_new(Qnil, regex, proc);
     rets = Data_Wrap_Struct(cRegExpTokenizer, &frt_rets_mark, &frt_rets_free, ts);
-    REF(ts);
     /* rb_hash_aset(object_space, LONG2NUM((long)rets), rets); */
     object_add(ts, rets);
-    if (lower != Qfalse) ts = mb_lowercase_filter_new(ts);
+    if (lower != Qfalse) {
+        rets = frt_lowercase_filter_init(frt_data_alloc(cLowerCaseFilter), rets);
+        ts = DATA_PTR(rets);
+    }
+    REF(ts);
     a = analyzer_new(ts, &re_analyzer_destroy_i, NULL);
     Frt_Wrap_Struct(self, &frt_re_analyzer_mark, &frt_analyzer_free, a);
@@ -1461,7 +1483,7 @@ static void Init_Token(void)
  */
 static void Init_TokenStream(void)
 {
-    cTokenStream = rb_define_class_under(mAnalysis, "TokenStream", rb_cObject);
+    cTokenStream = frt_define_class_under(mAnalysis, "TokenStream", rb_cObject);
     rb_define_method(cTokenStream, "next", frt_ts_next, 0);
     rb_define_method(cTokenStream, "text=", frt_ts_set_text, 1);
     rb_define_method(cTokenStream, "text", frt_ts_get_text, 0);
@@ -1482,7 +1504,7 @@ static void Init_TokenStream(void)
 static void Init_AsciiLetterTokenizer(void)
 {
     cAsciiLetterTokenizer =
-        rb_define_class_under(mAnalysis, "AsciiLetterTokenizer", cTokenStream);
+        frt_define_class_under(mAnalysis, "AsciiLetterTokenizer", cTokenStream);
     rb_define_alloc_func(cAsciiLetterTokenizer, frt_data_alloc);
     rb_define_method(cAsciiLetterTokenizer, "initialize",
                      frt_a_letter_tokenizer_init, 1);
@@ -1504,7 +1526,7 @@ static void Init_AsciiLetterTokenizer(void)
 static void Init_LetterTokenizer(void)
 {
     cLetterTokenizer =
-        rb_define_class_under(mAnalysis, "LetterTokenizer", cTokenStream);
+        frt_define_class_under(mAnalysis, "LetterTokenizer", cTokenStream);
     rb_define_alloc_func(cLetterTokenizer, frt_data_alloc);
     rb_define_method(cLetterTokenizer, "initialize",
                      frt_letter_tokenizer_init, -1);
@@ -1524,7 +1546,7 @@ static void Init_LetterTokenizer(void)
 static void Init_AsciiWhiteSpaceTokenizer(void)
 {
     cAsciiWhiteSpaceTokenizer =
-        rb_define_class_under(mAnalysis, "AsciiWhiteSpaceTokenizer",
+        frt_define_class_under(mAnalysis, "AsciiWhiteSpaceTokenizer",
                               cTokenStream);
     rb_define_alloc_func(cAsciiWhiteSpaceTokenizer, frt_data_alloc);
     rb_define_method(cAsciiWhiteSpaceTokenizer, "initialize",
@@ -1545,7 +1567,7 @@ static void Init_AsciiWhiteSpaceTokenizer(void)
 static void Init_WhiteSpaceTokenizer(void)
 {
     cWhiteSpaceTokenizer =
-        rb_define_class_under(mAnalysis, "WhiteSpaceTokenizer", cTokenStream);
+        frt_define_class_under(mAnalysis, "WhiteSpaceTokenizer", cTokenStream);
     rb_define_alloc_func(cWhiteSpaceTokenizer, frt_data_alloc);
     rb_define_method(cWhiteSpaceTokenizer, "initialize",
                      frt_whitespace_tokenizer_init, -1);
@@ -1566,7 +1588,7 @@ static void Init_WhiteSpaceTokenizer(void)
 static void Init_AsciiStandardTokenizer(void)
 {
     cAsciiStandardTokenizer =
-        rb_define_class_under(mAnalysis, "AsciiStandardTokenizer", cTokenStream);
+        frt_define_class_under(mAnalysis, "AsciiStandardTokenizer", cTokenStream);
     rb_define_alloc_func(cAsciiStandardTokenizer, frt_data_alloc);
     rb_define_method(cAsciiStandardTokenizer, "initialize",
                      frt_a_standard_tokenizer_init, 1);
@@ -1587,7 +1609,7 @@ static void Init_AsciiStandardTokenizer(void)
 static void Init_StandardTokenizer(void)
 {
     cStandardTokenizer =
-        rb_define_class_under(mAnalysis, "StandardTokenizer", cTokenStream);
+        frt_define_class_under(mAnalysis, "StandardTokenizer", cTokenStream);
     rb_define_alloc_func(cStandardTokenizer, frt_data_alloc);
     rb_define_method(cStandardTokenizer, "initialize",
                      frt_standard_tokenizer_init, 1);
@@ -1614,7 +1636,7 @@ static void Init_StandardTokenizer(void)
 static void Init_RegExpTokenizer(void)
 {
     cRegExpTokenizer =
-        rb_define_class_under(mAnalysis, "RegExpTokenizer", cTokenStream);
+        frt_define_class_under(mAnalysis, "RegExpTokenizer", cTokenStream);
     rtoken_re = rb_reg_new(TOKEN_RE, strlen(TOKEN_RE), 0);
     rb_define_const(cRegExpTokenizer, "REGEXP", rtoken_re);
     rb_define_alloc_func(cRegExpTokenizer, frt_data_alloc);
@@ -1642,7 +1664,7 @@ static void Init_RegExpTokenizer(void)
 static void Init_AsciiLowerCaseFilter(void)
 {
     cAsciiLowerCaseFilter =
-        rb_define_class_under(mAnalysis, "AsciiLowerCaseFilter", cTokenStream);
+        frt_define_class_under(mAnalysis, "AsciiLowerCaseFilter", cTokenStream);
     rb_define_alloc_func(cAsciiLowerCaseFilter, frt_data_alloc);
     rb_define_method(cAsciiLowerCaseFilter, "initialize",
                      frt_a_lowercase_filter_init, 1);
@@ -1662,12 +1684,33 @@ static void Init_AsciiLowerCaseFilter(void)
 static void Init_LowerCaseFilter(void)
 {
     cLowerCaseFilter =
-        rb_define_class_under(mAnalysis, "LowerCaseFilter", cTokenStream);
+        frt_define_class_under(mAnalysis, "LowerCaseFilter", cTokenStream);
     rb_define_alloc_func(cLowerCaseFilter, frt_data_alloc);
     rb_define_method(cLowerCaseFilter, "initialize",
                      frt_lowercase_filter_init, 1);
 }
+/*
+ *  Document-class: Ferret::Analysis::HyphenFilter
+ *
+ *  HyphenFilter filters hyphenated words by adding both the word concatenated
+ *  into a single word and split into multiple words. ie "e-mail" becomes
+ *  "email" and "e mail". This way a search for "e-mail", "email" and "mail"
+ *  will all match. This filter is used by default by the StandardAnalyzer.
+ *
+ *  === Example
+ *
+ *    ["e-mail", "set-up"] => ["email", "e", "mail", "setup", "set", "up"]
+ *
+ */
+static void Init_HyphenFilter(void)
+{
+    cHyphenFilter =
+        frt_define_class_under(mAnalysis, "HyphenFilter", cTokenStream);
+    rb_define_alloc_func(cHyphenFilter, frt_data_alloc);
+    rb_define_method(cHyphenFilter, "initialize", frt_hyphen_filter_init, 1);
+}
 /*
  *  Document-class: Ferret::Analysis::StopFilter
  *
@@ -1682,7 +1725,7 @@ static void Init_LowerCaseFilter(void)
 static void Init_StopFilter(void)
 {
     cStopFilter =
-        rb_define_class_under(mAnalysis, "StopFilter", cTokenStream);
+        frt_define_class_under(mAnalysis, "StopFilter", cTokenStream);
     rb_define_alloc_func(cStopFilter, frt_data_alloc);
     rb_define_method(cStopFilter, "initialize",
                      frt_stop_filter_init, -1);
@@ -1741,7 +1784,7 @@ static void Init_StopFilter(void)
 static void Init_StemFilter(void)
 {
     cStemFilter =
-        rb_define_class_under(mAnalysis, "StemFilter", cTokenStream);
+        frt_define_class_under(mAnalysis, "StemFilter", cTokenStream);
     rb_define_alloc_func(cStemFilter, frt_data_alloc);
     rb_define_method(cStemFilter, "initialize",
                      frt_stem_filter_init, -1);
@@ -1784,7 +1827,7 @@ static void Init_StemFilter(void)
 static void Init_Analyzer(void)
 {
     cAnalyzer =
-        rb_define_class_under(mAnalysis, "Analyzer", rb_cObject);
+        frt_define_class_under(mAnalysis, "Analyzer", rb_cObject);
     rb_define_alloc_func(cAnalyzer, frt_data_alloc);
     rb_define_method(cAnalyzer, "initialize", frt_letter_analyzer_init, -1);
     rb_define_method(cAnalyzer, "token_stream", frt_analyzer_token_stream, 2);
@@ -1821,7 +1864,7 @@ static void Init_Analyzer(void)
 static void Init_AsciiLetterAnalyzer(void)
 {
     cAsciiLetterAnalyzer =
-        rb_define_class_under(mAnalysis, "AsciiLetterAnalyzer", cAnalyzer);
+        frt_define_class_under(mAnalysis, "AsciiLetterAnalyzer", cAnalyzer);
     rb_define_alloc_func(cAsciiLetterAnalyzer, frt_data_alloc);
     rb_define_method(cAsciiLetterAnalyzer, "initialize",
                      frt_a_letter_analyzer_init, -1);
@@ -1851,7 +1894,7 @@ static void Init_AsciiLetterAnalyzer(void)
 static void Init_LetterAnalyzer(void)
 {
     cLetterAnalyzer =
-        rb_define_class_under(mAnalysis, "LetterAnalyzer", cAnalyzer);
+        frt_define_class_under(mAnalysis, "LetterAnalyzer", cAnalyzer);
     rb_define_alloc_func(cLetterAnalyzer, frt_data_alloc);
     rb_define_method(cLetterAnalyzer, "initialize",
                      frt_letter_analyzer_init, -1);
@@ -1887,7 +1930,7 @@ static void Init_LetterAnalyzer(void)
 static void Init_AsciiWhiteSpaceAnalyzer(void)
 {
     cAsciiWhiteSpaceAnalyzer =
-        rb_define_class_under(mAnalysis, "AsciiWhiteSpaceAnalyzer", cAnalyzer);
+        frt_define_class_under(mAnalysis, "AsciiWhiteSpaceAnalyzer", cAnalyzer);
     rb_define_alloc_func(cAsciiWhiteSpaceAnalyzer, frt_data_alloc);
     rb_define_method(cAsciiWhiteSpaceAnalyzer, "initialize",
                      frt_a_white_space_analyzer_init, -1);
@@ -1917,7 +1960,7 @@ static void Init_AsciiWhiteSpaceAnalyzer(void)
 static void Init_WhiteSpaceAnalyzer(void)
 {
     cWhiteSpaceAnalyzer =
-        rb_define_class_under(mAnalysis, "WhiteSpaceAnalyzer", cAnalyzer);
+        frt_define_class_under(mAnalysis, "WhiteSpaceAnalyzer", cAnalyzer);
     rb_define_alloc_func(cWhiteSpaceAnalyzer, frt_data_alloc);
     rb_define_method(cWhiteSpaceAnalyzer, "initialize",
                      frt_white_space_analyzer_init, -1);
@@ -1955,7 +1998,7 @@ static void Init_WhiteSpaceAnalyzer(void)
 static void Init_AsciiStandardAnalyzer(void)
 {
     cAsciiStandardAnalyzer =
-        rb_define_class_under(mAnalysis, "AsciiStandardAnalyzer", cAnalyzer);
+        frt_define_class_under(mAnalysis, "AsciiStandardAnalyzer", cAnalyzer);
     rb_define_alloc_func(cAsciiStandardAnalyzer, frt_data_alloc);
     rb_define_method(cAsciiStandardAnalyzer, "initialize",
                      frt_a_standard_analyzer_init, -1);
@@ -1986,7 +2029,7 @@ static void Init_AsciiStandardAnalyzer(void)
 static void Init_StandardAnalyzer(void)
 {
     cStandardAnalyzer =
-        rb_define_class_under(mAnalysis, "StandardAnalyzer", cAnalyzer);
+        frt_define_class_under(mAnalysis, "StandardAnalyzer", cAnalyzer);
     rb_define_alloc_func(cStandardAnalyzer, frt_data_alloc);
     rb_define_method(cStandardAnalyzer, "initialize",
                      frt_standard_analyzer_init, -1);
@@ -2015,7 +2058,7 @@ static void Init_StandardAnalyzer(void)
 static void Init_PerFieldAnalyzer(void)
 {
     cPerFieldAnalyzer =
-        rb_define_class_under(mAnalysis, "PerFieldAnalyzer", cAnalyzer);
+        frt_define_class_under(mAnalysis, "PerFieldAnalyzer", cAnalyzer);
     rb_define_alloc_func(cPerFieldAnalyzer, frt_data_alloc);
     rb_define_method(cPerFieldAnalyzer, "initialize",
                      frt_per_field_analyzer_init, 1);
@@ -2055,7 +2098,7 @@ static void Init_PerFieldAnalyzer(void)
 static void Init_RegExpAnalyzer(void)
 {
     cRegExpAnalyzer =
-        rb_define_class_under(mAnalysis, "RegExpAnalyzer", cAnalyzer);
+        frt_define_class_under(mAnalysis, "RegExpAnalyzer", cAnalyzer);
     rb_define_alloc_func(cRegExpAnalyzer, frt_data_alloc);
     rb_define_method(cRegExpAnalyzer, "initialize",
                      frt_re_analyzer_init, -1);
@@ -2171,6 +2214,7 @@ Init_Analysis(void)
     Init_AsciiLowerCaseFilter();
     Init_LowerCaseFilter();
+    Init_HyphenFilter();
     Init_StopFilter();
     Init_StemFilter();

data/ext/r_index.c CHANGED

@@ -1875,7 +1875,6 @@ frt_ir_init(VALUE self, VALUE rdir)
     VALUE rfield_num_map = rb_hash_new();
     if (TYPE(rdir) == T_ARRAY) {
-        VALUE rreader;
         VALUE rdirs = rdir;
         const int reader_cnt = RARRAY(rdir)->len;
         IndexReader **sub_readers = ALLOC_N(IndexReader *, reader_cnt);

data/ext/r_search.c CHANGED

@@ -2181,7 +2181,7 @@ frt_sea_search_internal(Query *query, VALUE roptions, Searcher *sea)
  *     searcher.search(query, options = {}) -> TopDocs
  *
  *  Run a query through the Searcher on the index. A TopDocs object is
- *  returned with the relevant results. The +query+ is an built in Query
+ *  returned with the relevant results. The +query+ is a built in Query
  *  object. Here are the options;
  *
  *  === Options

data/ext/search.c CHANGED

@@ -741,19 +741,23 @@ static char *excerpt_get_str(Excerpt *e, MatchVector *mv,
     for (i = e->start; i <= e->end; i++) {
         MatchRange *mr = mv->matches + i;
         len = mr->start_offset - last_offset;
-        lazy_df_get_bytes(lazy_df, e_ptr, last_offset, len);
+        if (len) lazy_df_get_bytes(lazy_df, e_ptr, last_offset, len);
         e_ptr += len;
         memcpy(e_ptr, pre_tag, pre_tag_len);
         e_ptr += pre_tag_len;
         len = mr->end_offset - mr->start_offset;
-        lazy_df_get_bytes(lazy_df, e_ptr, mr->start_offset, len);
+        if (len) lazy_df_get_bytes(lazy_df, e_ptr, mr->start_offset, len);
         e_ptr += len;
         memcpy(e_ptr, post_tag, post_tag_len);
         e_ptr += post_tag_len;
         last_offset = mr->end_offset;
     }
+    if ((lazy_df->len - e->end_offset) <= ellipsis_len) {
+        /* no point using ellipsis if it takes up more space */
+        e->end_offset = lazy_df->len;
+    }
     len = e->end_offset - last_offset;
-    lazy_df_get_bytes(lazy_df, e_ptr, last_offset, len);
+    if (len) lazy_df_get_bytes(lazy_df, e_ptr, last_offset, len);
     e_ptr += len;
     if (e->end_offset < lazy_df->len) {
         memcpy(e_ptr, ellipsis, ellipsis_len);

data/ext/term_vectors.c CHANGED

@@ -45,7 +45,7 @@ int tv_get_tv_term_index(TermVector *tv, const char *term)
             return mid;
         }
     }
-    if (strcmp(term, tv->terms[hi].text) == 0) {
+    if (hi >= 0 && strcmp(term, tv->terms[hi].text) == 0) {
         return hi;
     }
     else {

data/lib/ferret/index.rb CHANGED

@@ -146,25 +146,25 @@ module Ferret::Index
     #
     # === Options
     #
-    # :field::            Default: @options[:default_field]. The default_field
-    #                     is the field that is usually highlighted but you can
-    #                     specify which field you want to highlight here. If
-    #                     you want to highlight multiple fields then you will
-    #                     need to call this method multiple times.
-    # :excerpt_length::   Default: 150. Length of excerpt to show. Highlighted
-    #                     terms will be in the centre of the excerpt.
-    # :num_excerpts::     Default: 2. Number of excerpts to return.
-    # :pre_tag::          Default: "<b>". Tag to place to the left of the
-    #                     match.  You'll probably want to change this to a
-    #                     "<span>" tag with a class "\033[7m" for use in a
-    #                     terminal.
-    # :post_tag::         Default: "</b>". This tag should close the
-    #                     +:pre_tag+.  Try tag "\033[m" in the terminal.
-    # :ellipsis::         Default: "...". This is the string that is appended
-    #                     at the beginning and end of excerpts (unless the
-    #                     excerpt hits the start or end of the field. You'll
-    #                     probably want to change this so a Unicode elipsis
-    #                     character.
+    # field::            Default: @options[:default_field]. The default_field
+    #                    is the field that is usually highlighted but you can
+    #                    specify which field you want to highlight here. If
+    #                    you want to highlight multiple fields then you will
+    #                    need to call this method multiple times.
+    # excerpt_length::   Default: 150. Length of excerpt to show. Highlighted
+    #                    terms will be in the centre of the excerpt.
+    # num_excerpts::     Default: 2. Number of excerpts to return.
+    # pre_tag::          Default: "<b>". Tag to place to the left of the
+    #                    match.  You'll probably want to change this to a
+    #                    "<span>" tag with a class "\033[7m" for use in a
+    #                    terminal.
+    # post_tag::         Default: "</b>". This tag should close the
+    #                    +:pre_tag+.  Try tag "\033[m" in the terminal.
+    # ellipsis::         Default: "...". This is the string that is appended
+    #                    at the beginning and end of excerpts (unless the
+    #                    excerpt hits the start or end of the field. You'll
+    #                    probably want to change this so a Unicode elipsis
+    #                    character.
     def highlight(query, doc_id, options = {})
       ensure_searcher_open()
       @searcher.highlight(process_query(query),
@@ -270,33 +270,79 @@ module Ferret::Index
     end
     alias :<< :add_document
-    # The main search method for the index. You need to create a query to
-    # pass to this method. You can also pass a hash with one or more of the
-    # following; {filter, num_docs, first_doc, sort}
-    #
-    # query::       The query to run on the index
-    # filter::      Filters docs from the search result
-    # first_doc::   The index in the results of the first doc retrieved.
-    #               Default is 0
-    # num_docs::    The number of results returned. Default is 10
-    # sort::        An array of SortFields describing how to sort the results.
-    # filter_proc:: A proc which takes |doc_id, score, searcher| as arguments
-    #               and returns true if the document passes the filter.
+    # Run a query through the Searcher on the index. A TopDocs object is
+    # returned with the relevant results. The +query+ is a built in Query
+    # object or a query string that can be parsed by the Ferret::QueryParser.
+    # Here are the options;
+    #
+    # === Options
+    #
+    # offset::      Default: 0. The offset of the start of the section of the
+    #               result-set to return. This is used for paging through
+    #               results. Let's say you have a page size of 10. If you
+    #               don't find the result you want among the first 10 results
+    #               then set +:offset+ to 10 and look at the next 10 results,
+    #               then 20 and so on.
+    # limit::       Default: 10. This is the number of results you want
+    #               returned, also called the page size. Set +:limit+ to
+    #               +:all+ to return all results
+    # sort::        A Sort object or sort string describing how the field
+    #               should be sorted. A sort string is made up of field names
+    #               which cannot contain spaces and the word "DESC" if you
+    #               want the field reversed, all seperated by commas. For
+    #               example; "rating DESC, author, title"
+    # filter::      a Filter object to filter the search results with
+    # filter_proc:: a filter Proc is a Proc which takes the doc_id, the score
+    #               and the Searcher object as its parameters and returns a
+    #               Boolean value specifying whether the result should be
+    #               included in the result set.
     def search(query, options = {})
       @dir.synchronize do
         return do_search(query, options)
       end
     end
-    # See Index#search
+    # Run a query through the Searcher on the index. A TopDocs object is
+    # returned with the relevant results. The +query+ is a Query object or a
+    # query string that can be validly parsed by the Ferret::QueryParser. The
+    # Searcher#search_each method yields the internal document id (used to
+    # reference documents in the Searcher object like this;
+    # +searcher[doc_id]+) and the search score for that document. It is
+    # possible for the score to be greater than 1.0 for some queries and
+    # taking boosts into account. This method will also normalize scores to
+    # the range 0.0..1.0 when the max-score is greater than 1.0. Here are the
+    # options;
+    #
+    # === Options
     #
-    # This method yields the doc and score for each hit.
+    # offset::      Default: 0. The offset of the start of the section of the
+    #               result-set to return. This is used for paging through
+    #               results. Let's say you have a page size of 10. If you
+    #               don't find the result you want among the first 10 results
+    #               then set +:offset+ to 10 and look at the next 10 results,
+    #               then 20 and so on.
+    # limit::       Default: 10. This is the number of results you want
+    #               returned, also called the page size. Set +:limit+ to
+    #               +:all+ to return all results
+    # sort::        A Sort object or sort string describing how the field
+    #               should be sorted. A sort string is made up of field names
+    #               which cannot contain spaces and the word "DESC" if you
+    #               want the field reversed, all seperated by commas. For
+    #               example; "rating DESC, author, title"
+    # filter::      a Filter object to filter the search results with
+    # filter_proc:: a filter Proc is a Proc which takes the doc_id, the score
+    #               and the Searcher object as its parameters and returns a
+    #               Boolean value specifying whether the result should be
+    #               included in the result set.
+    #
+    # returns:: The total number of hits.
+    #
+    # === Example
     # eg.
-    #   index.search_each() do |doc, score|
+    #   index.search_each(query, options = {}) do |doc, score|
     #     puts "hit document number #{doc} with a score of #{score}"
     #   end
     #
-    # returns:: The total number of hits.
     def search_each(query, options = {}) # :yield: doc, score
       @dir.synchronize do
         ensure_searcher_open()
@@ -571,6 +617,19 @@ module Ferret::Index
       end
     end
+    # Turn a query string into a Query object with the Index's QueryParser
+    def process_query(query)
+      if query.is_a?(String)
+        if @qp.nil?
+          @qp = Ferret::QueryParser.new(@options)
+        end
+        # we need to set this ever time, in case a new field has been added
+        @qp.fields = @reader.field_names
+        query = @qp.parse(query)
+      end
+      return query
+    end
     protected
       def ensure_writer_open()
         raise "tried to use a closed index" if not @open
@@ -623,19 +682,6 @@ module Ferret::Index
         return @searcher.search(query, options)
       end
-      def process_query(query)
-        if query.is_a?(String)
-          if @qp.nil?
-            @qp = Ferret::QueryParser.new(@options)
-          end
-          # we need to set this ever time, in case a new field has been added
-          @qp.fields = @reader.field_names
-          query = @qp.parse(query)
-        end
-        return query
-      end
   end
 end