RubyGems - ferret - Versions diffs - 0.10.4 → 0.10.5 - Mend

ferret 0.10.4 → 0.10.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

data/Rakefile +1 -1
data/ext/analysis.c +7 -1
data/ext/bitvector.c +5 -2
data/ext/bitvector.h +1 -0
data/ext/ferret.c +55 -8
data/ext/ferret.h +8 -2
data/ext/index.c +34 -43
data/ext/index.h +1 -1
data/ext/q_boolean.c +1 -1
data/ext/q_multi_term.c +13 -1
data/ext/q_parser.c +33 -18
data/ext/r_analysis.c +68 -45
data/ext/r_index.c +64 -10
data/ext/r_search.c +145 -10
data/ext/search.c +71 -12
data/lib/ferret/index.rb +42 -28
data/lib/ferret_version.rb +1 -1
data/test/unit/analysis/tc_analyzer.rb +1 -1
data/test/unit/analysis/tc_token_stream.rb +0 -1
data/test/unit/index/tc_index.rb +3 -3
data/test/unit/index/tc_index_reader.rb +5 -0
data/test/unit/search/tc_filter.rb +15 -0
data/test/unit/search/tm_searcher.rb +13 -2
metadata +2 -2

data/ext/r_search.c CHANGED

@@ -36,12 +36,13 @@ static VALUE cSpanOrQuery;
 static VALUE cSpanNotQuery;
 /* Filters */
+static ID id_bits;
 static VALUE cFilter;
 static VALUE cRangeFilter;
 static VALUE cQueryFilter;
 /* MultiTermQuery */
-static VALUE id_default_max_terms;
+static ID id_default_max_terms;
 static VALUE sym_max_terms;
 static VALUE sym_min_score;
@@ -72,8 +73,8 @@ static VALUE sym_in_order;
 static VALUE sym_clauses;
 /* Class variable ids */
-static VALUE id_default_min_similarity;
-static VALUE id_default_prefix_length;
+static ID id_default_min_similarity;
+static ID id_default_prefix_length;
 /** Sort **/
@@ -93,6 +94,15 @@ static VALUE sym_type;
 static VALUE sym_reverse;
 static VALUE sym_comparator;
+/* Hits */
+static ID id_doc;
+static ID id_score;
+/* TopDocs */
+static ID id_hits;
+static ID id_total_hits;
+static ID id_max_score;
 /* Search */
 static VALUE sym_offset;
 static VALUE sym_limit;
@@ -113,7 +123,6 @@ extern void frt_ir_mark(void *p);
 extern void frt_set_term(VALUE rterm, Term *t);
-extern Term *frt_get_term(VALUE rterm);
 extern VALUE frt_get_analyzer(Analyzer *a);
 extern HashSet *frt_get_fields(VALUE rfields);
 extern Analyzer *frt_get_cwrapped_analyzer(VALUE ranalyzer);
@@ -161,6 +170,35 @@ frt_get_td(TopDocs *td)
     return rtop_docs;
 }
+static VALUE
+frt_td_to_s(VALUE self)
+{
+    int i;
+    VALUE rhits = rb_funcall(self, id_hits, 0);
+    const int len = RARRAY(rhits)->len;
+    char *str = ALLOC_N(char, len * 64 + 100);
+    char *s = str;
+    VALUE rstr;
+    sprintf(s, "TopDocs: totalhits = %d, max_score = %f [\n",
+            FIX2INT(rb_funcall(self, id_total_hits, 0)),
+            NUM2DBL(rb_funcall(self, id_max_score, 0)));
+    s += strlen(s);
+    for (i = 0; i < len; i++) {
+        VALUE rhit = RARRAY(rhits)->ptr[i];
+        sprintf(s, "\t%d: %f\n",
+                FIX2INT(rb_funcall(rhit, id_doc, 0)),
+                NUM2DBL(rb_funcall(rhit, id_score, 0)));
+        s += strlen(s);
+    }
+    sprintf(s, "]\n");
+    rstr = rb_str_new2(str);
+    free(str);
+    return rstr;
+}
 /****************************************************************************
  *
  * Explanation Methods
@@ -319,6 +357,34 @@ frt_q_eql(VALUE self, VALUE other)
     return q->eq(q, oq) ? Qtrue : Qfalse;
 }
+/*
+ *  call-seq:
+ *     query.terms(searcher) -> term_array
+ *
+ *  Returns an array of terms searched for by this query. This can be used for
+ *  implementing an external query highlighter for example. You must supply a
+ *  searcher so that the query can be rewritten and optimized like it would be
+ *  in a real search.
+ */
+static VALUE
+frt_q_get_terms(VALUE self, VALUE searcher)
+{
+    int i;
+    VALUE rterms = rb_ary_new();
+    HashSet *terms = term_set_new();
+    GET_Q();
+    Searcher *sea = (Searcher *)DATA_PTR(searcher);
+    Query *rq = sea->rewrite(sea, q);
+    rq->extract_terms(rq, terms);
+    q_deref(rq);
+    for (i = 0; i < terms->size; i++) {
+        Term *term = (Term *)terms->elems[i];
+        rb_ary_push(rterms, frt_get_term(term->field, term->text));
+    }
+    hs_destroy(terms);
+    return rterms;
+}
 #define MK_QUERY(klass, q) Data_Wrap_Struct(klass, NULL, &frt_q_free, q)
 VALUE
 frt_get_q(Query *q)
@@ -2130,6 +2196,53 @@ call_filter_proc(int doc_id, float score, Searcher *self)
                             object_get(self)));
 }
+typedef struct CWrappedFilter
+{
+    Filter super;
+    VALUE  rfilter;
+} CWrappedFilter;
+#define CWF(filt) ((CWrappedFilter *)(filt))
+static ulong
+cwfilt_hash(Filter *filt)
+{
+    return NUM2ULONG(rb_funcall(CWF(filt)->rfilter, id_hash, 0));
+}
+static int
+cwfilt_eq(Filter *filt, Filter *o)
+{
+    return RTEST(rb_funcall(CWF(filt)->rfilter, id_eql, 1, CWF(o)->rfilter));
+}
+static BitVector *
+cwfilt_get_bv_i(Filter *filt, IndexReader *ir)
+{
+    VALUE rbv = rb_funcall(CWF(filt)->rfilter, id_bits, 1, object_get(ir));
+    BitVector *bv;
+    Data_Get_Struct(rbv, BitVector, bv);
+    REF(bv);
+    return bv;
+}
+Filter *
+frt_get_cwrapped_filter(VALUE rval)
+{
+    Filter *filter;
+    if (frt_is_cclass(rval) && DATA_PTR(rval)) {
+        Data_Get_Struct(rval, Filter, filter);
+        REF(filter);
+    }
+    else {
+        filter = filt_create(sizeof(CWrappedFilter), "CWrappedFilter");
+        filter->hash     = &cwfilt_hash;
+        filter->eq       = &cwfilt_eq;
+        filter->get_bv_i = &cwfilt_get_bv_i;
+        CWF(filter)->rfilter = rval;
+    }
+    return filter;
+}
 static TopDocs *
 frt_sea_search_internal(Query *query, VALUE roptions, Searcher *sea)
 {
@@ -2137,6 +2250,8 @@ frt_sea_search_internal(Query *query, VALUE roptions, Searcher *sea)
     int offset = 0, limit = 10;
     Filter *filter = NULL;
     Sort *sort = NULL;
+    TopDocs *td;
     filter_ft filter_func = NULL;
     if (Qnil != roptions) {
@@ -2159,7 +2274,7 @@ frt_sea_search_internal(Query *query, VALUE roptions, Searcher *sea)
             }
         }
         if (Qnil != (rval = rb_hash_aref(roptions, sym_filter))) {
-            Data_Get_Struct(rval, Filter, filter);
+            filter = frt_get_cwrapped_filter(rval);
         }
         if (Qnil != (rval = rb_hash_aref(roptions, sym_filter_proc))) {
             filter_func = &call_filter_proc;
@@ -2173,7 +2288,9 @@ frt_sea_search_internal(Query *query, VALUE roptions, Searcher *sea)
         }
     }
-    return sea->search(sea, query, offset, limit, filter, sort, filter_func, 0);
+    td = sea->search(sea, query, offset, limit, filter, sort, filter_func, 0);
+    if (filter) filt_deref(filter);
+    return td;
 }
 /*
@@ -2317,7 +2434,8 @@ frt_sea_explain(VALUE self, VALUE rquery, VALUE rdoc_id)
  *  === Options
  *
  *  :excerpt_length::   Default: 150. Length of excerpt to show. Highlighted
- *                      terms will be in the centre of the excerpt.
+ *                      terms will be in the centre of the excerpt. Set to
+ *                      :all to highlight the entire field.
  *  :num_excerpts::     Default: 2. Number of excerpts to return.
  *  :pre_tag::          Default: "<b>". Tag to place to the left of the match.
  *                      You'll probably want to change this to a "<span>" tag
@@ -2344,12 +2462,18 @@ frt_sea_highlight(int argc, VALUE *argv, VALUE self)
     rb_scan_args(argc, argv, "31", &rquery, &rdoc_id, &rfield, &roptions);
     Data_Get_Struct(rquery, Query, query);
-    if (Qnil != (v = rb_hash_aref(roptions, sym_excerpt_length))) {
-        excerpt_length = FIX2INT(v);
-    }
     if (Qnil != (v = rb_hash_aref(roptions, sym_num_excerpts))) {
         num_excerpts =  FIX2INT(v);
     }
+    if (Qnil != (v = rb_hash_aref(roptions, sym_excerpt_length))) {
+        if (v == sym_all) {
+            num_excerpts = 1;
+            excerpt_length = INT_MAX/2;
+        }
+        else {
+            excerpt_length = FIX2INT(v);
+        }
+    }
     if (Qnil != (v = rb_hash_aref(roptions, sym_pre_tag))) {
         pre_tag = RSTRING(rb_obj_as_string(v))->ptr;
     }
@@ -2539,6 +2663,8 @@ Init_Hit(void)
     cHit = rb_struct_define(hit_class, "doc", "score", NULL);
     rb_set_class_path(cHit, mSearch, hit_class);
     rb_const_set(mSearch, rb_intern(hit_class), cHit);
+    id_doc = rb_intern("doc");
+    id_score = rb_intern("score");
 }
 /*
@@ -2570,6 +2696,10 @@ Init_TopDocs(void)
                                 NULL);
     rb_set_class_path(cTopDocs, mSearch, td_class);
     rb_const_set(mSearch, rb_intern(td_class), cTopDocs);
+    rb_define_method(cTopDocs, "to_s", frt_td_to_s, 0);
+    id_hits = rb_intern("hits");
+    id_total_hits = rb_intern("total_hits");
+    id_max_score = rb_intern("max_score");
 }
 /*
@@ -2646,6 +2776,7 @@ Init_Query(void)
     rb_define_method(cQuery, "eql?", frt_q_eql, 1);
     rb_define_method(cQuery, "==", frt_q_eql, 1);
     rb_define_method(cQuery, "hash", frt_q_hash, 0);
+    rb_define_method(cQuery, "terms", frt_q_get_terms, 1);
 }
 /*
@@ -3326,6 +3457,7 @@ static void
 Init_RangeFilter(void)
 {
     cRangeFilter = rb_define_class_under(mSearch, "RangeFilter", cFilter);
+    frt_mark_cclass(cRangeFilter);
     rb_define_alloc_func(cRangeFilter, frt_data_alloc);
     rb_define_method(cRangeFilter, "initialize", frt_rf_init, 2);
@@ -3360,6 +3492,7 @@ static void
 Init_QueryFilter(void)
 {
     cQueryFilter = rb_define_class_under(mSearch, "QueryFilter", cFilter);
+    frt_mark_cclass(cQueryFilter);
     rb_define_alloc_func(cQueryFilter, frt_data_alloc);
     rb_define_method(cQueryFilter, "initialize", frt_qf_init, 1);
@@ -3383,7 +3516,9 @@ Init_QueryFilter(void)
 static void
 Init_Filter(void)
 {
+    id_bits = rb_intern("bits");
     cFilter = rb_define_class_under(mSearch, "Filter", rb_cObject);
+    frt_mark_cclass(cFilter);
     rb_define_alloc_func(cConstantScoreQuery, frt_data_alloc);
     rb_define_method(cFilter, "to_s", frt_f_to_s, 0);

data/ext/search.c CHANGED

@@ -741,13 +741,17 @@ static char *excerpt_get_str(Excerpt *e, MatchVector *mv,
     for (i = e->start; i <= e->end; i++) {
         MatchRange *mr = mv->matches + i;
         len = mr->start_offset - last_offset;
-        if (len) lazy_df_get_bytes(lazy_df, e_ptr, last_offset, len);
-        e_ptr += len;
+        if (len) {
+            lazy_df_get_bytes(lazy_df, e_ptr, last_offset, len);
+            e_ptr += len;
+        }
         memcpy(e_ptr, pre_tag, pre_tag_len);
         e_ptr += pre_tag_len;
         len = mr->end_offset - mr->start_offset;
-        if (len) lazy_df_get_bytes(lazy_df, e_ptr, mr->start_offset, len);
-        e_ptr += len;
+        if (len) {
+            lazy_df_get_bytes(lazy_df, e_ptr, mr->start_offset, len);
+            e_ptr += len;
+        }
         memcpy(e_ptr, post_tag, post_tag_len);
         e_ptr += post_tag_len;
         last_offset = mr->end_offset;
@@ -757,8 +761,10 @@ static char *excerpt_get_str(Excerpt *e, MatchVector *mv,
         e->end_offset = lazy_df->len;
     }
     len = e->end_offset - last_offset;
-    if (len) lazy_df_get_bytes(lazy_df, e_ptr, last_offset, len);
-    e_ptr += len;
+    if (len) {
+        lazy_df_get_bytes(lazy_df, e_ptr, last_offset, len);
+        e_ptr += len;
+    }
     if (e->end_offset < lazy_df->len) {
         memcpy(e_ptr, ellipsis, ellipsis_len);
         e_ptr += ellipsis_len;
@@ -767,6 +773,54 @@ static char *excerpt_get_str(Excerpt *e, MatchVector *mv,
     return excerpt_str;
 }
+static char *highlight_field(MatchVector *mv,
+                             LazyDocField *lazy_df,
+                             TermVector *tv,
+                             const char *pre_tag,
+                             const char *post_tag)
+{
+    const int pre_len = (int)strlen(pre_tag);
+    const int post_len = (int)strlen(post_tag);
+    char *excerpt_str =
+        ALLOC_N(char, 10 + lazy_df->len + (mv->size * (pre_len + post_len)));
+    if (mv->size > 0) {
+        int last_offset = 0;
+        int i, len;
+        char *e_ptr = excerpt_str;
+        matchv_compact_with_breaks(mv);
+        matchv_set_offsets(mv, tv->offsets);
+        for (i = 0; i < mv->size; i++) {
+            MatchRange *mr = mv->matches + i;
+            len = mr->start_offset - last_offset;
+            if (len) {
+                lazy_df_get_bytes(lazy_df, e_ptr, last_offset, len);
+                e_ptr += len;
+            }
+            memcpy(e_ptr, pre_tag, pre_len);
+            e_ptr += pre_len;
+            len = mr->end_offset - mr->start_offset;
+            if (len) {
+                lazy_df_get_bytes(lazy_df, e_ptr, mr->start_offset, len);
+                e_ptr += len;
+            }
+            memcpy(e_ptr, post_tag, post_len);
+            e_ptr += post_len;
+            last_offset = mr->end_offset;
+        }
+        len = lazy_df->len - last_offset;
+        if (len) {
+            lazy_df_get_bytes(lazy_df, e_ptr, last_offset, len);
+            e_ptr += len;
+        }
+        *e_ptr = '\0';
+    }
+    else {
+        lazy_df_get_bytes(lazy_df, excerpt_str, 0, lazy_df->len);
+        excerpt_str[lazy_df->len] = '\0';
+    }
+    return excerpt_str;
+}
 char **searcher_highlight(Searcher *self,
                           Query *query,
                           const int doc_num,
@@ -789,7 +843,12 @@ char **searcher_highlight(Searcher *self,
         MatchVector *mv;
         query = self->rewrite(self, query);
         mv = query->get_matchv_i(query, matchv_new(), tv);
-        if (mv->size > 0) {
+        if (lazy_df->len < (excerpt_len * num_excerpts)) {
+            excerpt_strs = ary_new_type_capa(char *, 1);
+            ary_push(excerpt_strs,
+                     highlight_field(mv, lazy_df, tv, pre_tag, post_tag));
+        }
+        else if (mv->size > 0) {
             Excerpt **excerpts = ALLOC_AND_ZERO_N(Excerpt *, num_excerpts);
             int e_start, e_end, i, j;
             MatchRange *matches = mv->matches;
@@ -802,12 +861,12 @@ char **searcher_highlight(Searcher *self,
             excerpt_pq = pq_new(mv->size, (lt_ft)&excerpt_lt, &free);
             /* add all possible excerpts to the priority queue */
-            for (e_start = 0, e_end = 1; e_start < mv->size; e_start++) {
+            for (e_start = e_end = 0; e_start < mv->size; e_start++) {
                 const int start_offset = matches[e_start].start_offset;
-                if (e_start >= e_end) {
-                    e_end = e_start + 1;
+                if (e_start > e_end) {
+                    running_score = 0.0;
+                    e_end = e_start;
                 }
-                running_score += matches[e_start].score;
                 while (e_end < mv->size && (matches[e_end].end_offset
                                              <= start_offset + excerpt_len)) {
                     running_score += matches[e_end].score;
@@ -883,8 +942,8 @@ char **searcher_highlight(Searcher *self,
             }
             free(excerpts);
             pq_destroy(excerpt_pq);
-            matchv_destroy(mv);
         }
+        matchv_destroy(mv);
         q_deref(query);
     }
     if (tv) tv_destroy(tv);

data/lib/ferret/index.rb CHANGED

@@ -152,22 +152,23 @@ module Ferret::Index
     #                    you want to highlight multiple fields then you will
     #                    need to call this method multiple times.
     # excerpt_length::   Default: 150. Length of excerpt to show. Highlighted
-    #                    terms will be in the centre of the excerpt.
+    #                    terms will be in the centre of the excerpt. Set to
+    #                    :all to highlight the entire field.
     # num_excerpts::     Default: 2. Number of excerpts to return.
     # pre_tag::          Default: "<b>". Tag to place to the left of the
     #                    match.  You'll probably want to change this to a
-    #                    "<span>" tag with a class "\033[7m" for use in a
+    #                    "<span>" tag with a class "\033[36m" for use in a
     #                    terminal.
     # post_tag::         Default: "</b>". This tag should close the
     #                    +:pre_tag+.  Try tag "\033[m" in the terminal.
     # ellipsis::         Default: "...". This is the string that is appended
     #                    at the beginning and end of excerpts (unless the
-    #                    excerpt hits the start or end of the field. You'll
-    #                    probably want to change this so a Unicode elipsis
-    #                    character.
+    #                    excerpt hits the start or end of the field.
+    #                    Alternatively you may want to use the HTML entity
+    #                    &#8230; or the UTF-8 string "\342\200\246".
     def highlight(query, doc_id, options = {})
       ensure_searcher_open()
-      @searcher.highlight(process_query(query),
+      @searcher.highlight(do_process_query(query),
                           doc_id,
                           options[:field]||@options[:default_field],
                           options)
@@ -346,7 +347,7 @@ module Ferret::Index
     def search_each(query, options = {}) # :yield: doc, score
       @dir.synchronize do
         ensure_searcher_open()
-        query = process_query(query)
+        query = do_process_query(query)
         @searcher.search_each(query, options) do |doc, score|
           yield doc, score
@@ -359,20 +360,16 @@ module Ferret::Index
     #
     # id:: The number of the document to retrieve, or the term used as the :id
     #      for the document we wish to retrieve
-    def doc(id)
+    def doc(*args)
       @dir.synchronize do
         ensure_reader_open()
+        id = args[0]
         if id.kind_of?(String) or id.kind_of?(Symbol)
           term_doc_enum = @reader.term_docs_for(@id_field, id.to_s)
-          id = term_doc_enum.next? ? term_doc_enum.doc : nil
-        end
-        return @reader[id] if id.is_a? Integer
-        if id
-          raise(ArgumentError, "key to Index to access a document must be " +
-                "an Integer or a String")
+          return term_doc_enum.next? ? @reader[term_doc_enum.doc] : nil
         end
+        return @reader[*args]
       end
-      return nil
     end
     alias :[] :doc
@@ -405,7 +402,7 @@ module Ferret::Index
     def query_delete(query)
       @dir.synchronize do
         ensure_searcher_open()
-        query = process_query(query)
+        query = do_process_query(query)
         @searcher.search_each(query) do |doc, score|
           @reader.delete(doc)
         end
@@ -470,7 +467,7 @@ module Ferret::Index
       @dir.synchronize do
         ensure_searcher_open()
         docs_to_add = []
-        query = process_query(query)
+        query = do_process_query(query)
         @searcher.search_each(query) do |id, score|
           document = @searcher[id].load
           if new_val.is_a?(Hash)
@@ -609,9 +606,9 @@ module Ferret::Index
     # Computing an explanation is as expensive as executing the query over the
     # entire index.
     def explain(query, doc)
-      synchronize do
+      @dir.synchronize do
         ensure_searcher_open()
-        query = process_query(query)
+        query = do_process_query(query)
         return @searcher.explain(query, doc)
       end
@@ -619,17 +616,22 @@ module Ferret::Index
     # Turn a query string into a Query object with the Index's QueryParser
     def process_query(query)
-      if query.is_a?(String)
-        if @qp.nil?
-          @qp = Ferret::QueryParser.new(@options)
-        end
-        # we need to set this ever time, in case a new field has been added
-        @qp.fields = @reader.field_names
-        query = @qp.parse(query)
+      @dir.synchronize do
+        ensure_searcher_open()
+        return do_process_query(query)
       end
-      return query
     end
+    # Returns the field_infos object so that you can add new fields to the
+    # index.
+    def field_infos
+      @dir.synchronize do
+        ensure_writer_open()
+        return @writer.field_infos
+      end
+    end
     protected
       def ensure_writer_open()
         raise "tried to use a closed index" if not @open
@@ -676,9 +678,21 @@ module Ferret::Index
       end
     private
+      def do_process_query(query)
+        if query.is_a?(String)
+          if @qp.nil?
+            @qp = Ferret::QueryParser.new(@options)
+          end
+          # we need to set this ever time, in case a new field has been added
+          @qp.fields = @reader.field_names
+          query = @qp.parse(query)
+        end
+        return query
+      end
       def do_search(query, options)
         ensure_searcher_open()
-        query = process_query(query)
+        query = do_process_query(query)
         return @searcher.search(query, options)
       end