RubyGems - ferret - Versions diffs - 0.10.1 → 0.10.2 - Mend

ferret 0.10.1 → 0.10.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

data/Rakefile +7 -1
data/ext/analysis.c +21 -13
data/ext/array.c +1 -1
data/ext/bitvector.c +2 -2
data/ext/defines.h +0 -6
data/ext/except.c +6 -6
data/ext/except.h +12 -8
data/ext/extconf.rb +1 -0
data/ext/ferret.c +4 -0
data/ext/ferret.h +1 -0
data/ext/fs_store.c +18 -4
data/ext/global.c +18 -16
data/ext/global.h +7 -2
data/ext/hash.c +1 -1
data/ext/helper.c +1 -1
data/ext/helper.h +1 -1
data/ext/inc/lang.h +7 -1
data/ext/ind.c +4 -4
data/ext/ind.h +3 -3
data/ext/index.c +33 -26
data/ext/index.h +1 -1
data/ext/lang.h +7 -1
data/ext/mem_pool.c +1 -1
data/ext/mem_pool.h +1 -1
data/ext/q_fuzzy.c +2 -2
data/ext/q_match_all.c +2 -2
data/ext/q_multi_term.c +1 -1
data/ext/q_parser.c +60 -52
data/ext/r_analysis.c +6 -4
data/ext/r_index.c +57 -4
data/ext/r_search.c +1 -1
data/ext/r_utils.c +1 -1
data/ext/ram_store.c +1 -1
data/ext/search.c +4 -4
data/ext/sort.c +3 -3
data/ext/store.c +9 -9
data/ext/store.h +4 -4
data/ext/tags +7841 -0
data/ext/term_vectors.c +3 -3
data/lib/ferret/index.rb +69 -7
data/test/test_helper.rb +3 -2
data/test/unit/analysis/tc_token_stream.rb +1 -0
data/test/unit/index/tc_index.rb +157 -2
data/test/unit/index/tc_index_reader.rb +108 -5
data/test/unit/query_parser/tc_query_parser.rb +2 -1
data/test/unit/search/tc_index_searcher.rb +1 -1
data/test/unit/search/tc_multi_searcher.rb +2 -1
data/test/unit/search/tc_spans.rb +1 -1
data/test/unit/store/tc_fs_store.rb +6 -3
data/test/unit/ts_analysis.rb +1 -1
data/test/unit/ts_utils.rb +1 -1
data/test/unit/utils/tc_number_tools.rb +1 -1
metadata +138 -137

data/ext/term_vectors.c CHANGED Viewed

@@ -250,10 +250,10 @@ TermVectorsWriter *tvw_open(Store *store, const char *segment, FieldInfos *fis)
     tvw->fis = fis;
     tvw->fields = ary_new_type_capa(TVField, TV_FIELD_INIT_CAPA);
-    sprintf(file_name, "%s.tvx", segment);
+    snprintf(file_name, SEGMENT_NAME_MAX_LENGTH, "%s.tvx", segment);
     tvw->tvx_out = store->new_output(store, file_name);
-    sprintf(file_name, "%s.tvd", segment);
+    snprintf(file_name, SEGMENT_NAME_MAX_LENGTH, "%s.tvd", segment);
     tvw->tvd_out = store->new_output(store, file_name);
     return tvw;
@@ -335,7 +335,7 @@ void tvw_add_postings(TermVectorsWriter *tvw,
     if (fi_store_offsets(fi)) {
         /* use delta encoding for offsets */
-        int last_end = 0;
+        int last_end = 0;
         os_write_vint(tvd_out, offset_count);  /* write shared prefix length */
         for (i = 0; i < offset_count; i++) {
             int start = offsets[i].start;

data/lib/ferret/index.rb CHANGED Viewed

@@ -10,6 +10,7 @@ module Ferret::Index
     include Ferret::Search
     attr_reader :options
     # If you create an Index without any options, it'll simply create an index
     # in memory. But this class is highly configurable and every option that
     # you can supply to IndexWriter and QueryParser, you can also set here.
@@ -52,6 +53,10 @@ module Ferret::Index
     #                         concerned about performance. In that case you
     #                         should think about setting up a DRb indexing
     #                         service.
+    # lock_retry_time::       Default: 2 seconds. This parameter specifies how
+    #                         long to wait before retrying to obtain the
+    #                         commit lock when detecting if the IndexReader is
+    #                         at the latest version.
     #
     # Some examples;
     #
@@ -64,8 +69,14 @@ module Ferret::Index
     #   index = Index::Index.new(:dir => directory,
     #                            :default_slop => 2,
     #                            :handle_parse_errors => false)
-    #
-    def initialize(options = {})
+    #
+    # You can also pass a block if you like. The index will be yielded and
+    # closed at the index of the box. For example;
+    #
+    #   Ferret::I.new() do |index|
+    #     # do stuff with index. Most of your actions will be cached.
+    #   end
+    def initialize(options = {}, &block)
       super()
       if options[:key]
@@ -92,14 +103,19 @@ module Ferret::Index
       end
       options[:dir] = @dir
+      options[:lock_retry_time]||= 2
       @dir.extend(MonitorMixin)
       @dir.synchronize do
         @options = options
-        @writer = IndexWriter.new(options) # create the index if need be
-        options[:analyzer] = @analyzer = @writer.analyzer
-        @writer.close
+        if (!@dir.exists?("segments")) || options[:create]
+          IndexWriter.new(options).close
+        end
+        options[:analyzer]||= Ferret::Analysis::StandardAnalyzer.new
+        @searcher = nil
         @writer = nil
         @reader = nil
         @options.delete(:create) # only want to create the first time if at all
         @auto_flush = @options[:auto_flush] || false
         if (@options[:id_field].nil? and
@@ -117,13 +133,51 @@ module Ferret::Index
         @open = true
         @qp = nil
       end
+      if block
+        yield self
+        self.close
+      end
+    end
+    # Returns an array of strings with the matches highlighted. The +query+ can
+    # either a query String or a Ferret::Search::Query object. The doc_id is
+    # the id of the document you want to highlight (usually returned by the
+    # search methods). There are also a number of options you can pass;
+    #
+    # === Options
+    #
+    # :field::            Default: @options[:default_field]. The default_field
+    #                     is the field that is usually highlighted but you can
+    #                     specify which field you want to highlight here. If
+    #                     you want to highlight multiple fields then you will
+    #                     need to call this method multiple times.
+    # :excerpt_length::   Default: 150. Length of excerpt to show. Highlighted
+    #                     terms will be in the centre of the excerpt.
+    # :num_excerpts::     Default: 2. Number of excerpts to return.
+    # :pre_tag::          Default: "<b>". Tag to place to the left of the
+    #                     match.  You'll probably want to change this to a
+    #                     "<span>" tag with a class "\033[7m" for use in a
+    #                     terminal.
+    # :post_tag::         Default: "</b>". This tag should close the
+    #                     +:pre_tag+.  Try tag "\033[m" in the terminal.
+    # :ellipsis::         Default: "...". This is the string that is appended
+    #                     at the beginning and end of excerpts (unless the
+    #                     excerpt hits the start or end of the field. You'll
+    #                     probably want to change this so a Unicode elipsis
+    #                     character.
+    def highlight(query, doc_id, options = {})
+      ensure_searcher_open()
+      @searcher.highlight(process_query(query),
+                          doc_id,
+                          options[:field]||@options[:default_field],
+                          options)
     end
     # Closes this index by closing its associated reader and writer objects.
     def close
       @dir.synchronize do
         if not @open
-          raise "tried to close an already closed directory"
+          raise(StandardError, "tried to close an already closed directory")
         end
         @searcher.close() if @searcher
         @reader.close() if @reader
@@ -534,7 +588,15 @@ module Ferret::Index
       def ensure_reader_open()
         raise "tried to use a closed index" if not @open
         if @reader
-          if not @reader.latest?
+          latest = false
+          begin
+            latest = @reader.latest?
+          rescue LockException => le
+            sleep(@options[:lock_retry_time]) # sleep for 2 seconds and try again
+            latest = @reader.latest?
+          end
+          if not latest
+            @reader.close
             return @reader = IndexReader.new(@dir)
           end
         else

data/test/test_helper.rb CHANGED Viewed

@@ -3,9 +3,10 @@ $:.unshift File.join(File.dirname(__FILE__), '../lib')
 $:.unshift File.join(File.dirname(__FILE__), '../ext')
 class Float
-  def =~(o)
-    return (1 - self/o).abs < 0.00001
+  def approx_eql?(o)
+    return (1 - self/o).abs < 0.0001
   end
+  alias :=~ :approx_eql?
 end
 require 'test/unit'

data/test/unit/analysis/tc_token_stream.rb CHANGED Viewed

@@ -1,5 +1,6 @@
 require File.dirname(__FILE__) + "/../../test_helper"
+puts "Loading once"
 class TokenTest < Test::Unit::TestCase
   include Ferret::Analysis
   def test_token

data/test/unit/index/tc_index.rb CHANGED Viewed

@@ -9,7 +9,7 @@ class IndexTest < Test::Unit::TestCase
   def setup()
   end
-  def tear_down()
+  def teardown()
   end
   def check_results(index, query, expected)
@@ -345,9 +345,10 @@ class IndexTest < Test::Unit::TestCase
     assert_equal(2, index2.size)
     assert_equal(2, index.size)
     top_docs = index.search("content3")
     assert_equal(0, top_docs.hits.size)
-    iw = IndexWriter.new(:path => fs_path, :analyzer => WhiteSpaceAnalyzer.new())
+    iw = IndexWriter.new(:path => fs_path, :analyzer => WhiteSpaceAnalyzer.new)
     iw << {:f, "content3"}
     iw.close()
@@ -355,6 +356,7 @@ class IndexTest < Test::Unit::TestCase
     assert_equal(1, top_docs.hits.size)
     assert_equal(3, index.size)
     assert_equal("content3", index[2][:f])
+    index2.close
     index.close
   end
@@ -556,6 +558,7 @@ class IndexTest < Test::Unit::TestCase
     data = %q(one two three four five six seven eight nine ten eleven twelve)
     index1 = Index.new(:path => fs_path, :auto_flush => true, :key => :id)
+    index1 << "zero"
     index2 = Index.new(:path => fs_path, :auto_flush => true)
     begin
       data.each do |datum|
@@ -611,4 +614,156 @@ class IndexTest < Test::Unit::TestCase
     hits = i.search 'move or shake'
     assert_equal 1, hits.total_hits # fails when id field is present
   end
+  def test_threading
+    path = File.expand_path(File.join(File.dirname(__FILE__), '../../temp/fsdir'))
+    index = Ferret::Index::Index.new(:path => path, :create => true)
+    100.times do |i|
+      buf = ''
+      doc = {}
+      doc[:id] = i
+      doc[:foo] = "foo #{i}"
+      index << doc
+    end
+    threads = []
+    4.times do
+      threads << Thread.new(index) do |index|
+        result = index.search('id:42')
+        assert_equal(1, result.total_hits)
+      end
+    end
+    threads.each{|t| t.join }
+  end
+  def test_wildcard
+    i = nil
+    Ferret::I.new do |i|
+      i << "one"
+      assert_equal(1, i.search("*").total_hits)
+      i << "two"
+      assert_equal(2, i.search("*").total_hits)
+      i << {:content => "three"}
+      assert_equal(3, i.search("*").total_hits)
+      assert_equal(3, i.search("id:*").total_hits)
+      assert_equal(2, i.search('id:?*').total_hits)
+    end
+    assert_raise(StandardError) {i.close}
+  end
+  def test_highlighter()
+    index = Ferret::I.new(:default_field => :field,
+                          :default_input_field => :field,
+                          :analyzer => Ferret::Analysis::WhiteSpaceAnalyzer.new)
+    [
+      "the words we are searching for are one and two also " +
+      "sometimes looking for them as a phrase like this; one " +
+      "two lets see how it goes"
+    ].each {|doc| index << doc }
+    highlights = index.highlight("one", 0,
+                                 :excerpt_length => 10,
+                                 :num_excerpts => 1)
+    assert_equal(1, highlights.size)
+    assert_equal("...are <b>one</b>...", highlights[0])
+    highlights = index.highlight("one", 0,
+                                 :excerpt_length => 10,
+                                 :num_excerpts => 2)
+    assert_equal(2, highlights.size)
+    assert_equal("...are <b>one</b>...", highlights[0])
+    assert_equal("...this; <b>one</b>...", highlights[1])
+    highlights = index.highlight("one", 0,
+                                 :excerpt_length => 10,
+                                 :num_excerpts => 3)
+    assert_equal(3, highlights.size)
+    assert_equal("the words...", highlights[0])
+    assert_equal("...are <b>one</b>...", highlights[1])
+    assert_equal("...this; <b>one</b>...", highlights[2])
+    highlights = index.highlight("one", 0,
+                                 :excerpt_length => 10,
+                                 :num_excerpts => 4)
+    assert_equal(3, highlights.size)
+    assert_equal("the words we are...", highlights[0])
+    assert_equal("...are <b>one</b>...", highlights[1])
+    assert_equal("...this; <b>one</b>...", highlights[2])
+    highlights = index.highlight("one", 0,
+                                 :excerpt_length => 10,
+                                 :num_excerpts => 5)
+    assert_equal(2, highlights.size)
+    assert_equal("the words we are searching for are <b>one</b>...", highlights[0])
+    assert_equal("...this; <b>one</b>...", highlights[1])
+    highlights = index.highlight("one", 0,
+                                 :excerpt_length => 10,
+                                 :num_excerpts => 20)
+    assert_equal(1, highlights.size)
+    assert_equal("the words we are searching for are <b>one</b> and two also " +
+            "sometimes looking for them as a phrase like this; <b>one</b> " +
+            "two lets see how it goes", highlights[0])
+    highlights = index.highlight("one", 0,
+                                 :excerpt_length => 1000,
+                                 :num_excerpts => 1)
+    assert_equal(1, highlights.size)
+    assert_equal("the words we are searching for are <b>one</b> and two also " +
+            "sometimes looking for them as a phrase like this; <b>one</b> " +
+            "two lets see how it goes", highlights[0])
+    highlights = index.highlight("(one two)", 0,
+                                 :excerpt_length => 15,
+                                 :num_excerpts => 2)
+    assert_equal(2, highlights.size)
+    assert_equal("...<b>one</b> and <b>two</b>...", highlights[0])
+    assert_equal("...this; <b>one</b> <b>two</b>...", highlights[1])
+    highlights = index.highlight('one two "one two"', 0,
+                                 :excerpt_length => 15,
+                                 :num_excerpts => 2)
+    assert_equal(2, highlights.size)
+    assert_equal("...<b>one</b> and <b>two</b>...", highlights[0])
+    assert_equal("...this; <b>one two</b>...", highlights[1])
+    highlights = index.highlight('"one two"', 0,
+                                 :excerpt_length => 15,
+                                 :num_excerpts => 1)
+    assert_equal(1, highlights.size)
+    # should have a higher priority since it the merger of three matches
+    assert_equal("...this; <b>one two</b>...", highlights[0])
+    highlights = index.highlight('"one two"', 0, :field => :not_a_field,
+                                 :excerpt_length => 15,
+                                 :num_excerpts => 1)
+    assert_nil(highlights)
+    highlights = index.highlight("wrong_field:one", 0, :field => :wrong_field,
+                                 :excerpt_length => 15,
+                                 :num_excerpts => 1)
+    assert_nil(highlights)
+    highlights = index.highlight('"the words" "for are one and two" ' +
+                                 'words one two', 0,
+                                 :excerpt_length => 10,
+                                 :num_excerpts => 1)
+    assert_equal(1, highlights.size)
+    assert_equal("...<b>for are one and two</b>...", highlights[0])
+    highlights = index.highlight('"the words" "for are one and two" ' +
+                                 'words one two', 0,
+                                 :excerpt_length => 10,
+                                 :num_excerpts => 2)
+    assert_equal(2, highlights.size)
+    assert_equal("<b>the words</b>...", highlights[0])
+    assert_equal("...<b>for are one and two</b>...", highlights[1])
+    index.close
+  end
 end

data/test/unit/index/tc_index_reader.rb CHANGED Viewed

@@ -63,6 +63,13 @@ module IndexReaderCommon
     assert_equal(1, te.doc_freq)
     assert(!te.next?)
+    expected = %w{is 1 more 1 not 1 skip 42 stored 1 text 1 which 1}
+    te = @ir.terms(:text)
+    te.each do |term, doc_freq|
+      assert_equal(expected.shift, term)
+      assert_equal(expected.shift.to_i, doc_freq)
+    end
     te = @ir.terms_from(:body, "Not")
     assert_equal("Not", te.term)
     assert_equal(1, te.doc_freq)
@@ -177,7 +184,7 @@ module IndexReaderCommon
   def do_test_get_doc()
     doc = @ir.get_document(3)
-    assert_equal([:year, :body, :title, :author], doc.fields)
+    [:author, :body, :title, :year].each {|fn| assert(doc.fields.include?(fn))}
     assert_equal(4, doc.fields.size)
     assert_equal(0, doc.size)
     assert_equal([], doc.keys)
@@ -296,6 +303,7 @@ module IndexReaderCommon
     assert_equal(doc_count, ir2.max_doc())
     assert_equal(doc_count, ir2.num_docs())
+    ir2.close
     ir2 = ir_new()
     assert(ir2.has_deletions?())
     assert_equal(doc_count, ir2.max_doc())
@@ -325,6 +333,7 @@ module IndexReaderCommon
     assert_equal(doc_count - 6, ir3.max_doc())
     assert_equal(doc_count - 6, ir3.num_docs())
+    ir2.close()
     ir3.close()
   end
 end
@@ -358,7 +367,7 @@ class MultiReaderTest < Test::Unit::TestCase
     @ir = ir_new()
   end
-  def tear_down()
+  def teardown()
     @ir.close()
     @dir.close()
   end
@@ -406,9 +415,102 @@ class MultiExternalReaderTest < Test::Unit::TestCase
     @ir = ir_new
   end
-  def tear_down()
+  def teardown()
+    @ir.close()
+    @dirs.each {|dir| dir.close}
+  end
+end
+class MultiExternalReaderDirTest < Test::Unit::TestCase
+  include IndexReaderCommon
+  def ir_new
+    IndexReader.new(@dirs)
+  end
+  def iw_optimize
+    @dirs.each do |dir|
+      iw = IndexWriter.new(:dir => dir, :analyzer => WhiteSpaceAnalyzer.new())
+      iw.optimize()
+      iw.close()
+    end
+  end
+  def setup()
+    @dirs = []
+    [
+      [0, 10],
+      [10, 30],
+      [30, IndexTestHelper::INDEX_TEST_DOCS.size]
+    ].each do |start, finish|
+      dir = Ferret::Store::RAMDirectory.new()
+      @dirs << dir
+      iw = IndexWriter.new(:dir => dir,
+                           :analyzer => WhiteSpaceAnalyzer.new(),
+                           :create => true,
+                           :field_infos => IndexTestHelper::INDEX_TEST_FIS)
+      (start...finish).each do |doc_id|
+        iw << IndexTestHelper::INDEX_TEST_DOCS[doc_id]
+      end
+      iw.close()
+    end
+    @ir = ir_new
+  end
+  def teardown()
+    @ir.close()
+    @dirs.each {|dir| dir.close}
+  end
+end
+class MultiExternalReaderPathTest < Test::Unit::TestCase
+  include IndexReaderCommon
+  def ir_new
+    IndexReader.new(@paths)
+  end
+  def iw_optimize
+    @paths.each do |path|
+      iw = IndexWriter.new(:path => path, :analyzer => WhiteSpaceAnalyzer.new())
+      iw.optimize()
+      iw.close()
+    end
+  end
+  def setup()
+    base_dir = File.expand_path(File.join(File.dirname(__FILE__),
+                       '../../temp/multidir'))
+    FileUtils.mkdir_p(base_dir)
+    @paths = [
+      File.join(base_dir, "i1"),
+      File.join(base_dir, "i2"),
+      File.join(base_dir, "i3")
+    ]
+    [
+      [0, 10],
+      [10, 30],
+      [30, IndexTestHelper::INDEX_TEST_DOCS.size]
+    ].each_with_index do |(start, finish), i|
+      path = @paths[i]
+      iw = IndexWriter.new(:path => path,
+                           :analyzer => WhiteSpaceAnalyzer.new(),
+                           :create => true,
+                           :field_infos => IndexTestHelper::INDEX_TEST_FIS)
+      (start...finish).each do |doc_id|
+        iw << IndexTestHelper::INDEX_TEST_DOCS[doc_id]
+      end
+      iw.close()
+    end
+    @ir = ir_new
+  end
+  def teardown()
     @ir.close()
-    @dir.close()
   end
 end
@@ -420,7 +522,7 @@ class IndexReaderTest < Test::Unit::TestCase
     @dir = Ferret::Store::RAMDirectory.new()
   end
-  def tear_down()
+  def teardown()
     @dir.close()
   end
@@ -445,6 +547,7 @@ class IndexReaderTest < Test::Unit::TestCase
     @dir = Ferret::Store::RAMDirectory.new(@fs_dir)
     ir = IndexReader.new(@dir)
     assert_equal(doc, ir.get_document(0).load)
+    ir.close
   end
   def do_test_term_vectors(ir)