RubyGems - ferret - Versions diffs - 0.11.4 → 0.11.5 - Mend

ferret 0.11.4 → 0.11.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

data/Rakefile +1 -0
data/TUTORIAL +3 -3
data/ext/analysis.c +12 -9
data/ext/array.c +10 -10
data/ext/array.h +8 -1
data/ext/bitvector.c +2 -2
data/ext/except.c +1 -1
data/ext/ferret.c +2 -2
data/ext/ferret.h +1 -1
data/ext/fs_store.c +13 -2
data/ext/global.c +4 -4
data/ext/global.h +6 -0
data/ext/hash.c +1 -1
data/ext/helper.c +1 -1
data/ext/helper.h +1 -1
data/ext/index.c +48 -22
data/ext/index.h +17 -16
data/ext/mempool.c +4 -1
data/ext/mempool.h +1 -1
data/ext/multimapper.c +2 -2
data/ext/q_fuzzy.c +2 -2
data/ext/q_multi_term.c +2 -2
data/ext/q_parser.c +39 -8
data/ext/q_range.c +32 -1
data/ext/r_analysis.c +66 -28
data/ext/r_index.c +18 -19
data/ext/r_qparser.c +21 -6
data/ext/r_search.c +74 -49
data/ext/r_store.c +1 -1
data/ext/r_utils.c +17 -17
data/ext/search.c +10 -5
data/ext/search.h +3 -1
data/ext/sort.c +2 -2
data/ext/stopwords.c +23 -34
data/ext/store.c +9 -9
data/ext/store.h +5 -4
data/lib/ferret/document.rb +2 -2
data/lib/ferret/field_infos.rb +37 -35
data/lib/ferret/index.rb +16 -6
data/lib/ferret/number_tools.rb +2 -2
data/lib/ferret_version.rb +1 -1
data/test/unit/analysis/tc_token_stream.rb +40 -0
data/test/unit/index/tc_index.rb +64 -101
data/test/unit/index/tc_index_reader.rb +13 -0
data/test/unit/largefile/tc_largefile.rb +46 -0
data/test/unit/query_parser/tc_query_parser.rb +17 -1
data/test/unit/search/tc_multiple_search_requests.rb +58 -0
data/test/unit/search/tm_searcher.rb +27 -1
data/test/unit/ts_largefile.rb +4 -0
metadata +147 -144

data/lib/ferret/index.rb CHANGED Viewed

@@ -322,8 +322,13 @@ module Ferret::Index
     # sort::        A Sort object or sort string describing how the field
     #               should be sorted. A sort string is made up of field names
     #               which cannot contain spaces and the word "DESC" if you
-    #               want the field reversed, all seperated by commas. For
-    #               example; "rating DESC, author, title"
+    #               want the field reversed, all separated by commas. For
+    #               example; "rating DESC, author, title". Note that Ferret
+    #               will try to determine a field's type by looking at the
+    #               first term in the index and seeing if it can be parsed as
+    #               an integer or a float. Keep this in mind as you may need
+    #               to specify a fields type to sort it correctly. For more
+    #               on this, see the documentation for SortField
     # filter::      a Filter object to filter the search results with
     # filter_proc:: a filter Proc is a Proc which takes the doc_id, the score
     #               and the Searcher object as its parameters and returns a
@@ -360,8 +365,13 @@ module Ferret::Index
     # sort::        A Sort object or sort string describing how the field
     #               should be sorted. A sort string is made up of field names
     #               which cannot contain spaces and the word "DESC" if you
-    #               want the field reversed, all seperated by commas. For
-    #               example; "rating DESC, author, title"
+    #               want the field reversed, all separated by commas. For
+    #               example; "rating DESC, author, title". Note that Ferret
+    #               will try to determine a field's type by looking at the
+    #               first term in the index and seeing if it can be parsed as
+    #               an integer or a float. Keep this in mind as you may need
+    #               to specify a fields type to sort it correctly. For more
+    #               on this, see the documentation for SortField
     # filter::      a Filter object to filter the search results with
     # filter_proc:: a filter Proc is a Proc which takes the doc_id, the score
     #               and the Searcher object as its parameters and returns a
@@ -451,7 +461,7 @@ module Ferret::Index
         ensure_writer_open()
         ensure_searcher_open()
         query = do_process_query(query)
-        @searcher.search_each(query) do |doc, score|
+        @searcher.search_each(query, :limit => :all) do |doc, score|
           @reader.delete(doc)
         end
         flush() if @auto_flush
@@ -623,7 +633,7 @@ module Ferret::Index
     #
     # directory:: This can either be a Store::Directory object or a String
     #             representing the path to the directory where you would
-    #             like to store the the index.
+    #             like to store the index.
     #
     # create::    True if you'd like to create the directory if it doesn't
     #             exist or copy over an existing directory. False if you'd

data/lib/ferret/number_tools.rb CHANGED Viewed

@@ -3,7 +3,7 @@ require 'time'
 class Float
   # Return true if the float is within +precision+ of the other value +o+. This
-  # is used to accomodate for floating point errors.
+  # is used to accommodate for floating point errors.
   #
   # o::         value to compare with
   # precision:: the precision to use in the comparison.
@@ -49,7 +49,7 @@ class Integer
   # Convert the number to a lexicographically sortable string by padding with
   # 0s. You should make sure that you set the width to a number large enough to
-  # accomodate all possible values. Also note that this method will not work
+  # accommodate all possible values. Also note that this method will not work
   # with negative numbers. That is negative numbers will sort in the opposite
   # direction as positive numbers. If you have very large numbers or a mix of
   # positive and negative numbers you should use the Integer#to_s_lex method

data/lib/ferret_version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Ferret
-  VERSION = '0.11.4'
+  VERSION = '0.11.5'
 end

data/test/unit/analysis/tc_token_stream.rb CHANGED Viewed

@@ -508,6 +508,11 @@ module Ferret::Analysis
       return Token.new(normalize(term), term_start, term_end)
     end
+    def text=(text)
+      @ss = StringScanner.new(text)
+    end
     protected
       # returns the regular expression used to find the next token
       TOKEN_RE = /[[:alpha:]]+/
@@ -521,6 +526,23 @@ module Ferret::Analysis
       def normalize(str) return str end
   end
+  class MyReverseTokenFilter < TokenStream
+    def initialize(token_stream)
+      @token_stream = token_stream
+    end
+    def text=(text)
+      @token_stream.text = text
+    end
+    def next()
+      if token = @token_stream.next
+        token.text = token.text.reverse
+      end
+      token
+    end
+  end
   class MyCSVTokenizer < MyRegExpTokenizer
     protected
       # returns the regular expression used to find the next token
@@ -551,6 +573,24 @@ class CustomTokenizerTest < Test::Unit::TestCase
     assert_equal(Token.new("2nd field", 12, 21), t.next)
     assert_equal(Token.new("  p a d d e d  f i e l d  ", 22, 48), t.next)
     assert(! t.next())
+    t = MyReverseTokenFilter.new(
+          AsciiLowerCaseFilter.new(MyCSVTokenizer.new(input)))
+    assert_equal(Token.new("dleif tsrif", 0, 11), t.next)
+    assert_equal(Token.new("dleif dn2", 12, 21), t.next)
+    assert_equal(Token.new("  d l e i f  d e d d a p  ", 22, 48), t.next)
+    t.text = "one,TWO,three"
+    assert_equal(Token.new("eno", 0, 3), t.next)
+    assert_equal(Token.new("owt", 4, 7), t.next)
+    assert_equal(Token.new("eerht", 8, 13), t.next)
+    t = AsciiLowerCaseFilter.new(
+          MyReverseTokenFilter.new(MyCSVTokenizer.new(input)))
+    assert_equal(Token.new("dleif tsrif", 0, 11), t.next)
+    assert_equal(Token.new("dleif dn2", 12, 21), t.next)
+    assert_equal(Token.new("  d l e i f  d e d d a p  ", 22, 48), t.next)
+    t.text = "one,TWO,three"
+    assert_equal(Token.new("eno", 0, 3), t.next)
+    assert_equal(Token.new("owt", 4, 7), t.next)
+    assert_equal(Token.new("eerht", 8, 13), t.next)
   end
 end

data/test/unit/index/tc_index.rb CHANGED Viewed

@@ -658,6 +658,19 @@ class IndexTest < Test::Unit::TestCase
     assert_raise(StandardError) {i.close}
   end
+  def check_highlight(index, q, excerpt_length, num_excerpts, expected, field = :field)
+    highlights = index.highlight(q, 0,
+                                 :excerpt_length => excerpt_length,
+                                 :num_excerpts => num_excerpts,
+                                 :field => field)
+    assert_equal(expected, highlights)
+    highlights = index.highlight(q, 1,
+                                 :excerpt_length => excerpt_length,
+                                 :num_excerpts => num_excerpts,
+                                 :field => field)
+    assert_equal(expected, highlights)
+  end
   def test_highlighter()
     index = Ferret::I.new(:default_field => :field,
                           :default_input_field => :field,
@@ -665,109 +678,49 @@ class IndexTest < Test::Unit::TestCase
     [
       "the words we are searching for are one and two also " +
       "sometimes looking for them as a phrase like this; one " +
-      "two lets see how it goes"
+      "two lets see how it goes",
+      [
+        "the words we",
+        "are searching",
+        "for are one",
+        "and two also",
+        "sometimes looking",
+        "for them as a",
+        "phrase like this;",
+        "one two lets see",
+        "how it goes"
+      ]
     ].each {|doc| index << doc }
-    highlights = index.highlight("one", 0,
-                                 :excerpt_length => 10,
-                                 :num_excerpts => 1)
-    assert_equal(1, highlights.size)
-    assert_equal("...are <b>one</b>...", highlights[0])
-    highlights = index.highlight("one", 0,
-                                 :excerpt_length => 10,
-                                 :num_excerpts => 2)
-    assert_equal(2, highlights.size)
-    assert_equal("...are <b>one</b>...", highlights[0])
-    assert_equal("...this; <b>one</b>...", highlights[1])
-    highlights = index.highlight("one", 0,
-                                 :excerpt_length => 10,
-                                 :num_excerpts => 3)
-    assert_equal(3, highlights.size)
-    assert_equal("the words...", highlights[0])
-    assert_equal("...are <b>one</b>...", highlights[1])
-    assert_equal("...this; <b>one</b>...", highlights[2])
-    highlights = index.highlight("one", 0,
-                                 :excerpt_length => 10,
-                                 :num_excerpts => 4)
-    assert_equal(3, highlights.size)
-    assert_equal("the words we are...", highlights[0])
-    assert_equal("...are <b>one</b>...", highlights[1])
-    assert_equal("...this; <b>one</b>...", highlights[2])
-    highlights = index.highlight("one", 0,
-                                 :excerpt_length => 10,
-                                 :num_excerpts => 5)
-    assert_equal(2, highlights.size)
-    assert_equal("the words we are searching for are <b>one</b>...", highlights[0])
-    assert_equal("...this; <b>one</b>...", highlights[1])
-    highlights = index.highlight("one", 0,
-                                 :excerpt_length => 10,
-                                 :num_excerpts => 20)
-    assert_equal(1, highlights.size)
-    assert_equal("the words we are searching for are <b>one</b> and two also " +
-            "sometimes looking for them as a phrase like this; <b>one</b> " +
-            "two lets see how it goes", highlights[0])
-    highlights = index.highlight("one", 0,
-                                 :excerpt_length => 1000,
-                                 :num_excerpts => 1)
-    assert_equal(1, highlights.size)
-    assert_equal("the words we are searching for are <b>one</b> and two also " +
-            "sometimes looking for them as a phrase like this; <b>one</b> " +
-            "two lets see how it goes", highlights[0])
-    highlights = index.highlight("(one two)", 0,
-                                 :excerpt_length => 15,
-                                 :num_excerpts => 2)
-    assert_equal(2, highlights.size)
-    assert_equal("...<b>one</b> and <b>two</b>...", highlights[0])
-    assert_equal("...this; <b>one</b> <b>two</b>...", highlights[1])
-    highlights = index.highlight('one two "one two"', 0,
-                                 :excerpt_length => 15,
-                                 :num_excerpts => 2)
-    assert_equal(2, highlights.size)
-    assert_equal("...<b>one</b> and <b>two</b>...", highlights[0])
-    assert_equal("...this; <b>one two</b>...", highlights[1])
-    highlights = index.highlight('"one two"', 0,
-                                 :excerpt_length => 15,
-                                 :num_excerpts => 1)
-    assert_equal(1, highlights.size)
-    # should have a higher priority since it the merger of three matches
-    assert_equal("...this; <b>one two</b>...", highlights[0])
-    highlights = index.highlight('"one two"', 0, :field => :not_a_field,
-                                 :excerpt_length => 15,
-                                 :num_excerpts => 1)
-    assert_nil(highlights)
-    highlights = index.highlight("wrong_field:one", 0, :field => :wrong_field,
-                                 :excerpt_length => 15,
-                                 :num_excerpts => 1)
-    assert_nil(highlights)
-    highlights = index.highlight('"the words" "for are one and two" ' +
-                                 'words one two', 0,
-                                 :excerpt_length => 10,
-                                 :num_excerpts => 1)
-    assert_equal(1, highlights.size)
-    assert_equal("<b>the words</b>...", highlights[0])
-    highlights = index.highlight('"the words" "for are one and two" ' +
-                                 'words one two', 0,
-                                 :excerpt_length => 20,
-                                 :num_excerpts => 2)
-    assert_equal(2, highlights.size)
-    assert_equal("<b>the words</b> we are...", highlights[0])
-    assert_equal("...<b>for are one and two</b>...", highlights[1])
+    check_highlight(index, "one", 10, 1, ["...are <b>one</b>..."])
+    check_highlight(index, "one", 10, 2,
+                    ["...are <b>one</b>...","...this; <b>one</b>..."])
+    check_highlight(index, "one", 10, 3,
+                    ["the words...","...are <b>one</b>...","...this; <b>one</b>..."])
+    check_highlight(index, "one", 10, 4,
+                    ["the words we are...","...are <b>one</b>...","...this; <b>one</b>..."])
+    check_highlight(index, "one", 10, 5,
+                    ["the words we are searching for are <b>one</b>...","...this; <b>one</b>..."])
+    check_highlight(index, "one", 10, 20,
+                    ["the words we are searching for are <b>one</b> and two also " +
+                     "sometimes looking for them as a phrase like this; <b>one</b> " +
+                     "two lets see how it goes"])
+    check_highlight(index, "one", 200, 1,
+                    ["the words we are searching for are <b>one</b> and two also " +
+                     "sometimes looking for them as a phrase like this; <b>one</b> " +
+                     "two lets see how it goes"])
+    check_highlight(index, "(one two)", 15, 2,
+                    ["...<b>one</b> and <b>two</b>...","...this; <b>one</b> <b>two</b>..."])
+    check_highlight(index, 'one two "one two"', 15, 2,
+                    ["...<b>one</b> and <b>two</b>...","...this; <b>one two</b>..."])
+    check_highlight(index, 'one two "one two"', 15, 1,
+                    ["...this; <b>one two</b>..."])
+    check_highlight(index, '"one two"', 15, 1, nil, :not_a_field)
+    check_highlight(index, 'wrong_field:one', 15, 1, nil, :wrong_field)
+    check_highlight(index, '"the words" "for are one and two" words one two', 10, 1,
+                    ["<b>the words</b>..."])
+    check_highlight(index, '"the words" "for are one and two" words one two', 20, 2,
+                    ["<b>the words</b> we are...","...<b>for are one and two</b>..."])
     index.close
   end
@@ -796,4 +749,14 @@ class IndexTest < Test::Unit::TestCase
     assert_equal('[]', index.search("xxx").to_json)
     index.close
   end
+  def test_large_query_delete
+    index = Ferret::I.new
+    20.times do
+      index << {:id => 'one'}
+      index << {:id => 'two'}
+    end
+    index.query_delete('id:one')
+    assert_equal(20, index.size)
+  end
 end

data/test/unit/index/tc_index_reader.rb CHANGED Viewed

@@ -378,6 +378,19 @@ module IndexReaderCommon
     ir2.close()
     ir3.close()
   end
+  def test_latest
+    assert(@ir.latest?)
+    ir2 = ir_new()
+    assert(ir2.latest?)
+    ir2.delete(0)
+    ir2.commit()
+    assert(ir2.latest?)
+    assert(!@ir.latest?)
+    ir2.close()
+  end
 end
 class MultiReaderTest < Test::Unit::TestCase

data/test/unit/largefile/tc_largefile.rb ADDED Viewed

@@ -0,0 +1,46 @@
+require File.dirname(__FILE__) + "/../../test_helper"
+class SampleLargeTest < Test::Unit::TestCase
+  include Ferret::Index
+  include Ferret::Search
+  include Ferret::Store
+  include Ferret::Utils
+  INDEX_DIR = File.dirname(__FILE__) + "/../../temp/largefile"
+  RECORDS = 750
+  RECORD_SIZE = 10e5
+  def setup
+    @index = Index.new(:path => INDEX_DIR, :create_if_missing => true, :key => :id)
+    create_index! if @index.size == 0 or ENV["RELOAD_LARGE_INDEX"]
+  end
+  def test_file_index_created
+    assert @index.size == RECORDS, "Index size should be #{RECORDS}, is #{@index.size}"
+  end
+  def test_keys_work
+    @index << {:content => "foo", :id => RECORDS - 4}
+    assert @index.size == RECORDS, "Index size should be #{RECORDS}, is #{@index.size}"
+  end
+  def test_read_file_after_two_gigs
+    assert @index.reader[RECORDS - 5].load.is_a?Hash
+  end
+  def create_index!
+    @@already_built_large_index ||= false
+    return if @@already_built_large_index
+    @@already_built_large_index = true
+    a = "a"
+    RECORDS.times { |i|
+      seq = (a.succ! + " ") * RECORD_SIZE
+      record = {:id => i, :content => seq}
+    	@index << record
+    	print "i"
+    	STDOUT.flush
+    }
+    puts "o"
+    @index.optimize
+  end
+end

data/test/unit/query_parser/tc_query_parser.rb CHANGED Viewed

@@ -22,7 +22,7 @@ class QueryParserTest < Test::Unit::TestCase
       ['field:"one <> <> <> three <>"', 'field:"one <> <> <> three"'],
       ['field:"one <> 222 <> three|four|five <>"', 'field:"one <> 222 <> three|four|five"'],
       ['field:"on1|tw2 THREE|four|five six|seven"', 'field:"on1|tw2 THREE|four|five six|seven"'],
-      ['field:"testing|trucks"', 'field:testing field:trucks'],
+      ['field:"testing|trucks"', 'field:"testing|trucks"'],
       ['[aaa bbb]', '[aaa bbb]'],
       ['{aaa bbb]', '{aaa bbb]'],
       ['field:[aaa bbb}', 'field:[aaa bbb}'],
@@ -91,6 +91,8 @@ class QueryParserTest < Test::Unit::TestCase
       ['*:"asdf <> xxx|yyy"', '"asdf <> xxx|yyy" field:"asdf <> xxx|yyy" f1:"asdf <> xxx|yyy" f2:"asdf <> xxx|yyy"'],
       ['f1|f2:"asdf <> xxx|yyy"', 'f1:"asdf <> xxx|yyy" f2:"asdf <> xxx|yyy"'],
+      ['f1|f2:"asdf <> do|yyy"', 'f1:"asdf <> yyy" f2:"asdf <> yyy"'],
+      ['f1|f2:"do|cat"', 'f1:cat f2:cat'],
       ['*:[bbb xxx]', '[bbb xxx] field:[bbb xxx] f1:[bbb xxx] f2:[bbb xxx]'],
       ['f1|f2:[bbb xxx]', 'f1:[bbb xxx] f2:[bbb xxx]'],
@@ -219,4 +221,18 @@ class QueryParserTest < Test::Unit::TestCase
       assert_equal(expected, parser.parse(query_str).to_s("xxx"))
     end
   end
+  def test_use_keywords_switch
+    analyzer = LetterAnalyzer.new
+    parser = Ferret::QueryParser.new(:analyzer => analyzer,
+                                     :default_field => "xxx")
+    assert_equal("+www (+xxx +yyy) -zzz",
+                 parser.parse("REQ www (xxx AND yyy) OR NOT zzz").to_s("xxx"))
+    parser = Ferret::QueryParser.new(:analyzer => analyzer,
+                                     :default_field => "xxx",
+                                     :use_keywords => false)
+    assert_equal("req www (xxx and yyy) or not zzz",
+                 parser.parse("REQ www (xxx AND yyy) OR NOT zzz").to_s("xxx"))
+  end
 end

data/test/unit/search/tc_multiple_search_requests.rb ADDED Viewed

@@ -0,0 +1,58 @@
+require File.dirname(__FILE__) + "/../../test_helper"
+class MultipleSearchRequestsTest < Test::Unit::TestCase
+  include Ferret::Search
+  include Ferret::Store
+  include Ferret::Analysis
+  include Ferret::Index
+  def setup()
+    dpath = File.expand_path(File.join(File.dirname(__FILE__),
+                       '../../temp/fsdir'))
+    fs_dir = Ferret::Store::FSDirectory.new(dpath, true)
+    iw = IndexWriter.new(:dir => fs_dir, :create => true, :key => [:id])
+    1000.times do |x|
+      doc = {:id => x}
+      iw << doc
+    end
+    iw.close()
+    fs_dir.close()
+    @ix = Index.new(:path => dpath, :create => true, :key => [:id])
+  end
+  def tear_down()
+    @ix.close
+  end
+  def test_repeated_queries_segmentation_fault
+    1000.times do |x|
+      bq = BooleanQuery.new()
+      tq1 = TermQuery.new(:id, 1)
+      tq2 = TermQuery.new(:another_id, 1)
+      bq.add_query(tq1, :must)
+      bq.add_query(tq2, :must)
+      top_docs = @ix.search(bq)
+    end
+  end
+  def test_repeated_queries_bus_error
+    1000.times do |x|
+      bq = BooleanQuery.new()
+      tq1 = TermQuery.new(:id, '1')
+      tq2 = TermQuery.new(:another_id, '1')
+      tq3 = TermQuery.new(:yet_another_id, '1')
+      tq4 = TermQuery.new(:still_another_id, '1')
+      tq5 = TermQuery.new(:one_more_id, '1')
+      tq6 = TermQuery.new(:and_another_id, '1')
+      bq.add_query(tq1, :must)
+      bq.add_query(tq2, :must)
+      bq.add_query(tq3, :must)
+      bq.add_query(tq4, :must)
+      bq.add_query(tq5, :must)
+      bq.add_query(tq6, :must)
+      top_docs = @ix.search(bq)
+    end
+  end
+end