RubyGems - ferret - Versions diffs - 0.1.3 → 0.1.4 - Mend

ferret 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

data/Rakefile +1 -1
data/TODO +3 -0
data/ext/dummy.exe +0 -0
data/lib/ferret.rb +1 -1
data/lib/ferret/analysis/token.rb +6 -0
data/lib/ferret/analysis/tokenizers.rb +5 -5
data/lib/ferret/document/document.rb +10 -13
data/lib/ferret/index/compound_file_io.rb +12 -9
data/lib/ferret/index/field_infos.rb +0 -6
data/lib/ferret/index/index.rb +220 -102
data/lib/ferret/index/index_reader.rb +22 -2
data/lib/ferret/index/index_writer.rb +55 -14
data/lib/ferret/index/multi_reader.rb +279 -279
data/lib/ferret/index/segment_infos.rb +3 -3
data/lib/ferret/index/segment_merger.rb +7 -6
data/lib/ferret/index/segment_reader.rb +23 -7
data/lib/ferret/index/segment_term_enum.rb +6 -7
data/lib/ferret/index/term_buffer.rb +3 -5
data/lib/ferret/index/term_doc_enum.rb +7 -2
data/lib/ferret/index/term_infos_io.rb +15 -8
data/lib/ferret/query_parser/query_parser.tab.rb +49 -45
data/lib/ferret/search/boolean_query.rb +3 -4
data/lib/ferret/search/boolean_scorer.rb +11 -11
data/lib/ferret/search/caching_wrapper_filter.rb +1 -1
data/lib/ferret/search/disjunction_sum_scorer.rb +9 -7
data/lib/ferret/search/field_cache.rb +1 -2
data/lib/ferret/search/field_sorted_hit_queue.rb +1 -1
data/lib/ferret/search/fuzzy_term_enum.rb +64 -58
data/lib/ferret/search/index_searcher.rb +16 -9
data/lib/ferret/search/prefix_query.rb +7 -0
data/lib/ferret/search/query_filter.rb +1 -1
data/lib/ferret/search/term_scorer.rb +5 -1
data/lib/ferret/search/top_docs.rb +12 -0
data/lib/ferret/store/buffered_index_io.rb +5 -6
data/lib/ferret/store/fs_store.rb +47 -33
data/lib/ferret/store/ram_store.rb +2 -2
data/lib/ferret/utils.rb +1 -0
data/lib/ferret/utils/bit_vector.rb +20 -2
data/lib/ferret/utils/thread_local.rb +28 -0
data/lib/ferret/utils/weak_key_hash.rb +11 -2
data/test/benchmark/tb_rw_vint.rb +1 -1
data/test/functional/thread_safety_index_test.rb +81 -0
data/test/functional/thread_safety_test.rb +137 -0
data/test/test_all.rb +3 -7
data/test/test_helper.rb +2 -1
data/test/unit/index/tc_compound_file_io.rb +2 -2
data/test/unit/index/tc_index.rb +128 -6
data/test/unit/index/tc_index_reader.rb +1 -1
data/test/unit/index/tc_segment_infos.rb +1 -1
data/test/unit/index/th_doc.rb +1 -1
data/test/unit/search/tc_index_searcher.rb +6 -0
data/test/unit/store/tc_fs_store.rb +3 -3
data/test/unit/utils/tc_bit_vector.rb +8 -0
data/test/unit/utils/tc_thread.rb +61 -0
data/test/unit/utils/tc_weak_key_hash.rb +2 -2
data/test/utils/number_to_spoken.rb +132 -0
metadata +7 -2

data/lib/ferret/search/boolean_query.rb CHANGED Viewed

@@ -251,10 +251,9 @@ module Ferret::Search
       return Query.merge_boolean_queries(queries)
     end
-    def clone()
-      clone = super
-      clone.clauses = @clauses.clone
-      return clone
+    def initialize_copy(o)
+      super
+      @clauses = o.clauses.clone
     end
     # Prints a user-readable version of this query.

data/lib/ferret/search/boolean_scorer.rb CHANGED Viewed

@@ -87,11 +87,11 @@ module Ferret::Search
       def next?
         return @scorer.next?
       end
-      def skip_to(doc_nr)
-        return @scorer.skip_to(doc_nr)
+      def skip_to(doc_num)
+        return @scorer.skip_to(doc_num)
       end
-      def explain(doc_nr)
-        return @scorer.explain(doc_nr)
+      def explain(doc_num)
+        return @scorer.explain(doc_num)
       end
     end
@@ -116,13 +116,13 @@ module Ferret::Search
       def initialize(parent_scorer, similarity)
         super(similarity)
         @parent_scorer = parent_scorer
-        @required_nr_matchers = parent_scorer.required_scorers.size
+        @required_num_matchers = parent_scorer.required_scorers.size
         @last_scored_doc = -1
       end
       def score
         if (@parent_scorer.doc() > @last_scored_doc)
           @last_scored_doc = @parent_scorer.doc()
-          @parent_scorer.coordinator.nr_matchers += @required_nr_matchers
+          @parent_scorer.coordinator.nr_matchers += @required_num_matchers
         end
         return super
@@ -132,7 +132,7 @@ module Ferret::Search
     def counting_conjunction_sum_scorer(required_scorers)
       # each scorer from the list counted as a single matcher
-      required_nr_matchers = required_scorers.size
+      required_num_matchers = required_scorers.size
       ccs = CountingConjunctionScorer.new(self, Similarity.default)
       @required_scorers.each do |scorer|
         ccs << scorer
@@ -239,13 +239,13 @@ module Ferret::Search
     # returns:: true if more matching documents may remain.
     def each_hit_up_to(max = MAX_DOCS) # :yields: doc, score
       # nil pointer exception when next? was not called before:
-      doc_nr = @counting_sum_scorer.doc()
-      while (doc_nr < max)
-        yield(doc_nr, score())
+      doc_num = @counting_sum_scorer.doc()
+      while (doc_num < max)
+        yield(doc_num, score())
         if not @counting_sum_scorer.next?
           return false
         end
-        doc_nr = @counting_sum_scorer.doc()
+        doc_num = @counting_sum_scorer.doc()
       end
       return true
     end

data/lib/ferret/search/caching_wrapper_filter.rb CHANGED Viewed

@@ -14,7 +14,7 @@ module Ferret::Search
     def bits(reader)
       if (@cache == nil)
-        @cache = Ferret::Utils::WeakKeyHash.new.extend(MonitorMixin)
+        @cache = Ferret::Utils::WeakKeyHash.new
       end
       @cache.synchronize() do # check cache

data/lib/ferret/search/disjunction_sum_scorer.rb CHANGED Viewed

@@ -8,13 +8,15 @@ module Ferret::Search
     # Construct a +DisjunctionScorer+.
     # sub_scorers:: A collection of at least two subscorers.
     #
-    # minimum_nr_matchers:: The positive minimum number of subscorers that should
-    # match to match this query.
-    # <br>When +@minimum_nr_matchers+ is bigger than
-    # the number of +sub_scorers+,
-    # no matches will be produced.
-    # <br>When @minimum_nr_matchers equals the number of sub_scorers,
-    # it more efficient to use +ConjunctionScorer+.
+    # minimum_nr_matchers:: The positive minimum number of subscorers that
+    #                       should match to match this query.
+    #
+    #                       When +@minimum_nr_matchers+ is bigger than the number
+    #                       of +sub_scorers+,no matches will be produced.
+    #
+    #                       When @minimum_nr_matchers equals the number of
+    #                       sub_scorers, it more efficient to use
+    #                       +ConjunctionScorer+.
     def initialize(sub_scorers, minimum_nr_matchers = 1)
       super(nil)

data/lib/ferret/search/field_cache.rb CHANGED Viewed

@@ -1,5 +1,4 @@
 module Ferret::Search
-  require 'monitor'
   # Expert: The default cache implementation, storing all values in memory.
   # A WeakKeyHash is used for storage.
@@ -36,7 +35,7 @@ module Ferret::Search
     FLOAT_PARSER = lambda {|i| i.to_f}
     # The internal cache. Maps Entry to array of interpreted term values.
-    @@cache = Ferret::Utils::WeakKeyHash.new.extend(MonitorMixin)
+    @@cache = Ferret::Utils::WeakKeyHash.new
     # See if an object is in the cache.
     def FieldCache.lookup(reader, field, sort_type)

data/lib/ferret/search/field_sorted_hit_queue.rb CHANGED Viewed

@@ -83,7 +83,7 @@ module Ferret::Search
     # Internal cache of comparators. Similar to FieldCache, only
     # caches comparators instead of term values.
-    @@comparators = Ferret::Utils::WeakKeyHash.new.extend(MonitorMixin)
+    @@comparators = Ferret::Utils::WeakKeyHash.new
     # Returns a comparator if it is in the cache.
     def lookup(reader, field, sort_type, comproc)

data/lib/ferret/search/fuzzy_term_enum.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+require 'monitor'
 module Ferret::Search
   # Subclass of FilteredTermEnum for enumerating all terms that are similiar
   # to the specified filter term.
@@ -5,6 +7,8 @@ module Ferret::Search
   # Term enumerations are always ordered by Term.compareTo().  Each term in
   # the enumeration is greater than all that precede it.
   class FuzzyTermEnum < FilteredTermEnum
+    include MonitorMixin
     include Ferret::Index
     attr_reader :end_enum
@@ -144,73 +148,75 @@ module Ferret::Search
     #    than the required threshold and 1.0 indicates that the text and
     #    target are identical
     def similarity(target)
-      m = target.length
-      n = @text.length
+      synchronize do
+        m = target.length
+        n = @text.length
+        if (n == 0)
+          # we don't have anything to compare.  That means if we just add the
+          # letters for m we get the new word
+          return (@prefix_length == 0) ? 0.0 : 1.0 - (m.to_f / @prefix_length)
+        end
+        if (m == 0)
+          return (@prefix_length == 0) ? 0.0 : 1.0 - (n.to_f / @prefix_length)
+        end
-      if (n == 0)
-        # we don't have anything to compare.  That means if we just add the
-        # letters for m we get the new word
-        return (@prefix_length == 0) ? 0.0 : 1.0 - (m.to_f / @prefix_length)
-      end
-      if (m == 0)
-        return (@prefix_length == 0) ? 0.0 : 1.0 - (n.to_f / @prefix_length)
-      end
+        max_distance = max_distance(m)
-      max_distance = max_distance(m)
-      if (max_distance < (m-n).abs)
-        #just adding the characters of m to n or vice-versa results in
-        #too many edits
-        #for example "pre" length is 3 and "prefixes" length is 8.  We can see that
-        #given this optimal circumstance, the edit distance cannot be less than 5.
-        #which is 8-3 or more precisesly Math.abs(3-8).
-        #if our maximum edit distance is 4, then we can discard this word
-        #without looking at it.
-        return 0.0
-      end
+        if (max_distance < (m-n).abs)
+          #just adding the characters of m to n or vice-versa results in
+          #too many edits
+          #for example "pre" length is 3 and "prefixes" length is 8.  We can see that
+          #given this optimal circumstance, the edit distance cannot be less than 5.
+          #which is 8-3 or more precisesly Math.abs(3-8).
+          #if our maximum edit distance is 4, then we can discard this word
+          #without looking at it.
+          return 0.0
+        end
-      #let's make sure we have enough room in our array to do the distance calculations.
-      if (@d[0].length <= m)
-        grow_distance_array(m)
-      end
+        #let's make sure we have enough room in our array to do the distance calculations.
+        if (@d[0].length <= m)
+          grow_distance_array(m)
+        end
-      # init matrix d
-      (n+1).times {|i| @d[i][0] = i}
-      (m+1).times {|j| @d[0][j] = j}
-      # start computing edit distance
-      1.upto(n) do |i|
-        best_possible_edit_distance = m
-        s_i = @text[i-1]
-        1.upto(m) do |j|
-          if (s_i != target[j-1])
-            @d[i][j] = min(@d[i-1][j], @d[i][j-1], @d[i-1][j-1])+1
-          else
-            @d[i][j] = min(@d[i-1][j]+1, @d[i][j-1]+1, @d[i-1][j-1])
+        # init matrix d
+        (n+1).times {|i| @d[i][0] = i}
+        (m+1).times {|j| @d[0][j] = j}
+        # start computing edit distance
+        1.upto(n) do |i|
+          best_possible_edit_distance = m
+          s_i = @text[i-1]
+          1.upto(m) do |j|
+            if (s_i != target[j-1])
+              @d[i][j] = min(@d[i-1][j], @d[i][j-1], @d[i-1][j-1])+1
+            else
+              @d[i][j] = min(@d[i-1][j]+1, @d[i][j-1]+1, @d[i-1][j-1])
+            end
+            if @d[i][j] < best_possible_edit_distance
+              best_possible_edit_distance = @d[i][j]
+            end
           end
-          if @d[i][j] < best_possible_edit_distance
-            best_possible_edit_distance = @d[i][j]
+          # After calculating row i, the best possible edit distance can be
+          # found by found by finding the smallest value in a given column.
+          # If the best_possible_edit_distance is greater than the max distance,
+          # abort.
+          if (i > max_distance and best_possible_edit_distance > max_distance)
+            # equal is okay, but not greater
+            # the closest the target can be to the text is just too far away.
+            # this target is leaving the party early.
+            return 0.0
           end
         end
-        # After calculating row i, the best possible edit distance can be
-        # found by found by finding the smallest value in a given column.
-        # If the best_possible_edit_distance is greater than the max distance,
-        # abort.
-        if (i > max_distance and best_possible_edit_distance > max_distance)
-          # equal is okay, but not greater
-          # the closest the target can be to the text is just too far away.
-          # this target is leaving the party early.
-          return 0.0
-        end
+        # this will return less than 0.0 when the edit distance is
+        # greater than the number of characters in the shorter word.
+        # but this was the formula that was previously used in FuzzyTermEnum,
+        # so it has not been changed (even though minimum_similarity must be
+        # greater than 0.0)
+        return 1.0 - (@d[n][m].to_f / (@prefix_length + (n < m ? n : m)))
       end
-      # this will return less than 0.0 when the edit distance is
-      # greater than the number of characters in the shorter word.
-      # but this was the formula that was previously used in FuzzyTermEnum,
-      # so it has not been changed (even though minimum_similarity must be
-      # greater than 0.0)
-      return 1.0 - (@d[n][m].to_f / (@prefix_length + (n < m ? n : m)))
     end
     # Grow the second dimension of the array, so that we can calculate the

data/lib/ferret/search/index_searcher.rb CHANGED Viewed

@@ -11,13 +11,20 @@ module Ferret::Search
     attr_accessor :similarity, :reader
     # Creates a searcher searching the index in the provided directory.
+    #
+    # You need to pass one argument which should be one of the following:
+    #
+    #   * An index reader which the searcher will search
+    #   * A directory where the searcher will open an index reader to search
+    #   * A string which represents a path to the directory to be searched
+    #
     def initialize(arg)
       if arg.is_a?(IndexReader)
         @reader = arg
       elsif arg.is_a?(Ferret::Store::Directory)
-        @reader = IndexReader.open(arg)
+        @reader = IndexReader.open(arg, false)
       elsif arg.is_a?(String)
-        @dir = Ferret::Store::FSDirectory.new(arg, true)
+        @dir = Ferret::Store::FSDirectory.new(arg, false)
         @reader = IndexReader.open(@dir, true)
       else
         raise ArgumentError, "Unknown argument passed to initialize IndexReader"
@@ -50,10 +57,10 @@ module Ferret::Search
     end
     # Expert: Returns the stored fields of document +i+.
-    # Called by HitCollector implementations.
+    #
     # See IndexReader#get_document
     def doc(i)
-      return @reader.document(i)
+      return @reader.get_document(i)
     end
     # Expert: Returns one greater than the largest possible document number.
@@ -73,12 +80,12 @@ module Ferret::Search
     # pass to this method. You can also pass a hash with one or more of the
     # following; {filter, num_docs, first_doc, sort}
     #
-    # query::    the query to run on the index
-    # filter::   filters docs from the search result
+    # query::     The query to run on the index
+    # filter::    filters docs from the search result
     # first_doc:: The index in the results of the first doc retrieved.
-    #    Default is 0
-    # num_docs:: The number of results returned. Default is 10
-    # sort::     an array of SortFields describing how to sort the results.
+    #             Default is 0
+    # num_docs::  The number of results returned. Default is 10
+    # sort::      An array of SortFields describing how to sort the results.
     def search(query, options = {})
       filter = options[:filter]
       first_doc = options[:first_doc]||0

data/lib/ferret/search/prefix_query.rb CHANGED Viewed

@@ -43,5 +43,12 @@ module Ferret::Search
       return buffer
     end
+    def eql?(o)
+      (@prefix == o.prefix and boost() == o.boost)
+    end
+    def hash()
+      boost().hash ^ @prefix.hash
+    end
   end
 end

data/lib/ferret/search/query_filter.rb CHANGED Viewed

@@ -21,7 +21,7 @@ module Ferret::Search
     def bits(reader)
       if (@cache == nil)
-        @cache = Ferret::Utils::WeakKeyHash.new.extend(MonitorMixin)
+        @cache = Ferret::Utils::WeakKeyHash.new
       end
       @cache.synchronize() do # check cache

data/lib/ferret/search/term_scorer.rb CHANGED Viewed

@@ -25,6 +25,8 @@ module Ferret::Search
       @weight = weight
       @term_docs = td
       @norms = norms
+      #XXX
+      @norms_size = @norms.size
       @weight_value = weight.value
       SCORE_CACHE_SIZE.times do |i|
@@ -85,8 +87,10 @@ module Ferret::Search
     # Advances to the next document matching the query.
-    # <br>The iterator over the matching documents is buffered using
+    #
+    # The iterator over the matching documents is buffered using
     # TermDocEnum#read(int[],int[]).
+    #
     # returns:: true iff there is another document matching the query.
     def next?()
       @pointer += 1

data/lib/ferret/search/top_docs.rb CHANGED Viewed

@@ -5,6 +5,18 @@ module Ferret::Search
     # Expert: The total number of hits for the query.
     # See Hits#length()
     attr_accessor :score_docs, :total_hits, :fields
+    alias :size :total_hits
+    # iterate through each of the score docs, yielding the document number and
+    # the score. eg:
+    #
+    #   top_docs.each do |doc, score|
+    #     puts "Doc number #{doc} found with score of #{score}"}
+    #   end
+    #
+    def each
+      score_docs.each {|sd| yield(sd.doc, sd.score) }
+    end
     # Expert: Constructs a TopDocs.
     def initialize(total_hits, score_docs, fields = SortField::FIELD_SCORE)

data/lib/ferret/store/buffered_index_io.rb CHANGED Viewed

@@ -140,14 +140,13 @@ module Ferret::Store
     # Creates a clone of the BufferedIndexReader. Reading from a
     # BufferedIndexInput should not change the state (read position) in the
     # clone and vice-versa.
-    def clone()
-      bii = super
-      bii.buffer = @buffer.clone if @buffer
-      return bii
+    def initialize_copy(o)
+      super
+      @buffer = o.buffer.clone if o.buffer
     end
-    attr_writer :buffer
-    protected :buffer=
+    attr_reader :buffer
+    protected :buffer
     private

data/lib/ferret/store/fs_store.rb CHANGED Viewed

@@ -31,6 +31,30 @@ module Ferret::Store
     # The lock dir is the directory where the file locks will be stored
     LOCK_DIR = nil
+    # Create a new directory from the path.
+    # path:: the path to the directory.
+    # create:: if true, create, or erase any existing contents.
+    def initialize(path, create)
+      super()
+      if create then FileUtils.mkdir_p(path) end
+      if not File.directory?(path) then
+        raise "There is no directory: #{path}. Use create = true to create one"
+      end
+      @dir = Dir.new(path)
+      # put the lock_dir here as well if no default exists.
+      if LOCK_DIR then
+        @lock_dir = Dir.new(LOCK_DIR)
+      else
+        @lock_dir = Dir.new(path)
+      end
+      @ref_count = 0
+    end
+    class <<FSDirectory
+      alias :allocate :new
+      protected :allocate
+    end
     # Returns the directory instance for the named location.
     #
     # Directories are cached, so that, for a given canonical path, the same
@@ -39,12 +63,12 @@ module Ferret::Store
     #
     # path:: the path to the directory.
     # create:: if true, create, or erase any existing contents.
-    def FSDirectory.get_directory(path, create=false)
+    def FSDirectory.new(path, create = false)
       dir = nil
       @@Directories.synchronize do
         dir = @@Directories[path]
         if not dir then
-          dir = FSDirectory.new(path, create)
+          dir = FSDirectory.allocate(path, create)
           @@Directories[path] = dir
         end
         dir.refresh if create
@@ -76,6 +100,7 @@ module Ferret::Store
     def refresh
       synchronize do
         # delete all the files
+        refresh_dir
         each do |fname|
           File.delete(dir_path(fname))
         end
@@ -133,7 +158,9 @@ module Ferret::Store
     # If a file already exists with the new name, then it is replaced.
     # This replacement should be atomic.
     def rename(from, to)
-      File.rename(dir_path(from), dir_path(to))
+      synchronize do
+        File.rename(dir_path(from), dir_path(to))
+      end
     end
@@ -160,10 +187,12 @@ module Ferret::Store
     # Closes the store.
     def close()
-      @ref_count -= 1
-      if (@ref_count <=0) then
-        @@Directories.synchronize do
-          @@Directories.delete(@dir.path)
+      synchronize do
+        @ref_count -= 1
+        if (@ref_count <= 0) then
+          @@Directories.synchronize do
+            @@Directories.delete(@dir.path)
+          end
         end
       end
     end
@@ -242,29 +271,32 @@ module Ferret::Store
     # A file system input stream extending InputStream to read from the file system
     class FSIndexInput < BufferedIndexInput
-      attr_writer :is_clone
-      attr_reader :length
-      attr_reader :file
+      attr_accessor :is_clone
+      attr_reader   :length, :file
       def initialize(path)
         @file = File.open(path, "rb")
         @file.extend(MonitorMixin)
+        #class <<@file
+        #  attr_accessor :ref_count
+        #end
+        #@file.ref_count = 1
         @length = File.size(path)
         @is_clone = false
         super()
       end
       def close
+        #@file.ref_count -= 1
+        #@file.close if @file.ref_count == 0
         @file.close if not @is_clone
       end
       # We need to record if this is a clone so we know when to close the file.
       # The file should only be closed when the original FSIndexInput is closed.
-      def clone()
-        fsii = super
-        fsii.is_clone = true
-        fsii.file.seek(@file.pos)
-        return fsii
+      def initialize_copy(o)
+        super
+        @is_clone = true
       end
       private
@@ -290,24 +322,6 @@ module Ferret::Store
     end
     private
-      # Create a new directory from the path.
-      # path:: the path to the directory.
-      # create:: if true, create, or erase any existing contents.
-      def initialize(path, create)
-        super()
-        if create then FileUtils.mkdir_p(path) end
-        if not File.directory?(path) then
-          raise "There is no directory: #{path}. Use create = true to create one"
-        end
-        @dir = Dir.new(path)
-        # put the lock_dir here as well if no default exists.
-        if LOCK_DIR then
-          @lock_dir = Dir.new(LOCK_DIR)
-        else
-          @lock_dir = Dir.new(path)
-        end
-        @ref_count = 0
-      end
       # Add the directory path to the file name for opening
       def dir_path(name)