RubyGems - sdsykes_acts_as_ferret - Versions diffs - 0.4.3.1 - Mend

sdsykes_acts_as_ferret 0.4.3.1

Files changed (32) hide show

data/LICENSE +20 -0
data/README +51 -0
data/bin/aaf_install +25 -0
data/config/ferret_server.yml +23 -0
data/doc/README.win32 +23 -0
data/doc/monit-example +22 -0
data/init.rb +22 -0
data/install.rb +18 -0
data/lib/act_methods.rb +254 -0
data/lib/acts_as_ferret.rb +151 -0
data/lib/bulk_indexer.rb +35 -0
data/lib/class_methods.rb +459 -0
data/lib/ferret_cap_tasks.rb +21 -0
data/lib/ferret_extensions.rb +115 -0
data/lib/ferret_result.rb +36 -0
data/lib/ferret_server.rb +203 -0
data/lib/index.rb +31 -0
data/lib/instance_methods.rb +156 -0
data/lib/local_index.rb +211 -0
data/lib/more_like_this.rb +217 -0
data/lib/multi_index.rb +83 -0
data/lib/remote_index.rb +50 -0
data/lib/search_results.rb +53 -0
data/lib/server_manager.rb +46 -0
data/lib/shared_index.rb +14 -0
data/lib/shared_index_class_methods.rb +90 -0
data/lib/unix_daemon.rb +63 -0
data/script/ferret_daemon +94 -0
data/script/ferret_server +10 -0
data/script/ferret_service +178 -0
data/sdsykes_acts_as_ferret.gemspec +21 -0
metadata +99 -0

@@ -0,0 +1,211 @@
+module ActsAsFerret
+  class LocalIndex < AbstractIndex
+    include MoreLikeThis::IndexMethods
+    def initialize(aaf_configuration)
+      super
+      ensure_index_exists
+    end
+    def reopen!
+      if @ferret_index
+        @ferret_index.close
+        @ferret_index = nil
+      end
+      logger.debug "reopening index at #{aaf_configuration[:ferret][:path]}"
+      ferret_index
+    end
+    # The 'real' Ferret Index instance
+    def ferret_index
+      ensure_index_exists
+      (@ferret_index ||= Ferret::Index::Index.new(aaf_configuration[:ferret])).tap do
+        @ferret_index.batch_size = aaf_configuration[:reindex_batch_size]
+        @ferret_index.logger = logger
+      end
+    end
+    # Checks for the presence of a segments file in the index directory
+    # Rebuilds the index if none exists.
+    def ensure_index_exists
+      logger.debug "LocalIndex: ensure_index_exists at #{aaf_configuration[:index_dir]}"
+      unless File.file? "#{aaf_configuration[:index_dir]}/segments"
+        ActsAsFerret::ensure_directory(aaf_configuration[:index_dir])
+        close
+        rebuild_index
+      end
+    end
+    # Closes the underlying index instance
+    def close
+      @ferret_index.close if @ferret_index
+    rescue StandardError
+      # is raised when index already closed
+    ensure
+      @ferret_index = nil
+    end
+    # rebuilds the index from all records of the model class this index belongs
+    # to. Arguments can be given in shared index scenarios to name multiple
+    # model classes to include in the index
+    def rebuild_index(*models)
+      models << aaf_configuration[:class_name] unless models.include?(aaf_configuration[:class_name])
+      models = models.flatten.uniq.map(&:constantize)
+      logger.debug "rebuild index: #{models.inspect}"
+      index = Ferret::Index::Index.new(aaf_configuration[:ferret].dup.update(:auto_flush => false,
+                                                                             :field_infos => ActsAsFerret::field_infos(models),
+                                                                             :create => true))
+      index.batch_size = aaf_configuration[:reindex_batch_size]
+      index.logger = logger
+      index.index_models models
+    end
+    def bulk_index(ids, options)
+      ferret_index.bulk_index(aaf_configuration[:class_name].constantize, ids, options)
+    end
+    # Parses the given query string into a Ferret Query object.
+    def process_query(query)
+      # work around ferret bug in #process_query (doesn't ensure the
+      # reader is open)
+      ferret_index.synchronize do
+        ferret_index.send(:ensure_reader_open)
+        original_query = ferret_index.process_query(query)
+      end
+    end
+    # Total number of hits for the given query.
+    # To count the results of a multi_search query, specify an array of
+    # class names with the :multi option.
+    def total_hits(query, options = {})
+      index = (models = options.delete(:multi)) ? multi_index(models) : ferret_index
+      index.search(query, options).total_hits
+    end
+    def determine_lazy_fields(options = {})
+      stored_fields = options[:lazy]
+      if stored_fields && !(Array === stored_fields)
+        stored_fields = aaf_configuration[:ferret_fields].select { |field, config| config[:store] == :yes }.map(&:first)
+      end
+      logger.debug "stored_fields: #{stored_fields}"
+      return stored_fields
+    end
+    # Queries the Ferret index to retrieve model class, id, score and the
+    # values of any fields stored in the index for each hit.
+    # If a block is given, these are yielded and the number of total hits is
+    # returned. Otherwise [total_hits, result_array] is returned.
+    def find_id_by_contents(query, options = {})
+      result = []
+      index = ferret_index
+      logger.debug "query: #{ferret_index.process_query query}" if logger.debug?
+      lazy_fields = determine_lazy_fields options
+      total_hits = index.search_each(query, options) do |hit, score|
+        doc = index[hit]
+        model = aaf_configuration[:store_class_name] ? doc[:class_name] : aaf_configuration[:class_name]
+        # fetch stored fields if lazy loading
+        data = {}
+        lazy_fields.each { |field| data[field] = doc[field] } if lazy_fields
+        if block_given?
+          yield model, doc[:id], score, data
+        else
+          result << { :model => model, :id => doc[:id], :score => score, :data => data }
+        end
+      end
+      #logger.debug "id_score_model array: #{result.inspect}"
+      return block_given? ? total_hits : [total_hits, result]
+    end
+    # Queries multiple Ferret indexes to retrieve model class, id and score for
+    # each hit. Use the models parameter to give the list of models to search.
+    # If a block is given, model, id and score are yielded and the number of
+    # total hits is returned. Otherwise [total_hits, result_array] is returned.
+    def id_multi_search(query, models, options = {})
+      index = multi_index(models)
+      result = []
+      lazy_fields = determine_lazy_fields options
+      total_hits = index.search_each(query, options) do |hit, score|
+        doc = index[hit]
+        # fetch stored fields if lazy loading
+        data = {}
+        lazy_fields.each { |field| data[field] = doc[field] } if lazy_fields
+        raise "':store_class_name => true' required for multi_search to work" if doc[:class_name].blank?
+        if block_given?
+          yield doc[:class_name], doc[:id], score, doc, data
+        else
+          result << { :model => doc[:class_name], :id => doc[:id], :score => score, :data => data }
+        end
+      end
+      return block_given? ? total_hits : [ total_hits, result ]
+    end
+    ######################################
+    # methods working on a single record
+    # called from instance_methods, here to simplify interfacing with the
+    # remote ferret server
+    # TODO having to pass id and class_name around like this isn't nice
+    ######################################
+    # add record to index
+    # record may be the full AR object, a Ferret document instance or a Hash
+    def add(record)
+      record = record.to_doc unless Hash === record || Ferret::Document === record
+      ferret_index << record
+    end
+    alias << add
+    # delete record from index
+    def remove(id, class_name)
+      ferret_index.query_delete query_for_record(id, class_name)
+    end
+    # highlight search terms for the record with the given id.
+    def highlight(id, class_name, query, options = {})
+      options.reverse_merge! :num_excerpts => 2, :pre_tag => '<em>', :post_tag => '</em>'
+      highlights = []
+      ferret_index.synchronize do
+        doc_num = document_number(id, class_name)
+        if options[:field]
+          highlights << ferret_index.highlight(query, doc_num, options)
+        else
+          query = process_query(query) # process only once
+          aaf_configuration[:ferret_fields].each_pair do |field, config|
+            next if config[:store] == :no || config[:highlight] == :no
+            options[:field] = field
+            highlights << ferret_index.highlight(query, doc_num, options)
+          end
+        end
+      end
+      return highlights.compact.flatten[0..options[:num_excerpts]-1]
+    end
+    # retrieves the ferret document number of the record with the given id.
+    def document_number(id, class_name)
+      hits = ferret_index.search(query_for_record(id, class_name))
+      return hits.hits.first.doc if hits.total_hits == 1
+      raise "cannot determine document number from primary key: #{id}"
+    end
+    # build a ferret query matching only the record with the given id
+    # the class name only needs to be given in case of a shared index configuration
+    def query_for_record(id, class_name = nil)
+      Ferret::Search::TermQuery.new(:id, id.to_s)
+    end
+    protected
+    # returns a MultiIndex instance operating on a MultiReader
+    def multi_index(model_classes)
+      model_classes.map!(&:constantize) if String === model_classes.first
+      model_classes.sort! { |a, b| a.name <=> b.name }
+      key = model_classes.inject("") { |s, clazz| s + clazz.name }
+      multi_config = aaf_configuration[:ferret].dup
+      multi_config.delete :default_field  # we don't want the default field list of *this* class for multi_searching
+      ActsAsFerret::multi_indexes[key] ||= MultiIndex.new(model_classes, multi_config)
+    end
+  end
+end

data/lib/more_like_this.rb ADDED

@@ -0,0 +1,217 @@
+module ActsAsFerret #:nodoc:
+    module MoreLikeThis
+      module InstanceMethods
+        # returns other instances of this class, which have similar contents
+        # like this one. Basically works like this: find out n most interesting
+        # (i.e. characteristic) terms from this document, and then build a
+        # query from those which is run against the whole index. Which terms
+        # are interesting is decided on variour criteria which can be
+        # influenced by the given options.
+        #
+        # The algorithm used here is a quite straight port of the MoreLikeThis class
+        # from Apache Lucene.
+        #
+        # options are:
+        # :field_names : Array of field names to use for similarity search (mandatory)
+        # :min_term_freq => 2,  # Ignore terms with less than this frequency in the source doc.
+        # :min_doc_freq => 5,   # Ignore words which do not occur in at least this many docs
+        # :min_word_length => nil, # Ignore words shorter than this length (longer words tend to
+        #                            be more characteristic for the document they occur in).
+        # :max_word_length => nil, # Ignore words if greater than this len.
+        # :max_query_terms => 25,  # maximum number of terms in the query built
+        # :max_num_tokens => 5000, # maximum number of tokens to examine in a single field
+        # :boost => false,         # when true, a boost according to the relative score of
+        #                            a term is applied to this Term's TermQuery.
+        # :similarity => 'DefaultAAFSimilarity'   # the similarity implementation to use (the default
+        #                                           equals Ferret's internal similarity implementation)
+        # :analyzer => 'Ferret::Analysis::StandardAnalyzer' # class name of the analyzer to use
+        # :append_to_query => nil # proc taking a query object as argument, which will be called after generating the query. can be used to further manipulate the query used to find related documents, i.e. to constrain the search to a given class in single table inheritance scenarios
+        # ferret_options : Ferret options handed over to find_by_contents (i.e. for limits and sorting)
+        # ar_options : options handed over to find_by_contents for AR scoping
+        def more_like_this(options = {}, ferret_options = {}, ar_options = {})
+          options = {
+            :field_names => nil,  # Default field names
+            :min_term_freq => 2,  # Ignore terms with less than this frequency in the source doc.
+            :min_doc_freq => 5,   # Ignore words which do not occur in at least this many docs
+            :min_word_length => 0, # Ignore words if less than this len. Default is not to ignore any words.
+            :max_word_length => 0, # Ignore words if greater than this len. Default is not to ignore any words.
+            :max_query_terms => 25,  # maximum number of terms in the query built
+            :max_num_tokens => 5000, # maximum number of tokens to analyze when analyzing contents
+            :boost => false,
+            :similarity => 'ActsAsFerret::MoreLikeThis::DefaultAAFSimilarity',  # class name of the similarity implementation to use
+            :analyzer => 'Ferret::Analysis::StandardAnalyzer', # class name of the analyzer to use
+            :append_to_query => nil,
+            :base_class => self.class # base class to use for querying, useful in STI scenarios where BaseClass.find_by_contents can be used to retrieve results from other classes, too
+          }.update(options)
+          #index.search_each('id:*') do |doc, score|
+          #  puts "#{doc} == #{index[doc][:description]}"
+          #end
+          clazz = options[:base_class]
+          options[:base_class] = clazz.name
+          query = clazz.aaf_index.build_more_like_this_query(self.id, self.class.name, options)
+          options[:append_to_query].call(query) if options[:append_to_query]
+          clazz.find_by_contents(query, ferret_options, ar_options)
+        end
+      end
+      module IndexMethods
+        # TODO to allow morelikethis for unsaved records, we have to give the
+        # unsaved record's data to this method. check how this will work out
+        # via drb...
+        def build_more_like_this_query(id, class_name, options)
+          [:similarity, :analyzer].each { |sym| options[sym] = options[sym].constantize.new }
+          ferret_index.synchronize do # avoid that concurrent writes close our reader
+            ferret_index.send(:ensure_reader_open)
+            reader = ferret_index.send(:reader)
+            term_freq_map = retrieve_terms(id, class_name, reader, options)
+            priority_queue = create_queue(term_freq_map, reader, options)
+            create_query(id, class_name, priority_queue, options)
+          end
+        end
+        protected
+        def create_query(id, class_name, priority_queue, options={})
+          query = Ferret::Search::BooleanQuery.new
+          qterms = 0
+          best_score = nil
+          while(cur = priority_queue.pop)
+            term_query = Ferret::Search::TermQuery.new(cur.field, cur.word)
+            if options[:boost]
+              # boost term according to relative score
+              # TODO untested
+              best_score ||= cur.score
+              term_query.boost = cur.score / best_score
+            end
+            begin
+              query.add_query(term_query, :should)
+            rescue Ferret::Search::BooleanQuery::TooManyClauses
+              break
+            end
+            qterms += 1
+            break if options[:max_query_terms] > 0 && qterms >= options[:max_query_terms]
+          end
+          # exclude the original record
+          query.add_query(query_for_record(id, class_name), :must_not)
+          return query
+        end
+        # creates a term/term_frequency map for terms from the fields
+        # given in options[:field_names]
+        def retrieve_terms(id, class_name, reader, options)
+          raise "more_like_this atm only works on saved records" if id.nil?
+          document_number = document_number(id, class_name) rescue nil
+          field_names = options[:field_names]
+          max_num_tokens = options[:max_num_tokens]
+          term_freq_map = Hash.new(0)
+          doc = nil
+          record = nil
+          field_names.each do |field|
+            #puts "field: #{field}"
+            term_freq_vector = reader.term_vector(document_number, field) if document_number
+            #if false
+            if term_freq_vector
+              # use stored term vector
+              # puts 'using stored term vector'
+              term_freq_vector.terms.each do |term|
+                term_freq_map[term.text] += term.positions.size unless noise_word?(term.text, options)
+              end
+            else
+              # puts 'no stored term vector'
+              # no term vector stored, but we have stored the contents in the index
+              # -> extract terms from there
+              content = nil
+              if document_number
+                doc = reader[document_number]
+                content = doc[field]
+              end
+              unless content
+                # no term vector, no stored content, so try content from this instance
+                record ||= options[:base_class].constantize.find(id)
+                content = record.content_for_field_name(field.to_s)
+              end
+              puts "have doc: #{doc[:id]} with #{field} == #{content}"
+              token_count = 0
+              ts = options[:analyzer].token_stream(field, content)
+              while token = ts.next
+                break if (token_count+=1) > max_num_tokens
+                next if noise_word?(token.text, options)
+                term_freq_map[token.text] += 1
+              end
+            end
+          end
+          term_freq_map
+        end
+        # create an ordered(by score) list of word,fieldname,score
+        # structures
+        def create_queue(term_freq_map, reader, options)
+          pq = Array.new(term_freq_map.size)
+          similarity = options[:similarity]
+          num_docs = reader.num_docs
+          term_freq_map.each_pair do |word, tf|
+            # filter out words that don't occur enough times in the source
+            next if options[:min_term_freq] && tf < options[:min_term_freq]
+            # go through all the fields and find the largest document frequency
+            top_field = options[:field_names].first
+            doc_freq = 0
+            options[:field_names].each do |field_name|
+              freq = reader.doc_freq(field_name, word)
+              if freq > doc_freq
+                top_field = field_name
+                doc_freq = freq
+              end
+            end
+            # filter out words that don't occur in enough docs
+            next if options[:min_doc_freq] && doc_freq < options[:min_doc_freq]
+            next if doc_freq == 0 # index update problem ?
+            idf = similarity.idf(doc_freq, num_docs)
+            score = tf * idf
+            pq << FrequencyQueueItem.new(word, top_field, score)
+          end
+          pq.compact!
+          pq.sort! { |a,b| a.score<=>b.score }
+          return pq
+        end
+        def noise_word?(text, options)
+          len = text.length
+          (
+            (options[:min_word_length] > 0 && len < options[:min_word_length]) ||
+            (options[:max_word_length] > 0 && len > options[:max_word_length]) ||
+            (options[:stop_words] && options.include?(text))
+          )
+        end
+      end
+      class DefaultAAFSimilarity
+        def idf(doc_freq, num_docs)
+          return 0.0 if num_docs == 0
+          return Math.log(num_docs.to_f/(doc_freq+1)) + 1.0
+        end
+      end
+      class FrequencyQueueItem
+        attr_reader :word, :field, :score
+        def initialize(word, field, score)
+          @word = word; @field = field; @score = score
+        end
+      end
+    end
+end

data/lib/multi_index.rb ADDED

@@ -0,0 +1,83 @@
+module ActsAsFerret #:nodoc:
+      # this class is not threadsafe
+      class MultiIndex
+        def initialize(model_classes, options = {})
+          @model_classes = model_classes
+          # ensure all models indexes exist
+          @model_classes.each { |m| m.aaf_index.ensure_index_exists }
+          default_fields = @model_classes.inject([]) do |fields, c|
+            fields + [ c.aaf_configuration[:ferret][:default_field] ].flatten
+          end
+          @options = {
+            :default_field => default_fields
+          }.update(options)
+        end
+        def search(query, options={})
+          #puts "querystring: #{query.to_s}"
+          query = process_query(query)
+          #puts "parsed query: #{query.to_s}"
+          searcher.search(query, options)
+        end
+        def search_each(query, options = {}, &block)
+          query = process_query(query)
+          searcher.search_each(query, options, &block)
+        end
+        # checks if all our sub-searchers still are up to date
+        def latest?
+          return false unless @reader
+          # segfaults with 0.10.4 --> TODO report as bug @reader.latest?
+          @sub_readers.each do |r|
+            return false unless r.latest?
+          end
+          true
+        end
+        def searcher
+          ensure_searcher
+          @searcher
+        end
+        def doc(i)
+          searcher[i]
+        end
+        alias :[] :doc
+        def query_parser
+          @query_parser ||= Ferret::QueryParser.new(@options)
+        end
+        def process_query(query)
+          query = query_parser.parse(query) if query.is_a?(String)
+          return query
+        end
+        def close
+          @searcher.close if @searcher
+          @reader.close if @reader
+        end
+        protected
+          def ensure_searcher
+            unless latest?
+              @sub_readers = @model_classes.map { |clazz|
+                begin
+                  reader = Ferret::Index::IndexReader.new(clazz.aaf_configuration[:index_dir])
+                rescue Exception
+                  raise "error opening #{clazz.aaf_configuration[:index_dir]}: #{$!}"
+                end
+              }
+              close
+              @reader = Ferret::Index::IndexReader.new(@sub_readers)
+              @searcher = Ferret::Search::Searcher.new(@reader)
+            end
+          end
+      end # of class MultiIndex
+end