RubyGems - acts_as_indexed - Versions diffs - 0.6.2 - Mend

acts_as_indexed 0.6.2

Files changed (23) hide show

data/.gitignore +5 -0
data/CHANGELOG +90 -0
data/MIT-LICENSE +20 -0
data/README.rdoc +137 -0
data/Rakefile +50 -0
data/VERSION +1 -0
data/acts_as_indexed.gemspec +67 -0
data/lib/acts_as_indexed.rb +248 -0
data/lib/acts_as_indexed/configuration.rb +41 -0
data/lib/acts_as_indexed/search_atom.rb +104 -0
data/lib/acts_as_indexed/search_index.rb +325 -0
data/lib/will_paginate_search.rb +29 -0
data/rails/init.rb +2 -0
data/test/abstract_unit.rb +52 -0
data/test/acts_as_indexed_test.rb +133 -0
data/test/configuration_test.rb +57 -0
data/test/database.yml +10 -0
data/test/fixtures/post.rb +5 -0
data/test/fixtures/posts.yml +31 -0
data/test/schema.rb +6 -0
data/test/search_atom_test.rb +98 -0
data/test/search_index_test.rb +50 -0
metadata +94 -0

data/lib/acts_as_indexed/configuration.rb ADDED

@@ -0,0 +1,41 @@
+# ActsAsIndexed
+# Copyright (c) 2007 - 2010 Douglas F Shearer.
+# http://douglasfshearer.com
+# Distributed under the MIT license as included with this plugin.
+module ActsAsIndexed
+  # Used to set up and modify settings for acts_as_indexed.
+  class Configuration
+    # Sets the location for the index. Specify as an array. Heroku, for
+    # example would use RAILS_ROOT/tmp/index, which would be set as
+    # [Rails.root,'tmp','index]
+    attr_accessor :index_file
+    # Tuning value for the index partitioning. Larger values result in quicker
+    # searches, but slower indexing. Default is 3.
+    attr_reader :index_file_depth
+    # Sets the minimum length for a word in a query. Words shorter than this
+    # value are ignored in searches unless preceded by the '+' operator.
+    # Default is 3.
+    attr_reader :min_word_size
+    def initialize
+      @index_file = [Rails.root, 'index']
+      @index_file_depth = 3
+      @min_word_size = 3
+    end
+    def index_file_depth=(val)
+      raise(ArgumentError, 'index_file_depth cannot be less than one (1)') if val < 1
+      @index_file_depth = val
+    end
+    def min_word_size=(val)
+      raise(ArgumentError, 'min_word_size cannot be less than one (1)') if val < 1
+      @min_word_size = val
+    end
+  end
+end

data/lib/acts_as_indexed/search_atom.rb ADDED

@@ -0,0 +1,104 @@
+# ActsAsIndexed
+# Copyright (c) 2007 - 2010 Douglas F Shearer.
+# http://douglasfshearer.com
+# Distributed under the MIT license as included with this plugin.
+module ActsAsIndexed #:nodoc:
+  class SearchAtom
+    # Contains a hash of records.
+    # { 'record_id' => [pos1, pos2, pos] }
+    #--
+    # Weighting:
+    # http://www.perlmonks.com/index.pl?node_id=27509
+    # W(T, D) = tf(T, D) * log ( DN / df(T))
+    # weighting = frequency_in_this_record * log (total_number_of_records / number_of_matching_records)
+    def initialize
+      @records = {}
+    end
+    # Returns true if the given record is present.
+    def include_record?(record_id)
+      @records.include?(record_id)
+    end
+    # Adds +record_id+ to the stored records.
+    def add_record(record_id)
+      @records[record_id] = [] if !include_record?(record_id)
+    end
+    # Adds +pos+ to the array of positions for +record_id+.
+    def add_position(record_id, pos)
+      add_record(record_id)
+      @records[record_id] << pos
+    end
+    # Returns all record IDs stored in this Atom.
+    def record_ids
+      @records.keys
+    end
+    # Returns an array of positions for +record_id+ stored in this Atom.
+    def positions(record_id)
+      @records[record_id]
+    end
+    # Removes +record_id+ from this Atom.
+    def remove_record(record_id)
+      @records.delete(record_id)
+    end
+    # Returns at atom containing the records and positions of +self+ preceded by +former+
+    # "former latter" or "big dog" where "big" is the former and "dog" is the latter.
+    def preceded_by(former)
+      matches = SearchAtom.new
+      latter = {}
+      former.record_ids.each do |rid|
+        latter[rid] = @records[rid] if @records[rid]
+      end
+      # Iterate over each record in latter.
+      latter.each do |record_id,pos|
+        # Iterate over each position.
+        pos.each do |p|
+          # Check if previous position is in former.
+          if former.include_position?(record_id,p-1)
+            matches.add_record(record_id) if !matches.include_record?(record_id)
+            matches.add_position(record_id,p)
+          end
+        end
+      end
+      matches
+    end
+    # Returns a hash of record_ids and weightings for each record in the
+    # atom.
+    def weightings(records_size)
+      out = {}
+      @records.each do |r_id, pos|
+        # Fixes a bug when the records_size is zero. i.e. The only record
+        # contaning the word has been deleted.
+        if records_size < 1
+          out[r_id] = 0.0
+          next
+        end
+        # weighting = frequency * log (records.size / records_with_atom)
+        ## parndt 2010/05/03 changed to records_size.to_f to avoid -Infinity Errno::ERANGE exceptions
+        ## which would happen for example Math.log(1 / 20) == -Infinity but Math.log(1.0 / 20) == -2.99573227355399
+        out[r_id] = pos.size * Math.log(records_size.to_f / @records.size)
+      end
+      out
+    end
+    protected
+    def include_position?(record_id,pos)
+      @records[record_id].include?(pos)
+    end
+  end
+end

data/lib/acts_as_indexed/search_index.rb ADDED

@@ -0,0 +1,325 @@
+# ActsAsIndexed
+# Copyright (c) 2007 - 2010 Douglas F Shearer.
+# http://douglasfshearer.com
+# Distributed under the MIT license as included with this plugin.
+module ActsAsIndexed #:nodoc:
+  class SearchIndex
+    # root:: Location of index on filesystem.
+    # index_depth:: Degree of index partitioning.
+    # fields:: Fields or instance methods of ActiveRecord model to be indexed.
+    # min_word_size:: Smallest query term that will be run through search.
+    def initialize(root, index_depth, fields, min_word_size)
+      @root = root
+      @fields = fields
+      @index_depth = index_depth
+      @atoms = {}
+      @min_word_size = min_word_size
+      @records_size = exists? ? load_record_size : 0
+    end
+    # Adds +record+ to the index.
+    def add_record(record)
+      condensed_record = condense_record(record)
+      load_atoms(condensed_record)
+      add_occurences(condensed_record,record.id)
+      @records_size += 1
+    end
+    # Adds multiple records to the index. Accepts an array of +records+.
+    def add_records(records)
+      records.each do |record|
+        add_record(record)
+      end
+    end
+    # Removes +record+ from the index.
+    def remove_record(record)
+      atoms = condense_record(record)
+      load_atoms(atoms)
+      atoms.each do |a|
+        @atoms[a].remove_record(record.id) if @atoms.has_key?(a)
+        @records_size -= 1
+        #p "removing #{record.id} from #{a}"
+      end
+    end
+    def update_record(record_new, record_old)
+      # Work out which atoms have modifications.
+      # Minimises loading and saving of partitions.
+      old_atoms = condense_record(record_old)
+      new_atoms = condense_record(record_new)
+      # Remove the old version from the appropriate atoms.
+      load_atoms(old_atoms)
+      old_atoms.each do |a|
+        @atoms[a].remove_record(record_new.id) if @atoms.has_key?(a)
+      end
+      # Add the new version to the appropriate atoms.
+      load_atoms(new_atoms)
+      # TODO: Make a version of this method that takes the
+      # atomised version of the record.
+      add_occurences(new_atoms, record_new.id)
+    end
+    # Saves the current index partitions to the filesystem.
+    def save
+      prepare
+      atoms_sorted = {}
+      @atoms.each do |atom_name, records|
+        e_p = encoded_prefix(atom_name)
+        atoms_sorted[e_p] = {} if !atoms_sorted.has_key?(e_p)
+        atoms_sorted[e_p][atom_name] = records
+      end
+      atoms_sorted.each do |e_p, atoms|
+        #p "Saving #{e_p}."
+        File.open(File.join(@root + [e_p.to_s]),'w+') do |f|
+          Marshal.dump(atoms,f)
+        end
+      end
+      save_record_size
+    end
+    # Deletes the current model's index from the filesystem.
+    #--
+    # TODO: Write a public method that will delete all indexes.
+    def destroy
+      FileUtils.rm_rf(@root)
+      true
+    end
+    # Returns an array of IDs for records matching +query+.
+    def search(query)
+      return [] if query.nil?
+      load_atoms(cleanup_atoms(query))
+      queries = parse_query(query.dup)
+      positive = run_queries(queries[:positive])
+      positive_quoted = run_quoted_queries(queries[:positive_quoted])
+      negative = run_queries(queries[:negative])
+      negative_quoted = run_quoted_queries(queries[:negative_quoted])
+      if !queries[:positive].empty? && !queries[:positive_quoted].empty?
+        p = positive.delete_if{ |r_id,w| !positive_quoted.include?(r_id) }
+        pq = positive_quoted.delete_if{ |r_id,w| !positive.include?(r_id) }
+        results = p.merge(pq) { |r_id,old_val,new_val| old_val + new_val}
+      elsif !queries[:positive].empty?
+        results = positive
+      else
+        results = positive_quoted
+      end
+      negative_results = (negative.keys + negative_quoted.keys)
+      results.delete_if { |r_id, w| negative_results.include?(r_id) }
+      #p results
+      results
+    end
+    # Returns true if the index root exists on the FS.
+    #--
+    # TODO: Make a private method called 'root_exists?' which checks for the root directory.
+    def exists?
+      File.exists?(File.join(@root + ['size']))
+    end
+    private
+    # Gets the size file from the index.
+    def load_record_size
+      File.open(File.join(@root + ['size'])) do |f|
+        (Marshal.load(f))
+      end
+    end
+    # Saves the size to the size file.
+    def save_record_size
+      File.open(File.join(@root + ['size']),'w+') do |f|
+        Marshal.dump(@records_size,f)
+      end
+    end
+    # Returns true if the given atom is present.
+    def include_atom?(atom)
+      @atoms.has_key?(atom)
+    end
+    # Returns true if all the given atoms are present.
+    def include_atoms?(atoms_arr)
+      atoms_arr.each do |a|
+        return false if !include_atom?(a)
+      end
+      true
+    end
+    # Returns true if the given record is present.
+    def include_record?(record_id)
+      @atoms.each do |atomname, atom|
+        return true if atom.include_record?(record_id)
+      end
+    end
+    def add_atom(atom)
+      @atoms[atom] = SearchAtom.new if !include_atom?(atom)
+    end
+    def add_occurences(condensed_record,record_id)
+      condensed_record.each_with_index do |atom, i|
+        add_atom(atom)
+        @atoms[atom].add_position(record_id, i)
+        #p "adding #{record.id} to #{atom}"
+      end
+    end
+    def encoded_prefix(atom)
+      prefix = atom[0,@index_depth]
+      if !@prefix_cache || !@prefix_cache.has_key?(prefix)
+        @prefix_cache = {} if !@prefix_cache
+        len = atom.length
+        if len > 1
+          @prefix_cache[prefix] = prefix.split(//).map{|c| encode_character(c)}.join('_')
+        else
+          @prefix_cache[prefix] = encode_character(atom)
+        end
+      end
+      @prefix_cache[prefix]
+    end
+    # Allows compatibility with 1.8.6 which has no ord method.
+    def encode_character(char)
+      if @@has_ord ||= char.respond_to?(:ord)
+        char.ord.to_s
+      else
+        char[0]
+      end
+    end
+    def parse_query(s)
+      # Find -"foo bar".
+      negative_quoted = []
+      while neg_quoted = s.slice!(/-\"[^\"]*\"/)
+        negative_quoted << cleanup_atoms(neg_quoted)
+      end
+      # Find "foo bar".
+      positive_quoted = []
+      while pos_quoted = s.slice!(/\"[^\"]*\"/)
+        positive_quoted << cleanup_atoms(pos_quoted)
+      end
+      # Find -foo.
+      negative = []
+      while neg = s.slice!(/-[\S]*/)
+        negative << cleanup_atoms(neg).first
+      end
+      # Find +foo
+      positive = []
+      while pos = s.slice!(/\+[\S]*/)
+        positive << cleanup_atoms(pos).first
+      end
+      # Find all other terms.
+      positive += cleanup_atoms(s,true)
+      {:negative_quoted => negative_quoted, :positive_quoted => positive_quoted, :negative => negative, :positive => positive}
+    end
+    def run_queries(atoms)
+      results = {}
+      atoms.uniq.each do |atom|
+        interim_results = {}
+        if include_atom?(atom)
+          # Collect all the weightings for the current atom.
+          interim_results = @atoms[atom].weightings(@records_size)
+        end
+        if results.empty?
+          # If first time round, set results with initial weightings.
+          results = interim_results
+        else
+          # If second time round, add weightings together for records
+          # matching both atoms. Any matching only one are discarded.
+          rr = {}
+          interim_results.each do |r,w|
+            rr[r] = w + results[r] if results[r]
+          end
+          results = rr
+        end
+      end
+      #p results
+      results
+    end
+    def run_quoted_queries(quoted_atoms)
+      results = {}
+      quoted_atoms.each do |quoted_atom|
+        interim_results = {}
+        # Check the index contains all the required atoms.
+        # match_atom = first_word_atom
+        # for each of the others
+        #   return atom containing records + positions where current atom is preceded by following atom.
+        # end
+        # return records from final atom.
+        next if !include_atoms?(quoted_atom)
+        matches = @atoms[quoted_atom.first]
+        quoted_atom[1..-1].each do |atom_name|
+          matches = @atoms[atom_name].preceded_by(matches)
+        end
+        #results += matches.record_ids
+        interim_results = matches.weightings(@records_size)
+        if results.empty?
+          results = interim_results
+        else
+          rr = {}
+          interim_results.each do |r,w|
+            rr[r] = w + results[r] if results[r]
+          end
+          #p results.class
+          results = rr
+        end
+      end
+      results
+    end
+    def load_atoms(atoms)
+      # Remove duplicate atoms.
+      # Remove atoms already in index.
+      # Calculate prefixes.
+      # Remove duplicate prefixes.
+      atoms.uniq.reject{|a| include_atom?(a)}.collect{|a| encoded_prefix(a)}.uniq.each do |name|
+        if File.exists?(File.join(@root + [name.to_s]))
+          File.open(File.join(@root + [name.to_s])) do |f|
+            @atoms.merge!(Marshal.load(f))
+          end
+        end
+      end
+    end
+    def prepare
+      # Makes the RAILS_ROOT/index directory
+      Dir.mkdir(File.join(@root[0,2])) if !File.exists?(File.join(@root[0,2]))
+      # Makes the RAILS_ROOT/index/ENVIRONMENT directory
+      Dir.mkdir(File.join(@root[0,3])) if !File.exists?(File.join(@root[0,3]))
+      # Makes the RAILS_ROOT/index/ENVIRONMENT/CLASS directory
+      Dir.mkdir(File.join(@root)) if !File.exists?(File.join(@root))
+    end
+    def cleanup_atoms(s, limit_size=false, min_size = @min_word_size || 3)
+      atoms = s.downcase.gsub(/\W/,' ').squeeze(' ').split
+      return atoms if !limit_size
+      atoms.reject{|w| w.size < min_size}
+    end
+    def condense_record(record)
+      record_condensed = ''
+      @fields.each do |f|
+        record_condensed += ' ' + record.send(f).to_s if record.send(f)
+      end
+      cleanup_atoms(record_condensed)
+    end
+  end
+end