RubyGems - acts_as_ferret - Versions diffs - 0.3.1 - Mend

acts_as_ferret 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

data/lib/more_like_this.rb ADDED Viewed

@@ -0,0 +1,203 @@
+module FerretMixin
+  module Acts #:nodoc:
+    module ARFerret #:nodoc:
+      module MoreLikeThis
+        class DefaultAAFSimilarity
+          def idf(doc_freq, num_docs)
+            return 0.0 if num_docs == 0
+            return Math.log(num_docs.to_f/(doc_freq+1)) + 1.0
+          end
+        end
+        # returns other instances of this class, which have similar contents
+        # like this one. Basically works like this: find out n most interesting
+        # (i.e. characteristic) terms from this document, and then build a
+        # query from those which is run against the whole index. Which terms
+        # are interesting is decided on variour criteria which can be
+        # influenced by the given options.
+        #
+        # The algorithm used here is a quite straight port of the MoreLikeThis class
+        # from Apache Lucene.
+        #
+        # options are:
+        # :field_names : Array of field names to use for similarity search (mandatory)
+        # :min_term_freq => 2,  # Ignore terms with less than this frequency in the source doc.
+        # :min_doc_freq => 5,   # Ignore words which do not occur in at least this many docs
+        # :min_word_length => nil, # Ignore words if less than this len (longer
+        # words tend to be more characteristic for the document they occur in).
+        # :max_word_length => nil, # Ignore words if greater than this len.
+        # :max_query_terms => 25,  # maximum number of terms in the query built
+        # :max_num_tokens => 5000, # maximum number of tokens to examine in a
+        # single field
+        # :boost => false,         # when true, a boost according to the
+        # relative score of a term is applied to this Term's TermQuery.
+        # :similarity => Ferret::Search::Similarity.default, # the similarity
+        # implementation to use
+        # :analyzer => Ferret::Analysis::StandardAnalyzer.new # the analyzer to
+        # use
+        # :append_to_query => nil # proc taking a query object as argument, which will be called after generating the query. can be used to further manipulate the query used to find related documents, i.e. to constrain the search to a given class in single table inheritance scenarios
+        # find_options : options handed over to find_by_contents
+        def more_like_this(options = {}, find_options = {})
+          options = {
+            :field_names => nil,  # Default field names
+            :min_term_freq => 2,  # Ignore terms with less than this frequency in the source doc.
+            :min_doc_freq => 5,   # Ignore words which do not occur in at least this many docs
+            :min_word_length => 0, # Ignore words if less than this len. Default is not to ignore any words.
+            :max_word_length => 0, # Ignore words if greater than this len. Default is not to ignore any words.
+            :max_query_terms => 25,  # maximum number of terms in the query built
+            :max_num_tokens => 5000, # maximum number of tokens to analyze when analyzing contents
+            :boost => false,
+            :similarity => DefaultAAFSimilarity.new,
+            :analyzer => Ferret::Analysis::StandardAnalyzer.new,
+            :append_to_query => nil,
+            :base_class => self.class # base class to use for querying, useful in STI scenarios where BaseClass.find_by_contents can be used to retrieve results from other classes, too
+          }.update(options)
+          index = self.class.ferret_index
+          #index.search_each('id:*') do |doc, score|
+          #  puts "#{doc} == #{index[doc][:description]}"
+          #end
+          index.synchronize do # avoid that concurrent writes close our reader
+            index.send(:ensure_reader_open)
+            reader = index.send(:reader)
+            doc_number = self.document_number
+            term_freq_map = retrieve_terms(document_number, reader, options)
+            priority_queue = create_queue(term_freq_map, reader, options)
+            query = create_query(priority_queue, options)
+            logger.debug "morelikethis-query: #{query}"
+            options[:append_to_query].call(query) if options[:append_to_query]
+            options[:base_class].find_by_contents(query, find_options)
+          end
+        end
+        def create_query(priority_queue, options={})
+          query = Ferret::Search::BooleanQuery.new
+          qterms = 0
+          best_score = nil
+          while(cur = priority_queue.pop)
+            term_query = Ferret::Search::TermQuery.new(cur.field, cur.word)
+            if options[:boost]
+              # boost term according to relative score
+              # TODO untested
+              best_score ||= cur.score
+              term_query.boost = cur.score / best_score
+            end
+            begin
+              query.add_query(term_query, :should)
+            rescue Ferret::Search::BooleanQuery::TooManyClauses
+              break
+            end
+            qterms += 1
+            break if options[:max_query_terms] > 0 && qterms >= options[:max_query_terms]
+          end
+          # exclude ourselves
+          query.add_query(Ferret::Search::TermQuery.new(:id, self.id.to_s), :must_not)
+          return query
+        end
+        # creates a term/term_frequency map for terms from the fields
+        # given in options[:field_names]
+        def retrieve_terms(doc_number, reader, options)
+          field_names = options[:field_names]
+          max_num_tokens = options[:max_num_tokens]
+          term_freq_map = Hash.new(0)
+          doc = nil
+          field_names.each do |field|
+            #puts "field: #{field}"
+            term_freq_vector = reader.term_vector(document_number, field)
+            #if false
+            if term_freq_vector
+              # use stored term vector
+              # puts 'using stored term vector'
+              term_freq_vector.terms.each do |term|
+                term_freq_map[term.text] += term.positions.size unless noise_word?(term.text, options)
+              end
+            else
+              # puts 'no stored term vector'
+              # no term vector stored, but we have stored the contents in the index
+              # -> extract terms from there
+              doc = reader[doc_number]
+              content = doc[field]
+              unless content
+                # no term vector, no stored content, so try content from this instance
+                content = content_for_field_name(field.to_s)
+              end
+              puts "have doc: #{doc[:id]} with #{field} == #{content}"
+              token_count = 0
+              ts = options[:analyzer].token_stream(field, content)
+              while token = ts.next
+                break if (token_count+=1) > max_num_tokens
+                next if noise_word?(token.text, options)
+                term_freq_map[token.text] += 1
+              end
+            end
+          end
+          term_freq_map
+        end
+        # create an ordered(by score) list of word,fieldname,score
+        # structures
+        def create_queue(term_freq_map, reader, options)
+          pq = Array.new(term_freq_map.size)
+          similarity = options[:similarity]
+          num_docs = reader.num_docs
+          term_freq_map.each_pair do |word, tf|
+            # filter out words that don't occur enough times in the source
+            next if options[:min_term_freq] && tf < options[:min_term_freq]
+            # go through all the fields and find the largest document frequency
+            top_field = options[:field_names].first
+            doc_freq = 0
+            options[:field_names].each do |field_name|
+              freq = reader.doc_freq(field_name, word)
+              if freq > doc_freq
+                top_field = field_name
+                doc_freq = freq
+              end
+            end
+            # filter out words that don't occur in enough docs
+            next if options[:min_doc_freq] && doc_freq < options[:min_doc_freq]
+            next if doc_freq == 0 # index update problem ?
+            idf = similarity.idf(doc_freq, num_docs)
+            score = tf * idf
+            pq << FrequencyQueueItem.new(word, top_field, score)
+          end
+          pq.compact!
+          pq.sort! { |a,b| a.score<=>b.score }
+          return pq
+        end
+        def noise_word?(text, options)
+          len = text.length
+          (
+            (options[:min_word_length] > 0 && len < options[:min_word_length]) ||
+            (options[:max_word_length] > 0 && len > options[:max_word_length]) ||
+            (options[:stop_words] && options.include?(text))
+          )
+        end
+        def content_for_field_name(field)
+          self[field] || self.instance_variable_get("@#{field.to_s}".to_sym) || self.send(field.to_sym)
+        end
+      end
+      class FrequencyQueueItem
+        attr_reader :word, :field, :score
+        def initialize(word, field, score)
+          @word = word; @field = field; @score = score
+        end
+      end
+    end
+  end
+end

data/lib/multi_index.rb ADDED Viewed

@@ -0,0 +1,87 @@
+module FerretMixin
+  module Acts #:nodoc:
+    module ARFerret #:nodoc:
+      # not threadsafe
+      class MultiIndex
+        # todo: check for necessary index rebuilds in this place, too
+        # idea - each class gets a create_reader method that does this
+        def initialize(model_classes, options = {})
+          @model_classes = model_classes
+          default_fields = @model_classes.inject([]) do |fields, c|
+            fields + c.ferret_configuration[:default_field]
+          end
+          @options = {
+            :default_field => default_fields
+          }.update(options)
+        end
+        def search(query, options={})
+          #puts "querystring: #{query.to_s}"
+          query = process_query(query)
+          #puts "parsed query: #{query.to_s}"
+          searcher.search(query, options)
+        end
+        def search_each(query, options = {}, &block)
+          query = process_query(query)
+          searcher.search_each(query, options, &block)
+        end
+        # checks if all our sub-searchers still are up to date
+        def latest?
+          return false unless @reader
+          # segfaults with 0.10.4 --> TODO report as bug @reader.latest?
+          @sub_readers.each do |r|
+            return false unless r.latest?
+          end
+          true
+        end
+        def searcher
+          ensure_searcher
+          @searcher
+        end
+        def doc(i)
+          searcher[i]
+        end
+        alias :[] :doc
+        def query_parser
+          @query_parser ||= Ferret::QueryParser.new(@options)
+        end
+        def process_query(query)
+          query = query_parser.parse(query) if query.is_a?(String)
+          return query
+        end
+        def close
+          @searcher.close if @searcher
+          @reader.close if @reader
+        end
+        protected
+          def ensure_searcher
+            unless latest?
+              @sub_readers = @model_classes.map { |clazz|
+                begin
+                  reader = Ferret::Index::IndexReader.new(clazz.class_index_dir)
+                rescue Exception
+                  puts "error opening #{clazz.class_index_dir}: #{$!}"
+                end
+                reader
+              }
+              close
+              @reader = Ferret::Index::IndexReader.new(@sub_readers)
+              @searcher = Ferret::Search::Searcher.new(@reader)
+            end
+          end
+      end # of class MultiIndex
+    end
+  end
+end

data/rakefile ADDED Viewed

@@ -0,0 +1,191 @@
+# rakefile for acts_as_ferret.
+# use to create a gem or generate rdoc api documentation.
+#
+# heavily based on the one from the acts_as_searchable plugin.
+require 'rake'
+require 'rake/rdoctask'
+require 'rake/packagetask'
+require 'rake/gempackagetask'
+require 'rake/testtask'
+require 'rake/contrib/rubyforgepublisher'
+PKG_NAME      = 'acts_as_ferret'
+PKG_VERSION   = '0.3.1'
+PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"
+RUBY_FORGE_PROJECT = 'actsasferret'
+RUBY_FORGE_USER    = 'jkraemer'
+desc 'Default: run unit tests.'
+task :default => :test
+desc 'Test the acts_as_searchable plugin.'
+Rake::TestTask.new(:test) do |t|
+  t.libs << 'lib'
+  t.pattern = 'test/**/*_test.rb'
+  t.verbose = true
+end
+desc 'Generate documentation for the acts_as_ferret plugin.'
+Rake::RDocTask.new(:rdoc) do |rdoc|
+  rdoc.rdoc_dir = 'html'
+  rdoc.title    = "acts_as_ferret - Ferret based full text search for any ActiveRecord model"
+  rdoc.options << '--line-numbers' << '--inline-source'
+  rdoc.options << '--main' << 'README'
+  rdoc.rdoc_files.include('README', 'LICENSE')
+  rdoc.template = "#{ENV['template']}.rb" if ENV['template']
+  rdoc.rdoc_files.include('lib/**/*.rb')
+end
+spec = Gem::Specification.new do |s|
+  s.name            = PKG_NAME
+  s.version         = PKG_VERSION
+  s.platform        = Gem::Platform::RUBY
+  s.summary         = "acts_as_ferret - Ferret based full text search for any ActiveRecord model"
+  s.files  = Dir.glob('**/*', File::FNM_DOTMATCH).reject do |f|
+     [ /\.$/, /sqlite$/, /\.log$/, /^pkg/, /\.svn/,
+     /\~$/, /\/\._/, /\/#/ ].any? {|regex| f =~ regex }
+  end
+  #s.files           = FileList["{lib,test}/**/*"].to_a + %w(README MIT-LICENSE CHANGELOG)
+ # s.files.delete  ...
+  s.require_path    = 'lib'
+  s.autorequire     = 'acts_as_ferret'
+  s.has_rdoc        = true
+ # s.test_files      = Dir['test/**/*_test.rb']
+  s.author          = "Jens Kraemer"
+  s.email           = "jk@jkraemer.net"
+  s.homepage        = "http://projects.jkraemer.net/acts_as_ferret"
+end
+Rake::GemPackageTask.new(spec) do |pkg|
+  pkg.need_tar = true
+end
+desc "Publish the API documentation"
+task :pdoc => [:rdoc] do
+  Rake::RubyForgePublisher.new(RUBY_FORGE_PROJECT, RUBY_FORGE_USER).upload
+end
+desc 'Publish the gem and API docs'
+task :publish => [:pdoc, :rubyforge_upload]
+desc "Publish the release files to RubyForge."
+task :rubyforge_upload => :package do
+  files = %w(gem tgz).map { |ext| "pkg/#{PKG_FILE_NAME}.#{ext}" }
+  if RUBY_FORGE_PROJECT then
+    require 'net/http'
+    require 'open-uri'
+    project_uri = "http://rubyforge.org/projects/#{RUBY_FORGE_PROJECT}/"
+    project_data = open(project_uri) { |data| data.read }
+    group_id = project_data[/[?&]group_id=(\d+)/, 1]
+    raise "Couldn't get group id" unless group_id
+    # This echos password to shell which is a bit sucky
+    if ENV["RUBY_FORGE_PASSWORD"]
+      password = ENV["RUBY_FORGE_PASSWORD"]
+    else
+      print "#{RUBY_FORGE_USER}@rubyforge.org's password: "
+      password = STDIN.gets.chomp
+    end
+    login_response = Net::HTTP.start("rubyforge.org", 80) do |http|
+      data = [
+        "login=Login",
+        "form_loginname=#{RUBY_FORGE_USER}",
+        "form_pw=#{password}"
+      ].join("&")
+      headers = { 'Content-Type' => 'application/x-www-form-urlencoded' }
+      http.post("/account/login.php", data, headers)
+    end
+    cookie = login_response["set-cookie"]
+    raise "Login failed" unless cookie
+    headers = { "Cookie" => cookie }
+    release_uri = "http://rubyforge.org/frs/admin/?group_id=#{group_id}"
+    release_data = open(release_uri, headers) { |data| data.read }
+    package_id = release_data[/[?&]package_id=(\d+)/, 1]
+    raise "Couldn't get package id" unless package_id
+    first_file = true
+    release_id = ""
+    files.each do |filename|
+      basename  = File.basename(filename)
+      file_ext  = File.extname(filename)
+      file_data = File.open(filename, "rb") { |file| file.read }
+      puts "Releasing #{basename}..."
+      release_response = Net::HTTP.start("rubyforge.org", 80) do |http|
+        release_date = Time.now.strftime("%Y-%m-%d %H:%M")
+        type_map = {
+          ".zip"    => "3000",
+          ".tgz"    => "3110",
+          ".gz"     => "3110",
+          ".gem"    => "1400"
+        }; type_map.default = "9999"
+        type = type_map[file_ext]
+        boundary = "rubyqMY6QN9bp6e4kS21H4y0zxcvoor"
+        query_hash = if first_file then
+          {
+            "group_id" => group_id,
+            "package_id" => package_id,
+            "release_name" => PKG_FILE_NAME,
+            "release_date" => release_date,
+            "type_id" => type,
+            "processor_id" => "8000", # Any
+            "release_notes" => "",
+            "release_changes" => "",
+            "preformatted" => "1",
+            "submit" => "1"
+          }
+        else
+          {
+            "group_id" => group_id,
+            "release_id" => release_id,
+            "package_id" => package_id,
+            "step2" => "1",
+            "type_id" => type,
+            "processor_id" => "8000", # Any
+            "submit" => "Add This File"
+          }
+        end
+        data = [
+          "--" + boundary,
+          "Content-Disposition: form-data; name=\"userfile\"; filename=\"#{basename}\"",
+          "Content-Type: application/octet-stream",
+          "Content-Transfer-Encoding: binary",
+          "", file_data, "",
+          query_hash.collect do |name, value|
+            [ "--" + boundary,
+              "Content-Disposition: form-data; name='#{name}'",
+              "", value, "" ]
+          end
+          ].flatten.join("\x0D\x0A")
+        release_headers = headers.merge(
+          "Content-Type" => "multipart/form-data; boundary=#{boundary}"
+        )
+        target = first_file ? "/frs/admin/qrs.php" : "/frs/admin/editrelease.php"
+        http.post(target, data, release_headers)
+      end
+      if first_file then
+        release_id = release_response.body[/release_id=(\d+)/, 1]
+        raise("Couldn't get release id") unless release_id
+      end
+      first_file = false
+    end
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,59 @@
+--- !ruby/object:Gem::Specification
+rubygems_version: 0.8.11
+specification_version: 1
+name: acts_as_ferret
+version: !ruby/object:Gem::Version
+  version: 0.3.1
+date: 2007-01-20 00:00:00 +01:00
+summary: acts_as_ferret - Ferret based full text search for any ActiveRecord model
+require_paths:
+- lib
+email: jk@jkraemer.net
+homepage: http://projects.jkraemer.net/acts_as_ferret
+rubyforge_project:
+description:
+autorequire: acts_as_ferret
+default_executable:
+bindir: bin
+has_rdoc: true
+required_ruby_version: !ruby/object:Gem::Version::Requirement
+  requirements:
+  - - ">"
+    - !ruby/object:Gem::Version
+      version: 0.0.0
+  version:
+platform: ruby
+signing_key:
+cert_chain:
+authors:
+- Jens Kraemer
+files:
+- LICENSE
+- rakefile
+- init.rb
+- lib
+- README
+- .rakefile.swp
+- .init.rb.swp
+- lib/multi_index.rb
+- lib/acts_as_ferret.rb
+- lib/instance_methods.rb
+- lib/class_methods.rb
+- lib/more_like_this.rb
+- lib/.class_methods.rb.swp
+- lib/.acts_as_ferret.rb.swp
+- lib/.class_methods.rb.swo
+test_files: []
+rdoc_options: []
+extra_rdoc_files: []
+executables: []
+extensions: []
+requirements: []
+dependencies: []