RubyGems - mongoid_fulltext - Versions diffs - 0.5.1 → 0.5.2 - Mend

mongoid_fulltext 0.5.1 → 0.5.2

Files changed (7) hide show

data/README.md +3 -1
data/VERSION +1 -1
data/lib/mongoid_fulltext.rb +20 -14
data/mongoid_fulltext.gemspec +2 -2
data/spec/models/stopwords_artwork.rb +1 -1
data/spec/mongoid/fulltext_spec.rb +7 -0
metadata +13 -13

data/README.md CHANGED Viewed

@@ -212,9 +212,11 @@ Indexing Options
 Additional indexing/query options can be used as parameters to `fulltext_search_in`.
 * `alphabet`: letters to index, default is `abcdefghijklmnopqrstuvwxyz0123456789 `
-* `word_separators`: word separators, default is ` `
+* `word_separators`: word separators, default is the space character.
 * `ngram_width`: ngram width, default is `3`
 * `index_full_words`: index full words, which improves exact matches, default is `true`
+* `stop_words`: a hash of words to avoid indexing as full words. Used only if `index_full_words`
+  is set to `true`. Defaults to a hash containing a list of common English stop words.
 * `apply_prefix_scoring_to_all_words`: score n-grams at beginning of words higher, default is `true`
 * `max_ngrams_to_search`: maximum number of ngrams to query at any given time, default is `6`
 * `max_candidate_set_size`: maximum number of candidate ngrams to examine for a given query.

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.5.1
1	+ 0.5.2

data/lib/mongoid_fulltext.rb CHANGED Viewed

@@ -1,5 +1,6 @@
 require 'mongoid_indexes'
 require 'unicode_utils'
+require 'cgi'
 module Mongoid::FullTextSearch
   extend ActiveSupport::Concern
@@ -180,28 +181,32 @@ module Mongoid::FullTextSearch
       end
     end
-    # returns an [ngram, score] [ngram, position] pair
     def all_ngrams(str, config, bound_number_returned = true)
-      return {} if str.nil? or str.length < config[:ngram_width]
+      return {} if str.nil?
-      filtered_str = String.new(str)
       if config[:remove_accents]
-        if str.encoding.name == "ASCII-8BIT"
-          filtered_str = CGI.unescape(filtered_str)
-        end
-        filtered_str = UnicodeUtils.nfkd(filtered_str).gsub(/[^\x00-\x7F]/,'')
+        str = UnicodeUtils.nfkd(CGI.unescape(str)).gsub(/[^\x00-\x7F]/,'')
       end
-      filtered_str = filtered_str.mb_chars.downcase.to_s.split('').map{ |ch| config[:alphabet][ch] }.compact.join('')
+      # Remove any characters that aren't in the alphabet
+      filtered_str = str.mb_chars.to_s.downcase.split('').find_all{ |ch| config[:alphabet][ch] }.join('')
+      # Figure out how many ngrams to extract from the string. If we can't afford to extract all ngrams,
+      # step over the string in evenly spaced strides to extract ngrams. For example, to extract 3 3-letter
+      # ngrams from 'abcdefghijk', we'd want to extract 'abc', 'efg', and 'ijk'.
       if bound_number_returned
         step_size = [((filtered_str.length - config[:ngram_width]).to_f / config[:max_ngrams_to_search]).ceil, 1].max
       else
         step_size = 1
       end
-      # Create an array of records of the form {:ngram => x, :score => y} for all ngrams that occur in the input string
-      ngram_ary = (0..filtered_str.length - config[:ngram_width]).step(step_size).map do |i|
+      # Create an array of records of the form {:ngram => x, :score => y} for all ngrams that occur in the
+      # input string using the step size that we just computed. Let score(x,y) be the score of string x
+      # compared with string y - assigning scores to ngrams with the square root-based scoring function
+      # below and multiplying scores of matching ngrams together yields a score function that has the
+      # property that score(x,y) > score(x,z) for any string z containing y and score(x,y) > score(x,z)
+      # for any string z contained in y.
+      ngram_array = (0..filtered_str.length - config[:ngram_width]).step(step_size).map do |i|
         if i == 0 or (config[:apply_prefix_scoring_to_all_words] and \
                       config[:word_separators].has_key?(filtered_str[i-1].chr))
           score = Math.sqrt(1 + 1.0/filtered_str.length)
@@ -212,20 +217,21 @@ module Mongoid::FullTextSearch
       end
       # If an ngram appears multiple times in the query string, keep the max score
-      ngram_ary = ngram_ary.group_by{ |h| h[:ngram] }.map{ |key, values| {:ngram => key, :score => values.map{ |v| v[:score] }.max} }
+      ngram_array = ngram_array.group_by{ |h| h[:ngram] }.map{ |key, values| {:ngram => key, :score => values.map{ |v| v[:score] }.max} }
+      # Add records to the array of ngrams for each full word in the string that isn't a stop word
       if (config[:index_full_words])
         full_words_seen = {}
         filtered_str.split(Regexp.compile(config[:word_separators].keys.join)).each do |word|
-          if word.length >= config[:ngram_width] and full_words_seen[word].nil? and config[:stop_words][word].nil?
-            ngram_ary << {:ngram => word, :score => 1}
+          if word.length > 1 and full_words_seen[word].nil? and config[:stop_words][word].nil?
+            ngram_array << {:ngram => word, :score => 1}
             full_words_seen[word] = true
           end
         end
       end
       # If an ngram appears as a full word and an ngram, keep the sum of the two scores
-      Hash[ngram_ary.group_by{ |h| h[:ngram] }.map{ |key, values| [key, values.map{ |v| v[:score] }.sum] }]
+      Hash[ngram_array.group_by{ |h| h[:ngram] }.map{ |key, values| [key, values.map{ |v| v[:score] }.sum] }]
     end
     def remove_from_ngram_index

data/mongoid_fulltext.gemspec CHANGED Viewed

@@ -5,11 +5,11 @@
 Gem::Specification.new do |s|
   s.name = %q{mongoid_fulltext}
-  s.version = "0.5.1"
+  s.version = "0.5.2"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Aaron Windsor"]
-  s.date = %q{2011-11-02}
+  s.date = %q{2011-11-05}
   s.description = %q{Full-text search for the Mongoid ORM, using n-grams extracted from text}
   s.email = %q{aaron.windsor@gmail.com}
   s.extra_rdoc_files = [

data/spec/models/stopwords_artwork.rb CHANGED Viewed

@@ -5,6 +5,6 @@ class StopwordsArtwork
   field :title
   fulltext_search_in :title,
   :index_full_words => true,
-  :stop_words => { 'and' => true }
+  :stop_words => { 'and' => true, 'by' => true}
 end

data/spec/mongoid/fulltext_spec.rb CHANGED Viewed

@@ -453,11 +453,18 @@ module Mongoid
     context "with stop words defined" do
       let!(:flowers)      { StopwordsArtwork.create(:title => "Flowers by Andy Warhol") }
       let!(:many_ands)    { StopwordsArtwork.create(:title => "Foo and bar and baz and foobar") }
+      let!(:harry)        { StopwordsArtwork.create(:title => "Harry in repose by JK Rowling") }
       it "doesn't give a full-word score boost to stopwords" do
         StopwordsArtwork.fulltext_search("andy").map{ |a| a.title }.should == [flowers.title, many_ands.title]
         StopwordsArtwork.fulltext_search("warhol and other stuff").map{ |a| a.title }.should == [flowers.title, many_ands.title]
       end
+      it "allows searching on words that are more than one letter, less than the ngram length and not stopwords" do
+        StopwordsArtwork.fulltext_search("jk").map{ |a| a.title }.should == [harry.title]
+        StopwordsArtwork.fulltext_search("by").map{ |a| a.title }.should == []
+      end
     end
     context "remove_from_ngram_index" do

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: mongoid_fulltext
 version: !ruby/object:Gem::Version
-  version: 0.5.1
+  version: 0.5.2
   prerelease:
 platform: ruby
 authors:
@@ -9,12 +9,12 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-11-02 00:00:00.000000000 -04:00
+date: 2011-11-05 00:00:00.000000000 -04:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
   name: unicode_utils
-  requirement: &87323870 !ruby/object:Gem::Requirement
+  requirement: &86209940 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ~>
@@ -22,10 +22,10 @@ dependencies:
         version: 1.0.0
   type: :runtime
   prerelease: false
-  version_requirements: *87323870
+  version_requirements: *86209940
 - !ruby/object:Gem::Dependency
   name: mongoid
-  requirement: &87323630 !ruby/object:Gem::Requirement
+  requirement: &86209400 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ~>
@@ -33,10 +33,10 @@ dependencies:
         version: 2.0.0
   type: :development
   prerelease: false
-  version_requirements: *87323630
+  version_requirements: *86209400
 - !ruby/object:Gem::Dependency
   name: bson_ext
-  requirement: &87323390 !ruby/object:Gem::Requirement
+  requirement: &86209080 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ~>
@@ -44,10 +44,10 @@ dependencies:
         version: 1.3.0
   type: :development
   prerelease: false
-  version_requirements: *87323390
+  version_requirements: *86209080
 - !ruby/object:Gem::Dependency
   name: rspec
-  requirement: &87323150 !ruby/object:Gem::Requirement
+  requirement: &86208570 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ~>
@@ -55,10 +55,10 @@ dependencies:
         version: 2.5.0
   type: :development
   prerelease: false
-  version_requirements: *87323150
+  version_requirements: *86208570
 - !ruby/object:Gem::Dependency
   name: jeweler
-  requirement: &87322910 !ruby/object:Gem::Requirement
+  requirement: &86208060 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ~>
@@ -66,7 +66,7 @@ dependencies:
         version: 1.5.2
   type: :development
   prerelease: false
-  version_requirements: *87322910
+  version_requirements: *86208060
 description: Full-text search for the Mongoid ORM, using n-grams extracted from text
 email: aaron.windsor@gmail.com
 executables: []
@@ -119,7 +119,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
       version: '0'
       segments:
       - 0
-      hash: 136864689
+      hash: 444903309
 required_rubygems_version: !ruby/object:Gem::Requirement
   none: false
   requirements: