mongoid_fulltext 0.5.1 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -212,9 +212,11 @@ Indexing Options
212
212
  Additional indexing/query options can be used as parameters to `fulltext_search_in`.
213
213
 
214
214
  * `alphabet`: letters to index, default is `abcdefghijklmnopqrstuvwxyz0123456789 `
215
- * `word_separators`: word separators, default is ` `
215
+ * `word_separators`: word separators, default is the space character.
216
216
  * `ngram_width`: ngram width, default is `3`
217
217
  * `index_full_words`: index full words, which improves exact matches, default is `true`
218
+ * `stop_words`: a hash of words to avoid indexing as full words. Used only if `index_full_words`
219
+ is set to `true`. Defaults to a hash containing a list of common English stop words.
218
220
  * `apply_prefix_scoring_to_all_words`: score n-grams at beginning of words higher, default is `true`
219
221
  * `max_ngrams_to_search`: maximum number of ngrams to query at any given time, default is `6`
220
222
  * `max_candidate_set_size`: maximum number of candidate ngrams to examine for a given query.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.5.1
1
+ 0.5.2
@@ -1,5 +1,6 @@
1
1
  require 'mongoid_indexes'
2
2
  require 'unicode_utils'
3
+ require 'cgi'
3
4
 
4
5
  module Mongoid::FullTextSearch
5
6
  extend ActiveSupport::Concern
@@ -180,28 +181,32 @@ module Mongoid::FullTextSearch
180
181
  end
181
182
  end
182
183
 
183
- # returns an [ngram, score] [ngram, position] pair
184
184
  def all_ngrams(str, config, bound_number_returned = true)
185
- return {} if str.nil? or str.length < config[:ngram_width]
185
+ return {} if str.nil?
186
186
 
187
- filtered_str = String.new(str)
188
187
  if config[:remove_accents]
189
- if str.encoding.name == "ASCII-8BIT"
190
- filtered_str = CGI.unescape(filtered_str)
191
- end
192
- filtered_str = UnicodeUtils.nfkd(filtered_str).gsub(/[^\x00-\x7F]/,'')
188
+ str = UnicodeUtils.nfkd(CGI.unescape(str)).gsub(/[^\x00-\x7F]/,'')
193
189
  end
194
190
 
195
- filtered_str = filtered_str.mb_chars.downcase.to_s.split('').map{ |ch| config[:alphabet][ch] }.compact.join('')
191
+ # Remove any characters that aren't in the alphabet
192
+ filtered_str = str.mb_chars.to_s.downcase.split('').find_all{ |ch| config[:alphabet][ch] }.join('')
196
193
 
194
+ # Figure out how many ngrams to extract from the string. If we can't afford to extract all ngrams,
195
+ # step over the string in evenly spaced strides to extract ngrams. For example, to extract 3 3-letter
196
+ # ngrams from 'abcdefghijk', we'd want to extract 'abc', 'efg', and 'ijk'.
197
197
  if bound_number_returned
198
198
  step_size = [((filtered_str.length - config[:ngram_width]).to_f / config[:max_ngrams_to_search]).ceil, 1].max
199
199
  else
200
200
  step_size = 1
201
201
  end
202
202
 
203
- # Create an array of records of the form {:ngram => x, :score => y} for all ngrams that occur in the input string
204
- ngram_ary = (0..filtered_str.length - config[:ngram_width]).step(step_size).map do |i|
203
+ # Create an array of records of the form {:ngram => x, :score => y} for all ngrams that occur in the
204
+ # input string using the step size that we just computed. Let score(x,y) be the score of string x
205
+ # compared with string y - assigning scores to ngrams with the square root-based scoring function
206
+ # below and multiplying scores of matching ngrams together yields a score function that has the
207
+ # property that score(x,y) > score(x,z) for any string z containing y and score(x,y) > score(x,z)
208
+ # for any string z contained in y.
209
+ ngram_array = (0..filtered_str.length - config[:ngram_width]).step(step_size).map do |i|
205
210
  if i == 0 or (config[:apply_prefix_scoring_to_all_words] and \
206
211
  config[:word_separators].has_key?(filtered_str[i-1].chr))
207
212
  score = Math.sqrt(1 + 1.0/filtered_str.length)
@@ -212,20 +217,21 @@ module Mongoid::FullTextSearch
212
217
  end
213
218
 
214
219
  # If an ngram appears multiple times in the query string, keep the max score
215
- ngram_ary = ngram_ary.group_by{ |h| h[:ngram] }.map{ |key, values| {:ngram => key, :score => values.map{ |v| v[:score] }.max} }
220
+ ngram_array = ngram_array.group_by{ |h| h[:ngram] }.map{ |key, values| {:ngram => key, :score => values.map{ |v| v[:score] }.max} }
216
221
 
222
+ # Add records to the array of ngrams for each full word in the string that isn't a stop word
217
223
  if (config[:index_full_words])
218
224
  full_words_seen = {}
219
225
  filtered_str.split(Regexp.compile(config[:word_separators].keys.join)).each do |word|
220
- if word.length >= config[:ngram_width] and full_words_seen[word].nil? and config[:stop_words][word].nil?
221
- ngram_ary << {:ngram => word, :score => 1}
226
+ if word.length > 1 and full_words_seen[word].nil? and config[:stop_words][word].nil?
227
+ ngram_array << {:ngram => word, :score => 1}
222
228
  full_words_seen[word] = true
223
229
  end
224
230
  end
225
231
  end
226
232
 
227
233
  # If an ngram appears as a full word and an ngram, keep the sum of the two scores
228
- Hash[ngram_ary.group_by{ |h| h[:ngram] }.map{ |key, values| [key, values.map{ |v| v[:score] }.sum] }]
234
+ Hash[ngram_array.group_by{ |h| h[:ngram] }.map{ |key, values| [key, values.map{ |v| v[:score] }.sum] }]
229
235
  end
230
236
 
231
237
  def remove_from_ngram_index
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{mongoid_fulltext}
8
- s.version = "0.5.1"
8
+ s.version = "0.5.2"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Aaron Windsor"]
12
- s.date = %q{2011-11-02}
12
+ s.date = %q{2011-11-05}
13
13
  s.description = %q{Full-text search for the Mongoid ORM, using n-grams extracted from text}
14
14
  s.email = %q{aaron.windsor@gmail.com}
15
15
  s.extra_rdoc_files = [
@@ -5,6 +5,6 @@ class StopwordsArtwork
5
5
  field :title
6
6
  fulltext_search_in :title,
7
7
  :index_full_words => true,
8
- :stop_words => { 'and' => true }
8
+ :stop_words => { 'and' => true, 'by' => true}
9
9
 
10
10
  end
@@ -453,11 +453,18 @@ module Mongoid
453
453
  context "with stop words defined" do
454
454
  let!(:flowers) { StopwordsArtwork.create(:title => "Flowers by Andy Warhol") }
455
455
  let!(:many_ands) { StopwordsArtwork.create(:title => "Foo and bar and baz and foobar") }
456
+ let!(:harry) { StopwordsArtwork.create(:title => "Harry in repose by JK Rowling") }
456
457
 
457
458
  it "doesn't give a full-word score boost to stopwords" do
458
459
  StopwordsArtwork.fulltext_search("andy").map{ |a| a.title }.should == [flowers.title, many_ands.title]
459
460
  StopwordsArtwork.fulltext_search("warhol and other stuff").map{ |a| a.title }.should == [flowers.title, many_ands.title]
460
461
  end
462
+
463
+ it "allows searching on words that are more than one letter, less than the ngram length and not stopwords" do
464
+ StopwordsArtwork.fulltext_search("jk").map{ |a| a.title }.should == [harry.title]
465
+ StopwordsArtwork.fulltext_search("by").map{ |a| a.title }.should == []
466
+ end
467
+
461
468
  end
462
469
 
463
470
  context "remove_from_ngram_index" do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: mongoid_fulltext
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.1
4
+ version: 0.5.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,12 +9,12 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2011-11-02 00:00:00.000000000 -04:00
12
+ date: 2011-11-05 00:00:00.000000000 -04:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: unicode_utils
17
- requirement: &87323870 !ruby/object:Gem::Requirement
17
+ requirement: &86209940 !ruby/object:Gem::Requirement
18
18
  none: false
19
19
  requirements:
20
20
  - - ~>
@@ -22,10 +22,10 @@ dependencies:
22
22
  version: 1.0.0
23
23
  type: :runtime
24
24
  prerelease: false
25
- version_requirements: *87323870
25
+ version_requirements: *86209940
26
26
  - !ruby/object:Gem::Dependency
27
27
  name: mongoid
28
- requirement: &87323630 !ruby/object:Gem::Requirement
28
+ requirement: &86209400 !ruby/object:Gem::Requirement
29
29
  none: false
30
30
  requirements:
31
31
  - - ~>
@@ -33,10 +33,10 @@ dependencies:
33
33
  version: 2.0.0
34
34
  type: :development
35
35
  prerelease: false
36
- version_requirements: *87323630
36
+ version_requirements: *86209400
37
37
  - !ruby/object:Gem::Dependency
38
38
  name: bson_ext
39
- requirement: &87323390 !ruby/object:Gem::Requirement
39
+ requirement: &86209080 !ruby/object:Gem::Requirement
40
40
  none: false
41
41
  requirements:
42
42
  - - ~>
@@ -44,10 +44,10 @@ dependencies:
44
44
  version: 1.3.0
45
45
  type: :development
46
46
  prerelease: false
47
- version_requirements: *87323390
47
+ version_requirements: *86209080
48
48
  - !ruby/object:Gem::Dependency
49
49
  name: rspec
50
- requirement: &87323150 !ruby/object:Gem::Requirement
50
+ requirement: &86208570 !ruby/object:Gem::Requirement
51
51
  none: false
52
52
  requirements:
53
53
  - - ~>
@@ -55,10 +55,10 @@ dependencies:
55
55
  version: 2.5.0
56
56
  type: :development
57
57
  prerelease: false
58
- version_requirements: *87323150
58
+ version_requirements: *86208570
59
59
  - !ruby/object:Gem::Dependency
60
60
  name: jeweler
61
- requirement: &87322910 !ruby/object:Gem::Requirement
61
+ requirement: &86208060 !ruby/object:Gem::Requirement
62
62
  none: false
63
63
  requirements:
64
64
  - - ~>
@@ -66,7 +66,7 @@ dependencies:
66
66
  version: 1.5.2
67
67
  type: :development
68
68
  prerelease: false
69
- version_requirements: *87322910
69
+ version_requirements: *86208060
70
70
  description: Full-text search for the Mongoid ORM, using n-grams extracted from text
71
71
  email: aaron.windsor@gmail.com
72
72
  executables: []
@@ -119,7 +119,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
119
119
  version: '0'
120
120
  segments:
121
121
  - 0
122
- hash: 136864689
122
+ hash: 444903309
123
123
  required_rubygems_version: !ruby/object:Gem::Requirement
124
124
  none: false
125
125
  requirements: