mongoid_fulltext 0.5.1 → 0.5.2

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -212,9 +212,11 @@ Indexing Options
212
212
  Additional indexing/query options can be used as parameters to `fulltext_search_in`.
213
213
 
214
214
  * `alphabet`: letters to index, default is `abcdefghijklmnopqrstuvwxyz0123456789 `
215
- * `word_separators`: word separators, default is ` `
215
+ * `word_separators`: word separators, default is the space character.
216
216
  * `ngram_width`: ngram width, default is `3`
217
217
  * `index_full_words`: index full words, which improves exact matches, default is `true`
218
+ * `stop_words`: a hash of words to avoid indexing as full words. Used only if `index_full_words`
219
+ is set to `true`. Defaults to a hash containing a list of common English stop words.
218
220
  * `apply_prefix_scoring_to_all_words`: score n-grams at beginning of words higher, default is `true`
219
221
  * `max_ngrams_to_search`: maximum number of ngrams to query at any given time, default is `6`
220
222
  * `max_candidate_set_size`: maximum number of candidate ngrams to examine for a given query.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.5.1
1
+ 0.5.2
@@ -1,5 +1,6 @@
1
1
  require 'mongoid_indexes'
2
2
  require 'unicode_utils'
3
+ require 'cgi'
3
4
 
4
5
  module Mongoid::FullTextSearch
5
6
  extend ActiveSupport::Concern
@@ -180,28 +181,32 @@ module Mongoid::FullTextSearch
180
181
  end
181
182
  end
182
183
 
183
- # returns an [ngram, score] [ngram, position] pair
184
184
  def all_ngrams(str, config, bound_number_returned = true)
185
- return {} if str.nil? or str.length < config[:ngram_width]
185
+ return {} if str.nil?
186
186
 
187
- filtered_str = String.new(str)
188
187
  if config[:remove_accents]
189
- if str.encoding.name == "ASCII-8BIT"
190
- filtered_str = CGI.unescape(filtered_str)
191
- end
192
- filtered_str = UnicodeUtils.nfkd(filtered_str).gsub(/[^\x00-\x7F]/,'')
188
+ str = UnicodeUtils.nfkd(CGI.unescape(str)).gsub(/[^\x00-\x7F]/,'')
193
189
  end
194
190
 
195
- filtered_str = filtered_str.mb_chars.downcase.to_s.split('').map{ |ch| config[:alphabet][ch] }.compact.join('')
191
+ # Remove any characters that aren't in the alphabet
192
+ filtered_str = str.mb_chars.to_s.downcase.split('').find_all{ |ch| config[:alphabet][ch] }.join('')
196
193
 
194
+ # Figure out how many ngrams to extract from the string. If we can't afford to extract all ngrams,
195
+ # step over the string in evenly spaced strides to extract ngrams. For example, to extract 3 3-letter
196
+ # ngrams from 'abcdefghijk', we'd want to extract 'abc', 'efg', and 'ijk'.
197
197
  if bound_number_returned
198
198
  step_size = [((filtered_str.length - config[:ngram_width]).to_f / config[:max_ngrams_to_search]).ceil, 1].max
199
199
  else
200
200
  step_size = 1
201
201
  end
202
202
 
203
- # Create an array of records of the form {:ngram => x, :score => y} for all ngrams that occur in the input string
204
- ngram_ary = (0..filtered_str.length - config[:ngram_width]).step(step_size).map do |i|
203
+ # Create an array of records of the form {:ngram => x, :score => y} for all ngrams that occur in the
204
+ # input string using the step size that we just computed. Let score(x,y) be the score of string x
205
+ # compared with string y - assigning scores to ngrams with the square root-based scoring function
206
+ # below and multiplying scores of matching ngrams together yields a score function that has the
207
+ # property that score(x,y) > score(x,z) for any string z containing y and score(x,y) > score(x,z)
208
+ # for any string z contained in y.
209
+ ngram_array = (0..filtered_str.length - config[:ngram_width]).step(step_size).map do |i|
205
210
  if i == 0 or (config[:apply_prefix_scoring_to_all_words] and \
206
211
  config[:word_separators].has_key?(filtered_str[i-1].chr))
207
212
  score = Math.sqrt(1 + 1.0/filtered_str.length)
@@ -212,20 +217,21 @@ module Mongoid::FullTextSearch
212
217
  end
213
218
 
214
219
  # If an ngram appears multiple times in the query string, keep the max score
215
- ngram_ary = ngram_ary.group_by{ |h| h[:ngram] }.map{ |key, values| {:ngram => key, :score => values.map{ |v| v[:score] }.max} }
220
+ ngram_array = ngram_array.group_by{ |h| h[:ngram] }.map{ |key, values| {:ngram => key, :score => values.map{ |v| v[:score] }.max} }
216
221
 
222
+ # Add records to the array of ngrams for each full word in the string that isn't a stop word
217
223
  if (config[:index_full_words])
218
224
  full_words_seen = {}
219
225
  filtered_str.split(Regexp.compile(config[:word_separators].keys.join)).each do |word|
220
- if word.length >= config[:ngram_width] and full_words_seen[word].nil? and config[:stop_words][word].nil?
221
- ngram_ary << {:ngram => word, :score => 1}
226
+ if word.length > 1 and full_words_seen[word].nil? and config[:stop_words][word].nil?
227
+ ngram_array << {:ngram => word, :score => 1}
222
228
  full_words_seen[word] = true
223
229
  end
224
230
  end
225
231
  end
226
232
 
227
233
  # If an ngram appears as a full word and an ngram, keep the sum of the two scores
228
- Hash[ngram_ary.group_by{ |h| h[:ngram] }.map{ |key, values| [key, values.map{ |v| v[:score] }.sum] }]
234
+ Hash[ngram_array.group_by{ |h| h[:ngram] }.map{ |key, values| [key, values.map{ |v| v[:score] }.sum] }]
229
235
  end
230
236
 
231
237
  def remove_from_ngram_index
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{mongoid_fulltext}
8
- s.version = "0.5.1"
8
+ s.version = "0.5.2"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Aaron Windsor"]
12
- s.date = %q{2011-11-02}
12
+ s.date = %q{2011-11-05}
13
13
  s.description = %q{Full-text search for the Mongoid ORM, using n-grams extracted from text}
14
14
  s.email = %q{aaron.windsor@gmail.com}
15
15
  s.extra_rdoc_files = [
@@ -5,6 +5,6 @@ class StopwordsArtwork
5
5
  field :title
6
6
  fulltext_search_in :title,
7
7
  :index_full_words => true,
8
- :stop_words => { 'and' => true }
8
+ :stop_words => { 'and' => true, 'by' => true}
9
9
 
10
10
  end
@@ -453,11 +453,18 @@ module Mongoid
453
453
  context "with stop words defined" do
454
454
  let!(:flowers) { StopwordsArtwork.create(:title => "Flowers by Andy Warhol") }
455
455
  let!(:many_ands) { StopwordsArtwork.create(:title => "Foo and bar and baz and foobar") }
456
+ let!(:harry) { StopwordsArtwork.create(:title => "Harry in repose by JK Rowling") }
456
457
 
457
458
  it "doesn't give a full-word score boost to stopwords" do
458
459
  StopwordsArtwork.fulltext_search("andy").map{ |a| a.title }.should == [flowers.title, many_ands.title]
459
460
  StopwordsArtwork.fulltext_search("warhol and other stuff").map{ |a| a.title }.should == [flowers.title, many_ands.title]
460
461
  end
462
+
463
+ it "allows searching on words that are more than one letter, less than the ngram length and not stopwords" do
464
+ StopwordsArtwork.fulltext_search("jk").map{ |a| a.title }.should == [harry.title]
465
+ StopwordsArtwork.fulltext_search("by").map{ |a| a.title }.should == []
466
+ end
467
+
461
468
  end
462
469
 
463
470
  context "remove_from_ngram_index" do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: mongoid_fulltext
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.1
4
+ version: 0.5.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,12 +9,12 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2011-11-02 00:00:00.000000000 -04:00
12
+ date: 2011-11-05 00:00:00.000000000 -04:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: unicode_utils
17
- requirement: &87323870 !ruby/object:Gem::Requirement
17
+ requirement: &86209940 !ruby/object:Gem::Requirement
18
18
  none: false
19
19
  requirements:
20
20
  - - ~>
@@ -22,10 +22,10 @@ dependencies:
22
22
  version: 1.0.0
23
23
  type: :runtime
24
24
  prerelease: false
25
- version_requirements: *87323870
25
+ version_requirements: *86209940
26
26
  - !ruby/object:Gem::Dependency
27
27
  name: mongoid
28
- requirement: &87323630 !ruby/object:Gem::Requirement
28
+ requirement: &86209400 !ruby/object:Gem::Requirement
29
29
  none: false
30
30
  requirements:
31
31
  - - ~>
@@ -33,10 +33,10 @@ dependencies:
33
33
  version: 2.0.0
34
34
  type: :development
35
35
  prerelease: false
36
- version_requirements: *87323630
36
+ version_requirements: *86209400
37
37
  - !ruby/object:Gem::Dependency
38
38
  name: bson_ext
39
- requirement: &87323390 !ruby/object:Gem::Requirement
39
+ requirement: &86209080 !ruby/object:Gem::Requirement
40
40
  none: false
41
41
  requirements:
42
42
  - - ~>
@@ -44,10 +44,10 @@ dependencies:
44
44
  version: 1.3.0
45
45
  type: :development
46
46
  prerelease: false
47
- version_requirements: *87323390
47
+ version_requirements: *86209080
48
48
  - !ruby/object:Gem::Dependency
49
49
  name: rspec
50
- requirement: &87323150 !ruby/object:Gem::Requirement
50
+ requirement: &86208570 !ruby/object:Gem::Requirement
51
51
  none: false
52
52
  requirements:
53
53
  - - ~>
@@ -55,10 +55,10 @@ dependencies:
55
55
  version: 2.5.0
56
56
  type: :development
57
57
  prerelease: false
58
- version_requirements: *87323150
58
+ version_requirements: *86208570
59
59
  - !ruby/object:Gem::Dependency
60
60
  name: jeweler
61
- requirement: &87322910 !ruby/object:Gem::Requirement
61
+ requirement: &86208060 !ruby/object:Gem::Requirement
62
62
  none: false
63
63
  requirements:
64
64
  - - ~>
@@ -66,7 +66,7 @@ dependencies:
66
66
  version: 1.5.2
67
67
  type: :development
68
68
  prerelease: false
69
- version_requirements: *87322910
69
+ version_requirements: *86208060
70
70
  description: Full-text search for the Mongoid ORM, using n-grams extracted from text
71
71
  email: aaron.windsor@gmail.com
72
72
  executables: []
@@ -119,7 +119,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
119
119
  version: '0'
120
120
  segments:
121
121
  - 0
122
- hash: 136864689
122
+ hash: 444903309
123
123
  required_rubygems_version: !ruby/object:Gem::Requirement
124
124
  none: false
125
125
  requirements: