mongoid_fulltext 0.5.1 → 0.5.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +3 -1
- data/VERSION +1 -1
- data/lib/mongoid_fulltext.rb +20 -14
- data/mongoid_fulltext.gemspec +2 -2
- data/spec/models/stopwords_artwork.rb +1 -1
- data/spec/mongoid/fulltext_spec.rb +7 -0
- metadata +13 -13
data/README.md
CHANGED
@@ -212,9 +212,11 @@ Indexing Options
|
|
212
212
|
Additional indexing/query options can be used as parameters to `fulltext_search_in`.
|
213
213
|
|
214
214
|
* `alphabet`: letters to index, default is `abcdefghijklmnopqrstuvwxyz0123456789 `
|
215
|
-
* `word_separators`: word separators, default is
|
215
|
+
* `word_separators`: word separators, default is the space character.
|
216
216
|
* `ngram_width`: ngram width, default is `3`
|
217
217
|
* `index_full_words`: index full words, which improves exact matches, default is `true`
|
218
|
+
* `stop_words`: a hash of words to avoid indexing as full words. Used only if `index_full_words`
|
219
|
+
is set to `true`. Defaults to a hash containing a list of common English stop words.
|
218
220
|
* `apply_prefix_scoring_to_all_words`: score n-grams at beginning of words higher, default is `true`
|
219
221
|
* `max_ngrams_to_search`: maximum number of ngrams to query at any given time, default is `6`
|
220
222
|
* `max_candidate_set_size`: maximum number of candidate ngrams to examine for a given query.
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.5.
|
1
|
+
0.5.2
|
data/lib/mongoid_fulltext.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'mongoid_indexes'
|
2
2
|
require 'unicode_utils'
|
3
|
+
require 'cgi'
|
3
4
|
|
4
5
|
module Mongoid::FullTextSearch
|
5
6
|
extend ActiveSupport::Concern
|
@@ -180,28 +181,32 @@ module Mongoid::FullTextSearch
|
|
180
181
|
end
|
181
182
|
end
|
182
183
|
|
183
|
-
# returns an [ngram, score] [ngram, position] pair
|
184
184
|
def all_ngrams(str, config, bound_number_returned = true)
|
185
|
-
return {} if str.nil?
|
185
|
+
return {} if str.nil?
|
186
186
|
|
187
|
-
filtered_str = String.new(str)
|
188
187
|
if config[:remove_accents]
|
189
|
-
|
190
|
-
filtered_str = CGI.unescape(filtered_str)
|
191
|
-
end
|
192
|
-
filtered_str = UnicodeUtils.nfkd(filtered_str).gsub(/[^\x00-\x7F]/,'')
|
188
|
+
str = UnicodeUtils.nfkd(CGI.unescape(str)).gsub(/[^\x00-\x7F]/,'')
|
193
189
|
end
|
194
190
|
|
195
|
-
|
191
|
+
# Remove any characters that aren't in the alphabet
|
192
|
+
filtered_str = str.mb_chars.to_s.downcase.split('').find_all{ |ch| config[:alphabet][ch] }.join('')
|
196
193
|
|
194
|
+
# Figure out how many ngrams to extract from the string. If we can't afford to extract all ngrams,
|
195
|
+
# step over the string in evenly spaced strides to extract ngrams. For example, to extract 3 3-letter
|
196
|
+
# ngrams from 'abcdefghijk', we'd want to extract 'abc', 'efg', and 'ijk'.
|
197
197
|
if bound_number_returned
|
198
198
|
step_size = [((filtered_str.length - config[:ngram_width]).to_f / config[:max_ngrams_to_search]).ceil, 1].max
|
199
199
|
else
|
200
200
|
step_size = 1
|
201
201
|
end
|
202
202
|
|
203
|
-
# Create an array of records of the form {:ngram => x, :score => y} for all ngrams that occur in the
|
204
|
-
|
203
|
+
# Create an array of records of the form {:ngram => x, :score => y} for all ngrams that occur in the
|
204
|
+
# input string using the step size that we just computed. Let score(x,y) be the score of string x
|
205
|
+
# compared with string y - assigning scores to ngrams with the square root-based scoring function
|
206
|
+
# below and multiplying scores of matching ngrams together yields a score function that has the
|
207
|
+
# property that score(x,y) > score(x,z) for any string z containing y and score(x,y) > score(x,z)
|
208
|
+
# for any string z contained in y.
|
209
|
+
ngram_array = (0..filtered_str.length - config[:ngram_width]).step(step_size).map do |i|
|
205
210
|
if i == 0 or (config[:apply_prefix_scoring_to_all_words] and \
|
206
211
|
config[:word_separators].has_key?(filtered_str[i-1].chr))
|
207
212
|
score = Math.sqrt(1 + 1.0/filtered_str.length)
|
@@ -212,20 +217,21 @@ module Mongoid::FullTextSearch
|
|
212
217
|
end
|
213
218
|
|
214
219
|
# If an ngram appears multiple times in the query string, keep the max score
|
215
|
-
|
220
|
+
ngram_array = ngram_array.group_by{ |h| h[:ngram] }.map{ |key, values| {:ngram => key, :score => values.map{ |v| v[:score] }.max} }
|
216
221
|
|
222
|
+
# Add records to the array of ngrams for each full word in the string that isn't a stop word
|
217
223
|
if (config[:index_full_words])
|
218
224
|
full_words_seen = {}
|
219
225
|
filtered_str.split(Regexp.compile(config[:word_separators].keys.join)).each do |word|
|
220
|
-
if word.length
|
221
|
-
|
226
|
+
if word.length > 1 and full_words_seen[word].nil? and config[:stop_words][word].nil?
|
227
|
+
ngram_array << {:ngram => word, :score => 1}
|
222
228
|
full_words_seen[word] = true
|
223
229
|
end
|
224
230
|
end
|
225
231
|
end
|
226
232
|
|
227
233
|
# If an ngram appears as a full word and an ngram, keep the sum of the two scores
|
228
|
-
Hash[
|
234
|
+
Hash[ngram_array.group_by{ |h| h[:ngram] }.map{ |key, values| [key, values.map{ |v| v[:score] }.sum] }]
|
229
235
|
end
|
230
236
|
|
231
237
|
def remove_from_ngram_index
|
data/mongoid_fulltext.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{mongoid_fulltext}
|
8
|
-
s.version = "0.5.
|
8
|
+
s.version = "0.5.2"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Aaron Windsor"]
|
12
|
-
s.date = %q{2011-11-
|
12
|
+
s.date = %q{2011-11-05}
|
13
13
|
s.description = %q{Full-text search for the Mongoid ORM, using n-grams extracted from text}
|
14
14
|
s.email = %q{aaron.windsor@gmail.com}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -453,11 +453,18 @@ module Mongoid
|
|
453
453
|
context "with stop words defined" do
|
454
454
|
let!(:flowers) { StopwordsArtwork.create(:title => "Flowers by Andy Warhol") }
|
455
455
|
let!(:many_ands) { StopwordsArtwork.create(:title => "Foo and bar and baz and foobar") }
|
456
|
+
let!(:harry) { StopwordsArtwork.create(:title => "Harry in repose by JK Rowling") }
|
456
457
|
|
457
458
|
it "doesn't give a full-word score boost to stopwords" do
|
458
459
|
StopwordsArtwork.fulltext_search("andy").map{ |a| a.title }.should == [flowers.title, many_ands.title]
|
459
460
|
StopwordsArtwork.fulltext_search("warhol and other stuff").map{ |a| a.title }.should == [flowers.title, many_ands.title]
|
460
461
|
end
|
462
|
+
|
463
|
+
it "allows searching on words that are more than one letter, less than the ngram length and not stopwords" do
|
464
|
+
StopwordsArtwork.fulltext_search("jk").map{ |a| a.title }.should == [harry.title]
|
465
|
+
StopwordsArtwork.fulltext_search("by").map{ |a| a.title }.should == []
|
466
|
+
end
|
467
|
+
|
461
468
|
end
|
462
469
|
|
463
470
|
context "remove_from_ngram_index" do
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: mongoid_fulltext
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,12 +9,12 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2011-11-
|
12
|
+
date: 2011-11-05 00:00:00.000000000 -04:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: unicode_utils
|
17
|
-
requirement: &
|
17
|
+
requirement: &86209940 !ruby/object:Gem::Requirement
|
18
18
|
none: false
|
19
19
|
requirements:
|
20
20
|
- - ~>
|
@@ -22,10 +22,10 @@ dependencies:
|
|
22
22
|
version: 1.0.0
|
23
23
|
type: :runtime
|
24
24
|
prerelease: false
|
25
|
-
version_requirements: *
|
25
|
+
version_requirements: *86209940
|
26
26
|
- !ruby/object:Gem::Dependency
|
27
27
|
name: mongoid
|
28
|
-
requirement: &
|
28
|
+
requirement: &86209400 !ruby/object:Gem::Requirement
|
29
29
|
none: false
|
30
30
|
requirements:
|
31
31
|
- - ~>
|
@@ -33,10 +33,10 @@ dependencies:
|
|
33
33
|
version: 2.0.0
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
|
-
version_requirements: *
|
36
|
+
version_requirements: *86209400
|
37
37
|
- !ruby/object:Gem::Dependency
|
38
38
|
name: bson_ext
|
39
|
-
requirement: &
|
39
|
+
requirement: &86209080 !ruby/object:Gem::Requirement
|
40
40
|
none: false
|
41
41
|
requirements:
|
42
42
|
- - ~>
|
@@ -44,10 +44,10 @@ dependencies:
|
|
44
44
|
version: 1.3.0
|
45
45
|
type: :development
|
46
46
|
prerelease: false
|
47
|
-
version_requirements: *
|
47
|
+
version_requirements: *86209080
|
48
48
|
- !ruby/object:Gem::Dependency
|
49
49
|
name: rspec
|
50
|
-
requirement: &
|
50
|
+
requirement: &86208570 !ruby/object:Gem::Requirement
|
51
51
|
none: false
|
52
52
|
requirements:
|
53
53
|
- - ~>
|
@@ -55,10 +55,10 @@ dependencies:
|
|
55
55
|
version: 2.5.0
|
56
56
|
type: :development
|
57
57
|
prerelease: false
|
58
|
-
version_requirements: *
|
58
|
+
version_requirements: *86208570
|
59
59
|
- !ruby/object:Gem::Dependency
|
60
60
|
name: jeweler
|
61
|
-
requirement: &
|
61
|
+
requirement: &86208060 !ruby/object:Gem::Requirement
|
62
62
|
none: false
|
63
63
|
requirements:
|
64
64
|
- - ~>
|
@@ -66,7 +66,7 @@ dependencies:
|
|
66
66
|
version: 1.5.2
|
67
67
|
type: :development
|
68
68
|
prerelease: false
|
69
|
-
version_requirements: *
|
69
|
+
version_requirements: *86208060
|
70
70
|
description: Full-text search for the Mongoid ORM, using n-grams extracted from text
|
71
71
|
email: aaron.windsor@gmail.com
|
72
72
|
executables: []
|
@@ -119,7 +119,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
119
119
|
version: '0'
|
120
120
|
segments:
|
121
121
|
- 0
|
122
|
-
hash:
|
122
|
+
hash: 444903309
|
123
123
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
124
124
|
none: false
|
125
125
|
requirements:
|