mongoid_fulltext 0.5.1 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +3 -1
- data/VERSION +1 -1
- data/lib/mongoid_fulltext.rb +20 -14
- data/mongoid_fulltext.gemspec +2 -2
- data/spec/models/stopwords_artwork.rb +1 -1
- data/spec/mongoid/fulltext_spec.rb +7 -0
- metadata +13 -13
data/README.md
CHANGED
@@ -212,9 +212,11 @@ Indexing Options
|
|
212
212
|
Additional indexing/query options can be used as parameters to `fulltext_search_in`.
|
213
213
|
|
214
214
|
* `alphabet`: letters to index, default is `abcdefghijklmnopqrstuvwxyz0123456789 `
|
215
|
-
* `word_separators`: word separators, default is
|
215
|
+
* `word_separators`: word separators, default is the space character.
|
216
216
|
* `ngram_width`: ngram width, default is `3`
|
217
217
|
* `index_full_words`: index full words, which improves exact matches, default is `true`
|
218
|
+
* `stop_words`: a hash of words to avoid indexing as full words. Used only if `index_full_words`
|
219
|
+
is set to `true`. Defaults to a hash containing a list of common English stop words.
|
218
220
|
* `apply_prefix_scoring_to_all_words`: score n-grams at beginning of words higher, default is `true`
|
219
221
|
* `max_ngrams_to_search`: maximum number of ngrams to query at any given time, default is `6`
|
220
222
|
* `max_candidate_set_size`: maximum number of candidate ngrams to examine for a given query.
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.5.
|
1
|
+
0.5.2
|
data/lib/mongoid_fulltext.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'mongoid_indexes'
|
2
2
|
require 'unicode_utils'
|
3
|
+
require 'cgi'
|
3
4
|
|
4
5
|
module Mongoid::FullTextSearch
|
5
6
|
extend ActiveSupport::Concern
|
@@ -180,28 +181,32 @@ module Mongoid::FullTextSearch
|
|
180
181
|
end
|
181
182
|
end
|
182
183
|
|
183
|
-
# returns an [ngram, score] [ngram, position] pair
|
184
184
|
def all_ngrams(str, config, bound_number_returned = true)
|
185
|
-
return {} if str.nil?
|
185
|
+
return {} if str.nil?
|
186
186
|
|
187
|
-
filtered_str = String.new(str)
|
188
187
|
if config[:remove_accents]
|
189
|
-
|
190
|
-
filtered_str = CGI.unescape(filtered_str)
|
191
|
-
end
|
192
|
-
filtered_str = UnicodeUtils.nfkd(filtered_str).gsub(/[^\x00-\x7F]/,'')
|
188
|
+
str = UnicodeUtils.nfkd(CGI.unescape(str)).gsub(/[^\x00-\x7F]/,'')
|
193
189
|
end
|
194
190
|
|
195
|
-
|
191
|
+
# Remove any characters that aren't in the alphabet
|
192
|
+
filtered_str = str.mb_chars.to_s.downcase.split('').find_all{ |ch| config[:alphabet][ch] }.join('')
|
196
193
|
|
194
|
+
# Figure out how many ngrams to extract from the string. If we can't afford to extract all ngrams,
|
195
|
+
# step over the string in evenly spaced strides to extract ngrams. For example, to extract 3 3-letter
|
196
|
+
# ngrams from 'abcdefghijk', we'd want to extract 'abc', 'efg', and 'ijk'.
|
197
197
|
if bound_number_returned
|
198
198
|
step_size = [((filtered_str.length - config[:ngram_width]).to_f / config[:max_ngrams_to_search]).ceil, 1].max
|
199
199
|
else
|
200
200
|
step_size = 1
|
201
201
|
end
|
202
202
|
|
203
|
-
# Create an array of records of the form {:ngram => x, :score => y} for all ngrams that occur in the
|
204
|
-
|
203
|
+
# Create an array of records of the form {:ngram => x, :score => y} for all ngrams that occur in the
|
204
|
+
# input string using the step size that we just computed. Let score(x,y) be the score of string x
|
205
|
+
# compared with string y - assigning scores to ngrams with the square root-based scoring function
|
206
|
+
# below and multiplying scores of matching ngrams together yields a score function that has the
|
207
|
+
# property that score(x,y) > score(x,z) for any string z containing y and score(x,y) > score(x,z)
|
208
|
+
# for any string z contained in y.
|
209
|
+
ngram_array = (0..filtered_str.length - config[:ngram_width]).step(step_size).map do |i|
|
205
210
|
if i == 0 or (config[:apply_prefix_scoring_to_all_words] and \
|
206
211
|
config[:word_separators].has_key?(filtered_str[i-1].chr))
|
207
212
|
score = Math.sqrt(1 + 1.0/filtered_str.length)
|
@@ -212,20 +217,21 @@ module Mongoid::FullTextSearch
|
|
212
217
|
end
|
213
218
|
|
214
219
|
# If an ngram appears multiple times in the query string, keep the max score
|
215
|
-
|
220
|
+
ngram_array = ngram_array.group_by{ |h| h[:ngram] }.map{ |key, values| {:ngram => key, :score => values.map{ |v| v[:score] }.max} }
|
216
221
|
|
222
|
+
# Add records to the array of ngrams for each full word in the string that isn't a stop word
|
217
223
|
if (config[:index_full_words])
|
218
224
|
full_words_seen = {}
|
219
225
|
filtered_str.split(Regexp.compile(config[:word_separators].keys.join)).each do |word|
|
220
|
-
if word.length
|
221
|
-
|
226
|
+
if word.length > 1 and full_words_seen[word].nil? and config[:stop_words][word].nil?
|
227
|
+
ngram_array << {:ngram => word, :score => 1}
|
222
228
|
full_words_seen[word] = true
|
223
229
|
end
|
224
230
|
end
|
225
231
|
end
|
226
232
|
|
227
233
|
# If an ngram appears as a full word and an ngram, keep the sum of the two scores
|
228
|
-
Hash[
|
234
|
+
Hash[ngram_array.group_by{ |h| h[:ngram] }.map{ |key, values| [key, values.map{ |v| v[:score] }.sum] }]
|
229
235
|
end
|
230
236
|
|
231
237
|
def remove_from_ngram_index
|
data/mongoid_fulltext.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{mongoid_fulltext}
|
8
|
-
s.version = "0.5.
|
8
|
+
s.version = "0.5.2"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Aaron Windsor"]
|
12
|
-
s.date = %q{2011-11-
|
12
|
+
s.date = %q{2011-11-05}
|
13
13
|
s.description = %q{Full-text search for the Mongoid ORM, using n-grams extracted from text}
|
14
14
|
s.email = %q{aaron.windsor@gmail.com}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -453,11 +453,18 @@ module Mongoid
|
|
453
453
|
context "with stop words defined" do
|
454
454
|
let!(:flowers) { StopwordsArtwork.create(:title => "Flowers by Andy Warhol") }
|
455
455
|
let!(:many_ands) { StopwordsArtwork.create(:title => "Foo and bar and baz and foobar") }
|
456
|
+
let!(:harry) { StopwordsArtwork.create(:title => "Harry in repose by JK Rowling") }
|
456
457
|
|
457
458
|
it "doesn't give a full-word score boost to stopwords" do
|
458
459
|
StopwordsArtwork.fulltext_search("andy").map{ |a| a.title }.should == [flowers.title, many_ands.title]
|
459
460
|
StopwordsArtwork.fulltext_search("warhol and other stuff").map{ |a| a.title }.should == [flowers.title, many_ands.title]
|
460
461
|
end
|
462
|
+
|
463
|
+
it "allows searching on words that are more than one letter, less than the ngram length and not stopwords" do
|
464
|
+
StopwordsArtwork.fulltext_search("jk").map{ |a| a.title }.should == [harry.title]
|
465
|
+
StopwordsArtwork.fulltext_search("by").map{ |a| a.title }.should == []
|
466
|
+
end
|
467
|
+
|
461
468
|
end
|
462
469
|
|
463
470
|
context "remove_from_ngram_index" do
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: mongoid_fulltext
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,12 +9,12 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2011-11-
|
12
|
+
date: 2011-11-05 00:00:00.000000000 -04:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: unicode_utils
|
17
|
-
requirement: &
|
17
|
+
requirement: &86209940 !ruby/object:Gem::Requirement
|
18
18
|
none: false
|
19
19
|
requirements:
|
20
20
|
- - ~>
|
@@ -22,10 +22,10 @@ dependencies:
|
|
22
22
|
version: 1.0.0
|
23
23
|
type: :runtime
|
24
24
|
prerelease: false
|
25
|
-
version_requirements: *
|
25
|
+
version_requirements: *86209940
|
26
26
|
- !ruby/object:Gem::Dependency
|
27
27
|
name: mongoid
|
28
|
-
requirement: &
|
28
|
+
requirement: &86209400 !ruby/object:Gem::Requirement
|
29
29
|
none: false
|
30
30
|
requirements:
|
31
31
|
- - ~>
|
@@ -33,10 +33,10 @@ dependencies:
|
|
33
33
|
version: 2.0.0
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
|
-
version_requirements: *
|
36
|
+
version_requirements: *86209400
|
37
37
|
- !ruby/object:Gem::Dependency
|
38
38
|
name: bson_ext
|
39
|
-
requirement: &
|
39
|
+
requirement: &86209080 !ruby/object:Gem::Requirement
|
40
40
|
none: false
|
41
41
|
requirements:
|
42
42
|
- - ~>
|
@@ -44,10 +44,10 @@ dependencies:
|
|
44
44
|
version: 1.3.0
|
45
45
|
type: :development
|
46
46
|
prerelease: false
|
47
|
-
version_requirements: *
|
47
|
+
version_requirements: *86209080
|
48
48
|
- !ruby/object:Gem::Dependency
|
49
49
|
name: rspec
|
50
|
-
requirement: &
|
50
|
+
requirement: &86208570 !ruby/object:Gem::Requirement
|
51
51
|
none: false
|
52
52
|
requirements:
|
53
53
|
- - ~>
|
@@ -55,10 +55,10 @@ dependencies:
|
|
55
55
|
version: 2.5.0
|
56
56
|
type: :development
|
57
57
|
prerelease: false
|
58
|
-
version_requirements: *
|
58
|
+
version_requirements: *86208570
|
59
59
|
- !ruby/object:Gem::Dependency
|
60
60
|
name: jeweler
|
61
|
-
requirement: &
|
61
|
+
requirement: &86208060 !ruby/object:Gem::Requirement
|
62
62
|
none: false
|
63
63
|
requirements:
|
64
64
|
- - ~>
|
@@ -66,7 +66,7 @@ dependencies:
|
|
66
66
|
version: 1.5.2
|
67
67
|
type: :development
|
68
68
|
prerelease: false
|
69
|
-
version_requirements: *
|
69
|
+
version_requirements: *86208060
|
70
70
|
description: Full-text search for the Mongoid ORM, using n-grams extracted from text
|
71
71
|
email: aaron.windsor@gmail.com
|
72
72
|
executables: []
|
@@ -119,7 +119,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
119
119
|
version: '0'
|
120
120
|
segments:
|
121
121
|
- 0
|
122
|
-
hash:
|
122
|
+
hash: 444903309
|
123
123
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
124
124
|
none: false
|
125
125
|
requirements:
|