mongoid_fulltext 0.5.2 → 0.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +3 -1
- data/VERSION +1 -1
- data/lib/mongoid_fulltext.rb +18 -4
- data/mongoid_fulltext.gemspec +4 -2
- data/spec/models/short_prefixes_artwork.rb +11 -0
- data/spec/mongoid/fulltext_spec.rb +27 -0
- metadata +15 -13
data/README.md
CHANGED
@@ -214,7 +214,9 @@ Additional indexing/query options can be used as parameters to `fulltext_search_
|
|
214
214
|
* `alphabet`: letters to index, default is `abcdefghijklmnopqrstuvwxyz0123456789 `
|
215
215
|
* `word_separators`: word separators, default is the space character.
|
216
216
|
* `ngram_width`: ngram width, default is `3`
|
217
|
-
* `index_full_words`: index full words, which improves exact matches, default is `true
|
217
|
+
* `index_full_words`: index full words, which improves exact matches, default is `true`.
|
218
|
+
* `index_short_prefixes`: index a prefix of each full word of length `(ngram_width-1)`. Useful if
|
219
|
+
you use a larger ngram_width than the default of 3. Default for this option is `false`.
|
218
220
|
* `stop_words`: a hash of words to avoid indexing as full words. Used only if `index_full_words`
|
219
221
|
is set to `true`. Defaults to a hash containing a list of common English stop words.
|
220
222
|
* `apply_prefix_scoring_to_all_words`: score n-grams at beginning of words higher, default is `true`
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.5.
|
1
|
+
0.5.3
|
data/lib/mongoid_fulltext.rb
CHANGED
@@ -29,6 +29,7 @@ module Mongoid::FullTextSearch
|
|
29
29
|
:max_ngrams_to_search => 6,
|
30
30
|
:apply_prefix_scoring_to_all_words => true,
|
31
31
|
:index_full_words => true,
|
32
|
+
:index_short_prefixes => false,
|
32
33
|
:max_candidate_set_size => 1000,
|
33
34
|
:remove_accents => true,
|
34
35
|
:stop_words => Hash[['i', 'a', 's', 't', 'me', 'my', 'we', 'he', 'it', 'am', 'is', 'be', 'do', 'an', 'if',
|
@@ -218,19 +219,32 @@ module Mongoid::FullTextSearch
|
|
218
219
|
|
219
220
|
# If an ngram appears multiple times in the query string, keep the max score
|
220
221
|
ngram_array = ngram_array.group_by{ |h| h[:ngram] }.map{ |key, values| {:ngram => key, :score => values.map{ |v| v[:score] }.max} }
|
221
|
-
|
222
|
+
|
223
|
+
# Add 'short prefix' records to the array: prefixes of the string that are length (ngram_width - 1)
|
224
|
+
if config[:index_short_prefixes]
|
225
|
+
prefixes_seen = {}
|
226
|
+
filtered_str.split(Regexp.compile(config[:word_separators].keys.join)).each do |word|
|
227
|
+
next if word.length < config[:ngram_width]-1
|
228
|
+
prefix = word[0...config[:ngram_width]-1]
|
229
|
+
if prefixes_seen[prefix].nil? and (config[:stop_words][word].nil? or word == filtered_str)
|
230
|
+
ngram_array << {:ngram => prefix, :score => 1}
|
231
|
+
prefixes_seen[prefix] = true
|
232
|
+
end
|
233
|
+
end
|
234
|
+
end
|
235
|
+
|
222
236
|
# Add records to the array of ngrams for each full word in the string that isn't a stop word
|
223
|
-
if
|
237
|
+
if config[:index_full_words]
|
224
238
|
full_words_seen = {}
|
225
239
|
filtered_str.split(Regexp.compile(config[:word_separators].keys.join)).each do |word|
|
226
|
-
if word.length > 1 and full_words_seen[word].nil? and config[:stop_words][word].nil?
|
240
|
+
if word.length > 1 and full_words_seen[word].nil? and (config[:stop_words][word].nil? or word == filtered_str)
|
227
241
|
ngram_array << {:ngram => word, :score => 1}
|
228
242
|
full_words_seen[word] = true
|
229
243
|
end
|
230
244
|
end
|
231
245
|
end
|
232
246
|
|
233
|
-
# If an ngram appears as
|
247
|
+
# If an ngram appears as any combination of full word, short prefix, and ngram, keep the sum of the two scores
|
234
248
|
Hash[ngram_array.group_by{ |h| h[:ngram] }.map{ |key, values| [key, values.map{ |v| v[:score] }.sum] }]
|
235
249
|
end
|
236
250
|
|
data/mongoid_fulltext.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{mongoid_fulltext}
|
8
|
-
s.version = "0.5.
|
8
|
+
s.version = "0.5.3"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Aaron Windsor"]
|
12
|
-
s.date = %q{2011-11-
|
12
|
+
s.date = %q{2011-11-08}
|
13
13
|
s.description = %q{Full-text search for the Mongoid ORM, using n-grams extracted from text}
|
14
14
|
s.email = %q{aaron.windsor@gmail.com}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -42,6 +42,7 @@ Gem::Specification.new do |s|
|
|
42
42
|
"spec/models/multi_field_artist.rb",
|
43
43
|
"spec/models/multi_field_artwork.rb",
|
44
44
|
"spec/models/partitioned_artist.rb",
|
45
|
+
"spec/models/short_prefixes_artwork.rb",
|
45
46
|
"spec/models/stopwords_artwork.rb",
|
46
47
|
"spec/mongoid/fulltext_spec.rb",
|
47
48
|
"spec/spec_helper.rb"
|
@@ -67,6 +68,7 @@ Gem::Specification.new do |s|
|
|
67
68
|
"spec/models/multi_field_artist.rb",
|
68
69
|
"spec/models/multi_field_artwork.rb",
|
69
70
|
"spec/models/partitioned_artist.rb",
|
71
|
+
"spec/models/short_prefixes_artwork.rb",
|
70
72
|
"spec/models/stopwords_artwork.rb",
|
71
73
|
"spec/mongoid/fulltext_spec.rb",
|
72
74
|
"spec/spec_helper.rb"
|
@@ -467,6 +467,33 @@ module Mongoid
|
|
467
467
|
|
468
468
|
end
|
469
469
|
|
470
|
+
context "indexing short prefixes" do
|
471
|
+
let!(:dimethyl_mercury) { ShortPrefixesArtwork.create(:title => "Dimethyl Mercury by Damien Hirst") }
|
472
|
+
let!(:volume) { ShortPrefixesArtwork.create(:title => "Volume by Dadamaino") }
|
473
|
+
let!(:damaged) { ShortPrefixesArtwork.create(:title => "Damaged: Photographs from the Chicago Daily News 1902-1933 (Governor) by Lisa Oppenheim") }
|
474
|
+
let!(:frozen) { ShortPrefixesArtwork.create(:title => "Frozen Fountain XXX by Evelyn Rosenberg") }
|
475
|
+
let!(:skull) { ShortPrefixesArtwork.create(:title => "Skull by Andy Warhol") }
|
476
|
+
|
477
|
+
it "finds the most relevant items with prefix indexing" do
|
478
|
+
ShortPrefixesArtwork.fulltext_search("damien").first.should == dimethyl_mercury
|
479
|
+
ShortPrefixesArtwork.fulltext_search("dami").first.should == dimethyl_mercury
|
480
|
+
ShortPrefixesArtwork.fulltext_search("dama").first.should == damaged
|
481
|
+
ShortPrefixesArtwork.fulltext_search("dam").first.should_not == volume
|
482
|
+
ShortPrefixesArtwork.fulltext_search("dadamaino").first.should == volume
|
483
|
+
ShortPrefixesArtwork.fulltext_search("kull").first.should == skull
|
484
|
+
end
|
485
|
+
|
486
|
+
it "doesn't index prefixes of stopwords" do
|
487
|
+
# damaged has the word "from" in it, which shouldn't get indexed.
|
488
|
+
ShortPrefixesArtwork.fulltext_search("fro").should == [frozen]
|
489
|
+
end
|
490
|
+
|
491
|
+
it "does index prefixes that would be stopwords taken alone" do
|
492
|
+
# skull has the word "andy" in it, which should get indexed as "and" even though "and" is a stopword
|
493
|
+
ShortPrefixesArtwork.fulltext_search("and").should == [skull]
|
494
|
+
end
|
495
|
+
end
|
496
|
+
|
470
497
|
context "remove_from_ngram_index" do
|
471
498
|
let!(:flowers1) { BasicArtwork.create(:title => 'Flowers 1') }
|
472
499
|
let!(:flowers2) { BasicArtwork.create(:title => 'Flowers 1') }
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: mongoid_fulltext
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,12 +9,12 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2011-11-
|
12
|
+
date: 2011-11-08 00:00:00.000000000 -05:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: unicode_utils
|
17
|
-
requirement: &
|
17
|
+
requirement: &87410860 !ruby/object:Gem::Requirement
|
18
18
|
none: false
|
19
19
|
requirements:
|
20
20
|
- - ~>
|
@@ -22,10 +22,10 @@ dependencies:
|
|
22
22
|
version: 1.0.0
|
23
23
|
type: :runtime
|
24
24
|
prerelease: false
|
25
|
-
version_requirements: *
|
25
|
+
version_requirements: *87410860
|
26
26
|
- !ruby/object:Gem::Dependency
|
27
27
|
name: mongoid
|
28
|
-
requirement: &
|
28
|
+
requirement: &87410620 !ruby/object:Gem::Requirement
|
29
29
|
none: false
|
30
30
|
requirements:
|
31
31
|
- - ~>
|
@@ -33,10 +33,10 @@ dependencies:
|
|
33
33
|
version: 2.0.0
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
|
-
version_requirements: *
|
36
|
+
version_requirements: *87410620
|
37
37
|
- !ruby/object:Gem::Dependency
|
38
38
|
name: bson_ext
|
39
|
-
requirement: &
|
39
|
+
requirement: &87410380 !ruby/object:Gem::Requirement
|
40
40
|
none: false
|
41
41
|
requirements:
|
42
42
|
- - ~>
|
@@ -44,10 +44,10 @@ dependencies:
|
|
44
44
|
version: 1.3.0
|
45
45
|
type: :development
|
46
46
|
prerelease: false
|
47
|
-
version_requirements: *
|
47
|
+
version_requirements: *87410380
|
48
48
|
- !ruby/object:Gem::Dependency
|
49
49
|
name: rspec
|
50
|
-
requirement: &
|
50
|
+
requirement: &87410140 !ruby/object:Gem::Requirement
|
51
51
|
none: false
|
52
52
|
requirements:
|
53
53
|
- - ~>
|
@@ -55,10 +55,10 @@ dependencies:
|
|
55
55
|
version: 2.5.0
|
56
56
|
type: :development
|
57
57
|
prerelease: false
|
58
|
-
version_requirements: *
|
58
|
+
version_requirements: *87410140
|
59
59
|
- !ruby/object:Gem::Dependency
|
60
60
|
name: jeweler
|
61
|
-
requirement: &
|
61
|
+
requirement: &87409900 !ruby/object:Gem::Requirement
|
62
62
|
none: false
|
63
63
|
requirements:
|
64
64
|
- - ~>
|
@@ -66,7 +66,7 @@ dependencies:
|
|
66
66
|
version: 1.5.2
|
67
67
|
type: :development
|
68
68
|
prerelease: false
|
69
|
-
version_requirements: *
|
69
|
+
version_requirements: *87409900
|
70
70
|
description: Full-text search for the Mongoid ORM, using n-grams extracted from text
|
71
71
|
email: aaron.windsor@gmail.com
|
72
72
|
executables: []
|
@@ -100,6 +100,7 @@ files:
|
|
100
100
|
- spec/models/multi_field_artist.rb
|
101
101
|
- spec/models/multi_field_artwork.rb
|
102
102
|
- spec/models/partitioned_artist.rb
|
103
|
+
- spec/models/short_prefixes_artwork.rb
|
103
104
|
- spec/models/stopwords_artwork.rb
|
104
105
|
- spec/mongoid/fulltext_spec.rb
|
105
106
|
- spec/spec_helper.rb
|
@@ -119,7 +120,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
119
120
|
version: '0'
|
120
121
|
segments:
|
121
122
|
- 0
|
122
|
-
hash:
|
123
|
+
hash: 591721469
|
123
124
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
124
125
|
none: false
|
125
126
|
requirements:
|
@@ -148,6 +149,7 @@ test_files:
|
|
148
149
|
- spec/models/multi_field_artist.rb
|
149
150
|
- spec/models/multi_field_artwork.rb
|
150
151
|
- spec/models/partitioned_artist.rb
|
152
|
+
- spec/models/short_prefixes_artwork.rb
|
151
153
|
- spec/models/stopwords_artwork.rb
|
152
154
|
- spec/mongoid/fulltext_spec.rb
|
153
155
|
- spec/spec_helper.rb
|