mongoid_fulltext 0.5.2 → 0.5.3

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -214,7 +214,9 @@ Additional indexing/query options can be used as parameters to `fulltext_search_
214
214
  * `alphabet`: letters to index, default is `abcdefghijklmnopqrstuvwxyz0123456789 `
215
215
  * `word_separators`: word separators, default is the space character.
216
216
  * `ngram_width`: ngram width, default is `3`
217
- * `index_full_words`: index full words, which improves exact matches, default is `true`
217
+ * `index_full_words`: index full words, which improves exact matches, default is `true`.
218
+ * `index_short_prefixes`: index a prefix of each full word of length `(ngram_width-1)`. Useful if
219
+ you use a larger ngram_width than the default of 3. Default for this option is `false`.
218
220
  * `stop_words`: a hash of words to avoid indexing as full words. Used only if `index_full_words`
219
221
  is set to `true`. Defaults to a hash containing a list of common English stop words.
220
222
  * `apply_prefix_scoring_to_all_words`: score n-grams at beginning of words higher, default is `true`
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.5.2
1
+ 0.5.3
@@ -29,6 +29,7 @@ module Mongoid::FullTextSearch
29
29
  :max_ngrams_to_search => 6,
30
30
  :apply_prefix_scoring_to_all_words => true,
31
31
  :index_full_words => true,
32
+ :index_short_prefixes => false,
32
33
  :max_candidate_set_size => 1000,
33
34
  :remove_accents => true,
34
35
  :stop_words => Hash[['i', 'a', 's', 't', 'me', 'my', 'we', 'he', 'it', 'am', 'is', 'be', 'do', 'an', 'if',
@@ -218,19 +219,32 @@ module Mongoid::FullTextSearch
218
219
 
219
220
  # If an ngram appears multiple times in the query string, keep the max score
220
221
  ngram_array = ngram_array.group_by{ |h| h[:ngram] }.map{ |key, values| {:ngram => key, :score => values.map{ |v| v[:score] }.max} }
221
-
222
+
223
+ # Add 'short prefix' records to the array: prefixes of the string that are length (ngram_width - 1)
224
+ if config[:index_short_prefixes]
225
+ prefixes_seen = {}
226
+ filtered_str.split(Regexp.compile(config[:word_separators].keys.join)).each do |word|
227
+ next if word.length < config[:ngram_width]-1
228
+ prefix = word[0...config[:ngram_width]-1]
229
+ if prefixes_seen[prefix].nil? and (config[:stop_words][word].nil? or word == filtered_str)
230
+ ngram_array << {:ngram => prefix, :score => 1}
231
+ prefixes_seen[prefix] = true
232
+ end
233
+ end
234
+ end
235
+
222
236
  # Add records to the array of ngrams for each full word in the string that isn't a stop word
223
- if (config[:index_full_words])
237
+ if config[:index_full_words]
224
238
  full_words_seen = {}
225
239
  filtered_str.split(Regexp.compile(config[:word_separators].keys.join)).each do |word|
226
- if word.length > 1 and full_words_seen[word].nil? and config[:stop_words][word].nil?
240
+ if word.length > 1 and full_words_seen[word].nil? and (config[:stop_words][word].nil? or word == filtered_str)
227
241
  ngram_array << {:ngram => word, :score => 1}
228
242
  full_words_seen[word] = true
229
243
  end
230
244
  end
231
245
  end
232
246
 
233
- # If an ngram appears as a full word and an ngram, keep the sum of the two scores
247
+ # If an ngram appears as any combination of full word, short prefix, and ngram, keep the sum of the two scores
234
248
  Hash[ngram_array.group_by{ |h| h[:ngram] }.map{ |key, values| [key, values.map{ |v| v[:score] }.sum] }]
235
249
  end
236
250
 
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{mongoid_fulltext}
8
- s.version = "0.5.2"
8
+ s.version = "0.5.3"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Aaron Windsor"]
12
- s.date = %q{2011-11-05}
12
+ s.date = %q{2011-11-08}
13
13
  s.description = %q{Full-text search for the Mongoid ORM, using n-grams extracted from text}
14
14
  s.email = %q{aaron.windsor@gmail.com}
15
15
  s.extra_rdoc_files = [
@@ -42,6 +42,7 @@ Gem::Specification.new do |s|
42
42
  "spec/models/multi_field_artist.rb",
43
43
  "spec/models/multi_field_artwork.rb",
44
44
  "spec/models/partitioned_artist.rb",
45
+ "spec/models/short_prefixes_artwork.rb",
45
46
  "spec/models/stopwords_artwork.rb",
46
47
  "spec/mongoid/fulltext_spec.rb",
47
48
  "spec/spec_helper.rb"
@@ -67,6 +68,7 @@ Gem::Specification.new do |s|
67
68
  "spec/models/multi_field_artist.rb",
68
69
  "spec/models/multi_field_artwork.rb",
69
70
  "spec/models/partitioned_artist.rb",
71
+ "spec/models/short_prefixes_artwork.rb",
70
72
  "spec/models/stopwords_artwork.rb",
71
73
  "spec/mongoid/fulltext_spec.rb",
72
74
  "spec/spec_helper.rb"
@@ -0,0 +1,11 @@
1
+ class ShortPrefixesArtwork
2
+ include Mongoid::Document
3
+ include Mongoid::FullTextSearch
4
+
5
+ field :title
6
+ fulltext_search_in :title,
7
+ :ngram_width => 4,
8
+ :index_short_prefixes => true,
9
+ :index_full_words => false
10
+
11
+ end
@@ -467,6 +467,33 @@ module Mongoid
467
467
 
468
468
  end
469
469
 
470
+ context "indexing short prefixes" do
471
+ let!(:dimethyl_mercury) { ShortPrefixesArtwork.create(:title => "Dimethyl Mercury by Damien Hirst") }
472
+ let!(:volume) { ShortPrefixesArtwork.create(:title => "Volume by Dadamaino") }
473
+ let!(:damaged) { ShortPrefixesArtwork.create(:title => "Damaged: Photographs from the Chicago Daily News 1902-1933 (Governor) by Lisa Oppenheim") }
474
+ let!(:frozen) { ShortPrefixesArtwork.create(:title => "Frozen Fountain XXX by Evelyn Rosenberg") }
475
+ let!(:skull) { ShortPrefixesArtwork.create(:title => "Skull by Andy Warhol") }
476
+
477
+ it "finds the most relevant items with prefix indexing" do
478
+ ShortPrefixesArtwork.fulltext_search("damien").first.should == dimethyl_mercury
479
+ ShortPrefixesArtwork.fulltext_search("dami").first.should == dimethyl_mercury
480
+ ShortPrefixesArtwork.fulltext_search("dama").first.should == damaged
481
+ ShortPrefixesArtwork.fulltext_search("dam").first.should_not == volume
482
+ ShortPrefixesArtwork.fulltext_search("dadamaino").first.should == volume
483
+ ShortPrefixesArtwork.fulltext_search("kull").first.should == skull
484
+ end
485
+
486
+ it "doesn't index prefixes of stopwords" do
487
+ # damaged has the word "from" in it, which shouldn't get indexed.
488
+ ShortPrefixesArtwork.fulltext_search("fro").should == [frozen]
489
+ end
490
+
491
+ it "does index prefixes that would be stopwords taken alone" do
492
+ # skull has the word "andy" in it, which should get indexed as "and" even though "and" is a stopword
493
+ ShortPrefixesArtwork.fulltext_search("and").should == [skull]
494
+ end
495
+ end
496
+
470
497
  context "remove_from_ngram_index" do
471
498
  let!(:flowers1) { BasicArtwork.create(:title => 'Flowers 1') }
472
499
  let!(:flowers2) { BasicArtwork.create(:title => 'Flowers 1') }
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: mongoid_fulltext
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.2
4
+ version: 0.5.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,12 +9,12 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2011-11-05 00:00:00.000000000 -04:00
12
+ date: 2011-11-08 00:00:00.000000000 -05:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: unicode_utils
17
- requirement: &86209940 !ruby/object:Gem::Requirement
17
+ requirement: &87410860 !ruby/object:Gem::Requirement
18
18
  none: false
19
19
  requirements:
20
20
  - - ~>
@@ -22,10 +22,10 @@ dependencies:
22
22
  version: 1.0.0
23
23
  type: :runtime
24
24
  prerelease: false
25
- version_requirements: *86209940
25
+ version_requirements: *87410860
26
26
  - !ruby/object:Gem::Dependency
27
27
  name: mongoid
28
- requirement: &86209400 !ruby/object:Gem::Requirement
28
+ requirement: &87410620 !ruby/object:Gem::Requirement
29
29
  none: false
30
30
  requirements:
31
31
  - - ~>
@@ -33,10 +33,10 @@ dependencies:
33
33
  version: 2.0.0
34
34
  type: :development
35
35
  prerelease: false
36
- version_requirements: *86209400
36
+ version_requirements: *87410620
37
37
  - !ruby/object:Gem::Dependency
38
38
  name: bson_ext
39
- requirement: &86209080 !ruby/object:Gem::Requirement
39
+ requirement: &87410380 !ruby/object:Gem::Requirement
40
40
  none: false
41
41
  requirements:
42
42
  - - ~>
@@ -44,10 +44,10 @@ dependencies:
44
44
  version: 1.3.0
45
45
  type: :development
46
46
  prerelease: false
47
- version_requirements: *86209080
47
+ version_requirements: *87410380
48
48
  - !ruby/object:Gem::Dependency
49
49
  name: rspec
50
- requirement: &86208570 !ruby/object:Gem::Requirement
50
+ requirement: &87410140 !ruby/object:Gem::Requirement
51
51
  none: false
52
52
  requirements:
53
53
  - - ~>
@@ -55,10 +55,10 @@ dependencies:
55
55
  version: 2.5.0
56
56
  type: :development
57
57
  prerelease: false
58
- version_requirements: *86208570
58
+ version_requirements: *87410140
59
59
  - !ruby/object:Gem::Dependency
60
60
  name: jeweler
61
- requirement: &86208060 !ruby/object:Gem::Requirement
61
+ requirement: &87409900 !ruby/object:Gem::Requirement
62
62
  none: false
63
63
  requirements:
64
64
  - - ~>
@@ -66,7 +66,7 @@ dependencies:
66
66
  version: 1.5.2
67
67
  type: :development
68
68
  prerelease: false
69
- version_requirements: *86208060
69
+ version_requirements: *87409900
70
70
  description: Full-text search for the Mongoid ORM, using n-grams extracted from text
71
71
  email: aaron.windsor@gmail.com
72
72
  executables: []
@@ -100,6 +100,7 @@ files:
100
100
  - spec/models/multi_field_artist.rb
101
101
  - spec/models/multi_field_artwork.rb
102
102
  - spec/models/partitioned_artist.rb
103
+ - spec/models/short_prefixes_artwork.rb
103
104
  - spec/models/stopwords_artwork.rb
104
105
  - spec/mongoid/fulltext_spec.rb
105
106
  - spec/spec_helper.rb
@@ -119,7 +120,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
119
120
  version: '0'
120
121
  segments:
121
122
  - 0
122
- hash: 444903309
123
+ hash: 591721469
123
124
  required_rubygems_version: !ruby/object:Gem::Requirement
124
125
  none: false
125
126
  requirements:
@@ -148,6 +149,7 @@ test_files:
148
149
  - spec/models/multi_field_artist.rb
149
150
  - spec/models/multi_field_artwork.rb
150
151
  - spec/models/partitioned_artist.rb
152
+ - spec/models/short_prefixes_artwork.rb
151
153
  - spec/models/stopwords_artwork.rb
152
154
  - spec/mongoid/fulltext_spec.rb
153
155
  - spec/spec_helper.rb