autosuggest 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +9 -0
- data/Gemfile +4 -0
- data/README.md +126 -0
- data/Rakefile +8 -0
- data/autosuggest.gemspec +26 -0
- data/lib/autosuggest.rb +202 -0
- data/lib/autosuggest/version.rb +3 -0
- metadata +121 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 305c68241ebeabdd7eca78f2bb1b4d3e7e1e430d
|
4
|
+
data.tar.gz: 797e7579646943ec7aa324f2f1c748d550914af0
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: d137ecf95fcdb4ab5271ad6be3f1cc4776411721a094093e84798a85938b91f92429cbb1090fdcb3bfc9e6e717b39a1197809d16dd79b81a56a53df88bb00bad
|
7
|
+
data.tar.gz: 52a264b4ebb28ef1786a3b9adfa6dff750f063d1eb92307113465a353becd64fd90e0a2b93f63cd81258a84dd998e8cf18e5204c0655c231c3bcf5ea92e1f342
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,126 @@
|
|
1
|
+
# Autosuggest
|
2
|
+
|
3
|
+
Generate autocomplete suggestions based on what your users search
|
4
|
+
|
5
|
+
:tangerine: Battle-tested at [Instacart](https://www.instacart.com/opensource)
|
6
|
+
|
7
|
+
## How It Works
|
8
|
+
|
9
|
+
#### Start with the most popular queries
|
10
|
+
|
11
|
+
```ruby
|
12
|
+
top_queries = Search.group("LOWER(query)")
|
13
|
+
.having("COUNT(DISTINCT user_id) >= 5")
|
14
|
+
.count("DISTINCT user_id")
|
15
|
+
# {"bananas" => 353, "apples" => 213, ...
|
16
|
+
|
17
|
+
autosuggest = Autosuggest.new(top_queries)
|
18
|
+
```
|
19
|
+
|
20
|
+
#### Filter duplicates
|
21
|
+
|
22
|
+
[Stemming](https://en.wikipedia.org/wiki/Stemming) is used to detect duplicates like `apple` and `apples`.
|
23
|
+
|
24
|
+
The most popular query is preferred by default. To override this, use:
|
25
|
+
|
26
|
+
```ruby
|
27
|
+
autosuggest.prefer ["apples"]
|
28
|
+
```
|
29
|
+
|
30
|
+
To fix false positives, use:
|
31
|
+
|
32
|
+
```ruby
|
33
|
+
autosuggest.not_duplicates [["straws", "straus"]]
|
34
|
+
```
|
35
|
+
|
36
|
+
#### Filter misspellings
|
37
|
+
|
38
|
+
We tried open-source libraries like [Aspell](http://aspell.net) and [Hunspell](http://hunspell.sourceforge.net/) but quickly realized we needed to build a corpus specific to our application.
|
39
|
+
|
40
|
+
There are two ways to build the corpus, which can be used together.
|
41
|
+
|
42
|
+
1. Add words
|
43
|
+
|
44
|
+
```ruby
|
45
|
+
autosuggest.parse_words Product.pluck(:name)
|
46
|
+
```
|
47
|
+
|
48
|
+
Use the `min` option to only add words that appear multiple times.
|
49
|
+
|
50
|
+
2. Add concepts
|
51
|
+
|
52
|
+
```ruby
|
53
|
+
autosuggest.add_concept "brand", Brand.pluck(:name)
|
54
|
+
```
|
55
|
+
|
56
|
+
#### Blacklist words
|
57
|
+
|
58
|
+
[Profanity](https://github.com/tjackiw/obscenity/blob/master/config/blacklist.yml) is blacklisted by default.
|
59
|
+
|
60
|
+
Add custom words with:
|
61
|
+
|
62
|
+
```ruby
|
63
|
+
autosuggest.blacklist_words ["boom"]
|
64
|
+
```
|
65
|
+
|
66
|
+
#### Profit
|
67
|
+
|
68
|
+
Get suggestions with:
|
69
|
+
|
70
|
+
```ruby
|
71
|
+
autosuggest.suggestions
|
72
|
+
```
|
73
|
+
|
74
|
+
Filter queries without results and you’re set.
|
75
|
+
|
76
|
+
We also prefer to have someone manually approve them by hand.
|
77
|
+
|
78
|
+
## Full Example
|
79
|
+
|
80
|
+
```ruby
|
81
|
+
top_queries = Search.group("LOWER(query)")
|
82
|
+
.having("COUNT(DISTINCT user_id) >= 5")
|
83
|
+
.count("DISTINCT user_id")
|
84
|
+
|
85
|
+
autosuggest = Autosuggest.new(top_queries)
|
86
|
+
|
87
|
+
# create corpus with product names and brand names
|
88
|
+
autosuggest.parse_words Product.pluck(:name)
|
89
|
+
brand_names = Brand.pluck(:name)
|
90
|
+
autosuggest.add_concept "brand", brand_names
|
91
|
+
|
92
|
+
# prefer brand names
|
93
|
+
autosuggest.prefer brand_names
|
94
|
+
|
95
|
+
# prevent false positives for duplicates
|
96
|
+
autosuggest.not_duplicates [["straws", "straus"]]
|
97
|
+
|
98
|
+
# blacklist words
|
99
|
+
autosuggest.blacklist_words ["boom"]
|
100
|
+
|
101
|
+
# print suggestions
|
102
|
+
puts autosuggest.pretty_suggestions
|
103
|
+
# or
|
104
|
+
p autosuggest.suggestions
|
105
|
+
```
|
106
|
+
|
107
|
+
## Installation
|
108
|
+
|
109
|
+
Add this line to your application’s Gemfile:
|
110
|
+
|
111
|
+
```ruby
|
112
|
+
gem 'autosuggest'
|
113
|
+
```
|
114
|
+
|
115
|
+
## TODO
|
116
|
+
|
117
|
+
- try Jaro-Winkler for duplicates
|
118
|
+
|
119
|
+
## Contributing
|
120
|
+
|
121
|
+
Everyone is encouraged to help improve this project. Here are a few ways you can help:
|
122
|
+
|
123
|
+
- [Report bugs](https://github.com/ankane/autosuggest/issues)
|
124
|
+
- Fix bugs and [submit pull requests](https://github.com/ankane/autosuggest/pulls)
|
125
|
+
- Write, clarify, or fix documentation
|
126
|
+
- Suggest or add new features
|
data/Rakefile
ADDED
data/autosuggest.gemspec
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path("../lib", __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require "autosuggest/version"
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "autosuggest"
|
8
|
+
spec.version = Autosuggest::VERSION
|
9
|
+
spec.authors = ["Andrew Kane"]
|
10
|
+
spec.email = ["andrew@chartkick.com"]
|
11
|
+
|
12
|
+
spec.summary = "Generate autocomplete suggestions based on what your users search"
|
13
|
+
spec.homepage = "https://github.com/ankane/autosuggest"
|
14
|
+
|
15
|
+
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
16
|
+
spec.bindir = "exe"
|
17
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
18
|
+
spec.require_paths = ["lib"]
|
19
|
+
|
20
|
+
spec.add_dependency "ruby-stemmer"
|
21
|
+
spec.add_dependency "obscenity"
|
22
|
+
|
23
|
+
spec.add_development_dependency "bundler", "~> 1.7"
|
24
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
25
|
+
spec.add_development_dependency "minitest"
|
26
|
+
end
|
data/lib/autosuggest.rb
ADDED
@@ -0,0 +1,202 @@
|
|
1
|
+
require "autosuggest/version"
|
2
|
+
require "lingua/stemmer"
|
3
|
+
require "yaml" # for obscenity
|
4
|
+
require "obscenity"
|
5
|
+
|
6
|
+
class Autosuggest
|
7
|
+
def initialize(top_queries)
|
8
|
+
@top_queries = top_queries
|
9
|
+
@concepts = {}
|
10
|
+
@words = Set.new
|
11
|
+
@non_duplicates = Set.new
|
12
|
+
@blacklisted_words = Set.new
|
13
|
+
@preferred_queries = {}
|
14
|
+
@profane_words = Set.new(Obscenity::Base.blacklist)
|
15
|
+
end
|
16
|
+
|
17
|
+
def add_concept(name, values)
|
18
|
+
@concepts[name] = Set.new(values.compact.uniq.map(&:downcase))
|
19
|
+
end
|
20
|
+
|
21
|
+
def parse_words(phrases, options = {})
|
22
|
+
min = options[:min] || 1
|
23
|
+
|
24
|
+
word_counts = Hash.new(0)
|
25
|
+
phrases.each do |phrase|
|
26
|
+
words = tokenize(phrase)
|
27
|
+
words.each do |word|
|
28
|
+
word_counts[word] += 1
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
word_counts.select { |_, c| c >= min }.each do |word, _|
|
33
|
+
@words << word
|
34
|
+
end
|
35
|
+
|
36
|
+
word_counts
|
37
|
+
end
|
38
|
+
|
39
|
+
def not_duplicates(pairs)
|
40
|
+
pairs.each do |pair|
|
41
|
+
@non_duplicates << pair.map(&:downcase).sort
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def blacklist_words(words)
|
46
|
+
words.each do |word|
|
47
|
+
@blacklisted_words << word.downcase
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def prefer(queries)
|
52
|
+
queries.each do |query|
|
53
|
+
@preferred_queries[normalize_query(query)] ||= query
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def suggestions
|
58
|
+
stemmed_queries = {}
|
59
|
+
added_queries = Set.new
|
60
|
+
@top_queries.sort_by { |_query, count| -count }.map do |query, count|
|
61
|
+
query = query.to_s
|
62
|
+
|
63
|
+
# TODO do not ignore silently
|
64
|
+
next if query.length < 2
|
65
|
+
|
66
|
+
stemmed_query = normalize_query(query)
|
67
|
+
|
68
|
+
# get preferred term
|
69
|
+
preferred_query = @preferred_queries[stemmed_query]
|
70
|
+
if preferred_query && preferred_query != query
|
71
|
+
original_query, query = query, preferred_query
|
72
|
+
end
|
73
|
+
|
74
|
+
# exclude duplicates
|
75
|
+
duplicate = stemmed_queries[stemmed_query]
|
76
|
+
stemmed_queries[stemmed_query] ||= query
|
77
|
+
|
78
|
+
# also detect possibly misspelled duplicates
|
79
|
+
# TODO use top query as duplicate
|
80
|
+
if !duplicate && query.length > 4
|
81
|
+
edits(query).each do |edited_query|
|
82
|
+
if added_queries.include?(edited_query)
|
83
|
+
duplicate = edited_query
|
84
|
+
break
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
if duplicate && @non_duplicates.include?([duplicate, query].sort)
|
89
|
+
duplicate = nil
|
90
|
+
end
|
91
|
+
added_queries << query unless duplicate
|
92
|
+
|
93
|
+
# find concepts
|
94
|
+
concepts = []
|
95
|
+
@concepts.each do |name, values|
|
96
|
+
concepts << name if values.include?(query)
|
97
|
+
end
|
98
|
+
|
99
|
+
# exclude misspellings that are not brands
|
100
|
+
misspelling = @words.any? && misspellings?(query)
|
101
|
+
|
102
|
+
profane = blacklisted?(query, @profane_words)
|
103
|
+
|
104
|
+
blacklisted = blacklisted?(query, @blacklisted_words)
|
105
|
+
|
106
|
+
notes = []
|
107
|
+
notes << "duplicate of #{duplicate}" if duplicate
|
108
|
+
notes.concat(concepts)
|
109
|
+
notes << "misspelling" if misspelling
|
110
|
+
notes << "profane" if profane
|
111
|
+
notes << "blacklisted" if blacklisted
|
112
|
+
notes << "originally #{original_query}" if original_query
|
113
|
+
|
114
|
+
{
|
115
|
+
query: query,
|
116
|
+
original_query: original_query,
|
117
|
+
score: count,
|
118
|
+
duplicate: duplicate,
|
119
|
+
concepts: concepts,
|
120
|
+
misspelling: misspelling,
|
121
|
+
profane: profane,
|
122
|
+
blacklisted: blacklisted,
|
123
|
+
notes: notes
|
124
|
+
}
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
def pretty_suggestions
|
129
|
+
str = "%-30s %5s %s\n" % %w(Query Score Notes)
|
130
|
+
suggestions.each do |suggestion|
|
131
|
+
str << "%-30s %5d %s\n" % [suggestion[:query], suggestion[:score], suggestion[:notes].join(", ")]
|
132
|
+
end
|
133
|
+
str
|
134
|
+
end
|
135
|
+
|
136
|
+
protected
|
137
|
+
|
138
|
+
def misspellings?(query)
|
139
|
+
recurse(tokenize(query)).each do |terms|
|
140
|
+
if terms.all? { |t| @concepts.any? { |_, values| values.include?(t) } || @words.include?(t) }
|
141
|
+
return false
|
142
|
+
end
|
143
|
+
end
|
144
|
+
true
|
145
|
+
end
|
146
|
+
|
147
|
+
def blacklisted?(query, blacklisted_words)
|
148
|
+
recurse(tokenize(query)).each do |terms|
|
149
|
+
return true if terms.any? { |t| blacklisted_words.include?(t) }
|
150
|
+
end
|
151
|
+
false
|
152
|
+
end
|
153
|
+
|
154
|
+
def recurse(words)
|
155
|
+
if words.size == 1
|
156
|
+
[words]
|
157
|
+
else
|
158
|
+
result = [[words.join(" ")]]
|
159
|
+
i = 0
|
160
|
+
while i < words.size - 1
|
161
|
+
recurse(words[0..i]).each do |v1|
|
162
|
+
recurse(words[i + 1..-1]).each do |v2|
|
163
|
+
result << v1 + v2
|
164
|
+
end
|
165
|
+
end
|
166
|
+
i += 1
|
167
|
+
end
|
168
|
+
result.uniq
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
172
|
+
def tokenize(str)
|
173
|
+
str.to_s.downcase.split(" ")
|
174
|
+
end
|
175
|
+
|
176
|
+
# from http://blog.lojic.com/2008/09/04/how-to-write-a-spelling-corrector-in-ruby/
|
177
|
+
LETTERS = ("a".."z").to_a.join + "'"
|
178
|
+
def edits(word)
|
179
|
+
n = word.length
|
180
|
+
deletion = (0...n).collect { |i| word[0...i] + word[i + 1..-1] }
|
181
|
+
transposition = (0...n - 1).collect { |i| word[0...i] + word[i + 1, 1] + word[i, 1] + word[i + 2..-1] }
|
182
|
+
alteration = []
|
183
|
+
n.times { |i| LETTERS.each_byte { |l| alteration << word[0...i] + l.chr + word[i + 1..-1] } }
|
184
|
+
insertion = []
|
185
|
+
(n + 1).times { |i| LETTERS.each_byte { |l| insertion << word[0...i] + l.chr + word[i..-1] } }
|
186
|
+
deletion + transposition + alteration + insertion
|
187
|
+
end
|
188
|
+
|
189
|
+
def normalize_query(query)
|
190
|
+
tokenize(query.to_s.gsub("&", "and")).map { |q| Lingua.stemmer(q) }.sort.join
|
191
|
+
end
|
192
|
+
|
193
|
+
# TODO remove ActiveSupport dependency
|
194
|
+
|
195
|
+
def singularize(str)
|
196
|
+
str.singularize
|
197
|
+
end
|
198
|
+
|
199
|
+
def pluralize(str)
|
200
|
+
str.pluralize
|
201
|
+
end
|
202
|
+
end
|
metadata
ADDED
@@ -0,0 +1,121 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: autosuggest
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Andrew Kane
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-06-20 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: ruby-stemmer
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: obscenity
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: bundler
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '1.7'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '1.7'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rake
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '10.0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '10.0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: minitest
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
description:
|
84
|
+
email:
|
85
|
+
- andrew@chartkick.com
|
86
|
+
executables: []
|
87
|
+
extensions: []
|
88
|
+
extra_rdoc_files: []
|
89
|
+
files:
|
90
|
+
- ".gitignore"
|
91
|
+
- Gemfile
|
92
|
+
- README.md
|
93
|
+
- Rakefile
|
94
|
+
- autosuggest.gemspec
|
95
|
+
- lib/autosuggest.rb
|
96
|
+
- lib/autosuggest/version.rb
|
97
|
+
homepage: https://github.com/ankane/autosuggest
|
98
|
+
licenses: []
|
99
|
+
metadata: {}
|
100
|
+
post_install_message:
|
101
|
+
rdoc_options: []
|
102
|
+
require_paths:
|
103
|
+
- lib
|
104
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
105
|
+
requirements:
|
106
|
+
- - ">="
|
107
|
+
- !ruby/object:Gem::Version
|
108
|
+
version: '0'
|
109
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
110
|
+
requirements:
|
111
|
+
- - ">="
|
112
|
+
- !ruby/object:Gem::Version
|
113
|
+
version: '0'
|
114
|
+
requirements: []
|
115
|
+
rubyforge_project:
|
116
|
+
rubygems_version: 2.4.5
|
117
|
+
signing_key:
|
118
|
+
specification_version: 4
|
119
|
+
summary: Generate autocomplete suggestions based on what your users search
|
120
|
+
test_files: []
|
121
|
+
has_rdoc:
|