fuzzzy 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +15 -0
- data/.rspec +2 -0
- data/.travis.yml +6 -0
- data/Gemfile +4 -0
- data/README.md +97 -0
- data/Rakefile +11 -0
- data/benchmark/data/cities.csv +8875 -0
- data/benchmark/fuzzzy_benchmark.rb +128 -0
- data/benchmark/test_ngram.rb +36 -0
- data/benchmark/test_soundex.rb +36 -0
- data/config.ru +11 -0
- data/dictionary/en_stopwords.yml +175 -0
- data/fuzzzy.gemspec +44 -0
- data/lib/fuzzzy.rb +129 -0
- data/lib/fuzzzy/index.rb +17 -0
- data/lib/fuzzzy/methods/indexer.rb +13 -0
- data/lib/fuzzzy/methods/method_base.rb +32 -0
- data/lib/fuzzzy/methods/ngram/base.rb +15 -0
- data/lib/fuzzzy/methods/ngram/indexer.rb +37 -0
- data/lib/fuzzzy/methods/ngram/searcher.rb +95 -0
- data/lib/fuzzzy/methods/soundex/base.rb +13 -0
- data/lib/fuzzzy/methods/soundex/indexer.rb +30 -0
- data/lib/fuzzzy/methods/soundex/searcher.rb +36 -0
- data/lib/fuzzzy/orm/mongoid/index.rb +86 -0
- data/lib/fuzzzy/redis.rb +61 -0
- data/lib/fuzzzy/server/http.rb +99 -0
- data/lib/fuzzzy/version.rb +3 -0
- data/spec/config/mongoid.yml +7 -0
- data/spec/models/city.rb +8 -0
- data/spec/models/indexed_city.rb +9 -0
- data/spec/ngram/indexer_spec.rb +142 -0
- data/spec/ngram/searcher_spec.rb +194 -0
- data/spec/orm/mongoid/index_spec.rb +165 -0
- data/spec/redis_spec.rb +54 -0
- data/spec/soundex/indexer_spec.rb +40 -0
- data/spec/soundex/searcher_spec.rb +63 -0
- data/spec/spec_helper.rb +22 -0
- metadata +325 -0
@@ -0,0 +1,128 @@
|
|
1
|
+
require 'csv'
|
2
|
+
require 'benchmark'
|
3
|
+
require 'ruby-prof'
|
4
|
+
|
5
|
+
module FuzzzyBenchmark
|
6
|
+
extend Fuzzzy::Index
|
7
|
+
|
8
|
+
module_function
|
9
|
+
def benchmark meth, contexts, index_cntx={}, times=100, &block
|
10
|
+
@search_method = meth
|
11
|
+
@times = times
|
12
|
+
prepare_indexes(default_context.merge(index_cntx))
|
13
|
+
|
14
|
+
Benchmark.bm do |benchmark|
|
15
|
+
contexts.each do |context|
|
16
|
+
report(benchmark, context.merge(index_cntx))
|
17
|
+
end
|
18
|
+
|
19
|
+
yield(benchmark, self) if block_given?
|
20
|
+
end
|
21
|
+
|
22
|
+
ensure
|
23
|
+
Fuzzzy.redis.flushdb
|
24
|
+
@search_method = nil
|
25
|
+
end
|
26
|
+
|
27
|
+
def profile meth, context, index_cntx={}, times=100, &block
|
28
|
+
@search_method = meth
|
29
|
+
@times = times
|
30
|
+
prepare_indexes(default_context.merge(index_cntx))
|
31
|
+
|
32
|
+
RubyProf.start
|
33
|
+
|
34
|
+
@times.times do
|
35
|
+
searcher.search(context)
|
36
|
+
end
|
37
|
+
|
38
|
+
result = RubyProf.stop
|
39
|
+
|
40
|
+
flat_printer = RubyProf::FlatPrinter.new(result)
|
41
|
+
callstack_printer = RubyProf::CallStackPrinter.new(result)
|
42
|
+
graph_printer = RubyProf::GraphHtmlPrinter.new(result)
|
43
|
+
|
44
|
+
File.open(Fuzzzy.root.join('benchmark', 'reports', "#{search_method}_graph.html"), 'w') do |file|
|
45
|
+
graph_printer.print(file)
|
46
|
+
end
|
47
|
+
File.open(Fuzzzy.root.join('benchmark', 'reports', "#{search_method}_callstack.html"), 'w') do |file|
|
48
|
+
callstack_printer.print(file)
|
49
|
+
end
|
50
|
+
File.open(Fuzzzy.root.join('benchmark', 'reports', "#{search_method}_flat.txt"), 'w') do |file|
|
51
|
+
flat_printer.print(file)
|
52
|
+
end
|
53
|
+
ensure
|
54
|
+
Fuzzzy.redis.flushdb
|
55
|
+
@search_method = nil
|
56
|
+
end
|
57
|
+
|
58
|
+
def report bench, context
|
59
|
+
context = default_context.merge(context)
|
60
|
+
result, strings = get_result(context)
|
61
|
+
|
62
|
+
puts "Execute #{@times} times"
|
63
|
+
puts "query: '#{context[:query]}', result: '#{result}' => #{strings}"
|
64
|
+
bench.report(context[:title] || '') do
|
65
|
+
@times.times do
|
66
|
+
searcher.search(context)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
puts ''
|
70
|
+
end
|
71
|
+
|
72
|
+
def get_result cntx
|
73
|
+
result = searcher.search(cntx)
|
74
|
+
strings = result.map{|id|Fuzzzy.redis.get('fuzzzy:city:name:dictionary:' + id)}
|
75
|
+
[result, strings]
|
76
|
+
end
|
77
|
+
|
78
|
+
def prepare_indexes cntx
|
79
|
+
puts "Create index for #{search_method}:"
|
80
|
+
puts "#{fixtures.size} names"
|
81
|
+
|
82
|
+
start = Time.now
|
83
|
+
fixtures.each do |source|
|
84
|
+
indexer.create_index(cntx.merge(
|
85
|
+
:dictionary_string => source[:name].downcase,
|
86
|
+
:id => source[:id]
|
87
|
+
))
|
88
|
+
end
|
89
|
+
|
90
|
+
puts "#{Time.now - start} sec."
|
91
|
+
puts "size - #{Fuzzzy.redis.info['used_memory_human']}"
|
92
|
+
end
|
93
|
+
|
94
|
+
def default_context
|
95
|
+
{
|
96
|
+
:index_name => 'city:name',
|
97
|
+
:method => search_method
|
98
|
+
}
|
99
|
+
end
|
100
|
+
|
101
|
+
def indexer
|
102
|
+
_indexer(search_method)
|
103
|
+
end
|
104
|
+
|
105
|
+
def searcher
|
106
|
+
_searcher(search_method)
|
107
|
+
end
|
108
|
+
|
109
|
+
def fixtures
|
110
|
+
@fixtures ||= begin
|
111
|
+
result = []
|
112
|
+
CSV.foreach(Fuzzzy.root.join('benchmark', 'data', 'cities.csv').to_s,
|
113
|
+
:headers => true, :encoding => 'utf-8'
|
114
|
+
) do |row|
|
115
|
+
result << row.to_hash.symbolize_keys
|
116
|
+
end
|
117
|
+
result
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
def search_method
|
122
|
+
@search_method
|
123
|
+
end
|
124
|
+
|
125
|
+
def search_method= meth
|
126
|
+
@search_method = meth
|
127
|
+
end
|
128
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
$LOAD_PATH.unshift File.expand_path("../../benchmark", __FILE__)
|
2
|
+
require File.expand_path('../../lib/fuzzzy', __FILE__)
|
3
|
+
require 'fuzzzy_benchmark'
|
4
|
+
|
5
|
+
puts 'Without stripping stopwords'
|
6
|
+
FuzzzyBenchmark.benchmark(:ngram, [{
|
7
|
+
:query => 'eastleigh naersouthempton',
|
8
|
+
:distance => 4
|
9
|
+
}, {
|
10
|
+
:query => 'alixandropolis',
|
11
|
+
:distance => 2
|
12
|
+
}, {
|
13
|
+
:query => 'jenan',
|
14
|
+
:distance => 1
|
15
|
+
}])
|
16
|
+
|
17
|
+
puts 'With stripping stopwords'
|
18
|
+
FuzzzyBenchmark.benchmark(:ngram, [{
|
19
|
+
:query => 'eastleigh naersouthempton',
|
20
|
+
:distance => 4,
|
21
|
+
:strip_stopwords => true
|
22
|
+
}, {
|
23
|
+
:query => 'alixandropolis',
|
24
|
+
:distance => 2,
|
25
|
+
:strip_stopwords => true
|
26
|
+
}, {
|
27
|
+
:query => 'jenan',
|
28
|
+
:distance => 1,
|
29
|
+
:strip_stopwords => true
|
30
|
+
}], {:strip_stopwords => true})
|
31
|
+
|
32
|
+
FuzzzyBenchmark.profile(:ngram, {
|
33
|
+
:query => 'eastleigh naersouthempton',
|
34
|
+
:distance => 4,
|
35
|
+
:strip_stopwords => true
|
36
|
+
}, {:strip_stopwords => true}, 1000)
|
@@ -0,0 +1,36 @@
|
|
1
|
+
$LOAD_PATH.unshift File.expand_path("../../benchmark", __FILE__)
|
2
|
+
require File.expand_path('../../lib/fuzzzy', __FILE__)
|
3
|
+
require 'fuzzzy_benchmark'
|
4
|
+
|
5
|
+
puts 'Without stripping stopwords'
|
6
|
+
FuzzzyBenchmark.benchmark(:soundex, [{
|
7
|
+
:query => 'eastleigh naersouthempton',
|
8
|
+
:distance => 4
|
9
|
+
}, {
|
10
|
+
:query => 'alixandropolis',
|
11
|
+
:distance => 2
|
12
|
+
}, {
|
13
|
+
:query => 'jenan',
|
14
|
+
:distance => 1
|
15
|
+
}])
|
16
|
+
|
17
|
+
puts 'With stripping stopwords'
|
18
|
+
FuzzzyBenchmark.benchmark(:soundex, [{
|
19
|
+
:query => 'eastleigh naersouthempton',
|
20
|
+
:distance => 4,
|
21
|
+
:strip_stopwords => true
|
22
|
+
}, {
|
23
|
+
:query => 'alixandropolis',
|
24
|
+
:distance => 2,
|
25
|
+
:strip_stopwords => true
|
26
|
+
}, {
|
27
|
+
:query => 'jenan',
|
28
|
+
:distance => 1,
|
29
|
+
:strip_stopwords => true
|
30
|
+
}], {:strip_stopwords => true})
|
31
|
+
|
32
|
+
FuzzzyBenchmark.profile(:soundex, {
|
33
|
+
:query => 'eastleigh naersouthempton',
|
34
|
+
:distance => 4,
|
35
|
+
:strip_stopwords => true
|
36
|
+
}, {:strip_stopwords => true}, 1000)
|
data/config.ru
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
$LOAD_PATH.unshift File.expand_path("../..", __FILE__)
|
2
|
+
require './lib/fuzzzy'
|
3
|
+
require 'grape'
|
4
|
+
|
5
|
+
Fuzzzy.redis = Redis.new(
|
6
|
+
:host => (ENV['REDIS_HOST'] || 'localhost'),
|
7
|
+
:port => (ENV['REDIS_PORT'] || 6379),
|
8
|
+
:database => (ENV['REDIS_DB'] || 0)
|
9
|
+
)
|
10
|
+
|
11
|
+
run Fuzzzy::Server::HTTP
|
@@ -0,0 +1,175 @@
|
|
1
|
+
---
|
2
|
+
- a
|
3
|
+
- about
|
4
|
+
- above
|
5
|
+
- after
|
6
|
+
- again
|
7
|
+
- against
|
8
|
+
- all
|
9
|
+
- am
|
10
|
+
- an
|
11
|
+
- and
|
12
|
+
- any
|
13
|
+
- are
|
14
|
+
- aren't
|
15
|
+
- as
|
16
|
+
- at
|
17
|
+
- be
|
18
|
+
- because
|
19
|
+
- been
|
20
|
+
- before
|
21
|
+
- being
|
22
|
+
- below
|
23
|
+
- between
|
24
|
+
- both
|
25
|
+
- but
|
26
|
+
- by
|
27
|
+
- can't
|
28
|
+
- cannot
|
29
|
+
- could
|
30
|
+
- couldn't
|
31
|
+
- did
|
32
|
+
- didn't
|
33
|
+
- do
|
34
|
+
- does
|
35
|
+
- doesn't
|
36
|
+
- doing
|
37
|
+
- don't
|
38
|
+
- down
|
39
|
+
- during
|
40
|
+
- each
|
41
|
+
- few
|
42
|
+
- for
|
43
|
+
- from
|
44
|
+
- further
|
45
|
+
- had
|
46
|
+
- hadn't
|
47
|
+
- has
|
48
|
+
- hasn't
|
49
|
+
- have
|
50
|
+
- haven't
|
51
|
+
- having
|
52
|
+
- he
|
53
|
+
- he'd
|
54
|
+
- he'll
|
55
|
+
- he's
|
56
|
+
- her
|
57
|
+
- here
|
58
|
+
- here's
|
59
|
+
- hers
|
60
|
+
- herself
|
61
|
+
- him
|
62
|
+
- himself
|
63
|
+
- his
|
64
|
+
- how
|
65
|
+
- how's
|
66
|
+
- i
|
67
|
+
- i'd
|
68
|
+
- i'll
|
69
|
+
- i'm
|
70
|
+
- i've
|
71
|
+
- if
|
72
|
+
- in
|
73
|
+
- into
|
74
|
+
- is
|
75
|
+
- isn't
|
76
|
+
- it
|
77
|
+
- it's
|
78
|
+
- its
|
79
|
+
- itself
|
80
|
+
- let's
|
81
|
+
- me
|
82
|
+
- more
|
83
|
+
- most
|
84
|
+
- mustn't
|
85
|
+
- my
|
86
|
+
- myself
|
87
|
+
- "no"
|
88
|
+
- nor
|
89
|
+
- not
|
90
|
+
- of
|
91
|
+
- "off"
|
92
|
+
- "on"
|
93
|
+
- once
|
94
|
+
- only
|
95
|
+
- or
|
96
|
+
- other
|
97
|
+
- ought
|
98
|
+
- our
|
99
|
+
- ours
|
100
|
+
- ourselves
|
101
|
+
- out
|
102
|
+
- over
|
103
|
+
- own
|
104
|
+
- same
|
105
|
+
- shan't
|
106
|
+
- she
|
107
|
+
- she'd
|
108
|
+
- she'll
|
109
|
+
- she's
|
110
|
+
- should
|
111
|
+
- shouldn't
|
112
|
+
- so
|
113
|
+
- some
|
114
|
+
- such
|
115
|
+
- than
|
116
|
+
- that
|
117
|
+
- that's
|
118
|
+
- the
|
119
|
+
- their
|
120
|
+
- theirs
|
121
|
+
- them
|
122
|
+
- themselves
|
123
|
+
- then
|
124
|
+
- there
|
125
|
+
- there's
|
126
|
+
- these
|
127
|
+
- they
|
128
|
+
- they'd
|
129
|
+
- they'll
|
130
|
+
- they're
|
131
|
+
- they've
|
132
|
+
- this
|
133
|
+
- those
|
134
|
+
- through
|
135
|
+
- to
|
136
|
+
- too
|
137
|
+
- under
|
138
|
+
- until
|
139
|
+
- up
|
140
|
+
- very
|
141
|
+
- was
|
142
|
+
- wasn't
|
143
|
+
- we
|
144
|
+
- we'd
|
145
|
+
- we'll
|
146
|
+
- we're
|
147
|
+
- we've
|
148
|
+
- were
|
149
|
+
- weren't
|
150
|
+
- what
|
151
|
+
- what's
|
152
|
+
- when
|
153
|
+
- when's
|
154
|
+
- where
|
155
|
+
- where's
|
156
|
+
- which
|
157
|
+
- while
|
158
|
+
- who
|
159
|
+
- who's
|
160
|
+
- whom
|
161
|
+
- why
|
162
|
+
- why's
|
163
|
+
- with
|
164
|
+
- won't
|
165
|
+
- would
|
166
|
+
- wouldn't
|
167
|
+
- you
|
168
|
+
- you'd
|
169
|
+
- you'll
|
170
|
+
- you're
|
171
|
+
- you've
|
172
|
+
- your
|
173
|
+
- yours
|
174
|
+
- yourself
|
175
|
+
- yourselves
|
data/fuzzzy.gemspec
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "fuzzzy/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "fuzzzy"
|
7
|
+
s.version = Fuzzzy::VERSION
|
8
|
+
s.platform = Gem::Platform::RUBY
|
9
|
+
s.authors = ["Undr"]
|
10
|
+
s.email = ["lilipoper@gmail.com"]
|
11
|
+
s.homepage = "http://github.com/undr/fuzzzy"
|
12
|
+
s.summary = %q{Fuzzy Search client and server}
|
13
|
+
s.description = %q{Fuzzy Search client and server}
|
14
|
+
|
15
|
+
s.rubyforge_project = "fuzzzy"
|
16
|
+
|
17
|
+
s.add_development_dependency "rspec", ">= 2"
|
18
|
+
s.add_development_dependency "yard", "~> 0.6.0"
|
19
|
+
s.add_development_dependency "ruby-debug19"
|
20
|
+
s.add_development_dependency "bson_ext"
|
21
|
+
s.add_development_dependency "mongoid"
|
22
|
+
s.add_development_dependency "pry"
|
23
|
+
s.add_development_dependency 'ruby-prof'
|
24
|
+
|
25
|
+
s.add_dependency "bundler"
|
26
|
+
s.add_dependency "rake"
|
27
|
+
s.add_dependency "activesupport"
|
28
|
+
s.add_dependency "eventmachine"
|
29
|
+
s.add_dependency "yajl-ruby"
|
30
|
+
s.add_dependency "levenshtein-ffi"
|
31
|
+
s.add_dependency "text"
|
32
|
+
s.add_dependency "grape"
|
33
|
+
s.add_dependency "hiredis"
|
34
|
+
s.add_dependency "redis"
|
35
|
+
s.add_dependency "daemons"
|
36
|
+
s.add_dependency "ZenTest", "4.5.0"
|
37
|
+
s.add_dependency 'RubyInline'
|
38
|
+
|
39
|
+
|
40
|
+
s.files = `git ls-files`.split("\n")
|
41
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
42
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
43
|
+
s.require_paths = ["lib"]
|
44
|
+
end
|
data/lib/fuzzzy.rb
ADDED
@@ -0,0 +1,129 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'bundler'
|
3
|
+
Bundler.require :default, (ENV['RACK_ENV'] || 'development')
|
4
|
+
require 'yaml'
|
5
|
+
require 'logger'
|
6
|
+
require 'active_support'
|
7
|
+
require 'yajl'
|
8
|
+
require 'levenshtein-ffi'
|
9
|
+
require 'text'
|
10
|
+
require 'redis/connection/hiredis'
|
11
|
+
require 'redis'
|
12
|
+
|
13
|
+
module Fuzzzy
|
14
|
+
extend self
|
15
|
+
extend ActiveSupport::Autoload
|
16
|
+
|
17
|
+
autoload :Redis
|
18
|
+
autoload :Index
|
19
|
+
|
20
|
+
autoload :Indexer, 'fuzzzy/methods/indexer'
|
21
|
+
autoload :MethodBase, 'fuzzzy/methods/method_base'
|
22
|
+
|
23
|
+
module Soundex
|
24
|
+
extend ActiveSupport::Autoload
|
25
|
+
|
26
|
+
autoload :Base, 'fuzzzy/methods/soundex/base'
|
27
|
+
autoload :Indexer, 'fuzzzy/methods/soundex/indexer'
|
28
|
+
autoload :Searcher, 'fuzzzy/methods/soundex/searcher'
|
29
|
+
end
|
30
|
+
|
31
|
+
module Ngram
|
32
|
+
extend ActiveSupport::Autoload
|
33
|
+
|
34
|
+
autoload :Base, 'fuzzzy/methods/ngram/base'
|
35
|
+
autoload :Indexer, 'fuzzzy/methods/ngram/indexer'
|
36
|
+
autoload :Searcher, 'fuzzzy/methods/ngram/searcher'
|
37
|
+
end
|
38
|
+
|
39
|
+
module Server
|
40
|
+
extend ActiveSupport::Autoload
|
41
|
+
|
42
|
+
autoload :HTTP, 'fuzzzy/server/http'
|
43
|
+
end
|
44
|
+
|
45
|
+
if defined?(Mongoid)
|
46
|
+
module Mongoid
|
47
|
+
extend ActiveSupport::Autoload
|
48
|
+
|
49
|
+
autoload :Index, 'fuzzzy/orm/mongoid/index'
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
# Fuzzzy.configure do |config|
|
54
|
+
# config.logger = Logger.new($stdout)
|
55
|
+
# config.redis = ::Redis.new(
|
56
|
+
# :host => 'localhost',
|
57
|
+
# :port => 6379,
|
58
|
+
# :database => 0
|
59
|
+
# )
|
60
|
+
# config.stopwords = %w{the stopwords list}
|
61
|
+
# end
|
62
|
+
def configure
|
63
|
+
yield self
|
64
|
+
end
|
65
|
+
|
66
|
+
def logger
|
67
|
+
@logger = default_logger unless defined?(@logger)
|
68
|
+
@logger
|
69
|
+
end
|
70
|
+
|
71
|
+
def logger=(logger)
|
72
|
+
case logger
|
73
|
+
when Logger then @logger = logger
|
74
|
+
when false, nil then @logger = nil
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
def redis
|
79
|
+
@redis ||= ::Redis.new(
|
80
|
+
:host => 'localhost',
|
81
|
+
:port => 6379,
|
82
|
+
:database => 0
|
83
|
+
)
|
84
|
+
end
|
85
|
+
|
86
|
+
def redis= connection
|
87
|
+
@redis = connection
|
88
|
+
end
|
89
|
+
|
90
|
+
def stopwords
|
91
|
+
@stopwords ||= default_stopwords
|
92
|
+
end
|
93
|
+
|
94
|
+
def stopwords= value
|
95
|
+
@stopwords = load_stopwords(value).uniq
|
96
|
+
end
|
97
|
+
|
98
|
+
def load_stopwords options
|
99
|
+
if options.is_a?(Hash)
|
100
|
+
stops = load_stopwords(options[:stopwords])
|
101
|
+
options[:default] ? (stops + default_stopwords) : stops
|
102
|
+
elsif options.is_a?(Array)
|
103
|
+
options
|
104
|
+
elsif options.is_a?(String) || options.is_a?(Pathname)
|
105
|
+
YAML.load_file(options)
|
106
|
+
else
|
107
|
+
[]
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def default_stopwords
|
112
|
+
@default_stopwords ||= load_stopwords(Fuzzzy.root.join('dictionary', 'en_stopwords.yml').to_s)
|
113
|
+
end
|
114
|
+
|
115
|
+
def env
|
116
|
+
return Rails.env if defined?(Rails)
|
117
|
+
return Sinatra::Base.environment.to_s if defined?(Sinatra)
|
118
|
+
ENV["RACK_ENV"] || 'development'
|
119
|
+
end
|
120
|
+
|
121
|
+
def root
|
122
|
+
@root ||= Pathname.new(File.expand_path('.'))
|
123
|
+
end
|
124
|
+
|
125
|
+
protected
|
126
|
+
def default_logger
|
127
|
+
defined?(Rails) && Rails.respond_to?(:logger) ? Rails.logger : ::Logger.new($stdout)
|
128
|
+
end
|
129
|
+
end
|