simhilarity 1.0.2 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +1 -0
- data/.travis.yml +6 -0
- data/README.md +13 -12
- data/bin/simhilarity +5 -2
- data/lib/simhilarity.rb +3 -3
- data/lib/simhilarity/candidate.rb +4 -28
- data/lib/simhilarity/candidates.rb +91 -0
- data/lib/simhilarity/matcher.rb +84 -52
- data/lib/simhilarity/score.rb +56 -0
- data/lib/simhilarity/version.rb +1 -1
- data/test/tests.rb +51 -25
- metadata +6 -5
- data/lib/simhilarity/bulk.rb +0 -163
- data/lib/simhilarity/single.rb +0 -18
data/.gitignore
CHANGED
data/.travis.yml
ADDED
data/README.md
CHANGED
@@ -66,15 +66,9 @@ score,needle,haystack
|
|
66
66
|
|
67
67
|
It will print out the best matches between needle and haystack in CSV format. Use `simhilarity --verbose` to look at pretty progress bars while it's running. Use --candidates to customize the candidates selection method, which will dramatically affect performance for large data sets.
|
68
68
|
|
69
|
-
### Simhilarity::
|
69
|
+
### Simhilarity::Matcher
|
70
70
|
|
71
|
-
To use simhilarity from code, create a `
|
72
|
-
|
73
|
-
### Simhilarity::Single
|
74
|
-
|
75
|
-
Sometimes it's useful to just calculate the score between two strings. For example, if you just want a title similarity measurement as part of some larger analysis between two books. Create a `Single` and call `score(a, b)` to measure similarity between those two items. By default, simhilarity assumes that needle and haystack are strings. To use something else, set `reader` to a proc that converts your opaque objects into strings. See [options](#options).
|
76
|
-
|
77
|
-
Important note: For best results with `Single`, set the corpus so that simhilarity can calculate ngram frequencies. This can dramatically improve accuracy. `Bulk` will do this automatically because it has access to the corpus, but `Single` doesn't. Call `corpus=` manually when using `Single`.
|
71
|
+
To use simhilarity from code, create a `Matcher` and call `matches(needles, haystack)`. It'll return an array of tuples, `[needle, haystack, score]`. By default, simhilarity assumes that needles and haystack are arrays of strings. To use something else, set `reader` to a proc that converts your opaque objects into strings. See [options](#options).
|
78
72
|
|
79
73
|
<a name="benchmarks"/>
|
80
74
|
|
@@ -135,10 +129,10 @@ There are a few ways to configure simhilarity:
|
|
135
129
|
Simhash works great, but there's no reason not to use `:ngrams` or even `:all` for small data sets. In fact, that's what simhilarity does by default - if you use a small dataset (needle * haystack < 200,000) it defaults to `:all`, otherwise it uses `:simhash`. Some examples:
|
136
130
|
|
137
131
|
```ruby
|
138
|
-
Simhilarity::
|
139
|
-
Simhilarity::
|
140
|
-
Simhilarity::
|
141
|
-
Simhilarity::
|
132
|
+
Simhilarity::Matcher.new # defaults to :all or :simhash based on size
|
133
|
+
Simhilarity::Matcher.new(candidates: :simhash)
|
134
|
+
Simhilarity::Matcher.new(candidates: :simhash, simhash_max_hamming: 8)
|
135
|
+
Simhilarity::Matcher.new(candidates: :ngrams, ngram_overlaps: 4)
|
142
136
|
```
|
143
137
|
|
144
138
|
or:
|
@@ -162,3 +156,10 @@ There are a few ways to configure simhilarity:
|
|
162
156
|
* **ngrammer** - proc for converting normalized strings into ngrams. The default ngrammer pulls out bigrams and runs of digits, which is perfect for matching names and addresses.
|
163
157
|
|
164
158
|
* **verbose** - if true, show progress while simhilarity is working. Great for the impatient. Use --verbose from the command line.
|
159
|
+
|
160
|
+
## Changelog
|
161
|
+
|
162
|
+
#### Master (unreleased)
|
163
|
+
|
164
|
+
* Works with Ruby 2.0 - thanks @abscondment!
|
165
|
+
* Travis
|
data/bin/simhilarity
CHANGED
@@ -15,8 +15,11 @@ class Main
|
|
15
15
|
|
16
16
|
# match
|
17
17
|
tm = Time.now
|
18
|
-
matcher = Simhilarity::
|
19
|
-
|
18
|
+
matcher = Simhilarity::Matcher.new
|
19
|
+
matcher.verbose = options[:verbose]
|
20
|
+
matcher.candidates = options[:candidates]
|
21
|
+
matcher.haystack = haystack
|
22
|
+
matches = matcher.matches(needle)
|
20
23
|
|
21
24
|
if options[:verbose]
|
22
25
|
tm = Time.now - tm
|
data/lib/simhilarity.rb
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
require "simhilarity/bits"
|
2
2
|
require "simhilarity/candidate"
|
3
|
+
require "simhilarity/candidates"
|
3
4
|
require "simhilarity/element"
|
4
|
-
require "simhilarity/
|
5
|
+
require "simhilarity/score"
|
5
6
|
require "simhilarity/version"
|
6
7
|
|
7
|
-
require "simhilarity/
|
8
|
-
require "simhilarity/single"
|
8
|
+
require "simhilarity/matcher"
|
@@ -1,44 +1,20 @@
|
|
1
1
|
module Simhilarity
|
2
2
|
# A potential match between two +Elements+. It can calculate it's own score.
|
3
3
|
class Candidate
|
4
|
-
# matcher that owns this guy
|
5
|
-
attr_reader :matcher
|
6
|
-
|
7
4
|
# first half of the candidate pair - the needle.
|
8
5
|
attr_reader :a
|
9
6
|
|
10
7
|
# first half of the candidate pair - the haystack.
|
11
8
|
attr_reader :b
|
12
9
|
|
13
|
-
|
14
|
-
|
10
|
+
# the score between these two candidates
|
11
|
+
attr_accessor :score
|
12
|
+
|
13
|
+
def initialize(a, b) #:nodoc:
|
15
14
|
@a = a
|
16
15
|
@b = b
|
17
16
|
end
|
18
17
|
|
19
|
-
# Calculate the score for this +Candidate+. The score is the {dice
|
20
|
-
# coefficient}[http://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient],
|
21
|
-
# <tt>(2*c)/(a+b)</tt>.
|
22
|
-
#
|
23
|
-
# * +a+: the frequency weighted sum of the ngrams in a
|
24
|
-
# * +b+: the frequency weighted sum of the ngrams in b
|
25
|
-
# * +c+: the frequency weighted sum of the ngrams in (a & b)
|
26
|
-
#
|
27
|
-
# Lazily calculated and memoized.
|
28
|
-
def score
|
29
|
-
@score ||= begin
|
30
|
-
c = (self.a.ngrams & self.b.ngrams)
|
31
|
-
if c.length > 0
|
32
|
-
a = self.a.ngrams_sum
|
33
|
-
b = self.b.ngrams_sum
|
34
|
-
c = matcher.ngrams_sum(c)
|
35
|
-
(2.0 * c) / (a + b)
|
36
|
-
else
|
37
|
-
0
|
38
|
-
end
|
39
|
-
end
|
40
|
-
end
|
41
|
-
|
42
18
|
def to_s #:nodoc:
|
43
19
|
"Candidate #{score}: #{a.inspect}..#{b.inspect}"
|
44
20
|
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
module Simhilarity
|
2
|
+
module Candidates
|
3
|
+
# default minimum number # of ngram overlaps with :ngrams
|
4
|
+
DEFAULT_NGRAM_OVERLAPS = 3
|
5
|
+
|
6
|
+
# default maximum hamming distance with :simhash
|
7
|
+
DEFAULT_SIMHASH_MAX_HAMMING = 7
|
8
|
+
|
9
|
+
# Find candidates from +needles+ & +haystack+. The method used
|
10
|
+
# depends on the value of +candidates+
|
11
|
+
def candidates_for(needles)
|
12
|
+
# generate candidates
|
13
|
+
candidates_method = candidates_method(needles)
|
14
|
+
candidates = self.send(candidates_method, needles)
|
15
|
+
|
16
|
+
# if these are the same, no self-dups
|
17
|
+
if needles == haystack
|
18
|
+
candidates = candidates.reject { |n, h| n == h }
|
19
|
+
end
|
20
|
+
|
21
|
+
# map and return
|
22
|
+
candidates.map { |n, h| Candidate.new(n, h) }
|
23
|
+
end
|
24
|
+
|
25
|
+
# Select the method for finding candidates based on +candidates+.
|
26
|
+
def candidates_method(needles)
|
27
|
+
# pick the method
|
28
|
+
method = self.candidates
|
29
|
+
method ||= (needles.length * haystack.length < 200000) ? :all : :simhash
|
30
|
+
case method
|
31
|
+
when /^ngrams=(\d+)$/
|
32
|
+
method = :ngrams
|
33
|
+
self.ngram_overlaps = $1.to_i
|
34
|
+
when /^simhash=(\d+)$/
|
35
|
+
method = :simhash
|
36
|
+
self.simhash_max_hamming = $1.to_i
|
37
|
+
end
|
38
|
+
|
39
|
+
method = "candidates_#{method}".to_sym
|
40
|
+
if !respond_to?(method, true)
|
41
|
+
raise "unsupported candidates #{candidates.inspect}"
|
42
|
+
end
|
43
|
+
|
44
|
+
vputs "Using #{method} with needles=#{needles.length} haystack=#{haystack.length}..."
|
45
|
+
method
|
46
|
+
end
|
47
|
+
|
48
|
+
# Return ALL candidates. This only works for small datasets.
|
49
|
+
def candidates_all(needles)
|
50
|
+
needles.product(haystack)
|
51
|
+
end
|
52
|
+
|
53
|
+
# Return candidates that overlap with three or more matching
|
54
|
+
# ngrams. Only works for small datasets.
|
55
|
+
def candidates_ngrams(needles)
|
56
|
+
ngram_overlaps = self.ngram_overlaps || DEFAULT_NGRAM_OVERLAPS
|
57
|
+
|
58
|
+
candidates = []
|
59
|
+
veach(" ngrams #{ngram_overlaps}", needles) do |n|
|
60
|
+
ngrams_set = Set.new(n.ngrams)
|
61
|
+
haystack.each do |h|
|
62
|
+
count = 0
|
63
|
+
h.ngrams.each do |ngram|
|
64
|
+
if ngrams_set.include?(ngram)
|
65
|
+
if (count += 1) == ngram_overlaps
|
66
|
+
candidates << [n, h]
|
67
|
+
break
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
candidates
|
74
|
+
end
|
75
|
+
|
76
|
+
# Find candidates that are close based on hamming distance between
|
77
|
+
# the simhashes.
|
78
|
+
def candidates_simhash(needles)
|
79
|
+
max_hamming = self.simhash_max_hamming || DEFAULT_SIMHASH_MAX_HAMMING
|
80
|
+
|
81
|
+
# search for candidates with low hamming distance
|
82
|
+
candidates = []
|
83
|
+
veach(" hamming #{max_hamming}", needles) do |n|
|
84
|
+
bk_tree.query(n, max_hamming).each do |h, distance|
|
85
|
+
candidates << [n, h]
|
86
|
+
end
|
87
|
+
end
|
88
|
+
candidates
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
data/lib/simhilarity/matcher.rb
CHANGED
@@ -1,70 +1,95 @@
|
|
1
|
+
require "bk"
|
2
|
+
require "set"
|
1
3
|
require "progressbar"
|
2
4
|
|
3
5
|
module Simhilarity
|
4
|
-
# Abstract superclass for matching. Mainly a container for options, corpus, etc.
|
5
6
|
class Matcher
|
6
|
-
|
7
|
-
|
7
|
+
include Simhilarity::Candidates
|
8
|
+
include Simhilarity::Score
|
8
9
|
|
9
|
-
#
|
10
|
-
|
11
|
-
|
10
|
+
# If true, show progress bars and timing
|
11
|
+
attr_accessor :verbose
|
12
|
+
|
13
|
+
# Proc for turning opaque items into strings.
|
12
14
|
attr_accessor :reader
|
13
15
|
|
14
|
-
# Proc for normalizing
|
15
|
-
# for the default implementation.
|
16
|
+
# Proc for normalizing strings.
|
16
17
|
attr_accessor :normalizer
|
17
18
|
|
18
|
-
# Proc for generating ngrams
|
19
|
-
# Matcher#ngrams for the default implementation.
|
19
|
+
# Proc for generating ngrams.
|
20
20
|
attr_accessor :ngrammer
|
21
21
|
|
22
|
-
#
|
23
|
-
|
24
|
-
|
22
|
+
# Proc for scoring ngrams.
|
23
|
+
attr_accessor :scorer
|
24
|
+
|
25
|
+
# Specifies which method to use for finding candidates. See the
|
26
|
+
# README for more details.
|
27
|
+
attr_accessor :candidates
|
25
28
|
|
26
|
-
#
|
27
|
-
#
|
28
|
-
|
29
|
-
# * +normalizer+: Proc for normalizing strings.
|
30
|
-
# * +ngrammer+: Proc for generating ngrams.
|
31
|
-
# * +verbose+: If true, show progress bars and timing.
|
32
|
-
def initialize(options = {})
|
33
|
-
@options = options
|
29
|
+
# Minimum number of ngram overlaps, defaults to 3 (for candidates
|
30
|
+
# = :ngrams)
|
31
|
+
attr_accessor :ngram_overlaps
|
34
32
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
self.ngrammer = options[:ngrammer]
|
33
|
+
# Maximum simhash hamming distance, defaults to 7. (for candidates
|
34
|
+
# = :simhash)
|
35
|
+
attr_accessor :simhash_max_hamming
|
39
36
|
|
40
|
-
|
37
|
+
# Set the haystack.
|
38
|
+
def haystack=(haystack)
|
39
|
+
@haystack = import_list(haystack)
|
40
|
+
|
41
|
+
# this stuff is lazily calculated from the haystack, and needs
|
42
|
+
# to be reset whenever the haystack changes.
|
43
|
+
@bitsums = { }
|
44
|
+
@bk_tree = nil
|
45
|
+
@freq = nil
|
41
46
|
end
|
42
47
|
|
43
|
-
#
|
44
|
-
|
45
|
-
|
46
|
-
|
48
|
+
# The current haystack.
|
49
|
+
def haystack
|
50
|
+
@haystack
|
51
|
+
end
|
47
52
|
|
48
|
-
|
53
|
+
# Ngram frequency weights from the haystack, or 1 if the ngram
|
54
|
+
# isn't in the haystack. Lazily calculated.
|
55
|
+
def freq
|
56
|
+
@freq ||= begin
|
57
|
+
# calculate ngram counts for the haystack
|
58
|
+
counts = Hash.new(0)
|
59
|
+
veach("Haystack", @haystack) do |element|
|
60
|
+
element.ngrams.each do |ngram|
|
61
|
+
counts[ngram] += 1
|
62
|
+
end
|
63
|
+
end
|
49
64
|
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
65
|
+
# turn counts into inverse frequencies
|
66
|
+
map = Hash.new(1)
|
67
|
+
total = counts.values.inject(&:+).to_f
|
68
|
+
counts.each do |ngram, count|
|
69
|
+
map[ngram] = ((total / count) * 10).round
|
55
70
|
end
|
71
|
+
map
|
56
72
|
end
|
73
|
+
end
|
57
74
|
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
75
|
+
# Match each item in +needles+ to an item in #haystack. Returns an
|
76
|
+
# array of tuples, <tt>[needle, haystack, score]</tt>. Scores
|
77
|
+
# range from 0 to 1, with 1 being a perfect match and 0 being a
|
78
|
+
# terrible match.
|
79
|
+
def matches(needles)
|
80
|
+
if haystack.nil?
|
81
|
+
raise RuntimeError.new('can\'t match before setting a haystack')
|
62
82
|
end
|
63
|
-
end
|
64
83
|
|
65
|
-
|
66
|
-
|
67
|
-
|
84
|
+
# create Elements
|
85
|
+
needles = import_list(needles)
|
86
|
+
|
87
|
+
# get candidate matches
|
88
|
+
candidates = candidates_for(needles)
|
89
|
+
vputs " got #{candidates.length} candidates."
|
90
|
+
|
91
|
+
# pick winners
|
92
|
+
winners(needles, candidates)
|
68
93
|
end
|
69
94
|
|
70
95
|
# Turn an opaque item from the user into a string.
|
@@ -108,7 +133,7 @@ module Simhilarity
|
|
108
133
|
|
109
134
|
# Sum up the frequency weights of the +ngrams+.
|
110
135
|
def ngrams_sum(ngrams)
|
111
|
-
ngrams.map { |i|
|
136
|
+
ngrams.map { |i| freq[i] }.inject(&:+) || 0
|
112
137
|
end
|
113
138
|
|
114
139
|
# Calculate the frequency weighted
|
@@ -147,9 +172,16 @@ module Simhilarity
|
|
147
172
|
Element.new(self, opaque)
|
148
173
|
end
|
149
174
|
|
150
|
-
def
|
151
|
-
@
|
152
|
-
|
175
|
+
def bk_tree
|
176
|
+
@bk_tree ||= begin
|
177
|
+
# calculate this first so we get a nice progress bar
|
178
|
+
veach(" simhash", haystack) { |i| i.simhash }
|
179
|
+
|
180
|
+
# build the bk tree
|
181
|
+
tree = BK::Tree.new(lambda { |a, b| Bits.hamming32(a.simhash, b.simhash) })
|
182
|
+
veach(" bktree", haystack) { |i| tree.add(i) }
|
183
|
+
tree
|
184
|
+
end
|
153
185
|
end
|
154
186
|
|
155
187
|
# calculate the simhash bitsums for this +ngram+, as part of
|
@@ -171,14 +203,14 @@ module Simhilarity
|
|
171
203
|
end
|
172
204
|
end
|
173
205
|
|
174
|
-
# Puts if
|
206
|
+
# Puts if +verbose+ is true
|
175
207
|
def vputs(s)
|
176
|
-
$stderr.puts s if
|
208
|
+
$stderr.puts s if verbose
|
177
209
|
end
|
178
210
|
|
179
|
-
# Like each, but with a progress bar if
|
211
|
+
# Like each, but with a progress bar if +verbose+ is true
|
180
212
|
def veach(title, array, &block)
|
181
|
-
if !
|
213
|
+
if !verbose
|
182
214
|
array.each do |i|
|
183
215
|
yield(i)
|
184
216
|
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
module Simhilarity
|
2
|
+
module Score
|
3
|
+
# walk candidates by score, pick winners
|
4
|
+
def winners(needles, candidates)
|
5
|
+
# calculate this first so we get a nice progress bar
|
6
|
+
veach("Scoring", candidates) do |i|
|
7
|
+
i.score = score(i)
|
8
|
+
end
|
9
|
+
|
10
|
+
# sort by score
|
11
|
+
candidates = candidates.sort_by { |i| -i.score }
|
12
|
+
|
13
|
+
# walk them, eliminate dups
|
14
|
+
seen = Set.new
|
15
|
+
winners = candidates.map do |i|
|
16
|
+
next if seen.include?(i.a)
|
17
|
+
seen << i.a
|
18
|
+
i
|
19
|
+
end.compact
|
20
|
+
|
21
|
+
# build map from needle => candidate...
|
22
|
+
needle_to_winner = { }
|
23
|
+
winners.each { |i| needle_to_winner[i.a] = i }
|
24
|
+
|
25
|
+
# so we can return in the original order
|
26
|
+
needles.map do |i|
|
27
|
+
if candidate = needle_to_winner[i]
|
28
|
+
[ i.opaque, candidate.b.opaque, candidate.score ]
|
29
|
+
else
|
30
|
+
[ i.opaque, nil, nil ]
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
# Score a +Candidate+. The default implementation is the {dice
|
36
|
+
# coefficient}[http://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient],
|
37
|
+
# <tt>(2*c)/(a+b)</tt>.
|
38
|
+
#
|
39
|
+
# * +a+: the frequency weighted sum of the ngrams in a
|
40
|
+
# * +b+: the frequency weighted sum of the ngrams in b
|
41
|
+
# * +c+: the frequency weighted sum of the ngrams in (a & b)
|
42
|
+
def score(candidate)
|
43
|
+
if scorer
|
44
|
+
return scorer.call(candidate)
|
45
|
+
end
|
46
|
+
|
47
|
+
c = (candidate.a.ngrams & candidate.b.ngrams)
|
48
|
+
return 0 if c.length == 0
|
49
|
+
|
50
|
+
a = candidate.a.ngrams_sum
|
51
|
+
b = candidate.b.ngrams_sum
|
52
|
+
c = ngrams_sum(c)
|
53
|
+
(2.0 * c) / (a + b)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
data/lib/simhilarity/version.rb
CHANGED
data/test/tests.rb
CHANGED
@@ -29,15 +29,17 @@ class Tests < Test::Unit::TestCase
|
|
29
29
|
sample
|
30
30
|
end
|
31
31
|
|
32
|
-
def
|
32
|
+
def assert_candidates(candidates, percent)
|
33
33
|
sample = self.sample
|
34
34
|
|
35
35
|
# match, with benchmark
|
36
36
|
output = nil
|
37
37
|
Benchmark.bm(10) do |bm|
|
38
38
|
bm.report(candidates.to_s) do
|
39
|
-
matcher = Simhilarity::
|
40
|
-
|
39
|
+
matcher = Simhilarity::Matcher.new
|
40
|
+
matcher.candidates = candidates
|
41
|
+
matcher.haystack = sample.haystack
|
42
|
+
output = matcher.matches(sample.needle)
|
41
43
|
end
|
42
44
|
end
|
43
45
|
|
@@ -55,9 +57,11 @@ class Tests < Test::Unit::TestCase
|
|
55
57
|
assert((correct - percent).abs < 0.001, "percent #{correct} != #{percent}")
|
56
58
|
end
|
57
59
|
|
60
|
+
TMP = "/tmp/_simhilarity_tests.txt"
|
61
|
+
|
58
62
|
def assert_system(cmd)
|
59
|
-
system("#{cmd} >
|
60
|
-
assert($? == 0,
|
63
|
+
system("#{cmd} > #{TMP} 2>&1")
|
64
|
+
assert($? == 0, File.read(TMP))
|
61
65
|
end
|
62
66
|
|
63
67
|
#
|
@@ -70,46 +74,62 @@ class Tests < Test::Unit::TestCase
|
|
70
74
|
|
71
75
|
# not a string
|
72
76
|
assert_raise(RuntimeError) { @matcher.read(123) }
|
73
|
-
|
74
|
-
# custom
|
75
|
-
@matcher.reader = lambda(&:key)
|
76
|
-
assert_equal @matcher.read(OpenStruct.new(key: "gub")), "gub"
|
77
77
|
end
|
78
78
|
|
79
79
|
def test_normalizer
|
80
80
|
# default
|
81
81
|
assert_equal @matcher.normalize(" HELLO,\tWORLD! "), "hello world"
|
82
|
-
|
83
|
-
# custom
|
84
|
-
@matcher.normalizer = lambda(&:upcase)
|
85
|
-
assert_equal @matcher.normalize("gub"), "GUB"
|
86
82
|
end
|
87
83
|
|
88
84
|
def test_ngrams
|
89
85
|
# default
|
90
86
|
assert_equal @matcher.ngrams("hi 42"), ["hi", "i ", " 4", "42"]
|
91
|
-
|
92
|
-
# custom
|
93
|
-
@matcher.ngrammer = lambda(&:split)
|
94
|
-
assert_equal @matcher.ngrams("hi 42"), ["hi", "42"]
|
95
87
|
end
|
96
88
|
|
97
89
|
def test_proc_options
|
98
|
-
matcher = Simhilarity::Matcher.new
|
90
|
+
matcher = Simhilarity::Matcher.new
|
91
|
+
matcher.reader = lambda(&:key)
|
92
|
+
matcher.normalizer = lambda(&:upcase)
|
93
|
+
matcher.ngrammer = lambda(&:split)
|
99
94
|
assert_equal matcher.read(OpenStruct.new(key: "gub")), "gub"
|
100
95
|
assert_equal matcher.normalize("gub"), "GUB"
|
101
96
|
assert_equal matcher.ngrams("hi 42"), ["hi", "42"]
|
102
97
|
end
|
103
98
|
|
104
|
-
def
|
105
|
-
|
106
|
-
|
99
|
+
def test_no_selfdups
|
100
|
+
# if you pass in the same list twice, it should ignore self-dups
|
101
|
+
list = ["hello, world", "hello there"]
|
102
|
+
@matcher.haystack = list
|
103
|
+
matches = @matcher.matches(@matcher.haystack)
|
104
|
+
assert_not_equal matches[0][1], "hello, world"
|
105
|
+
end
|
106
|
+
|
107
|
+
def test_haystack_required
|
108
|
+
# if you do not set a haystack, the matcher should yell
|
109
|
+
matcher = Simhilarity::Matcher.new
|
110
|
+
assert_raise RuntimeError do
|
111
|
+
matches = matcher.matches(['FOOM'])
|
112
|
+
end
|
107
113
|
end
|
108
114
|
|
109
|
-
def
|
110
|
-
|
111
|
-
|
112
|
-
|
115
|
+
def test_one_result_can_win_multiple_times
|
116
|
+
# We should be able to find the same piece of hay multiple times for
|
117
|
+
# different needles.
|
118
|
+
haystack = ['Black Sabbath', 'Led Zeppelin', 'The Doors',
|
119
|
+
'The Beatles', 'Neil Young']
|
120
|
+
needles = ['blak sabbath', 'black sabath', 'block soborch']
|
121
|
+
@matcher.haystack = haystack
|
122
|
+
|
123
|
+
# Whether matched individually or as a group, all of these needles
|
124
|
+
# should produce the same result.
|
125
|
+
matches = @matcher.matches(needles)
|
126
|
+
needles.each do |n|
|
127
|
+
matches.concat @matcher.matches([n])
|
128
|
+
end
|
129
|
+
|
130
|
+
matches.each do |n, h, s|
|
131
|
+
assert_equal 'Black Sabbath', h
|
132
|
+
end
|
113
133
|
end
|
114
134
|
|
115
135
|
def test_bin
|
@@ -122,4 +142,10 @@ class Tests < Test::Unit::TestCase
|
|
122
142
|
assert_system("#{bin} --candidates ngrams=3 identity.txt identity.txt")
|
123
143
|
assert_system("#{bin} --candidates all identity.txt identity.txt")
|
124
144
|
end
|
145
|
+
|
146
|
+
def test_candidates
|
147
|
+
assert_candidates(:all, 0.949)
|
148
|
+
assert_candidates(:ngrams, 0.949)
|
149
|
+
assert_candidates(:simhash, 0.949)
|
150
|
+
end
|
125
151
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: simhilarity
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-04-
|
12
|
+
date: 2013-04-26 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bk
|
@@ -100,6 +100,7 @@ extensions: []
|
|
100
100
|
extra_rdoc_files: []
|
101
101
|
files:
|
102
102
|
- .gitignore
|
103
|
+
- .travis.yml
|
103
104
|
- Gemfile
|
104
105
|
- LICENSE
|
105
106
|
- README.md
|
@@ -107,11 +108,11 @@ files:
|
|
107
108
|
- bin/simhilarity
|
108
109
|
- lib/simhilarity.rb
|
109
110
|
- lib/simhilarity/bits.rb
|
110
|
-
- lib/simhilarity/bulk.rb
|
111
111
|
- lib/simhilarity/candidate.rb
|
112
|
+
- lib/simhilarity/candidates.rb
|
112
113
|
- lib/simhilarity/element.rb
|
113
114
|
- lib/simhilarity/matcher.rb
|
114
|
-
- lib/simhilarity/
|
115
|
+
- lib/simhilarity/score.rb
|
115
116
|
- lib/simhilarity/version.rb
|
116
117
|
- simhilarity.gemspec
|
117
118
|
- test/harness
|
@@ -140,7 +141,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
140
141
|
version: '0'
|
141
142
|
segments:
|
142
143
|
- 0
|
143
|
-
hash:
|
144
|
+
hash: 3122268769366489382
|
144
145
|
requirements: []
|
145
146
|
rubyforge_project: simhilarity
|
146
147
|
rubygems_version: 1.8.24
|
data/lib/simhilarity/bulk.rb
DELETED
@@ -1,163 +0,0 @@
|
|
1
|
-
require "bk"
|
2
|
-
require "set"
|
3
|
-
|
4
|
-
module Simhilarity
|
5
|
-
# Match a set of needles against a haystack, in bulk. For example,
|
6
|
-
# this is used if you want to match 50 new addresses against your
|
7
|
-
# database of 1,000 known addresses.
|
8
|
-
class Bulk < Matcher
|
9
|
-
# default minimum number # of ngram overlaps with :ngrams
|
10
|
-
DEFAULT_NGRAM_OVERLAPS = 3
|
11
|
-
# default maximum hamming distance with :simhash
|
12
|
-
DEFAULT_SIMHASH_MAX_HAMMING = 7
|
13
|
-
|
14
|
-
# Initialize a new Bulk matcher. See Matcher#initialize. Bulk adds
|
15
|
-
# these options:
|
16
|
-
#
|
17
|
-
# * +candidates+: specifies which method to use for finding
|
18
|
-
# candidates. See the README for more details.
|
19
|
-
# * +ngrams_overlaps+: Minimum number of ngram overlaps, defaults
|
20
|
-
# to 3.
|
21
|
-
# * +simhash_max_hamming+: Maximum simhash hamming distance,
|
22
|
-
# defaults to 7.
|
23
|
-
def initialize(options = {})
|
24
|
-
super(options)
|
25
|
-
end
|
26
|
-
|
27
|
-
# Match each item in +needles+ to an item in +haystack+. Returns
|
28
|
-
# an array of tuples, <tt>[needle, haystack, score]</tt>. Scores
|
29
|
-
# range from 0 to 1, with 1 being a perfect match and 0 being a
|
30
|
-
# terrible match.
|
31
|
-
def matches(needles, haystack)
|
32
|
-
# create Elements
|
33
|
-
if needles == haystack
|
34
|
-
needles = haystack = import_list(needles)
|
35
|
-
|
36
|
-
# set the corpus, to generate frequency weights
|
37
|
-
self.corpus = needles
|
38
|
-
else
|
39
|
-
needles = import_list(needles)
|
40
|
-
haystack = import_list(haystack)
|
41
|
-
|
42
|
-
# set the corpus, to generate frequency weights
|
43
|
-
self.corpus = (needles + haystack)
|
44
|
-
end
|
45
|
-
|
46
|
-
# get candidate matches
|
47
|
-
candidates = candidates(needles, haystack)
|
48
|
-
vputs " got #{candidates.length} candidates."
|
49
|
-
|
50
|
-
# pick winners
|
51
|
-
winners(needles, candidates)
|
52
|
-
end
|
53
|
-
|
54
|
-
protected
|
55
|
-
|
56
|
-
# Find candidates from +needles+ & +haystack+. The method used
|
57
|
-
# depends on the value of options[:candidates]
|
58
|
-
def candidates(needles, haystack)
|
59
|
-
method = options[:candidates]
|
60
|
-
method ||= (needles.length * haystack.length < 200000) ? :all : :simhash
|
61
|
-
|
62
|
-
case method
|
63
|
-
when /^ngrams=(\d+)$/
|
64
|
-
method = :ngrams
|
65
|
-
options[:ngram_overlaps] = $1.to_i
|
66
|
-
when /^simhash=(\d+)$/
|
67
|
-
method = :simhash
|
68
|
-
options[:simhash_max_hamming] = $1.to_i
|
69
|
-
end
|
70
|
-
|
71
|
-
method = "candidates_#{method}".to_sym
|
72
|
-
if !respond_to?(method)
|
73
|
-
raise "unsupported options[:candidates] #{options[:candidates].inspect}"
|
74
|
-
end
|
75
|
-
|
76
|
-
vputs "Using #{method} with needles=#{needles.length} haystack=#{haystack.length}..."
|
77
|
-
self.send(method, needles, haystack).map do |n, h|
|
78
|
-
Candidate.new(self, n, h)
|
79
|
-
end
|
80
|
-
end
|
81
|
-
|
82
|
-
# Return ALL candidates. This only works for small datasets.
|
83
|
-
def candidates_all(needles, haystack)
|
84
|
-
needles.product(haystack)
|
85
|
-
end
|
86
|
-
|
87
|
-
# Return candidates that overlap with three or more matching
|
88
|
-
# ngrams. Only works for small datasets.
|
89
|
-
def candidates_ngrams(needles, haystack)
|
90
|
-
ngram_overlaps = options[:ngram_overlaps] || DEFAULT_NGRAM_OVERLAPS
|
91
|
-
|
92
|
-
candidates = []
|
93
|
-
veach(" ngrams #{ngram_overlaps}", needles) do |n|
|
94
|
-
ngrams_set = Set.new(n.ngrams)
|
95
|
-
haystack.each do |h|
|
96
|
-
count = 0
|
97
|
-
h.ngrams.each do |ngram|
|
98
|
-
if ngrams_set.include?(ngram)
|
99
|
-
if (count += 1) == ngram_overlaps
|
100
|
-
candidates << [n, h]
|
101
|
-
break
|
102
|
-
end
|
103
|
-
end
|
104
|
-
end
|
105
|
-
end
|
106
|
-
end
|
107
|
-
candidates
|
108
|
-
end
|
109
|
-
|
110
|
-
# Find candidates that are close based on hamming distance between
|
111
|
-
# the simhashes.
|
112
|
-
def candidates_simhash(needles, haystack)
|
113
|
-
max_hamming = options[:simhash_max_hamming] || DEFAULT_SIMHASH_MAX_HAMMING
|
114
|
-
|
115
|
-
# calculate this first so we get a nice progress bar
|
116
|
-
veach(" simhash", corpus) { |i| i.simhash }
|
117
|
-
|
118
|
-
# build the bk tree
|
119
|
-
bk = BK::Tree.new(lambda { |a, b| Bits.hamming32(a.simhash, b.simhash) })
|
120
|
-
veach(" bktree", haystack) { |i| bk.add(i) }
|
121
|
-
|
122
|
-
# search for candidates with low hamming distance
|
123
|
-
candidates = []
|
124
|
-
veach(" hamming #{max_hamming}", needles) do |n|
|
125
|
-
bk.query(n, max_hamming).each do |h, distance|
|
126
|
-
candidates << [n, h]
|
127
|
-
end
|
128
|
-
end
|
129
|
-
candidates
|
130
|
-
end
|
131
|
-
|
132
|
-
# walk candidates by score, pick winners
|
133
|
-
def winners(needles, candidates)
|
134
|
-
# calculate this first so we get a nice progress bar
|
135
|
-
veach("Scoring", candidates) { |i| i.score }
|
136
|
-
|
137
|
-
# score the candidates
|
138
|
-
candidates = candidates.sort_by { |i| -i.score }
|
139
|
-
|
140
|
-
# walk them, eliminate dups
|
141
|
-
seen = Set.new
|
142
|
-
winners = candidates.map do |i|
|
143
|
-
next if seen.include?(i.a) || seen.include?(i.b)
|
144
|
-
seen << i.a
|
145
|
-
seen << i.b
|
146
|
-
i
|
147
|
-
end.compact
|
148
|
-
|
149
|
-
# build map from needle => candidate...
|
150
|
-
needle_to_winner = { }
|
151
|
-
winners.each { |i| needle_to_winner[i.a] = i }
|
152
|
-
|
153
|
-
# so we can return in the original order
|
154
|
-
needles.map do |i|
|
155
|
-
if candidate = needle_to_winner[i]
|
156
|
-
[ i.opaque, candidate.b.opaque, candidate.score ]
|
157
|
-
else
|
158
|
-
[ i.opaque, nil, nil ]
|
159
|
-
end
|
160
|
-
end
|
161
|
-
end
|
162
|
-
end
|
163
|
-
end
|
data/lib/simhilarity/single.rb
DELETED
@@ -1,18 +0,0 @@
|
|
1
|
-
require "set"
|
2
|
-
|
3
|
-
module Simhilarity
|
4
|
-
# Calculate the similarity score for pairs of items, one at a time.
|
5
|
-
class Single < Matcher
|
6
|
-
# See Matcher#initialize.
|
7
|
-
def initialize(options = {})
|
8
|
-
super(options)
|
9
|
-
end
|
10
|
-
|
11
|
-
# Calculate the similarity score for these two items. Scores range
|
12
|
-
# from 0 to 1, with 1 being a perfect match and 0 being a terrible
|
13
|
-
# match. For best results, call #corpus= first.
|
14
|
-
def score(a, b)
|
15
|
-
Candidate.new(self, element_for(a), element_for(b)).score
|
16
|
-
end
|
17
|
-
end
|
18
|
-
end
|