simhilarity 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +5 -0
- data/Gemfile +2 -0
- data/LICENSE +20 -0
- data/README.md +165 -0
- data/Rakefile +18 -0
- data/bin/simhilarity +84 -0
- data/lib/simhilarity/bits.rb +62 -0
- data/lib/simhilarity/bulk.rb +163 -0
- data/lib/simhilarity/candidate.rb +46 -0
- data/lib/simhilarity/element.rb +50 -0
- data/lib/simhilarity/matcher.rb +164 -0
- data/lib/simhilarity/single.rb +18 -0
- data/lib/simhilarity/version.rb +4 -0
- data/lib/simhilarity.rb +8 -0
- data/simhilarity.gemspec +27 -0
- data/test/harness +138 -0
- data/test/identity.txt +1 -0
- data/test/large_haystack.txt +10000 -0
- data/test/large_needles.txt +500 -0
- data/test/sample.csv +2669 -0
- data/test/tests.rb +125 -0
- metadata +156 -0
@@ -0,0 +1,164 @@
|
|
1
|
+
require "progressbar"
|
2
|
+
|
3
|
+
module Simhilarity
|
4
|
+
# Abstract superclass for matching. Mainly a container for options, corpus, etc.
|
5
|
+
class Matcher
|
6
|
+
# Options used to create this Matcher.
|
7
|
+
attr_accessor :options
|
8
|
+
|
9
|
+
# Proc for turning needle/haystack elements into strings. You can
|
10
|
+
# leave this nil if the elements are already strings. See
|
11
|
+
# Matcher#reader for the default implementation.
|
12
|
+
attr_accessor :reader
|
13
|
+
|
14
|
+
# Proc for normalizing input strings. See Matcher#normalize
|
15
|
+
# for the default implementation.
|
16
|
+
attr_accessor :normalizer
|
17
|
+
|
18
|
+
# Proc for generating ngrams from a normalized string. See
|
19
|
+
# Matcher#ngrams for the default implementation.
|
20
|
+
attr_accessor :ngrammer
|
21
|
+
|
22
|
+
# Ngram frequency weights from the corpus, or 1 if the ngram isn't
|
23
|
+
# in the corpus.
|
24
|
+
attr_accessor :freq
|
25
|
+
|
26
|
+
# Create a new Matcher matcher. Options include:
|
27
|
+
#
|
28
|
+
# * +reader+: Proc for turning opaque items into strings.
|
29
|
+
# * +normalizer+: Proc for normalizing strings.
|
30
|
+
# * +ngrammer+: Proc for generating ngrams.
|
31
|
+
# * +verbose+: If true, show progress bars and timing.
|
32
|
+
def initialize(options = {})
|
33
|
+
@options = options
|
34
|
+
|
35
|
+
# procs
|
36
|
+
self.reader = options[:reader]
|
37
|
+
self.normalizer = options[:normalizer]
|
38
|
+
self.ngrammer = options[:ngrammer]
|
39
|
+
|
40
|
+
self.freq = Hash.new(1)
|
41
|
+
end
|
42
|
+
|
43
|
+
# Set the corpus. Calculates ngram frequencies (#freq) for future
|
44
|
+
# scoring.
|
45
|
+
def corpus=(corpus)
|
46
|
+
@corpus = corpus
|
47
|
+
|
48
|
+
# calculate ngram counts for the corpus
|
49
|
+
counts = Hash.new(0)
|
50
|
+
veach("Corpus", import_list(corpus)) do |element|
|
51
|
+
element.ngrams.each do |ngram|
|
52
|
+
counts[ngram] += 1
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
# turn counts into inverse frequencies
|
57
|
+
self.freq = Hash.new(1)
|
58
|
+
total = counts.values.inject(&:+).to_f
|
59
|
+
counts.each do |ngram, count|
|
60
|
+
self.freq[ngram] = total / count
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
# The current corpus.
|
65
|
+
def corpus
|
66
|
+
@corpus
|
67
|
+
end
|
68
|
+
|
69
|
+
# Turn an opaque item from the user into a string.
|
70
|
+
def read(opaque)
|
71
|
+
if reader
|
72
|
+
return reader.call(opaque)
|
73
|
+
end
|
74
|
+
|
75
|
+
if opaque.is_a?(String)
|
76
|
+
return opaque
|
77
|
+
end
|
78
|
+
raise "can't turn #{opaque.inspect} into string"
|
79
|
+
end
|
80
|
+
|
81
|
+
# Normalize an incoming string from the user.
|
82
|
+
def normalize(incoming_str)
|
83
|
+
if normalizer
|
84
|
+
return normalizer.call(incoming_str)
|
85
|
+
end
|
86
|
+
|
87
|
+
str = incoming_str
|
88
|
+
str = str.downcase
|
89
|
+
str = str.gsub(/[^a-z0-9]/, " ")
|
90
|
+
# squish whitespace
|
91
|
+
str = str.gsub(/\s+/, " ").strip
|
92
|
+
str
|
93
|
+
end
|
94
|
+
|
95
|
+
# Generate ngrams from a normalized str.
|
96
|
+
def ngrams(str)
|
97
|
+
if ngrammer
|
98
|
+
return ngrammer.call(str)
|
99
|
+
end
|
100
|
+
|
101
|
+
# two letter ngrams (bigrams)
|
102
|
+
ngrams = str.each_char.each_cons(2).map(&:join)
|
103
|
+
# runs of digits
|
104
|
+
ngrams += str.scan(/\d+/)
|
105
|
+
ngrams.uniq
|
106
|
+
end
|
107
|
+
|
108
|
+
# Sum up the frequency weights of the +ngrams+.
|
109
|
+
def ngrams_sum(ngrams)
|
110
|
+
ngrams.map { |i| @freq[i] }.inject(&:+) || 0
|
111
|
+
end
|
112
|
+
|
113
|
+
# Calculate the frequency weighted
|
114
|
+
# simhash[http://matpalm.com/resemblance/simhash/] of the
|
115
|
+
# +ngrams+.
|
116
|
+
def simhash(ngrams)
|
117
|
+
Bits.simhash32(freq, ngrams)
|
118
|
+
end
|
119
|
+
|
120
|
+
def inspect #:nodoc:
|
121
|
+
"Matcher"
|
122
|
+
end
|
123
|
+
|
124
|
+
protected
|
125
|
+
|
126
|
+
# Turn a list of user supplied opaque items into a list of
|
127
|
+
# Elements (if necessary).
|
128
|
+
def import_list(list)
|
129
|
+
if !list.first.is_a?(Element)
|
130
|
+
list = list.map { |opaque| element_for(opaque) }
|
131
|
+
end
|
132
|
+
list
|
133
|
+
end
|
134
|
+
|
135
|
+
# Turn a user's opaque item into an Element.
|
136
|
+
def element_for(opaque)
|
137
|
+
Element.new(self, opaque)
|
138
|
+
end
|
139
|
+
|
140
|
+
# Puts if options[:verbose]
|
141
|
+
def vputs(s)
|
142
|
+
$stderr.puts s if options[:verbose]
|
143
|
+
end
|
144
|
+
|
145
|
+
# Like each, but with a progress bar if options[:verbose]
|
146
|
+
def veach(title, array, &block)
|
147
|
+
if !options[:verbose]
|
148
|
+
array.each do |i|
|
149
|
+
yield(i)
|
150
|
+
end
|
151
|
+
else
|
152
|
+
begin
|
153
|
+
pb = ProgressBar.new(title, array.length)
|
154
|
+
array.each do |i|
|
155
|
+
yield(i)
|
156
|
+
pb.inc
|
157
|
+
end
|
158
|
+
ensure
|
159
|
+
pb.finish
|
160
|
+
end
|
161
|
+
end
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require "set"
|
2
|
+
|
3
|
+
module Simhilarity
|
4
|
+
# Calculate the similarity score for pairs of items, one at a time.
|
5
|
+
class Single < Matcher
|
6
|
+
# See Matcher#initialize.
|
7
|
+
def initialize(options = {})
|
8
|
+
super(options)
|
9
|
+
end
|
10
|
+
|
11
|
+
# Calculate the similarity score for these two items. Scores range
|
12
|
+
# from 0 to 1, with 1 being a perfect match and 0 being a terrible
|
13
|
+
# match. For best results, call #corpus= first.
|
14
|
+
def score(a, b)
|
15
|
+
Candidate.new(self, element_for(a), element_for(b)).score
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
data/lib/simhilarity.rb
ADDED
data/simhilarity.gemspec
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
require "simhilarity/version"
|
2
|
+
|
3
|
+
Gem::Specification.new do |s|
|
4
|
+
s.name = "simhilarity"
|
5
|
+
s.version = Simhilarity::VERSION
|
6
|
+
s.platform = Gem::Platform::RUBY
|
7
|
+
s.required_ruby_version = ">= 1.9.0"
|
8
|
+
s.authors = ["Adam Doppelt"]
|
9
|
+
s.email = ["amd@gurge.com"]
|
10
|
+
s.homepage = "http://github.com/gurgeous/simhilarity"
|
11
|
+
s.summary = "Simhilarity - measure text similarity using frequency weighted ngrams."
|
12
|
+
s.description = "Measure text similarity using frequency weighted ngrams."
|
13
|
+
|
14
|
+
s.rubyforge_project = "simhilarity"
|
15
|
+
|
16
|
+
s.add_runtime_dependency "bk"
|
17
|
+
s.add_runtime_dependency "progressbar"
|
18
|
+
|
19
|
+
s.add_development_dependency "awesome_print"
|
20
|
+
s.add_development_dependency "rake"
|
21
|
+
s.add_development_dependency "rdoc"
|
22
|
+
|
23
|
+
s.files = `git ls-files`.split("\n")
|
24
|
+
s.test_files = `git ls-files -- test/*`.split("\n")
|
25
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map { |i| File.basename(i) }
|
26
|
+
s.require_paths = ["lib"]
|
27
|
+
end
|
data/test/harness
ADDED
@@ -0,0 +1,138 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
|
5
|
+
require "awesome_print"
|
6
|
+
require "csv"
|
7
|
+
require "ostruct"
|
8
|
+
require "simhilarity"
|
9
|
+
|
10
|
+
#
|
11
|
+
# This is a test harness for measuring the speed and accuracy of the
|
12
|
+
# different searchers. Feel free to ignore.
|
13
|
+
#
|
14
|
+
|
15
|
+
#
|
16
|
+
# Results on an i5 3ghz, with 500 needles and 10,000 haystacks:
|
17
|
+
#
|
18
|
+
#
|
19
|
+
|
20
|
+
class Harness
|
21
|
+
def initialize
|
22
|
+
@matcher = Simhilarity::Bulk.new
|
23
|
+
|
24
|
+
# load data
|
25
|
+
$stderr.puts "Loading..."
|
26
|
+
@data = dataset_large
|
27
|
+
@data.needles = @data.needles.map { |i| @matcher.send(:element_for, i) }
|
28
|
+
@data.haystack = @data.haystack.map { |i| @matcher.send(:element_for, i) }
|
29
|
+
@matcher.corpus = @data.needles + @data.haystack
|
30
|
+
|
31
|
+
# exclude this initialization from our benchmarks
|
32
|
+
@matcher.corpus.each(&:simhash)
|
33
|
+
|
34
|
+
$stderr.puts "Harness using needles/haystack = #{@data.needles.length}/#{@data.haystack.length}..."
|
35
|
+
run
|
36
|
+
end
|
37
|
+
|
38
|
+
def dataset_large
|
39
|
+
needles = File.readlines("large_needles.txt").map(&:chomp)
|
40
|
+
haystack = File.readlines("large_haystack.txt").map(&:chomp)
|
41
|
+
OpenStruct.new(needles: needles, haystack: haystack)
|
42
|
+
end
|
43
|
+
|
44
|
+
def dataset_small
|
45
|
+
needles, haystack, matches = [], [], {}
|
46
|
+
CSV.read("sample.csv").each do |cols|
|
47
|
+
n, h = *cols
|
48
|
+
needles << n if n
|
49
|
+
haystack << h if h
|
50
|
+
matches[n] = h if n && h
|
51
|
+
end
|
52
|
+
OpenStruct.new(needles: needles, haystack: haystack, matches: matches)
|
53
|
+
end
|
54
|
+
|
55
|
+
# all
|
56
|
+
# ngrams: 5,4,3
|
57
|
+
# simhash: 5,6,7,8,9
|
58
|
+
|
59
|
+
#
|
60
|
+
# compare the various candidate methods
|
61
|
+
#
|
62
|
+
|
63
|
+
def run
|
64
|
+
# header
|
65
|
+
cols = [:title, :candidates, :s30, :s40, :s50, :c_tm, :s_tm, :correct]
|
66
|
+
puts cols.join("\t")
|
67
|
+
|
68
|
+
# simhash: 5..9
|
69
|
+
@matcher.options[:candidates] = :simhash
|
70
|
+
5.upto(9).each do |i|
|
71
|
+
@matcher.options[:simhash_max_hamming] = i
|
72
|
+
report("simhash #{i}")
|
73
|
+
end
|
74
|
+
|
75
|
+
# ngrams: 5..3
|
76
|
+
@matcher.options[:candidates] = :ngrams
|
77
|
+
5.downto(3).each do |i|
|
78
|
+
@matcher.options[:ngram_overlaps] = i
|
79
|
+
report("ngrams #{i}")
|
80
|
+
end
|
81
|
+
|
82
|
+
# all
|
83
|
+
@matcher.options[:candidates] = :all
|
84
|
+
report("all")
|
85
|
+
end
|
86
|
+
|
87
|
+
|
88
|
+
def report(title)
|
89
|
+
# candidates
|
90
|
+
tm1 = Time.now
|
91
|
+
candidates = @matcher.send(:candidates, @data.needles, @data.haystack)
|
92
|
+
tm1 = Time.now - tm1
|
93
|
+
|
94
|
+
# winners
|
95
|
+
tm2 = Time.now
|
96
|
+
winners = @matcher.send(:winners, @data.needles, candidates)
|
97
|
+
tm2 = Time.now - tm2
|
98
|
+
winners = winners.sort_by { |n, h, score| -(score || 0) }
|
99
|
+
|
100
|
+
s30 = winners.count { |n, h, score| score && score > 0.3 }
|
101
|
+
s40 = winners.count { |n, h, score| score && score > 0.4 }
|
102
|
+
s50 = winners.count { |n, h, score| score && score > 0.5 }
|
103
|
+
|
104
|
+
tm1 = sprintf("%.3f", tm1)
|
105
|
+
tm2 = sprintf("%.3f", tm2)
|
106
|
+
|
107
|
+
cols = [title, candidates.length, s30, s40, s50, tm1, tm2]
|
108
|
+
if @data.matches
|
109
|
+
correct = winners.select { |n, h, score| @data.matches[n] == h }
|
110
|
+
correct = correct.length.to_f / @data.needles.length
|
111
|
+
correct = sprintf("%.3f", correct)
|
112
|
+
cols << correct
|
113
|
+
end
|
114
|
+
puts cols.join("\t")
|
115
|
+
$stdout.flush
|
116
|
+
end
|
117
|
+
|
118
|
+
def dump_results(winners)
|
119
|
+
high_quality = winners.select { |n, h, score| score && score > 0.5 }
|
120
|
+
full report
|
121
|
+
high_quality.each do |n, h, score|
|
122
|
+
printf("%4.2f %-35s %-35s\n", score || 0, n, h)
|
123
|
+
end
|
124
|
+
|
125
|
+
# which high quality matches were added?
|
126
|
+
if @last
|
127
|
+
puts
|
128
|
+
added = high_quality - @last
|
129
|
+
added.each do |n, h, score|
|
130
|
+
printf("%4.2f %-35s %-35s\n", score || 0, n, h)
|
131
|
+
end
|
132
|
+
end
|
133
|
+
@last = high_quality
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
Dir.chdir(File.expand_path("../", __FILE__))
|
138
|
+
Harness.new
|
data/test/identity.txt
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
hello, world!
|