simhilarity 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +5 -0
- data/Gemfile +2 -0
- data/LICENSE +20 -0
- data/README.md +165 -0
- data/Rakefile +18 -0
- data/bin/simhilarity +84 -0
- data/lib/simhilarity/bits.rb +62 -0
- data/lib/simhilarity/bulk.rb +163 -0
- data/lib/simhilarity/candidate.rb +46 -0
- data/lib/simhilarity/element.rb +50 -0
- data/lib/simhilarity/matcher.rb +164 -0
- data/lib/simhilarity/single.rb +18 -0
- data/lib/simhilarity/version.rb +4 -0
- data/lib/simhilarity.rb +8 -0
- data/simhilarity.gemspec +27 -0
- data/test/harness +138 -0
- data/test/identity.txt +1 -0
- data/test/large_haystack.txt +10000 -0
- data/test/large_needles.txt +500 -0
- data/test/sample.csv +2669 -0
- data/test/tests.rb +125 -0
- metadata +156 -0
@@ -0,0 +1,164 @@
|
|
1
|
+
require "progressbar"
|
2
|
+
|
3
|
+
module Simhilarity
|
4
|
+
# Abstract superclass for matching. Mainly a container for options, corpus, etc.
|
5
|
+
class Matcher
|
6
|
+
# Options used to create this Matcher.
|
7
|
+
attr_accessor :options
|
8
|
+
|
9
|
+
# Proc for turning needle/haystack elements into strings. You can
|
10
|
+
# leave this nil if the elements are already strings. See
|
11
|
+
# Matcher#reader for the default implementation.
|
12
|
+
attr_accessor :reader
|
13
|
+
|
14
|
+
# Proc for normalizing input strings. See Matcher#normalize
|
15
|
+
# for the default implementation.
|
16
|
+
attr_accessor :normalizer
|
17
|
+
|
18
|
+
# Proc for generating ngrams from a normalized string. See
|
19
|
+
# Matcher#ngrams for the default implementation.
|
20
|
+
attr_accessor :ngrammer
|
21
|
+
|
22
|
+
# Ngram frequency weights from the corpus, or 1 if the ngram isn't
|
23
|
+
# in the corpus.
|
24
|
+
attr_accessor :freq
|
25
|
+
|
26
|
+
# Create a new Matcher matcher. Options include:
|
27
|
+
#
|
28
|
+
# * +reader+: Proc for turning opaque items into strings.
|
29
|
+
# * +normalizer+: Proc for normalizing strings.
|
30
|
+
# * +ngrammer+: Proc for generating ngrams.
|
31
|
+
# * +verbose+: If true, show progress bars and timing.
|
32
|
+
def initialize(options = {})
|
33
|
+
@options = options
|
34
|
+
|
35
|
+
# procs
|
36
|
+
self.reader = options[:reader]
|
37
|
+
self.normalizer = options[:normalizer]
|
38
|
+
self.ngrammer = options[:ngrammer]
|
39
|
+
|
40
|
+
self.freq = Hash.new(1)
|
41
|
+
end
|
42
|
+
|
43
|
+
# Set the corpus. Calculates ngram frequencies (#freq) for future
|
44
|
+
# scoring.
|
45
|
+
def corpus=(corpus)
|
46
|
+
@corpus = corpus
|
47
|
+
|
48
|
+
# calculate ngram counts for the corpus
|
49
|
+
counts = Hash.new(0)
|
50
|
+
veach("Corpus", import_list(corpus)) do |element|
|
51
|
+
element.ngrams.each do |ngram|
|
52
|
+
counts[ngram] += 1
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
# turn counts into inverse frequencies
|
57
|
+
self.freq = Hash.new(1)
|
58
|
+
total = counts.values.inject(&:+).to_f
|
59
|
+
counts.each do |ngram, count|
|
60
|
+
self.freq[ngram] = total / count
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
# The current corpus.
|
65
|
+
def corpus
|
66
|
+
@corpus
|
67
|
+
end
|
68
|
+
|
69
|
+
# Turn an opaque item from the user into a string.
|
70
|
+
def read(opaque)
|
71
|
+
if reader
|
72
|
+
return reader.call(opaque)
|
73
|
+
end
|
74
|
+
|
75
|
+
if opaque.is_a?(String)
|
76
|
+
return opaque
|
77
|
+
end
|
78
|
+
raise "can't turn #{opaque.inspect} into string"
|
79
|
+
end
|
80
|
+
|
81
|
+
# Normalize an incoming string from the user.
|
82
|
+
def normalize(incoming_str)
|
83
|
+
if normalizer
|
84
|
+
return normalizer.call(incoming_str)
|
85
|
+
end
|
86
|
+
|
87
|
+
str = incoming_str
|
88
|
+
str = str.downcase
|
89
|
+
str = str.gsub(/[^a-z0-9]/, " ")
|
90
|
+
# squish whitespace
|
91
|
+
str = str.gsub(/\s+/, " ").strip
|
92
|
+
str
|
93
|
+
end
|
94
|
+
|
95
|
+
# Generate ngrams from a normalized str.
|
96
|
+
def ngrams(str)
|
97
|
+
if ngrammer
|
98
|
+
return ngrammer.call(str)
|
99
|
+
end
|
100
|
+
|
101
|
+
# two letter ngrams (bigrams)
|
102
|
+
ngrams = str.each_char.each_cons(2).map(&:join)
|
103
|
+
# runs of digits
|
104
|
+
ngrams += str.scan(/\d+/)
|
105
|
+
ngrams.uniq
|
106
|
+
end
|
107
|
+
|
108
|
+
# Sum up the frequency weights of the +ngrams+.
|
109
|
+
def ngrams_sum(ngrams)
|
110
|
+
ngrams.map { |i| @freq[i] }.inject(&:+) || 0
|
111
|
+
end
|
112
|
+
|
113
|
+
# Calculate the frequency weighted
|
114
|
+
# simhash[http://matpalm.com/resemblance/simhash/] of the
|
115
|
+
# +ngrams+.
|
116
|
+
def simhash(ngrams)
|
117
|
+
Bits.simhash32(freq, ngrams)
|
118
|
+
end
|
119
|
+
|
120
|
+
def inspect #:nodoc:
|
121
|
+
"Matcher"
|
122
|
+
end
|
123
|
+
|
124
|
+
protected
|
125
|
+
|
126
|
+
# Turn a list of user supplied opaque items into a list of
|
127
|
+
# Elements (if necessary).
|
128
|
+
def import_list(list)
|
129
|
+
if !list.first.is_a?(Element)
|
130
|
+
list = list.map { |opaque| element_for(opaque) }
|
131
|
+
end
|
132
|
+
list
|
133
|
+
end
|
134
|
+
|
135
|
+
# Turn a user's opaque item into an Element.
|
136
|
+
def element_for(opaque)
|
137
|
+
Element.new(self, opaque)
|
138
|
+
end
|
139
|
+
|
140
|
+
# Puts if options[:verbose]
|
141
|
+
def vputs(s)
|
142
|
+
$stderr.puts s if options[:verbose]
|
143
|
+
end
|
144
|
+
|
145
|
+
# Like each, but with a progress bar if options[:verbose]
|
146
|
+
def veach(title, array, &block)
|
147
|
+
if !options[:verbose]
|
148
|
+
array.each do |i|
|
149
|
+
yield(i)
|
150
|
+
end
|
151
|
+
else
|
152
|
+
begin
|
153
|
+
pb = ProgressBar.new(title, array.length)
|
154
|
+
array.each do |i|
|
155
|
+
yield(i)
|
156
|
+
pb.inc
|
157
|
+
end
|
158
|
+
ensure
|
159
|
+
pb.finish
|
160
|
+
end
|
161
|
+
end
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require "set"
|
2
|
+
|
3
|
+
module Simhilarity
|
4
|
+
# Calculate the similarity score for pairs of items, one at a time.
|
5
|
+
class Single < Matcher
|
6
|
+
# See Matcher#initialize.
|
7
|
+
def initialize(options = {})
|
8
|
+
super(options)
|
9
|
+
end
|
10
|
+
|
11
|
+
# Calculate the similarity score for these two items. Scores range
|
12
|
+
# from 0 to 1, with 1 being a perfect match and 0 being a terrible
|
13
|
+
# match. For best results, call #corpus= first.
|
14
|
+
def score(a, b)
|
15
|
+
Candidate.new(self, element_for(a), element_for(b)).score
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
data/lib/simhilarity.rb
ADDED
data/simhilarity.gemspec
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
require "simhilarity/version"
|
2
|
+
|
3
|
+
Gem::Specification.new do |s|
|
4
|
+
s.name = "simhilarity"
|
5
|
+
s.version = Simhilarity::VERSION
|
6
|
+
s.platform = Gem::Platform::RUBY
|
7
|
+
s.required_ruby_version = ">= 1.9.0"
|
8
|
+
s.authors = ["Adam Doppelt"]
|
9
|
+
s.email = ["amd@gurge.com"]
|
10
|
+
s.homepage = "http://github.com/gurgeous/simhilarity"
|
11
|
+
s.summary = "Simhilarity - measure text similarity using frequency weighted ngrams."
|
12
|
+
s.description = "Measure text similarity using frequency weighted ngrams."
|
13
|
+
|
14
|
+
s.rubyforge_project = "simhilarity"
|
15
|
+
|
16
|
+
s.add_runtime_dependency "bk"
|
17
|
+
s.add_runtime_dependency "progressbar"
|
18
|
+
|
19
|
+
s.add_development_dependency "awesome_print"
|
20
|
+
s.add_development_dependency "rake"
|
21
|
+
s.add_development_dependency "rdoc"
|
22
|
+
|
23
|
+
s.files = `git ls-files`.split("\n")
|
24
|
+
s.test_files = `git ls-files -- test/*`.split("\n")
|
25
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map { |i| File.basename(i) }
|
26
|
+
s.require_paths = ["lib"]
|
27
|
+
end
|
data/test/harness
ADDED
@@ -0,0 +1,138 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
|
5
|
+
require "awesome_print"
|
6
|
+
require "csv"
|
7
|
+
require "ostruct"
|
8
|
+
require "simhilarity"
|
9
|
+
|
10
|
+
#
|
11
|
+
# This is a test harness for measuring the speed and accuracy of the
|
12
|
+
# different searchers. Feel free to ignore.
|
13
|
+
#
|
14
|
+
|
15
|
+
#
|
16
|
+
# Results on an i5 3ghz, with 500 needles and 10,000 haystacks:
|
17
|
+
#
|
18
|
+
#
|
19
|
+
|
20
|
+
class Harness
|
21
|
+
def initialize
|
22
|
+
@matcher = Simhilarity::Bulk.new
|
23
|
+
|
24
|
+
# load data
|
25
|
+
$stderr.puts "Loading..."
|
26
|
+
@data = dataset_large
|
27
|
+
@data.needles = @data.needles.map { |i| @matcher.send(:element_for, i) }
|
28
|
+
@data.haystack = @data.haystack.map { |i| @matcher.send(:element_for, i) }
|
29
|
+
@matcher.corpus = @data.needles + @data.haystack
|
30
|
+
|
31
|
+
# exclude this initialization from our benchmarks
|
32
|
+
@matcher.corpus.each(&:simhash)
|
33
|
+
|
34
|
+
$stderr.puts "Harness using needles/haystack = #{@data.needles.length}/#{@data.haystack.length}..."
|
35
|
+
run
|
36
|
+
end
|
37
|
+
|
38
|
+
def dataset_large
|
39
|
+
needles = File.readlines("large_needles.txt").map(&:chomp)
|
40
|
+
haystack = File.readlines("large_haystack.txt").map(&:chomp)
|
41
|
+
OpenStruct.new(needles: needles, haystack: haystack)
|
42
|
+
end
|
43
|
+
|
44
|
+
def dataset_small
|
45
|
+
needles, haystack, matches = [], [], {}
|
46
|
+
CSV.read("sample.csv").each do |cols|
|
47
|
+
n, h = *cols
|
48
|
+
needles << n if n
|
49
|
+
haystack << h if h
|
50
|
+
matches[n] = h if n && h
|
51
|
+
end
|
52
|
+
OpenStruct.new(needles: needles, haystack: haystack, matches: matches)
|
53
|
+
end
|
54
|
+
|
55
|
+
# all
|
56
|
+
# ngrams: 5,4,3
|
57
|
+
# simhash: 5,6,7,8,9
|
58
|
+
|
59
|
+
#
|
60
|
+
# compare the various candidate methods
|
61
|
+
#
|
62
|
+
|
63
|
+
def run
|
64
|
+
# header
|
65
|
+
cols = [:title, :candidates, :s30, :s40, :s50, :c_tm, :s_tm, :correct]
|
66
|
+
puts cols.join("\t")
|
67
|
+
|
68
|
+
# simhash: 5..9
|
69
|
+
@matcher.options[:candidates] = :simhash
|
70
|
+
5.upto(9).each do |i|
|
71
|
+
@matcher.options[:simhash_max_hamming] = i
|
72
|
+
report("simhash #{i}")
|
73
|
+
end
|
74
|
+
|
75
|
+
# ngrams: 5..3
|
76
|
+
@matcher.options[:candidates] = :ngrams
|
77
|
+
5.downto(3).each do |i|
|
78
|
+
@matcher.options[:ngram_overlaps] = i
|
79
|
+
report("ngrams #{i}")
|
80
|
+
end
|
81
|
+
|
82
|
+
# all
|
83
|
+
@matcher.options[:candidates] = :all
|
84
|
+
report("all")
|
85
|
+
end
|
86
|
+
|
87
|
+
|
88
|
+
def report(title)
|
89
|
+
# candidates
|
90
|
+
tm1 = Time.now
|
91
|
+
candidates = @matcher.send(:candidates, @data.needles, @data.haystack)
|
92
|
+
tm1 = Time.now - tm1
|
93
|
+
|
94
|
+
# winners
|
95
|
+
tm2 = Time.now
|
96
|
+
winners = @matcher.send(:winners, @data.needles, candidates)
|
97
|
+
tm2 = Time.now - tm2
|
98
|
+
winners = winners.sort_by { |n, h, score| -(score || 0) }
|
99
|
+
|
100
|
+
s30 = winners.count { |n, h, score| score && score > 0.3 }
|
101
|
+
s40 = winners.count { |n, h, score| score && score > 0.4 }
|
102
|
+
s50 = winners.count { |n, h, score| score && score > 0.5 }
|
103
|
+
|
104
|
+
tm1 = sprintf("%.3f", tm1)
|
105
|
+
tm2 = sprintf("%.3f", tm2)
|
106
|
+
|
107
|
+
cols = [title, candidates.length, s30, s40, s50, tm1, tm2]
|
108
|
+
if @data.matches
|
109
|
+
correct = winners.select { |n, h, score| @data.matches[n] == h }
|
110
|
+
correct = correct.length.to_f / @data.needles.length
|
111
|
+
correct = sprintf("%.3f", correct)
|
112
|
+
cols << correct
|
113
|
+
end
|
114
|
+
puts cols.join("\t")
|
115
|
+
$stdout.flush
|
116
|
+
end
|
117
|
+
|
118
|
+
def dump_results(winners)
|
119
|
+
high_quality = winners.select { |n, h, score| score && score > 0.5 }
|
120
|
+
full report
|
121
|
+
high_quality.each do |n, h, score|
|
122
|
+
printf("%4.2f %-35s %-35s\n", score || 0, n, h)
|
123
|
+
end
|
124
|
+
|
125
|
+
# which high quality matches were added?
|
126
|
+
if @last
|
127
|
+
puts
|
128
|
+
added = high_quality - @last
|
129
|
+
added.each do |n, h, score|
|
130
|
+
printf("%4.2f %-35s %-35s\n", score || 0, n, h)
|
131
|
+
end
|
132
|
+
end
|
133
|
+
@last = high_quality
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
Dir.chdir(File.expand_path("../", __FILE__))
|
138
|
+
Harness.new
|
data/test/identity.txt
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
hello, world!
|