simhilarity 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,164 @@
1
+ require "progressbar"
2
+
3
+ module Simhilarity
4
+ # Abstract superclass for matching. Mainly a container for options, corpus, etc.
5
+ class Matcher
6
+ # Options used to create this Matcher.
7
+ attr_accessor :options
8
+
9
+ # Proc for turning needle/haystack elements into strings. You can
10
+ # leave this nil if the elements are already strings. See
11
+ # Matcher#reader for the default implementation.
12
+ attr_accessor :reader
13
+
14
+ # Proc for normalizing input strings. See Matcher#normalize
15
+ # for the default implementation.
16
+ attr_accessor :normalizer
17
+
18
+ # Proc for generating ngrams from a normalized string. See
19
+ # Matcher#ngrams for the default implementation.
20
+ attr_accessor :ngrammer
21
+
22
+ # Ngram frequency weights from the corpus, or 1 if the ngram isn't
23
+ # in the corpus.
24
+ attr_accessor :freq
25
+
26
+ # Create a new Matcher matcher. Options include:
27
+ #
28
+ # * +reader+: Proc for turning opaque items into strings.
29
+ # * +normalizer+: Proc for normalizing strings.
30
+ # * +ngrammer+: Proc for generating ngrams.
31
+ # * +verbose+: If true, show progress bars and timing.
32
+ def initialize(options = {})
33
+ @options = options
34
+
35
+ # procs
36
+ self.reader = options[:reader]
37
+ self.normalizer = options[:normalizer]
38
+ self.ngrammer = options[:ngrammer]
39
+
40
+ self.freq = Hash.new(1)
41
+ end
42
+
43
+ # Set the corpus. Calculates ngram frequencies (#freq) for future
44
+ # scoring.
45
+ def corpus=(corpus)
46
+ @corpus = corpus
47
+
48
+ # calculate ngram counts for the corpus
49
+ counts = Hash.new(0)
50
+ veach("Corpus", import_list(corpus)) do |element|
51
+ element.ngrams.each do |ngram|
52
+ counts[ngram] += 1
53
+ end
54
+ end
55
+
56
+ # turn counts into inverse frequencies
57
+ self.freq = Hash.new(1)
58
+ total = counts.values.inject(&:+).to_f
59
+ counts.each do |ngram, count|
60
+ self.freq[ngram] = total / count
61
+ end
62
+ end
63
+
64
+ # The current corpus.
65
+ def corpus
66
+ @corpus
67
+ end
68
+
69
+ # Turn an opaque item from the user into a string.
70
+ def read(opaque)
71
+ if reader
72
+ return reader.call(opaque)
73
+ end
74
+
75
+ if opaque.is_a?(String)
76
+ return opaque
77
+ end
78
+ raise "can't turn #{opaque.inspect} into string"
79
+ end
80
+
81
+ # Normalize an incoming string from the user.
82
+ def normalize(incoming_str)
83
+ if normalizer
84
+ return normalizer.call(incoming_str)
85
+ end
86
+
87
+ str = incoming_str
88
+ str = str.downcase
89
+ str = str.gsub(/[^a-z0-9]/, " ")
90
+ # squish whitespace
91
+ str = str.gsub(/\s+/, " ").strip
92
+ str
93
+ end
94
+
95
+ # Generate ngrams from a normalized str.
96
+ def ngrams(str)
97
+ if ngrammer
98
+ return ngrammer.call(str)
99
+ end
100
+
101
+ # two letter ngrams (bigrams)
102
+ ngrams = str.each_char.each_cons(2).map(&:join)
103
+ # runs of digits
104
+ ngrams += str.scan(/\d+/)
105
+ ngrams.uniq
106
+ end
107
+
108
+ # Sum up the frequency weights of the +ngrams+.
109
+ def ngrams_sum(ngrams)
110
+ ngrams.map { |i| @freq[i] }.inject(&:+) || 0
111
+ end
112
+
113
+ # Calculate the frequency weighted
114
+ # simhash[http://matpalm.com/resemblance/simhash/] of the
115
+ # +ngrams+.
116
+ def simhash(ngrams)
117
+ Bits.simhash32(freq, ngrams)
118
+ end
119
+
120
+ def inspect #:nodoc:
121
+ "Matcher"
122
+ end
123
+
124
+ protected
125
+
126
+ # Turn a list of user supplied opaque items into a list of
127
+ # Elements (if necessary).
128
+ def import_list(list)
129
+ if !list.first.is_a?(Element)
130
+ list = list.map { |opaque| element_for(opaque) }
131
+ end
132
+ list
133
+ end
134
+
135
+ # Turn a user's opaque item into an Element.
136
+ def element_for(opaque)
137
+ Element.new(self, opaque)
138
+ end
139
+
140
+ # Puts if options[:verbose]
141
+ def vputs(s)
142
+ $stderr.puts s if options[:verbose]
143
+ end
144
+
145
+ # Like each, but with a progress bar if options[:verbose]
146
+ def veach(title, array, &block)
147
+ if !options[:verbose]
148
+ array.each do |i|
149
+ yield(i)
150
+ end
151
+ else
152
+ begin
153
+ pb = ProgressBar.new(title, array.length)
154
+ array.each do |i|
155
+ yield(i)
156
+ pb.inc
157
+ end
158
+ ensure
159
+ pb.finish
160
+ end
161
+ end
162
+ end
163
+ end
164
+ end
@@ -0,0 +1,18 @@
1
+ require "set"
2
+
3
+ module Simhilarity
4
+ # Calculate the similarity score for pairs of items, one at a time.
5
+ class Single < Matcher
6
+ # See Matcher#initialize.
7
+ def initialize(options = {})
8
+ super(options)
9
+ end
10
+
11
+ # Calculate the similarity score for these two items. Scores range
12
+ # from 0 to 1, with 1 being a perfect match and 0 being a terrible
13
+ # match. For best results, call #corpus= first.
14
+ def score(a, b)
15
+ Candidate.new(self, element_for(a), element_for(b)).score
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,4 @@
1
+ module Simhilarity
2
+ # Gem version
3
+ VERSION = "1.0.0"
4
+ end
@@ -0,0 +1,8 @@
1
+ require "simhilarity/bits"
2
+ require "simhilarity/candidate"
3
+ require "simhilarity/element"
4
+ require "simhilarity/matcher"
5
+ require "simhilarity/version"
6
+
7
+ require "simhilarity/bulk"
8
+ require "simhilarity/single"
@@ -0,0 +1,27 @@
1
+ require "simhilarity/version"
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = "simhilarity"
5
+ s.version = Simhilarity::VERSION
6
+ s.platform = Gem::Platform::RUBY
7
+ s.required_ruby_version = ">= 1.9.0"
8
+ s.authors = ["Adam Doppelt"]
9
+ s.email = ["amd@gurge.com"]
10
+ s.homepage = "http://github.com/gurgeous/simhilarity"
11
+ s.summary = "Simhilarity - measure text similarity using frequency weighted ngrams."
12
+ s.description = "Measure text similarity using frequency weighted ngrams."
13
+
14
+ s.rubyforge_project = "simhilarity"
15
+
16
+ s.add_runtime_dependency "bk"
17
+ s.add_runtime_dependency "progressbar"
18
+
19
+ s.add_development_dependency "awesome_print"
20
+ s.add_development_dependency "rake"
21
+ s.add_development_dependency "rdoc"
22
+
23
+ s.files = `git ls-files`.split("\n")
24
+ s.test_files = `git ls-files -- test/*`.split("\n")
25
+ s.executables = `git ls-files -- bin/*`.split("\n").map { |i| File.basename(i) }
26
+ s.require_paths = ["lib"]
27
+ end
data/test/harness ADDED
@@ -0,0 +1,138 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+
5
+ require "awesome_print"
6
+ require "csv"
7
+ require "ostruct"
8
+ require "simhilarity"
9
+
10
+ #
11
+ # This is a test harness for measuring the speed and accuracy of the
12
+ # different searchers. Feel free to ignore.
13
+ #
14
+
15
+ #
16
+ # Results on an i5 3ghz, with 500 needles and 10,000 haystacks:
17
+ #
18
+ #
19
+
20
+ class Harness
21
+ def initialize
22
+ @matcher = Simhilarity::Bulk.new
23
+
24
+ # load data
25
+ $stderr.puts "Loading..."
26
+ @data = dataset_large
27
+ @data.needles = @data.needles.map { |i| @matcher.send(:element_for, i) }
28
+ @data.haystack = @data.haystack.map { |i| @matcher.send(:element_for, i) }
29
+ @matcher.corpus = @data.needles + @data.haystack
30
+
31
+ # exclude this initialization from our benchmarks
32
+ @matcher.corpus.each(&:simhash)
33
+
34
+ $stderr.puts "Harness using needles/haystack = #{@data.needles.length}/#{@data.haystack.length}..."
35
+ run
36
+ end
37
+
38
+ def dataset_large
39
+ needles = File.readlines("large_needles.txt").map(&:chomp)
40
+ haystack = File.readlines("large_haystack.txt").map(&:chomp)
41
+ OpenStruct.new(needles: needles, haystack: haystack)
42
+ end
43
+
44
+ def dataset_small
45
+ needles, haystack, matches = [], [], {}
46
+ CSV.read("sample.csv").each do |cols|
47
+ n, h = *cols
48
+ needles << n if n
49
+ haystack << h if h
50
+ matches[n] = h if n && h
51
+ end
52
+ OpenStruct.new(needles: needles, haystack: haystack, matches: matches)
53
+ end
54
+
55
+ # all
56
+ # ngrams: 5,4,3
57
+ # simhash: 5,6,7,8,9
58
+
59
+ #
60
+ # compare the various candidate methods
61
+ #
62
+
63
+ def run
64
+ # header
65
+ cols = [:title, :candidates, :s30, :s40, :s50, :c_tm, :s_tm, :correct]
66
+ puts cols.join("\t")
67
+
68
+ # simhash: 5..9
69
+ @matcher.options[:candidates] = :simhash
70
+ 5.upto(9).each do |i|
71
+ @matcher.options[:simhash_max_hamming] = i
72
+ report("simhash #{i}")
73
+ end
74
+
75
+ # ngrams: 5..3
76
+ @matcher.options[:candidates] = :ngrams
77
+ 5.downto(3).each do |i|
78
+ @matcher.options[:ngram_overlaps] = i
79
+ report("ngrams #{i}")
80
+ end
81
+
82
+ # all
83
+ @matcher.options[:candidates] = :all
84
+ report("all")
85
+ end
86
+
87
+
88
+ def report(title)
89
+ # candidates
90
+ tm1 = Time.now
91
+ candidates = @matcher.send(:candidates, @data.needles, @data.haystack)
92
+ tm1 = Time.now - tm1
93
+
94
+ # winners
95
+ tm2 = Time.now
96
+ winners = @matcher.send(:winners, @data.needles, candidates)
97
+ tm2 = Time.now - tm2
98
+ winners = winners.sort_by { |n, h, score| -(score || 0) }
99
+
100
+ s30 = winners.count { |n, h, score| score && score > 0.3 }
101
+ s40 = winners.count { |n, h, score| score && score > 0.4 }
102
+ s50 = winners.count { |n, h, score| score && score > 0.5 }
103
+
104
+ tm1 = sprintf("%.3f", tm1)
105
+ tm2 = sprintf("%.3f", tm2)
106
+
107
+ cols = [title, candidates.length, s30, s40, s50, tm1, tm2]
108
+ if @data.matches
109
+ correct = winners.select { |n, h, score| @data.matches[n] == h }
110
+ correct = correct.length.to_f / @data.needles.length
111
+ correct = sprintf("%.3f", correct)
112
+ cols << correct
113
+ end
114
+ puts cols.join("\t")
115
+ $stdout.flush
116
+ end
117
+
118
+ def dump_results(winners)
119
+ high_quality = winners.select { |n, h, score| score && score > 0.5 }
120
+ full report
121
+ high_quality.each do |n, h, score|
122
+ printf("%4.2f %-35s %-35s\n", score || 0, n, h)
123
+ end
124
+
125
+ # which high quality matches were added?
126
+ if @last
127
+ puts
128
+ added = high_quality - @last
129
+ added.each do |n, h, score|
130
+ printf("%4.2f %-35s %-35s\n", score || 0, n, h)
131
+ end
132
+ end
133
+ @last = high_quality
134
+ end
135
+ end
136
+
137
+ Dir.chdir(File.expand_path("../", __FILE__))
138
+ Harness.new
data/test/identity.txt ADDED
@@ -0,0 +1 @@
1
+ hello, world!