simhilarity 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,164 @@
1
+ require "progressbar"
2
+
3
+ module Simhilarity
4
+ # Abstract superclass for matching. Mainly a container for options, corpus, etc.
5
+ class Matcher
6
+ # Options used to create this Matcher.
7
+ attr_accessor :options
8
+
9
+ # Proc for turning needle/haystack elements into strings. You can
10
+ # leave this nil if the elements are already strings. See
11
+ # Matcher#reader for the default implementation.
12
+ attr_accessor :reader
13
+
14
+ # Proc for normalizing input strings. See Matcher#normalize
15
+ # for the default implementation.
16
+ attr_accessor :normalizer
17
+
18
+ # Proc for generating ngrams from a normalized string. See
19
+ # Matcher#ngrams for the default implementation.
20
+ attr_accessor :ngrammer
21
+
22
+ # Ngram frequency weights from the corpus, or 1 if the ngram isn't
23
+ # in the corpus.
24
+ attr_accessor :freq
25
+
26
+ # Create a new Matcher matcher. Options include:
27
+ #
28
+ # * +reader+: Proc for turning opaque items into strings.
29
+ # * +normalizer+: Proc for normalizing strings.
30
+ # * +ngrammer+: Proc for generating ngrams.
31
+ # * +verbose+: If true, show progress bars and timing.
32
+ def initialize(options = {})
33
+ @options = options
34
+
35
+ # procs
36
+ self.reader = options[:reader]
37
+ self.normalizer = options[:normalizer]
38
+ self.ngrammer = options[:ngrammer]
39
+
40
+ self.freq = Hash.new(1)
41
+ end
42
+
43
+ # Set the corpus. Calculates ngram frequencies (#freq) for future
44
+ # scoring.
45
+ def corpus=(corpus)
46
+ @corpus = corpus
47
+
48
+ # calculate ngram counts for the corpus
49
+ counts = Hash.new(0)
50
+ veach("Corpus", import_list(corpus)) do |element|
51
+ element.ngrams.each do |ngram|
52
+ counts[ngram] += 1
53
+ end
54
+ end
55
+
56
+ # turn counts into inverse frequencies
57
+ self.freq = Hash.new(1)
58
+ total = counts.values.inject(&:+).to_f
59
+ counts.each do |ngram, count|
60
+ self.freq[ngram] = total / count
61
+ end
62
+ end
63
+
64
+ # The current corpus.
65
+ def corpus
66
+ @corpus
67
+ end
68
+
69
+ # Turn an opaque item from the user into a string.
70
+ def read(opaque)
71
+ if reader
72
+ return reader.call(opaque)
73
+ end
74
+
75
+ if opaque.is_a?(String)
76
+ return opaque
77
+ end
78
+ raise "can't turn #{opaque.inspect} into string"
79
+ end
80
+
81
+ # Normalize an incoming string from the user.
82
+ def normalize(incoming_str)
83
+ if normalizer
84
+ return normalizer.call(incoming_str)
85
+ end
86
+
87
+ str = incoming_str
88
+ str = str.downcase
89
+ str = str.gsub(/[^a-z0-9]/, " ")
90
+ # squish whitespace
91
+ str = str.gsub(/\s+/, " ").strip
92
+ str
93
+ end
94
+
95
+ # Generate ngrams from a normalized str.
96
+ def ngrams(str)
97
+ if ngrammer
98
+ return ngrammer.call(str)
99
+ end
100
+
101
+ # two letter ngrams (bigrams)
102
+ ngrams = str.each_char.each_cons(2).map(&:join)
103
+ # runs of digits
104
+ ngrams += str.scan(/\d+/)
105
+ ngrams.uniq
106
+ end
107
+
108
+ # Sum up the frequency weights of the +ngrams+.
109
+ def ngrams_sum(ngrams)
110
+ ngrams.map { |i| @freq[i] }.inject(&:+) || 0
111
+ end
112
+
113
+ # Calculate the frequency weighted
114
+ # simhash[http://matpalm.com/resemblance/simhash/] of the
115
+ # +ngrams+.
116
+ def simhash(ngrams)
117
+ Bits.simhash32(freq, ngrams)
118
+ end
119
+
120
+ def inspect #:nodoc:
121
+ "Matcher"
122
+ end
123
+
124
+ protected
125
+
126
+ # Turn a list of user supplied opaque items into a list of
127
+ # Elements (if necessary).
128
+ def import_list(list)
129
+ if !list.first.is_a?(Element)
130
+ list = list.map { |opaque| element_for(opaque) }
131
+ end
132
+ list
133
+ end
134
+
135
+ # Turn a user's opaque item into an Element.
136
+ def element_for(opaque)
137
+ Element.new(self, opaque)
138
+ end
139
+
140
+ # Puts if options[:verbose]
141
+ def vputs(s)
142
+ $stderr.puts s if options[:verbose]
143
+ end
144
+
145
+ # Like each, but with a progress bar if options[:verbose]
146
+ def veach(title, array, &block)
147
+ if !options[:verbose]
148
+ array.each do |i|
149
+ yield(i)
150
+ end
151
+ else
152
+ begin
153
+ pb = ProgressBar.new(title, array.length)
154
+ array.each do |i|
155
+ yield(i)
156
+ pb.inc
157
+ end
158
+ ensure
159
+ pb.finish
160
+ end
161
+ end
162
+ end
163
+ end
164
+ end
@@ -0,0 +1,18 @@
1
+ require "set"
2
+
3
+ module Simhilarity
4
+ # Calculate the similarity score for pairs of items, one at a time.
5
+ class Single < Matcher
6
+ # See Matcher#initialize.
7
+ def initialize(options = {})
8
+ super(options)
9
+ end
10
+
11
+ # Calculate the similarity score for these two items. Scores range
12
+ # from 0 to 1, with 1 being a perfect match and 0 being a terrible
13
+ # match. For best results, call #corpus= first.
14
+ def score(a, b)
15
+ Candidate.new(self, element_for(a), element_for(b)).score
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,4 @@
1
+ module Simhilarity
2
+ # Gem version
3
+ VERSION = "1.0.0"
4
+ end
@@ -0,0 +1,8 @@
1
+ require "simhilarity/bits"
2
+ require "simhilarity/candidate"
3
+ require "simhilarity/element"
4
+ require "simhilarity/matcher"
5
+ require "simhilarity/version"
6
+
7
+ require "simhilarity/bulk"
8
+ require "simhilarity/single"
@@ -0,0 +1,27 @@
1
+ require "simhilarity/version"
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = "simhilarity"
5
+ s.version = Simhilarity::VERSION
6
+ s.platform = Gem::Platform::RUBY
7
+ s.required_ruby_version = ">= 1.9.0"
8
+ s.authors = ["Adam Doppelt"]
9
+ s.email = ["amd@gurge.com"]
10
+ s.homepage = "http://github.com/gurgeous/simhilarity"
11
+ s.summary = "Simhilarity - measure text similarity using frequency weighted ngrams."
12
+ s.description = "Measure text similarity using frequency weighted ngrams."
13
+
14
+ s.rubyforge_project = "simhilarity"
15
+
16
+ s.add_runtime_dependency "bk"
17
+ s.add_runtime_dependency "progressbar"
18
+
19
+ s.add_development_dependency "awesome_print"
20
+ s.add_development_dependency "rake"
21
+ s.add_development_dependency "rdoc"
22
+
23
+ s.files = `git ls-files`.split("\n")
24
+ s.test_files = `git ls-files -- test/*`.split("\n")
25
+ s.executables = `git ls-files -- bin/*`.split("\n").map { |i| File.basename(i) }
26
+ s.require_paths = ["lib"]
27
+ end
data/test/harness ADDED
@@ -0,0 +1,138 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+
5
+ require "awesome_print"
6
+ require "csv"
7
+ require "ostruct"
8
+ require "simhilarity"
9
+
10
+ #
11
+ # This is a test harness for measuring the speed and accuracy of the
12
+ # different searchers. Feel free to ignore.
13
+ #
14
+
15
+ #
16
+ # Results on an i5 3ghz, with 500 needles and 10,000 haystacks:
17
+ #
18
+ #
19
+
20
+ class Harness
21
+ def initialize
22
+ @matcher = Simhilarity::Bulk.new
23
+
24
+ # load data
25
+ $stderr.puts "Loading..."
26
+ @data = dataset_large
27
+ @data.needles = @data.needles.map { |i| @matcher.send(:element_for, i) }
28
+ @data.haystack = @data.haystack.map { |i| @matcher.send(:element_for, i) }
29
+ @matcher.corpus = @data.needles + @data.haystack
30
+
31
+ # exclude this initialization from our benchmarks
32
+ @matcher.corpus.each(&:simhash)
33
+
34
+ $stderr.puts "Harness using needles/haystack = #{@data.needles.length}/#{@data.haystack.length}..."
35
+ run
36
+ end
37
+
38
+ def dataset_large
39
+ needles = File.readlines("large_needles.txt").map(&:chomp)
40
+ haystack = File.readlines("large_haystack.txt").map(&:chomp)
41
+ OpenStruct.new(needles: needles, haystack: haystack)
42
+ end
43
+
44
+ def dataset_small
45
+ needles, haystack, matches = [], [], {}
46
+ CSV.read("sample.csv").each do |cols|
47
+ n, h = *cols
48
+ needles << n if n
49
+ haystack << h if h
50
+ matches[n] = h if n && h
51
+ end
52
+ OpenStruct.new(needles: needles, haystack: haystack, matches: matches)
53
+ end
54
+
55
+ # all
56
+ # ngrams: 5,4,3
57
+ # simhash: 5,6,7,8,9
58
+
59
+ #
60
+ # compare the various candidate methods
61
+ #
62
+
63
+ def run
64
+ # header
65
+ cols = [:title, :candidates, :s30, :s40, :s50, :c_tm, :s_tm, :correct]
66
+ puts cols.join("\t")
67
+
68
+ # simhash: 5..9
69
+ @matcher.options[:candidates] = :simhash
70
+ 5.upto(9).each do |i|
71
+ @matcher.options[:simhash_max_hamming] = i
72
+ report("simhash #{i}")
73
+ end
74
+
75
+ # ngrams: 5..3
76
+ @matcher.options[:candidates] = :ngrams
77
+ 5.downto(3).each do |i|
78
+ @matcher.options[:ngram_overlaps] = i
79
+ report("ngrams #{i}")
80
+ end
81
+
82
+ # all
83
+ @matcher.options[:candidates] = :all
84
+ report("all")
85
+ end
86
+
87
+
88
+ def report(title)
89
+ # candidates
90
+ tm1 = Time.now
91
+ candidates = @matcher.send(:candidates, @data.needles, @data.haystack)
92
+ tm1 = Time.now - tm1
93
+
94
+ # winners
95
+ tm2 = Time.now
96
+ winners = @matcher.send(:winners, @data.needles, candidates)
97
+ tm2 = Time.now - tm2
98
+ winners = winners.sort_by { |n, h, score| -(score || 0) }
99
+
100
+ s30 = winners.count { |n, h, score| score && score > 0.3 }
101
+ s40 = winners.count { |n, h, score| score && score > 0.4 }
102
+ s50 = winners.count { |n, h, score| score && score > 0.5 }
103
+
104
+ tm1 = sprintf("%.3f", tm1)
105
+ tm2 = sprintf("%.3f", tm2)
106
+
107
+ cols = [title, candidates.length, s30, s40, s50, tm1, tm2]
108
+ if @data.matches
109
+ correct = winners.select { |n, h, score| @data.matches[n] == h }
110
+ correct = correct.length.to_f / @data.needles.length
111
+ correct = sprintf("%.3f", correct)
112
+ cols << correct
113
+ end
114
+ puts cols.join("\t")
115
+ $stdout.flush
116
+ end
117
+
118
+ def dump_results(winners)
119
+ high_quality = winners.select { |n, h, score| score && score > 0.5 }
120
+ full report
121
+ high_quality.each do |n, h, score|
122
+ printf("%4.2f %-35s %-35s\n", score || 0, n, h)
123
+ end
124
+
125
+ # which high quality matches were added?
126
+ if @last
127
+ puts
128
+ added = high_quality - @last
129
+ added.each do |n, h, score|
130
+ printf("%4.2f %-35s %-35s\n", score || 0, n, h)
131
+ end
132
+ end
133
+ @last = high_quality
134
+ end
135
+ end
136
+
137
+ Dir.chdir(File.expand_path("../", __FILE__))
138
+ Harness.new
data/test/identity.txt ADDED
@@ -0,0 +1 @@
1
+ hello, world!