simhilarity 1.0.0 → 1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/simhilarity/bits.rb +0 -32
- data/lib/simhilarity/matcher.rb +38 -4
- data/lib/simhilarity/version.rb +1 -1
- metadata +2 -2
data/lib/simhilarity/bits.rb
CHANGED
@@ -26,37 +26,5 @@ module Simhilarity
|
|
26
26
|
b = (x >> 0) & 0xffff
|
27
27
|
HAMMING16[a] + HAMMING16[b]
|
28
28
|
end
|
29
|
-
|
30
|
-
# can't rely on ruby hash, because it's not consistent across
|
31
|
-
# sessions. Let's just use MD5.
|
32
|
-
def self.nhash(ngram)
|
33
|
-
@hashes ||= { }
|
34
|
-
@hashes[ngram] ||= Digest::MD5.hexdigest(ngram).to_i(16)
|
35
|
-
end
|
36
|
-
|
37
|
-
# Calculate the frequency weighted
|
38
|
-
# simhash[http://matpalm.com/resemblance/simhash/] of the
|
39
|
-
# +ngrams+.
|
40
|
-
def self.simhash32(freq, ngrams)
|
41
|
-
# array of bit sums
|
42
|
-
bits = Array.new(32, 0)
|
43
|
-
|
44
|
-
# walk bits of ngram's hash, increase/decrease bit sums
|
45
|
-
ngrams.each do |ngram|
|
46
|
-
f = freq[ngram]
|
47
|
-
hash = nhash(ngram)
|
48
|
-
(0...32).each do |i|
|
49
|
-
bits[i] += (((hash >> i) & 1) == 1) ? f : -f
|
50
|
-
end
|
51
|
-
end
|
52
|
-
|
53
|
-
# calculate simhash based on whether bit sums are negative or
|
54
|
-
# positive
|
55
|
-
simhash = 0
|
56
|
-
(0...32).each do |bit|
|
57
|
-
simhash |= (1 << bit) if bits[bit] > 0
|
58
|
-
end
|
59
|
-
simhash
|
60
|
-
end
|
61
29
|
end
|
62
30
|
end
|
data/lib/simhilarity/matcher.rb
CHANGED
@@ -37,7 +37,7 @@ module Simhilarity
|
|
37
37
|
self.normalizer = options[:normalizer]
|
38
38
|
self.ngrammer = options[:ngrammer]
|
39
39
|
|
40
|
-
|
40
|
+
reset_corpus
|
41
41
|
end
|
42
42
|
|
43
43
|
# Set the corpus. Calculates ngram frequencies (#freq) for future
|
@@ -45,6 +45,8 @@ module Simhilarity
|
|
45
45
|
def corpus=(corpus)
|
46
46
|
@corpus = corpus
|
47
47
|
|
48
|
+
reset_corpus
|
49
|
+
|
48
50
|
# calculate ngram counts for the corpus
|
49
51
|
counts = Hash.new(0)
|
50
52
|
veach("Corpus", import_list(corpus)) do |element|
|
@@ -54,10 +56,9 @@ module Simhilarity
|
|
54
56
|
end
|
55
57
|
|
56
58
|
# turn counts into inverse frequencies
|
57
|
-
self.freq = Hash.new(1)
|
58
59
|
total = counts.values.inject(&:+).to_f
|
59
60
|
counts.each do |ngram, count|
|
60
|
-
|
61
|
+
@freq[ngram] = ((total / count) * 10).round
|
61
62
|
end
|
62
63
|
end
|
63
64
|
|
@@ -114,7 +115,16 @@ module Simhilarity
|
|
114
115
|
# simhash[http://matpalm.com/resemblance/simhash/] of the
|
115
116
|
# +ngrams+.
|
116
117
|
def simhash(ngrams)
|
117
|
-
|
118
|
+
# map each ngram to its bitsums
|
119
|
+
sums = ngrams.map { |i| simhash_bitsums(i) }
|
120
|
+
# transpose and calculate final sum for each bit
|
121
|
+
bits = sums.transpose.map { |values| values.inject(&:+) }
|
122
|
+
# wherever we have a positive sum, the simhash bit is 1
|
123
|
+
simhash = 0
|
124
|
+
bits.each_with_index do |i, index|
|
125
|
+
simhash |= (1 << index) if i > 0
|
126
|
+
end
|
127
|
+
simhash
|
118
128
|
end
|
119
129
|
|
120
130
|
def inspect #:nodoc:
|
@@ -137,6 +147,30 @@ module Simhilarity
|
|
137
147
|
Element.new(self, opaque)
|
138
148
|
end
|
139
149
|
|
150
|
+
def reset_corpus
|
151
|
+
@freq = Hash.new(1)
|
152
|
+
@bitsums = { }
|
153
|
+
end
|
154
|
+
|
155
|
+
# calculate the simhash bitsums for this +ngram+, as part of
|
156
|
+
# calculating the simhash. We can cache this because it only
|
157
|
+
# depends on the freq and ngram.
|
158
|
+
def simhash_bitsums(ngram)
|
159
|
+
@bitsums[ngram] ||= begin
|
160
|
+
# hash the ngram using a consistent hash (ruby's hash changes
|
161
|
+
# across sessions)
|
162
|
+
hash = Digest::MD5.hexdigest(ngram).to_i(16)
|
163
|
+
|
164
|
+
# map hash bits, 1 ? f : -f
|
165
|
+
f = freq[ngram]
|
166
|
+
array = Array.new(32, 0)
|
167
|
+
(0...32).each do |i|
|
168
|
+
array[i] = (((hash >> i) & 1) == 1) ? f : -f
|
169
|
+
end
|
170
|
+
array
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
140
174
|
# Puts if options[:verbose]
|
141
175
|
def vputs(s)
|
142
176
|
$stderr.puts s if options[:verbose]
|
data/lib/simhilarity/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: simhilarity
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -140,7 +140,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
140
140
|
version: '0'
|
141
141
|
segments:
|
142
142
|
- 0
|
143
|
-
hash: -
|
143
|
+
hash: -2275181007329764325
|
144
144
|
requirements: []
|
145
145
|
rubyforge_project: simhilarity
|
146
146
|
rubygems_version: 1.8.24
|