simhilarity 1.0.0 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/simhilarity/bits.rb +0 -32
- data/lib/simhilarity/matcher.rb +38 -4
- data/lib/simhilarity/version.rb +1 -1
- metadata +2 -2
data/lib/simhilarity/bits.rb
CHANGED
@@ -26,37 +26,5 @@ module Simhilarity
|
|
26
26
|
b = (x >> 0) & 0xffff
|
27
27
|
HAMMING16[a] + HAMMING16[b]
|
28
28
|
end
|
29
|
-
|
30
|
-
# can't rely on ruby hash, because it's not consistent across
|
31
|
-
# sessions. Let's just use MD5.
|
32
|
-
def self.nhash(ngram)
|
33
|
-
@hashes ||= { }
|
34
|
-
@hashes[ngram] ||= Digest::MD5.hexdigest(ngram).to_i(16)
|
35
|
-
end
|
36
|
-
|
37
|
-
# Calculate the frequency weighted
|
38
|
-
# simhash[http://matpalm.com/resemblance/simhash/] of the
|
39
|
-
# +ngrams+.
|
40
|
-
def self.simhash32(freq, ngrams)
|
41
|
-
# array of bit sums
|
42
|
-
bits = Array.new(32, 0)
|
43
|
-
|
44
|
-
# walk bits of ngram's hash, increase/decrease bit sums
|
45
|
-
ngrams.each do |ngram|
|
46
|
-
f = freq[ngram]
|
47
|
-
hash = nhash(ngram)
|
48
|
-
(0...32).each do |i|
|
49
|
-
bits[i] += (((hash >> i) & 1) == 1) ? f : -f
|
50
|
-
end
|
51
|
-
end
|
52
|
-
|
53
|
-
# calculate simhash based on whether bit sums are negative or
|
54
|
-
# positive
|
55
|
-
simhash = 0
|
56
|
-
(0...32).each do |bit|
|
57
|
-
simhash |= (1 << bit) if bits[bit] > 0
|
58
|
-
end
|
59
|
-
simhash
|
60
|
-
end
|
61
29
|
end
|
62
30
|
end
|
data/lib/simhilarity/matcher.rb
CHANGED
@@ -37,7 +37,7 @@ module Simhilarity
|
|
37
37
|
self.normalizer = options[:normalizer]
|
38
38
|
self.ngrammer = options[:ngrammer]
|
39
39
|
|
40
|
-
|
40
|
+
reset_corpus
|
41
41
|
end
|
42
42
|
|
43
43
|
# Set the corpus. Calculates ngram frequencies (#freq) for future
|
@@ -45,6 +45,8 @@ module Simhilarity
|
|
45
45
|
def corpus=(corpus)
|
46
46
|
@corpus = corpus
|
47
47
|
|
48
|
+
reset_corpus
|
49
|
+
|
48
50
|
# calculate ngram counts for the corpus
|
49
51
|
counts = Hash.new(0)
|
50
52
|
veach("Corpus", import_list(corpus)) do |element|
|
@@ -54,10 +56,9 @@ module Simhilarity
|
|
54
56
|
end
|
55
57
|
|
56
58
|
# turn counts into inverse frequencies
|
57
|
-
self.freq = Hash.new(1)
|
58
59
|
total = counts.values.inject(&:+).to_f
|
59
60
|
counts.each do |ngram, count|
|
60
|
-
|
61
|
+
@freq[ngram] = ((total / count) * 10).round
|
61
62
|
end
|
62
63
|
end
|
63
64
|
|
@@ -114,7 +115,16 @@ module Simhilarity
|
|
114
115
|
# simhash[http://matpalm.com/resemblance/simhash/] of the
|
115
116
|
# +ngrams+.
|
116
117
|
def simhash(ngrams)
|
117
|
-
|
118
|
+
# map each ngram to its bitsums
|
119
|
+
sums = ngrams.map { |i| simhash_bitsums(i) }
|
120
|
+
# transpose and calculate final sum for each bit
|
121
|
+
bits = sums.transpose.map { |values| values.inject(&:+) }
|
122
|
+
# wherever we have a positive sum, the simhash bit is 1
|
123
|
+
simhash = 0
|
124
|
+
bits.each_with_index do |i, index|
|
125
|
+
simhash |= (1 << index) if i > 0
|
126
|
+
end
|
127
|
+
simhash
|
118
128
|
end
|
119
129
|
|
120
130
|
def inspect #:nodoc:
|
@@ -137,6 +147,30 @@ module Simhilarity
|
|
137
147
|
Element.new(self, opaque)
|
138
148
|
end
|
139
149
|
|
150
|
+
def reset_corpus
|
151
|
+
@freq = Hash.new(1)
|
152
|
+
@bitsums = { }
|
153
|
+
end
|
154
|
+
|
155
|
+
# calculate the simhash bitsums for this +ngram+, as part of
|
156
|
+
# calculating the simhash. We can cache this because it only
|
157
|
+
# depends on the freq and ngram.
|
158
|
+
def simhash_bitsums(ngram)
|
159
|
+
@bitsums[ngram] ||= begin
|
160
|
+
# hash the ngram using a consistent hash (ruby's hash changes
|
161
|
+
# across sessions)
|
162
|
+
hash = Digest::MD5.hexdigest(ngram).to_i(16)
|
163
|
+
|
164
|
+
# map hash bits, 1 ? f : -f
|
165
|
+
f = freq[ngram]
|
166
|
+
array = Array.new(32, 0)
|
167
|
+
(0...32).each do |i|
|
168
|
+
array[i] = (((hash >> i) & 1) == 1) ? f : -f
|
169
|
+
end
|
170
|
+
array
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
140
174
|
# Puts if options[:verbose]
|
141
175
|
def vputs(s)
|
142
176
|
$stderr.puts s if options[:verbose]
|
data/lib/simhilarity/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: simhilarity
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -140,7 +140,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
140
140
|
version: '0'
|
141
141
|
segments:
|
142
142
|
- 0
|
143
|
-
hash: -
|
143
|
+
hash: -2275181007329764325
|
144
144
|
requirements: []
|
145
145
|
rubyforge_project: simhilarity
|
146
146
|
rubygems_version: 1.8.24
|