simhilarity 1.0.0 → 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -26,37 +26,5 @@ module Simhilarity
26
26
  b = (x >> 0) & 0xffff
27
27
  HAMMING16[a] + HAMMING16[b]
28
28
  end
29
-
30
- # can't rely on ruby hash, because it's not consistent across
31
- # sessions. Let's just use MD5.
32
- def self.nhash(ngram)
33
- @hashes ||= { }
34
- @hashes[ngram] ||= Digest::MD5.hexdigest(ngram).to_i(16)
35
- end
36
-
37
- # Calculate the frequency weighted
38
- # simhash[http://matpalm.com/resemblance/simhash/] of the
39
- # +ngrams+.
40
- def self.simhash32(freq, ngrams)
41
- # array of bit sums
42
- bits = Array.new(32, 0)
43
-
44
- # walk bits of ngram's hash, increase/decrease bit sums
45
- ngrams.each do |ngram|
46
- f = freq[ngram]
47
- hash = nhash(ngram)
48
- (0...32).each do |i|
49
- bits[i] += (((hash >> i) & 1) == 1) ? f : -f
50
- end
51
- end
52
-
53
- # calculate simhash based on whether bit sums are negative or
54
- # positive
55
- simhash = 0
56
- (0...32).each do |bit|
57
- simhash |= (1 << bit) if bits[bit] > 0
58
- end
59
- simhash
60
- end
61
29
  end
62
30
  end
@@ -37,7 +37,7 @@ module Simhilarity
37
37
  self.normalizer = options[:normalizer]
38
38
  self.ngrammer = options[:ngrammer]
39
39
 
40
- self.freq = Hash.new(1)
40
+ reset_corpus
41
41
  end
42
42
 
43
43
  # Set the corpus. Calculates ngram frequencies (#freq) for future
@@ -45,6 +45,8 @@ module Simhilarity
45
45
  def corpus=(corpus)
46
46
  @corpus = corpus
47
47
 
48
+ reset_corpus
49
+
48
50
  # calculate ngram counts for the corpus
49
51
  counts = Hash.new(0)
50
52
  veach("Corpus", import_list(corpus)) do |element|
@@ -54,10 +56,9 @@ module Simhilarity
54
56
  end
55
57
 
56
58
  # turn counts into inverse frequencies
57
- self.freq = Hash.new(1)
58
59
  total = counts.values.inject(&:+).to_f
59
60
  counts.each do |ngram, count|
60
- self.freq[ngram] = total / count
61
+ @freq[ngram] = ((total / count) * 10).round
61
62
  end
62
63
  end
63
64
 
@@ -114,7 +115,16 @@ module Simhilarity
114
115
  # simhash[http://matpalm.com/resemblance/simhash/] of the
115
116
  # +ngrams+.
116
117
  def simhash(ngrams)
117
- Bits.simhash32(freq, ngrams)
118
+ # map each ngram to its bitsums
119
+ sums = ngrams.map { |i| simhash_bitsums(i) }
120
+ # transpose and calculate final sum for each bit
121
+ bits = sums.transpose.map { |values| values.inject(&:+) }
122
+ # wherever we have a positive sum, the simhash bit is 1
123
+ simhash = 0
124
+ bits.each_with_index do |i, index|
125
+ simhash |= (1 << index) if i > 0
126
+ end
127
+ simhash
118
128
  end
119
129
 
120
130
  def inspect #:nodoc:
@@ -137,6 +147,30 @@ module Simhilarity
137
147
  Element.new(self, opaque)
138
148
  end
139
149
 
150
+ def reset_corpus
151
+ @freq = Hash.new(1)
152
+ @bitsums = { }
153
+ end
154
+
155
+ # calculate the simhash bitsums for this +ngram+, as part of
156
+ # calculating the simhash. We can cache this because it only
157
+ # depends on the freq and ngram.
158
+ def simhash_bitsums(ngram)
159
+ @bitsums[ngram] ||= begin
160
+ # hash the ngram using a consistent hash (ruby's hash changes
161
+ # across sessions)
162
+ hash = Digest::MD5.hexdigest(ngram).to_i(16)
163
+
164
+ # map hash bits, 1 ? f : -f
165
+ f = freq[ngram]
166
+ array = Array.new(32, 0)
167
+ (0...32).each do |i|
168
+ array[i] = (((hash >> i) & 1) == 1) ? f : -f
169
+ end
170
+ array
171
+ end
172
+ end
173
+
140
174
  # Puts if options[:verbose]
141
175
  def vputs(s)
142
176
  $stderr.puts s if options[:verbose]
@@ -1,4 +1,4 @@
1
1
  module Simhilarity
2
2
  # Gem version
3
- VERSION = "1.0.0"
3
+ VERSION = "1.0.1"
4
4
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: simhilarity
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.0.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -140,7 +140,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
140
140
  version: '0'
141
141
  segments:
142
142
  - 0
143
- hash: -1497809244519171705
143
+ hash: -2275181007329764325
144
144
  requirements: []
145
145
  rubyforge_project: simhilarity
146
146
  rubygems_version: 1.8.24