simhilarity 1.0.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -26,37 +26,5 @@ module Simhilarity
26
26
  b = (x >> 0) & 0xffff
27
27
  HAMMING16[a] + HAMMING16[b]
28
28
  end
29
-
30
- # can't rely on ruby hash, because it's not consistent across
31
- # sessions. Let's just use MD5.
32
- def self.nhash(ngram)
33
- @hashes ||= { }
34
- @hashes[ngram] ||= Digest::MD5.hexdigest(ngram).to_i(16)
35
- end
36
-
37
- # Calculate the frequency weighted
38
- # simhash[http://matpalm.com/resemblance/simhash/] of the
39
- # +ngrams+.
40
- def self.simhash32(freq, ngrams)
41
- # array of bit sums
42
- bits = Array.new(32, 0)
43
-
44
- # walk bits of ngram's hash, increase/decrease bit sums
45
- ngrams.each do |ngram|
46
- f = freq[ngram]
47
- hash = nhash(ngram)
48
- (0...32).each do |i|
49
- bits[i] += (((hash >> i) & 1) == 1) ? f : -f
50
- end
51
- end
52
-
53
- # calculate simhash based on whether bit sums are negative or
54
- # positive
55
- simhash = 0
56
- (0...32).each do |bit|
57
- simhash |= (1 << bit) if bits[bit] > 0
58
- end
59
- simhash
60
- end
61
29
  end
62
30
  end
@@ -37,7 +37,7 @@ module Simhilarity
37
37
  self.normalizer = options[:normalizer]
38
38
  self.ngrammer = options[:ngrammer]
39
39
 
40
- self.freq = Hash.new(1)
40
+ reset_corpus
41
41
  end
42
42
 
43
43
  # Set the corpus. Calculates ngram frequencies (#freq) for future
@@ -45,6 +45,8 @@ module Simhilarity
45
45
  def corpus=(corpus)
46
46
  @corpus = corpus
47
47
 
48
+ reset_corpus
49
+
48
50
  # calculate ngram counts for the corpus
49
51
  counts = Hash.new(0)
50
52
  veach("Corpus", import_list(corpus)) do |element|
@@ -54,10 +56,9 @@ module Simhilarity
54
56
  end
55
57
 
56
58
  # turn counts into inverse frequencies
57
- self.freq = Hash.new(1)
58
59
  total = counts.values.inject(&:+).to_f
59
60
  counts.each do |ngram, count|
60
- self.freq[ngram] = total / count
61
+ @freq[ngram] = ((total / count) * 10).round
61
62
  end
62
63
  end
63
64
 
@@ -114,7 +115,16 @@ module Simhilarity
114
115
  # simhash[http://matpalm.com/resemblance/simhash/] of the
115
116
  # +ngrams+.
116
117
  def simhash(ngrams)
117
- Bits.simhash32(freq, ngrams)
118
+ # map each ngram to its bitsums
119
+ sums = ngrams.map { |i| simhash_bitsums(i) }
120
+ # transpose and calculate final sum for each bit
121
+ bits = sums.transpose.map { |values| values.inject(&:+) }
122
+ # wherever we have a positive sum, the simhash bit is 1
123
+ simhash = 0
124
+ bits.each_with_index do |i, index|
125
+ simhash |= (1 << index) if i > 0
126
+ end
127
+ simhash
118
128
  end
119
129
 
120
130
  def inspect #:nodoc:
@@ -137,6 +147,30 @@ module Simhilarity
137
147
  Element.new(self, opaque)
138
148
  end
139
149
 
150
+ def reset_corpus
151
+ @freq = Hash.new(1)
152
+ @bitsums = { }
153
+ end
154
+
155
+ # calculate the simhash bitsums for this +ngram+, as part of
156
+ # calculating the simhash. We can cache this because it only
157
+ # depends on the freq and ngram.
158
+ def simhash_bitsums(ngram)
159
+ @bitsums[ngram] ||= begin
160
+ # hash the ngram using a consistent hash (ruby's hash changes
161
+ # across sessions)
162
+ hash = Digest::MD5.hexdigest(ngram).to_i(16)
163
+
164
+ # map hash bits, 1 ? f : -f
165
+ f = freq[ngram]
166
+ array = Array.new(32, 0)
167
+ (0...32).each do |i|
168
+ array[i] = (((hash >> i) & 1) == 1) ? f : -f
169
+ end
170
+ array
171
+ end
172
+ end
173
+
140
174
  # Puts if options[:verbose]
141
175
  def vputs(s)
142
176
  $stderr.puts s if options[:verbose]
@@ -1,4 +1,4 @@
1
1
  module Simhilarity
2
2
  # Gem version
3
- VERSION = "1.0.0"
3
+ VERSION = "1.0.1"
4
4
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: simhilarity
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.0.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -140,7 +140,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
140
140
  version: '0'
141
141
  segments:
142
142
  - 0
143
- hash: -1497809244519171705
143
+ hash: -2275181007329764325
144
144
  requirements: []
145
145
  rubyforge_project: simhilarity
146
146
  rubygems_version: 1.8.24