simhash 0.2.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. data/lib/simhash.rb +26 -13
  2. metadata +4 -4
data/lib/simhash.rb CHANGED
@@ -18,14 +18,29 @@ module Simhash
18
18
 
19
19
  def self.hash(tokens, options={})
20
20
  hashbits = options[:hashbits] || 64
21
- token_min_size = options[:token_min_size].to_i
22
21
  hashing_method = options[:hashing_method] || DEFAULT_STRING_HASH_METHOD
23
- stop_sentenses = options[:stop_sentenses]
24
-
22
+
25
23
  v = [0] * hashbits
26
24
  masks = v.dup
27
25
  masks.each_with_index {|e, i| masks[i] = (1 << i)}
28
26
 
27
+ self.each_filtered_token(tokens, options) do |token|
28
+ hashed_token = token.send(hashing_method, hashbits).to_i
29
+ hashbits.times do |i|
30
+ v[i] += (hashed_token & masks[i]).zero? ? -1 : +1
31
+ end
32
+ end
33
+
34
+ fingerprint = 0
35
+
36
+ hashbits.times { |i| fingerprint += 1 << i if v[i] >= 0 }
37
+
38
+ fingerprint
39
+ end
40
+
41
+ def self.each_filtered_token(tokens, options={})
42
+ token_min_size = options[:token_min_size].to_i
43
+ stop_sentenses = options[:stop_sentenses]
29
44
  tokens.each do |token|
30
45
  # cutting punctuation (\302\240 is unbreakable space)
31
46
  token = token.gsub(/(\s|\d|\W|\302\240| *— *|[«»\…\-\–\—]| )+/u,' ') if !options[:preserve_punctuation]
@@ -39,17 +54,15 @@ module Simhash
39
54
  next if stop_sentenses && stop_sentenses.include?(" #{token} ")
40
55
 
41
56
  next if token.size.zero? || token.mb_chars.size < token_min_size
42
- hashed_token = token.send(hashing_method, hashbits).to_i
43
- hashbits.times do |i|
44
- v[i] += (hashed_token & masks[i]).zero? ? -1 : +1
45
- end
46
- end
47
-
48
- fingerprint = 0
49
-
50
- hashbits.times { |i| fingerprint += 1 << i if v[i] >= 0 }
51
57
 
52
- fingerprint
58
+ yield token
59
+ end
60
+ end
61
+
62
+ def self.filtered_tokens(tokens, options={})
63
+ filtered_tokens = []
64
+ self.each_filtered_token(tokens, options) { |token| filtered_tokens << token }
65
+ filtered_tokens
53
66
  end
54
67
 
55
68
  def self.hm
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: simhash
3
3
  version: !ruby/object:Gem::Version
4
- hash: 19
4
+ hash: 17
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 2
9
- - 2
10
- version: 0.2.2
9
+ - 3
10
+ version: 0.2.3
11
11
  platform: ruby
12
12
  authors:
13
13
  - Alex Gusev
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-09-07 00:00:00 +04:00
18
+ date: 2011-01-20 00:00:00 +03:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency