simhash 0.2.2 → 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (2) hide show
  1. data/lib/simhash.rb +26 -13
  2. metadata +4 -4
data/lib/simhash.rb CHANGED
@@ -18,14 +18,29 @@ module Simhash
18
18
 
19
19
  def self.hash(tokens, options={})
20
20
  hashbits = options[:hashbits] || 64
21
- token_min_size = options[:token_min_size].to_i
22
21
  hashing_method = options[:hashing_method] || DEFAULT_STRING_HASH_METHOD
23
- stop_sentenses = options[:stop_sentenses]
24
-
22
+
25
23
  v = [0] * hashbits
26
24
  masks = v.dup
27
25
  masks.each_with_index {|e, i| masks[i] = (1 << i)}
28
26
 
27
+ self.each_filtered_token(tokens, options) do |token|
28
+ hashed_token = token.send(hashing_method, hashbits).to_i
29
+ hashbits.times do |i|
30
+ v[i] += (hashed_token & masks[i]).zero? ? -1 : +1
31
+ end
32
+ end
33
+
34
+ fingerprint = 0
35
+
36
+ hashbits.times { |i| fingerprint += 1 << i if v[i] >= 0 }
37
+
38
+ fingerprint
39
+ end
40
+
41
+ def self.each_filtered_token(tokens, options={})
42
+ token_min_size = options[:token_min_size].to_i
43
+ stop_sentenses = options[:stop_sentenses]
29
44
  tokens.each do |token|
30
45
  # cutting punctuation (\302\240 is unbreakable space)
31
46
  token = token.gsub(/(\s|\d|\W|\302\240| *— *|[«»\…\-\–\—]| )+/u,' ') if !options[:preserve_punctuation]
@@ -39,17 +54,15 @@ module Simhash
39
54
  next if stop_sentenses && stop_sentenses.include?(" #{token} ")
40
55
 
41
56
  next if token.size.zero? || token.mb_chars.size < token_min_size
42
- hashed_token = token.send(hashing_method, hashbits).to_i
43
- hashbits.times do |i|
44
- v[i] += (hashed_token & masks[i]).zero? ? -1 : +1
45
- end
46
- end
47
-
48
- fingerprint = 0
49
-
50
- hashbits.times { |i| fingerprint += 1 << i if v[i] >= 0 }
51
57
 
52
- fingerprint
58
+ yield token
59
+ end
60
+ end
61
+
62
+ def self.filtered_tokens(tokens, options={})
63
+ filtered_tokens = []
64
+ self.each_filtered_token(tokens, options) { |token| filtered_tokens << token }
65
+ filtered_tokens
53
66
  end
54
67
 
55
68
  def self.hm
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: simhash
3
3
  version: !ruby/object:Gem::Version
4
- hash: 19
4
+ hash: 17
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 2
9
- - 2
10
- version: 0.2.2
9
+ - 3
10
+ version: 0.2.3
11
11
  platform: ruby
12
12
  authors:
13
13
  - Alex Gusev
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-09-07 00:00:00 +04:00
18
+ date: 2011-01-20 00:00:00 +03:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency