simhash2 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/README.md +29 -0
- data/lib/simhash2.rb +30 -13
- data/lib/simhash2/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: ef3ff21591cb53fba63b03c7623c6cadb3874c6d2f67ac9e817c3533b9f9c3d8
|
4
|
+
data.tar.gz: aca99aa445defc207028a58948df20a293abdae66a2a3bf201a96bf9ccf19d49
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f9d2b12eaad8fd994a3e2f68aef4f75aa0b34358f2ca685e33371374dadbc3041046acb4999073a90d1a7d8d4346dde3e9ed79d6f6448aead28420227b0057cc
|
7
|
+
data.tar.gz: 8f978c0061d29706363b8c8dc63efee5d7cc9403ca2482173b09ba11bed256e1c2e7255d5adbeb40407648ff5efd19f6095131582c6944d5a625ecbbeabf3afe
|
data/README.md
CHANGED
@@ -35,3 +35,32 @@ simhash2 = Simhash.generate(str2) # => 13921220612431195624
|
|
35
35
|
Simhash.hamming_distance(simhash1, simhash2) # => 8
|
36
36
|
```
|
37
37
|
|
38
|
+
## Performance
|
39
|
+
|
40
|
+
Thanks to some performance optimizations by [JayTeeSF](https://github.com/JayTeeSF), this gem generally performs better than `bookmate/simhash`, especially when working with longer strings with lots of tokens.
|
41
|
+
|
42
|
+
```ruby
|
43
|
+
test_str = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."
|
44
|
+
|
45
|
+
def test_simhash (x)
|
46
|
+
x.simhash # bookmate/simhash
|
47
|
+
end
|
48
|
+
|
49
|
+
def test_simhash2 (x)
|
50
|
+
Simhash.generate(x) # this gem
|
51
|
+
end
|
52
|
+
|
53
|
+
n = 5000
|
54
|
+
Benchmark.bm do |x|
|
55
|
+
x.report("simhash") { for i in 1..n; test_simhash(test_str); end }
|
56
|
+
x.report("simhash2") { for i in 1..n; test_simhash2(test_str); end }
|
57
|
+
end
|
58
|
+
```
|
59
|
+
|
60
|
+
Results:
|
61
|
+
|
62
|
+
```
|
63
|
+
user system total real
|
64
|
+
simhash 5.109375 0.093750 5.203125 ( 5.199069)
|
65
|
+
simhash2 4.109375 0.000000 4.109375 ( 4.108586)
|
66
|
+
```
|
data/lib/simhash2.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
require_relative 'simhash2/version'
|
2
2
|
|
3
3
|
module Simhash
|
4
4
|
extend self
|
@@ -17,19 +17,20 @@ module Simhash
|
|
17
17
|
end
|
18
18
|
|
19
19
|
def generate(str, options = {})
|
20
|
+
# the split is how we get our tokens (or shingles)
|
21
|
+
# adjust that, if we want to use shingles
|
20
22
|
generate_from_tokens(str.split(/\s+/), options)
|
21
23
|
end
|
22
24
|
|
23
25
|
def generate_from_tokens(tokens, options = {})
|
24
|
-
filter_tokens(tokens, OPTIONS.merge(options))
|
25
|
-
|
26
26
|
v = [0] * HASHBITS
|
27
|
-
|
28
27
|
masks = v.dup
|
29
28
|
masks.each_with_index { |_e, i| masks[i] = (1 << i) }
|
30
29
|
|
31
|
-
|
32
|
-
|
30
|
+
filter_tokens(tokens, OPTIONS.merge(options)) do |token|
|
31
|
+
h = simple_string_hash(token, HASHBITS)
|
32
|
+
#warn "simple_string_hash (for: #{token.inspect}): #{h.inspect}"
|
33
|
+
|
33
34
|
HASHBITS.times do |i|
|
34
35
|
v[i] += (h & masks[i]).zero? ? -1 : +1
|
35
36
|
end
|
@@ -65,12 +66,28 @@ module Simhash
|
|
65
66
|
x.to_i
|
66
67
|
end
|
67
68
|
|
68
|
-
def filter_tokens(tokens, options)
|
69
|
-
|
70
|
-
tokens.
|
71
|
-
|
72
|
-
|
73
|
-
|
69
|
+
def filter_tokens(tokens, options, &block)
|
70
|
+
altered_tokens = []
|
71
|
+
tokens.each do |e|
|
72
|
+
new_e = e.downcase.gsub(/\W+/, '')
|
73
|
+
next if new_e.nil? || new_e.length < options[:min_token_length]
|
74
|
+
if options[:stop_words] && !options[:stop_words].empty?
|
75
|
+
next if options[:stop_words].include?(new_e)
|
76
|
+
end
|
77
|
+
if options[:stemming]
|
78
|
+
altered_tokens << new_e.stem
|
79
|
+
else
|
80
|
+
altered_tokens << new_e
|
81
|
+
end
|
82
|
+
end
|
83
|
+
altered_tokens.uniq! if options[:unique]
|
84
|
+
|
85
|
+
if block_given?
|
86
|
+
altered_tokens.each {|e| block[e] }
|
87
|
+
else
|
88
|
+
tokens.clear
|
89
|
+
altered_tokens.each {|e| tokens << e }
|
90
|
+
tokens
|
91
|
+
end
|
74
92
|
end
|
75
|
-
|
76
93
|
end
|
data/lib/simhash2/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: simhash2
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jonathan Wong
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-10-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|
@@ -79,7 +79,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
79
79
|
version: '0'
|
80
80
|
requirements: []
|
81
81
|
rubyforge_project:
|
82
|
-
rubygems_version: 2.6
|
82
|
+
rubygems_version: 2.7.6
|
83
83
|
signing_key:
|
84
84
|
specification_version: 4
|
85
85
|
summary: A rewrite of the 'simhash' gem, which is an implementation of Moses Charikar's
|