simhash2 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +1 -1
- data/README.md +34 -5
- data/lib/simhash2/version.rb +1 -1
- data/lib/simhash2.rb +76 -68
- data/simhash2.gemspec +1 -1
- data/spec/simhash_spec.rb +60 -51
- data/spec/spec_helper.rb +0 -2
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8e8190a872a7ec498e0a500482e8814758de6cb4
|
4
|
+
data.tar.gz: b0ea69a076ab1a99584010b3c85b6c3d17d37eea
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 290b5f9daf27c8d4a138e26aabc062b4c8896fb6506bb41da13ea5c1e44095286e82b9b34780b189d22fdae1e4f4d34d7dc0ce6438b9c47539584b82fb23a541
|
7
|
+
data.tar.gz: 5b961fbf6271a50571d557f628ef111ca7f9683378869bc9443635cb678b0a06942424287e5e6103702074f604a8ac39a06d2a2120c15b3bbfc26ee77d1cc5ab
|
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -1,8 +1,37 @@
|
|
1
1
|
[](https://badge.fury.io/rb/simhash2)
|
2
|
-
[](https://codeclimate.com/github/armchairtheorist/simhash2)
|
3
|
+
[](https://travis-ci.org/armchairtheorist/simhash2)
|
4
|
+
[](https://coveralls.io/github/armchairtheorist/simhash2?branch=master)
|
5
5
|
|
6
|
-
#
|
6
|
+
# Simhash2
|
7
|
+
|
8
|
+
**Simhash2** is a rewrite of the [bookmate/simhash](https://github.com/bookmate/simhash) gem, which is an implementation of Moses Charikar's simhashes in Ruby. The key differences are that this gem doesn't monkey patch the `String` and `Integer`, and configuration is also slightly easier. The simhash values generated by this gem on a default configuration should be identical to what is generated by the Bookmate version.
|
9
|
+
|
10
|
+
## Installation
|
11
|
+
|
12
|
+
Install the gem from RubyGems:
|
13
|
+
|
14
|
+
```bash
|
15
|
+
gem install simhash2
|
16
|
+
```
|
17
|
+
|
18
|
+
If you use Bundler, just add it to your Gemfile and run `bundle install`
|
19
|
+
|
20
|
+
```ruby
|
21
|
+
gem 'simhash2'
|
22
|
+
```
|
23
|
+
|
24
|
+
I have only tested this gem on Ruby 2.4.1, but there shouldn't be any reason why it wouldn't work on earlier Ruby versions as well.
|
25
|
+
|
26
|
+
## Usage
|
27
|
+
|
28
|
+
```ruby
|
29
|
+
str1 = "I am the king of the world!"
|
30
|
+
str2 = "I am the queen of the world!"
|
31
|
+
|
32
|
+
simhash1 = Simhash.generate(str1) # => 86798109229625320
|
33
|
+
simhash2 = Simhash.generate(str2) # => 13921220612431195624
|
34
|
+
|
35
|
+
Simhash.hamming_distance(simhash1, simhash2) # => 8
|
36
|
+
```
|
7
37
|
|
8
|
-
This is a rewrite of the [bookmate/simhash](https://github.com/bookmate/simhash) gem, which is an implementation of Moses Charikar's simhashes in Ruby.
|
data/lib/simhash2/version.rb
CHANGED
data/lib/simhash2.rb
CHANGED
@@ -1,68 +1,76 @@
|
|
1
|
-
require 'simhash2/version'
|
2
|
-
|
3
|
-
module Simhash
|
4
|
-
extend self
|
5
|
-
|
6
|
-
HASHBITS = 64
|
7
|
-
|
8
|
-
OPTIONS = {
|
9
|
-
min_token_length: 1,
|
10
|
-
unique: false,
|
11
|
-
stemming: false,
|
12
|
-
stop_words: []
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
simhash
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
1
|
+
require 'simhash2/version'
|
2
|
+
|
3
|
+
module Simhash
|
4
|
+
extend self
|
5
|
+
|
6
|
+
HASHBITS = 64
|
7
|
+
|
8
|
+
OPTIONS = {
|
9
|
+
min_token_length: 1,
|
10
|
+
unique: false,
|
11
|
+
stemming: false,
|
12
|
+
stop_words: []
|
13
|
+
}.freeze
|
14
|
+
|
15
|
+
def similarity(string1, string2, options = {})
|
16
|
+
return hash_similarity(generate(string1, options), generate(string2, options))
|
17
|
+
end
|
18
|
+
|
19
|
+
def generate(str, options = {})
|
20
|
+
generate_from_tokens(str.split(/\s+/), options)
|
21
|
+
end
|
22
|
+
|
23
|
+
def generate_from_tokens(tokens, options = {})
|
24
|
+
filter_tokens(tokens, OPTIONS.merge(options))
|
25
|
+
|
26
|
+
v = [0] * HASHBITS
|
27
|
+
|
28
|
+
masks = v.dup
|
29
|
+
masks.each_with_index { |_e, i| masks[i] = (1 << i) }
|
30
|
+
|
31
|
+
hashes = tokens.map { |token| simple_string_hash(token, HASHBITS) }
|
32
|
+
hashes.each do |h|
|
33
|
+
HASHBITS.times do |i|
|
34
|
+
v[i] += (h & masks[i]).zero? ? -1 : +1
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
simhash = 0
|
39
|
+
HASHBITS.times { |i| simhash += 1 << i if v[i] >= 0 }
|
40
|
+
|
41
|
+
simhash
|
42
|
+
end
|
43
|
+
|
44
|
+
def hamming_distance(simhash1, simhash2)
|
45
|
+
(simhash1.to_i ^ simhash2.to_i).to_s(2).count('1')
|
46
|
+
end
|
47
|
+
|
48
|
+
def hash_similarity(left, right)
|
49
|
+
return (1.0 - (hamming_distance(left, right).to_f / HASHBITS))
|
50
|
+
end
|
51
|
+
|
52
|
+
private
|
53
|
+
|
54
|
+
def simple_string_hash(str, length)
|
55
|
+
return 0 if str == ''
|
56
|
+
|
57
|
+
x = str.bytes.first << 7
|
58
|
+
m = 1_000_003
|
59
|
+
mask = (1 << length) - 1
|
60
|
+
str.each_byte { |char| x = ((x * m) ^ char.to_i) & mask }
|
61
|
+
|
62
|
+
x ^= str.bytes.count
|
63
|
+
x = -2 if x == -1
|
64
|
+
|
65
|
+
x.to_i
|
66
|
+
end
|
67
|
+
|
68
|
+
def filter_tokens(tokens, options)
|
69
|
+
tokens.map! { |e| e.downcase.gsub(/\W+/, '') }
|
70
|
+
tokens.reject! { |e| e.nil? || e.length < options[:min_token_length] }
|
71
|
+
tokens.reject! { |e| options[:stop_words].include?(e) } unless options[:stop_words].nil? || options[:stop_words].empty?
|
72
|
+
tokens.map!(&:stem) if options[:stemming]
|
73
|
+
tokens.uniq! if options[:unique]
|
74
|
+
end
|
75
|
+
|
76
|
+
end
|
data/simhash2.gemspec
CHANGED
@@ -6,7 +6,7 @@ Gem::Specification.new do |spec|
|
|
6
6
|
spec.authors = ['Jonathan Wong']
|
7
7
|
spec.email = ['jonathan@armchairtheorist.com']
|
8
8
|
spec.summary = 'A rewrite of the \'simhash\' gem, which is an implementation of Moses Charikar\'s simhashes in Ruby.'
|
9
|
-
spec.homepage = '
|
9
|
+
spec.homepage = 'https://github.com/armchairtheorist/simhash2'
|
10
10
|
spec.license = 'MIT'
|
11
11
|
|
12
12
|
spec.files = `git ls-files`.split("\n")
|
data/spec/simhash_spec.rb
CHANGED
@@ -1,51 +1,60 @@
|
|
1
|
-
require '
|
2
|
-
require 'fast-stemmer'
|
3
|
-
|
4
|
-
describe Simhash do
|
5
|
-
it 'should generate the same simhash for the same string, and a different simhash for a different string' do
|
6
|
-
str1 = 'I like going to the beach'
|
7
|
-
str2 = 'I like going to the beach'
|
8
|
-
str3 = 'I like going to the mall'
|
9
|
-
|
10
|
-
expect(Simhash.generate(str1)).to eq Simhash.generate(str2)
|
11
|
-
expect(Simhash.generate(str1)).not_to eq Simhash.generate(str3)
|
12
|
-
end
|
13
|
-
|
14
|
-
it 'should
|
15
|
-
str1 =
|
16
|
-
str2 = '
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
expect(Simhash.generate(str1,
|
35
|
-
expect(Simhash.generate(str1)).not_to eq Simhash.generate(str2)
|
36
|
-
end
|
37
|
-
|
38
|
-
it "should respect the '
|
39
|
-
str1 = '
|
40
|
-
str2 = '
|
41
|
-
|
42
|
-
|
43
|
-
expect(Simhash.generate(str1,
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
1
|
+
require 'simhash2'
|
2
|
+
require 'fast-stemmer'
|
3
|
+
|
4
|
+
describe Simhash do
|
5
|
+
it 'should generate the same simhash for the same string, and a different simhash for a different string' do
|
6
|
+
str1 = 'I like going to the beach'
|
7
|
+
str2 = 'I like going to the beach'
|
8
|
+
str3 = 'I like going to the mall'
|
9
|
+
|
10
|
+
expect(Simhash.generate(str1)).to eq Simhash.generate(str2)
|
11
|
+
expect(Simhash.generate(str1)).not_to eq Simhash.generate(str3)
|
12
|
+
end
|
13
|
+
|
14
|
+
it 'should calculate the same similarity for the same string, and a lower similarity for a different string' do
|
15
|
+
str1 = 'I like going to the beach'
|
16
|
+
str2 = 'I like going to the beach'
|
17
|
+
str3 = 'I like going to the mall'
|
18
|
+
|
19
|
+
expect(Simhash.similarity(str1, str2)).to eq 1.0
|
20
|
+
expect(Simhash.similarity(str1, str3)).to be < 1.0
|
21
|
+
end
|
22
|
+
|
23
|
+
it 'should strip punctuation and capitalization properly' do
|
24
|
+
str1 = "Hello, nurse! How's it going today... my man?"
|
25
|
+
str2 = 'hello nurse hows it going today my man'
|
26
|
+
|
27
|
+
expect(Simhash.generate(str1, unique: true)).to eq Simhash.generate(str2, unique: true)
|
28
|
+
end
|
29
|
+
|
30
|
+
it "should respect the 'unique' option" do
|
31
|
+
str1 = 'apple pear'
|
32
|
+
str2 = 'apple apple apple pear'
|
33
|
+
|
34
|
+
expect(Simhash.generate(str1, unique: true)).to eq Simhash.generate(str2, unique: true)
|
35
|
+
expect(Simhash.generate(str1, unique: false)).not_to eq Simhash.generate(str2, unique: false)
|
36
|
+
end
|
37
|
+
|
38
|
+
it "should respect the 'stop_words' option" do
|
39
|
+
str1 = 'I like the man on the moon.'
|
40
|
+
str2 = 'like man moon'
|
41
|
+
stop_words = %w[i the on]
|
42
|
+
|
43
|
+
expect(Simhash.generate(str1, stop_words: stop_words)).to eq Simhash.generate(str2, stop_words: stop_words)
|
44
|
+
expect(Simhash.generate(str1)).not_to eq Simhash.generate(str2)
|
45
|
+
end
|
46
|
+
|
47
|
+
it "should respect the 'stemming' option" do
|
48
|
+
str1 = 'My crazy cars have crazy minds!'
|
49
|
+
str2 = 'My crazi car have crazi mind!'
|
50
|
+
|
51
|
+
expect(Simhash.generate(str1, stemming: true)).to eq Simhash.generate(str2, stemming: true)
|
52
|
+
expect(Simhash.generate(str1, stemming: false)).not_to eq Simhash.generate(str2, stemming: false)
|
53
|
+
end
|
54
|
+
|
55
|
+
it 'should calculate hamming distances correctly' do
|
56
|
+
expect(Simhash.hamming_distance(2, 2)).to eq 0
|
57
|
+
expect(Simhash.hamming_distance(2, 3)).to eq 1
|
58
|
+
expect(Simhash.hamming_distance(255, 197)).to eq 4
|
59
|
+
end
|
60
|
+
end
|
data/spec/spec_helper.rb
CHANGED
@@ -1,8 +1,6 @@
|
|
1
1
|
require 'coveralls'
|
2
2
|
Coveralls.wear!
|
3
3
|
|
4
|
-
require 'simhash'
|
5
|
-
|
6
4
|
# This file was generated by the `rspec --init` command. Conventionally, all
|
7
5
|
# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
|
8
6
|
# The generated `.rspec` file contains `--require spec_helper` which will cause
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: simhash2
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jonathan Wong
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-09-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|
@@ -59,7 +59,7 @@ files:
|
|
59
59
|
- simhash2.gemspec
|
60
60
|
- spec/simhash_spec.rb
|
61
61
|
- spec/spec_helper.rb
|
62
|
-
homepage:
|
62
|
+
homepage: https://github.com/armchairtheorist/simhash2
|
63
63
|
licenses:
|
64
64
|
- MIT
|
65
65
|
metadata: {}
|