simhash2 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 4649489bf0c74cadd1dc2387578fc76994d3bdf6
4
- data.tar.gz: 3eb87d8c24c0b476278b7c7fc18a8555c1d3215e
3
+ metadata.gz: 8e8190a872a7ec498e0a500482e8814758de6cb4
4
+ data.tar.gz: b0ea69a076ab1a99584010b3c85b6c3d17d37eea
5
5
  SHA512:
6
- metadata.gz: b4802c746e43f91b99b6cc6f17f7a2757666596b9b70104eae7b4581dbf63a878ac4741b10ba7686bc6d2b7749132c4bf2cc94bca8416614521dac80c1f5f8df
7
- data.tar.gz: d5e1cec7950aef8d40f81518644554cfc6e23af590c8249b7adf1feef56c13db452548611e182f6449ed4182367a14c3ca5c6a288e34c40f32e63889b3b4b168
6
+ metadata.gz: 290b5f9daf27c8d4a138e26aabc062b4c8896fb6506bb41da13ea5c1e44095286e82b9b34780b189d22fdae1e4f4d34d7dc0ce6438b9c47539584b82fb23a541
7
+ data.tar.gz: 5b961fbf6271a50571d557f628ef111ca7f9683378869bc9443635cb678b0a06942424287e5e6103702074f604a8ac39a06d2a2120c15b3bbfc26ee77d1cc5ab
data/Gemfile CHANGED
@@ -1,6 +1,6 @@
1
1
  source 'http://rubygems.org'
2
2
 
3
- # Specify your gem's dependencies in simhash2.gemspec
3
+ # Specify your gem's dependencies in the .gemspec file
4
4
  gemspec
5
5
 
6
6
  gem 'coveralls', require: false
data/README.md CHANGED
@@ -1,8 +1,37 @@
1
1
  [![Gem Version](https://badge.fury.io/rb/simhash2.svg)](https://badge.fury.io/rb/simhash2)
2
- [![Code Climate](https://codeclimate.com/github/armchairtheorist/simhash/badges/gpa.svg)](https://codeclimate.com/github/armchairtheorist/simhash)
3
- [![Build Status](https://travis-ci.org/armchairtheorist/simhash.svg?branch=master)](https://travis-ci.org/armchairtheorist/simhash)
4
- [![Coverage Status](https://coveralls.io/repos/github/armchairtheorist/simhash/badge.svg?branch=master)](https://coveralls.io/github/armchairtheorist/simhash?branch=master)
2
+ [![Code Climate](https://codeclimate.com/github/armchairtheorist/simhash2/badges/gpa.svg)](https://codeclimate.com/github/armchairtheorist/simhash2)
3
+ [![Build Status](https://travis-ci.org/armchairtheorist/simhash2.svg?branch=master)](https://travis-ci.org/armchairtheorist/simhash2)
4
+ [![Coverage Status](https://coveralls.io/repos/github/armchairtheorist/simhash2/badge.svg?branch=master)](https://coveralls.io/github/armchairtheorist/simhash2?branch=master)
5
5
 
6
- # Simhash
6
+ # Simhash2
7
+
8
+ **Simhash2** is a rewrite of the [bookmate/simhash](https://github.com/bookmate/simhash) gem, which is an implementation of Moses Charikar's simhashes in Ruby. The key differences are that this gem doesn't monkey patch the `String` and `Integer`, and configuration is also slightly easier. The simhash values generated by this gem on a default configuration should be identical to what is generated by the Bookmate version.
9
+
10
+ ## Installation
11
+
12
+ Install the gem from RubyGems:
13
+
14
+ ```bash
15
+ gem install simhash2
16
+ ```
17
+
18
+ If you use Bundler, just add it to your Gemfile and run `bundle install`
19
+
20
+ ```ruby
21
+ gem 'simhash2'
22
+ ```
23
+
24
+ I have only tested this gem on Ruby 2.4.1, but there shouldn't be any reason why it wouldn't work on earlier Ruby versions as well.
25
+
26
+ ## Usage
27
+
28
+ ```ruby
29
+ str1 = "I am the king of the world!"
30
+ str2 = "I am the queen of the world!"
31
+
32
+ simhash1 = Simhash.generate(str1) # => 86798109229625320
33
+ simhash2 = Simhash.generate(str2) # => 13921220612431195624
34
+
35
+ Simhash.hamming_distance(simhash1, simhash2) # => 8
36
+ ```
7
37
 
8
- This is a rewrite of the [bookmate/simhash](https://github.com/bookmate/simhash) gem, which is an implementation of Moses Charikar's simhashes in Ruby.
@@ -1,3 +1,3 @@
1
1
  module Simhash
2
- VERSION = '0.0.2'.freeze
2
+ VERSION = '0.0.3'.freeze
3
3
  end
data/lib/simhash2.rb CHANGED
@@ -1,68 +1,76 @@
1
- require 'simhash2/version'
2
-
3
- module Simhash
4
- extend self
5
-
6
- HASHBITS = 64
7
-
8
- OPTIONS = {
9
- min_token_length: 1,
10
- unique: false,
11
- stemming: false,
12
- stop_words: []
13
-
14
- }.freeze
15
-
16
- def generate(str, options = {})
17
- generate_from_tokens(str.split(/\s+/), options)
18
- end
19
-
20
- def generate_from_tokens(tokens, options = {})
21
- filter_tokens(tokens, OPTIONS.merge(options))
22
-
23
- v = [0] * HASHBITS
24
-
25
- masks = v.dup
26
- masks.each_with_index { |_e, i| masks[i] = (1 << i) }
27
-
28
- hashes = tokens.map { |token| simple_string_hash(token, HASHBITS) }
29
- hashes.each do |h|
30
- HASHBITS.times do |i|
31
- v[i] += (h & masks[i]).zero? ? -1 : +1
32
- end
33
- end
34
-
35
- simhash = 0
36
- HASHBITS.times { |i| simhash += 1 << i if v[i] >= 0 }
37
-
38
- simhash
39
- end
40
-
41
- def hamming_distance(simhash1, simhash2)
42
- (simhash1.to_i ^ simhash2.to_i).to_s(2).count('1')
43
- end
44
-
45
- private
46
-
47
- def simple_string_hash(str, length)
48
- return 0 if str == ''
49
-
50
- x = str.bytes.first << 7
51
- m = 1_000_003
52
- mask = (1 << length) - 1
53
- str.each_byte { |char| x = ((x * m) ^ char.to_i) & mask }
54
-
55
- x ^= str.bytes.count
56
- x = -2 if x == -1
57
-
58
- x.to_i
59
- end
60
-
61
- def filter_tokens(tokens, options)
62
- tokens.map! { |e| e.downcase.gsub(/\W+/, '') }
63
- tokens.reject! { |e| e.nil? || e.length < options[:min_token_length] }
64
- tokens.reject! { |e| options[:stop_words].include?(e) } unless options[:stop_words].nil? || options[:stop_words].empty?
65
- tokens.map!(&:stem) if options[:stemming]
66
- tokens.uniq! if options[:unique]
67
- end
68
- end
1
+ require 'simhash2/version'
2
+
3
+ module Simhash
4
+ extend self
5
+
6
+ HASHBITS = 64
7
+
8
+ OPTIONS = {
9
+ min_token_length: 1,
10
+ unique: false,
11
+ stemming: false,
12
+ stop_words: []
13
+ }.freeze
14
+
15
+ def similarity(string1, string2, options = {})
16
+ return hash_similarity(generate(string1, options), generate(string2, options))
17
+ end
18
+
19
+ def generate(str, options = {})
20
+ generate_from_tokens(str.split(/\s+/), options)
21
+ end
22
+
23
+ def generate_from_tokens(tokens, options = {})
24
+ filter_tokens(tokens, OPTIONS.merge(options))
25
+
26
+ v = [0] * HASHBITS
27
+
28
+ masks = v.dup
29
+ masks.each_with_index { |_e, i| masks[i] = (1 << i) }
30
+
31
+ hashes = tokens.map { |token| simple_string_hash(token, HASHBITS) }
32
+ hashes.each do |h|
33
+ HASHBITS.times do |i|
34
+ v[i] += (h & masks[i]).zero? ? -1 : +1
35
+ end
36
+ end
37
+
38
+ simhash = 0
39
+ HASHBITS.times { |i| simhash += 1 << i if v[i] >= 0 }
40
+
41
+ simhash
42
+ end
43
+
44
+ def hamming_distance(simhash1, simhash2)
45
+ (simhash1.to_i ^ simhash2.to_i).to_s(2).count('1')
46
+ end
47
+
48
+ def hash_similarity(left, right)
49
+ return (1.0 - (hamming_distance(left, right).to_f / HASHBITS))
50
+ end
51
+
52
+ private
53
+
54
+ def simple_string_hash(str, length)
55
+ return 0 if str == ''
56
+
57
+ x = str.bytes.first << 7
58
+ m = 1_000_003
59
+ mask = (1 << length) - 1
60
+ str.each_byte { |char| x = ((x * m) ^ char.to_i) & mask }
61
+
62
+ x ^= str.bytes.count
63
+ x = -2 if x == -1
64
+
65
+ x.to_i
66
+ end
67
+
68
+ def filter_tokens(tokens, options)
69
+ tokens.map! { |e| e.downcase.gsub(/\W+/, '') }
70
+ tokens.reject! { |e| e.nil? || e.length < options[:min_token_length] }
71
+ tokens.reject! { |e| options[:stop_words].include?(e) } unless options[:stop_words].nil? || options[:stop_words].empty?
72
+ tokens.map!(&:stem) if options[:stemming]
73
+ tokens.uniq! if options[:unique]
74
+ end
75
+
76
+ end
data/simhash2.gemspec CHANGED
@@ -6,7 +6,7 @@ Gem::Specification.new do |spec|
6
6
  spec.authors = ['Jonathan Wong']
7
7
  spec.email = ['jonathan@armchairtheorist.com']
8
8
  spec.summary = 'A rewrite of the \'simhash\' gem, which is an implementation of Moses Charikar\'s simhashes in Ruby.'
9
- spec.homepage = 'http://github.com/armchairtheorist/simhash'
9
+ spec.homepage = 'https://github.com/armchairtheorist/simhash2'
10
10
  spec.license = 'MIT'
11
11
 
12
12
  spec.files = `git ls-files`.split("\n")
data/spec/simhash_spec.rb CHANGED
@@ -1,51 +1,60 @@
1
- require 'spec_helper'
2
- require 'fast-stemmer'
3
-
4
- describe Simhash do
5
- it 'should generate the same simhash for the same string, and a different simhash for a different string' do
6
- str1 = 'I like going to the beach'
7
- str2 = 'I like going to the beach'
8
- str3 = 'I like going to the mall'
9
-
10
- expect(Simhash.generate(str1)).to eq Simhash.generate(str2)
11
- expect(Simhash.generate(str1)).not_to eq Simhash.generate(str3)
12
- end
13
-
14
- it 'should strip punctuation and capitalization properly' do
15
- str1 = "Hello, nurse! How's it going today... my man?"
16
- str2 = 'hello nurse hows it going today my man'
17
-
18
- expect(Simhash.generate(str1, unique: true)).to eq Simhash.generate(str2, unique: true)
19
- end
20
-
21
- it "should respect the 'unique' option" do
22
- str1 = 'apple pear'
23
- str2 = 'apple apple apple pear'
24
-
25
- expect(Simhash.generate(str1, unique: true)).to eq Simhash.generate(str2, unique: true)
26
- expect(Simhash.generate(str1, unique: false)).not_to eq Simhash.generate(str2, unique: false)
27
- end
28
-
29
- it "should respect the 'stop_words' option" do
30
- str1 = 'I like the man on the moon.'
31
- str2 = 'like man moon'
32
- stop_words = %w[i the on]
33
-
34
- expect(Simhash.generate(str1, stop_words: stop_words)).to eq Simhash.generate(str2, stop_words: stop_words)
35
- expect(Simhash.generate(str1)).not_to eq Simhash.generate(str2)
36
- end
37
-
38
- it "should respect the 'stemming' option" do
39
- str1 = 'My crazy cars have crazy minds!'
40
- str2 = 'My crazi car have crazi mind!'
41
-
42
- expect(Simhash.generate(str1, stemming: true)).to eq Simhash.generate(str2, stemming: true)
43
- expect(Simhash.generate(str1, stemming: false)).not_to eq Simhash.generate(str2, stemming: false)
44
- end
45
-
46
- it 'should calculate hamming distances correctly' do
47
- expect(Simhash.hamming_distance(2, 2)).to eq 0
48
- expect(Simhash.hamming_distance(2, 3)).to eq 1
49
- expect(Simhash.hamming_distance(255, 197)).to eq 4
50
- end
51
- end
1
+ require 'simhash2'
2
+ require 'fast-stemmer'
3
+
4
+ describe Simhash do
5
+ it 'should generate the same simhash for the same string, and a different simhash for a different string' do
6
+ str1 = 'I like going to the beach'
7
+ str2 = 'I like going to the beach'
8
+ str3 = 'I like going to the mall'
9
+
10
+ expect(Simhash.generate(str1)).to eq Simhash.generate(str2)
11
+ expect(Simhash.generate(str1)).not_to eq Simhash.generate(str3)
12
+ end
13
+
14
+ it 'should calculate the same similarity for the same string, and a lower similarity for a different string' do
15
+ str1 = 'I like going to the beach'
16
+ str2 = 'I like going to the beach'
17
+ str3 = 'I like going to the mall'
18
+
19
+ expect(Simhash.similarity(str1, str2)).to eq 1.0
20
+ expect(Simhash.similarity(str1, str3)).to be < 1.0
21
+ end
22
+
23
+ it 'should strip punctuation and capitalization properly' do
24
+ str1 = "Hello, nurse! How's it going today... my man?"
25
+ str2 = 'hello nurse hows it going today my man'
26
+
27
+ expect(Simhash.generate(str1, unique: true)).to eq Simhash.generate(str2, unique: true)
28
+ end
29
+
30
+ it "should respect the 'unique' option" do
31
+ str1 = 'apple pear'
32
+ str2 = 'apple apple apple pear'
33
+
34
+ expect(Simhash.generate(str1, unique: true)).to eq Simhash.generate(str2, unique: true)
35
+ expect(Simhash.generate(str1, unique: false)).not_to eq Simhash.generate(str2, unique: false)
36
+ end
37
+
38
+ it "should respect the 'stop_words' option" do
39
+ str1 = 'I like the man on the moon.'
40
+ str2 = 'like man moon'
41
+ stop_words = %w[i the on]
42
+
43
+ expect(Simhash.generate(str1, stop_words: stop_words)).to eq Simhash.generate(str2, stop_words: stop_words)
44
+ expect(Simhash.generate(str1)).not_to eq Simhash.generate(str2)
45
+ end
46
+
47
+ it "should respect the 'stemming' option" do
48
+ str1 = 'My crazy cars have crazy minds!'
49
+ str2 = 'My crazi car have crazi mind!'
50
+
51
+ expect(Simhash.generate(str1, stemming: true)).to eq Simhash.generate(str2, stemming: true)
52
+ expect(Simhash.generate(str1, stemming: false)).not_to eq Simhash.generate(str2, stemming: false)
53
+ end
54
+
55
+ it 'should calculate hamming distances correctly' do
56
+ expect(Simhash.hamming_distance(2, 2)).to eq 0
57
+ expect(Simhash.hamming_distance(2, 3)).to eq 1
58
+ expect(Simhash.hamming_distance(255, 197)).to eq 4
59
+ end
60
+ end
data/spec/spec_helper.rb CHANGED
@@ -1,8 +1,6 @@
1
1
  require 'coveralls'
2
2
  Coveralls.wear!
3
3
 
4
- require 'simhash'
5
-
6
4
  # This file was generated by the `rspec --init` command. Conventionally, all
7
5
  # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
8
6
  # The generated `.rspec` file contains `--require spec_helper` which will cause
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: simhash2
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jonathan Wong
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-05-12 00:00:00.000000000 Z
11
+ date: 2017-09-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec
@@ -59,7 +59,7 @@ files:
59
59
  - simhash2.gemspec
60
60
  - spec/simhash_spec.rb
61
61
  - spec/spec_helper.rb
62
- homepage: http://github.com/armchairtheorist/simhash
62
+ homepage: https://github.com/armchairtheorist/simhash2
63
63
  licenses:
64
64
  - MIT
65
65
  metadata: {}