simhash2 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 4649489bf0c74cadd1dc2387578fc76994d3bdf6
4
- data.tar.gz: 3eb87d8c24c0b476278b7c7fc18a8555c1d3215e
3
+ metadata.gz: 8e8190a872a7ec498e0a500482e8814758de6cb4
4
+ data.tar.gz: b0ea69a076ab1a99584010b3c85b6c3d17d37eea
5
5
  SHA512:
6
- metadata.gz: b4802c746e43f91b99b6cc6f17f7a2757666596b9b70104eae7b4581dbf63a878ac4741b10ba7686bc6d2b7749132c4bf2cc94bca8416614521dac80c1f5f8df
7
- data.tar.gz: d5e1cec7950aef8d40f81518644554cfc6e23af590c8249b7adf1feef56c13db452548611e182f6449ed4182367a14c3ca5c6a288e34c40f32e63889b3b4b168
6
+ metadata.gz: 290b5f9daf27c8d4a138e26aabc062b4c8896fb6506bb41da13ea5c1e44095286e82b9b34780b189d22fdae1e4f4d34d7dc0ce6438b9c47539584b82fb23a541
7
+ data.tar.gz: 5b961fbf6271a50571d557f628ef111ca7f9683378869bc9443635cb678b0a06942424287e5e6103702074f604a8ac39a06d2a2120c15b3bbfc26ee77d1cc5ab
data/Gemfile CHANGED
@@ -1,6 +1,6 @@
1
1
  source 'http://rubygems.org'
2
2
 
3
- # Specify your gem's dependencies in simhash2.gemspec
3
+ # Specify your gem's dependencies in the .gemspec file
4
4
  gemspec
5
5
 
6
6
  gem 'coveralls', require: false
data/README.md CHANGED
@@ -1,8 +1,37 @@
1
1
  [![Gem Version](https://badge.fury.io/rb/simhash2.svg)](https://badge.fury.io/rb/simhash2)
2
- [![Code Climate](https://codeclimate.com/github/armchairtheorist/simhash/badges/gpa.svg)](https://codeclimate.com/github/armchairtheorist/simhash)
3
- [![Build Status](https://travis-ci.org/armchairtheorist/simhash.svg?branch=master)](https://travis-ci.org/armchairtheorist/simhash)
4
- [![Coverage Status](https://coveralls.io/repos/github/armchairtheorist/simhash/badge.svg?branch=master)](https://coveralls.io/github/armchairtheorist/simhash?branch=master)
2
+ [![Code Climate](https://codeclimate.com/github/armchairtheorist/simhash2/badges/gpa.svg)](https://codeclimate.com/github/armchairtheorist/simhash2)
3
+ [![Build Status](https://travis-ci.org/armchairtheorist/simhash2.svg?branch=master)](https://travis-ci.org/armchairtheorist/simhash2)
4
+ [![Coverage Status](https://coveralls.io/repos/github/armchairtheorist/simhash2/badge.svg?branch=master)](https://coveralls.io/github/armchairtheorist/simhash2?branch=master)
5
5
 
6
- # Simhash
6
+ # Simhash2
7
+
8
+ **Simhash2** is a rewrite of the [bookmate/simhash](https://github.com/bookmate/simhash) gem, which is an implementation of Moses Charikar's simhashes in Ruby. The key differences are that this gem doesn't monkey patch the `String` and `Integer`, and configuration is also slightly easier. The simhash values generated by this gem on a default configuration should be identical to what is generated by the Bookmate version.
9
+
10
+ ## Installation
11
+
12
+ Install the gem from RubyGems:
13
+
14
+ ```bash
15
+ gem install simhash2
16
+ ```
17
+
18
+ If you use Bundler, just add it to your Gemfile and run `bundle install`
19
+
20
+ ```ruby
21
+ gem 'simhash2'
22
+ ```
23
+
24
+ I have only tested this gem on Ruby 2.4.1, but there shouldn't be any reason why it wouldn't work on earlier Ruby versions as well.
25
+
26
+ ## Usage
27
+
28
+ ```ruby
29
+ str1 = "I am the king of the world!"
30
+ str2 = "I am the queen of the world!"
31
+
32
+ simhash1 = Simhash.generate(str1) # => 86798109229625320
33
+ simhash2 = Simhash.generate(str2) # => 13921220612431195624
34
+
35
+ Simhash.hamming_distance(simhash1, simhash2) # => 8
36
+ ```
7
37
 
8
- This is a rewrite of the [bookmate/simhash](https://github.com/bookmate/simhash) gem, which is an implementation of Moses Charikar's simhashes in Ruby.
@@ -1,3 +1,3 @@
1
1
  module Simhash
2
- VERSION = '0.0.2'.freeze
2
+ VERSION = '0.0.3'.freeze
3
3
  end
data/lib/simhash2.rb CHANGED
@@ -1,68 +1,76 @@
1
- require 'simhash2/version'
2
-
3
- module Simhash
4
- extend self
5
-
6
- HASHBITS = 64
7
-
8
- OPTIONS = {
9
- min_token_length: 1,
10
- unique: false,
11
- stemming: false,
12
- stop_words: []
13
-
14
- }.freeze
15
-
16
- def generate(str, options = {})
17
- generate_from_tokens(str.split(/\s+/), options)
18
- end
19
-
20
- def generate_from_tokens(tokens, options = {})
21
- filter_tokens(tokens, OPTIONS.merge(options))
22
-
23
- v = [0] * HASHBITS
24
-
25
- masks = v.dup
26
- masks.each_with_index { |_e, i| masks[i] = (1 << i) }
27
-
28
- hashes = tokens.map { |token| simple_string_hash(token, HASHBITS) }
29
- hashes.each do |h|
30
- HASHBITS.times do |i|
31
- v[i] += (h & masks[i]).zero? ? -1 : +1
32
- end
33
- end
34
-
35
- simhash = 0
36
- HASHBITS.times { |i| simhash += 1 << i if v[i] >= 0 }
37
-
38
- simhash
39
- end
40
-
41
- def hamming_distance(simhash1, simhash2)
42
- (simhash1.to_i ^ simhash2.to_i).to_s(2).count('1')
43
- end
44
-
45
- private
46
-
47
- def simple_string_hash(str, length)
48
- return 0 if str == ''
49
-
50
- x = str.bytes.first << 7
51
- m = 1_000_003
52
- mask = (1 << length) - 1
53
- str.each_byte { |char| x = ((x * m) ^ char.to_i) & mask }
54
-
55
- x ^= str.bytes.count
56
- x = -2 if x == -1
57
-
58
- x.to_i
59
- end
60
-
61
- def filter_tokens(tokens, options)
62
- tokens.map! { |e| e.downcase.gsub(/\W+/, '') }
63
- tokens.reject! { |e| e.nil? || e.length < options[:min_token_length] }
64
- tokens.reject! { |e| options[:stop_words].include?(e) } unless options[:stop_words].nil? || options[:stop_words].empty?
65
- tokens.map!(&:stem) if options[:stemming]
66
- tokens.uniq! if options[:unique]
67
- end
68
- end
1
+ require 'simhash2/version'
2
+
3
+ module Simhash
4
+ extend self
5
+
6
+ HASHBITS = 64
7
+
8
+ OPTIONS = {
9
+ min_token_length: 1,
10
+ unique: false,
11
+ stemming: false,
12
+ stop_words: []
13
+ }.freeze
14
+
15
+ def similarity(string1, string2, options = {})
16
+ return hash_similarity(generate(string1, options), generate(string2, options))
17
+ end
18
+
19
+ def generate(str, options = {})
20
+ generate_from_tokens(str.split(/\s+/), options)
21
+ end
22
+
23
+ def generate_from_tokens(tokens, options = {})
24
+ filter_tokens(tokens, OPTIONS.merge(options))
25
+
26
+ v = [0] * HASHBITS
27
+
28
+ masks = v.dup
29
+ masks.each_with_index { |_e, i| masks[i] = (1 << i) }
30
+
31
+ hashes = tokens.map { |token| simple_string_hash(token, HASHBITS) }
32
+ hashes.each do |h|
33
+ HASHBITS.times do |i|
34
+ v[i] += (h & masks[i]).zero? ? -1 : +1
35
+ end
36
+ end
37
+
38
+ simhash = 0
39
+ HASHBITS.times { |i| simhash += 1 << i if v[i] >= 0 }
40
+
41
+ simhash
42
+ end
43
+
44
+ def hamming_distance(simhash1, simhash2)
45
+ (simhash1.to_i ^ simhash2.to_i).to_s(2).count('1')
46
+ end
47
+
48
+ def hash_similarity(left, right)
49
+ return (1.0 - (hamming_distance(left, right).to_f / HASHBITS))
50
+ end
51
+
52
+ private
53
+
54
+ def simple_string_hash(str, length)
55
+ return 0 if str == ''
56
+
57
+ x = str.bytes.first << 7
58
+ m = 1_000_003
59
+ mask = (1 << length) - 1
60
+ str.each_byte { |char| x = ((x * m) ^ char.to_i) & mask }
61
+
62
+ x ^= str.bytes.count
63
+ x = -2 if x == -1
64
+
65
+ x.to_i
66
+ end
67
+
68
+ def filter_tokens(tokens, options)
69
+ tokens.map! { |e| e.downcase.gsub(/\W+/, '') }
70
+ tokens.reject! { |e| e.nil? || e.length < options[:min_token_length] }
71
+ tokens.reject! { |e| options[:stop_words].include?(e) } unless options[:stop_words].nil? || options[:stop_words].empty?
72
+ tokens.map!(&:stem) if options[:stemming]
73
+ tokens.uniq! if options[:unique]
74
+ end
75
+
76
+ end
data/simhash2.gemspec CHANGED
@@ -6,7 +6,7 @@ Gem::Specification.new do |spec|
6
6
  spec.authors = ['Jonathan Wong']
7
7
  spec.email = ['jonathan@armchairtheorist.com']
8
8
  spec.summary = 'A rewrite of the \'simhash\' gem, which is an implementation of Moses Charikar\'s simhashes in Ruby.'
9
- spec.homepage = 'http://github.com/armchairtheorist/simhash'
9
+ spec.homepage = 'https://github.com/armchairtheorist/simhash2'
10
10
  spec.license = 'MIT'
11
11
 
12
12
  spec.files = `git ls-files`.split("\n")
data/spec/simhash_spec.rb CHANGED
@@ -1,51 +1,60 @@
1
- require 'spec_helper'
2
- require 'fast-stemmer'
3
-
4
- describe Simhash do
5
- it 'should generate the same simhash for the same string, and a different simhash for a different string' do
6
- str1 = 'I like going to the beach'
7
- str2 = 'I like going to the beach'
8
- str3 = 'I like going to the mall'
9
-
10
- expect(Simhash.generate(str1)).to eq Simhash.generate(str2)
11
- expect(Simhash.generate(str1)).not_to eq Simhash.generate(str3)
12
- end
13
-
14
- it 'should strip punctuation and capitalization properly' do
15
- str1 = "Hello, nurse! How's it going today... my man?"
16
- str2 = 'hello nurse hows it going today my man'
17
-
18
- expect(Simhash.generate(str1, unique: true)).to eq Simhash.generate(str2, unique: true)
19
- end
20
-
21
- it "should respect the 'unique' option" do
22
- str1 = 'apple pear'
23
- str2 = 'apple apple apple pear'
24
-
25
- expect(Simhash.generate(str1, unique: true)).to eq Simhash.generate(str2, unique: true)
26
- expect(Simhash.generate(str1, unique: false)).not_to eq Simhash.generate(str2, unique: false)
27
- end
28
-
29
- it "should respect the 'stop_words' option" do
30
- str1 = 'I like the man on the moon.'
31
- str2 = 'like man moon'
32
- stop_words = %w[i the on]
33
-
34
- expect(Simhash.generate(str1, stop_words: stop_words)).to eq Simhash.generate(str2, stop_words: stop_words)
35
- expect(Simhash.generate(str1)).not_to eq Simhash.generate(str2)
36
- end
37
-
38
- it "should respect the 'stemming' option" do
39
- str1 = 'My crazy cars have crazy minds!'
40
- str2 = 'My crazi car have crazi mind!'
41
-
42
- expect(Simhash.generate(str1, stemming: true)).to eq Simhash.generate(str2, stemming: true)
43
- expect(Simhash.generate(str1, stemming: false)).not_to eq Simhash.generate(str2, stemming: false)
44
- end
45
-
46
- it 'should calculate hamming distances correctly' do
47
- expect(Simhash.hamming_distance(2, 2)).to eq 0
48
- expect(Simhash.hamming_distance(2, 3)).to eq 1
49
- expect(Simhash.hamming_distance(255, 197)).to eq 4
50
- end
51
- end
1
+ require 'simhash2'
2
+ require 'fast-stemmer'
3
+
4
+ describe Simhash do
5
+ it 'should generate the same simhash for the same string, and a different simhash for a different string' do
6
+ str1 = 'I like going to the beach'
7
+ str2 = 'I like going to the beach'
8
+ str3 = 'I like going to the mall'
9
+
10
+ expect(Simhash.generate(str1)).to eq Simhash.generate(str2)
11
+ expect(Simhash.generate(str1)).not_to eq Simhash.generate(str3)
12
+ end
13
+
14
+ it 'should calculate the same similarity for the same string, and a lower similarity for a different string' do
15
+ str1 = 'I like going to the beach'
16
+ str2 = 'I like going to the beach'
17
+ str3 = 'I like going to the mall'
18
+
19
+ expect(Simhash.similarity(str1, str2)).to eq 1.0
20
+ expect(Simhash.similarity(str1, str3)).to be < 1.0
21
+ end
22
+
23
+ it 'should strip punctuation and capitalization properly' do
24
+ str1 = "Hello, nurse! How's it going today... my man?"
25
+ str2 = 'hello nurse hows it going today my man'
26
+
27
+ expect(Simhash.generate(str1, unique: true)).to eq Simhash.generate(str2, unique: true)
28
+ end
29
+
30
+ it "should respect the 'unique' option" do
31
+ str1 = 'apple pear'
32
+ str2 = 'apple apple apple pear'
33
+
34
+ expect(Simhash.generate(str1, unique: true)).to eq Simhash.generate(str2, unique: true)
35
+ expect(Simhash.generate(str1, unique: false)).not_to eq Simhash.generate(str2, unique: false)
36
+ end
37
+
38
+ it "should respect the 'stop_words' option" do
39
+ str1 = 'I like the man on the moon.'
40
+ str2 = 'like man moon'
41
+ stop_words = %w[i the on]
42
+
43
+ expect(Simhash.generate(str1, stop_words: stop_words)).to eq Simhash.generate(str2, stop_words: stop_words)
44
+ expect(Simhash.generate(str1)).not_to eq Simhash.generate(str2)
45
+ end
46
+
47
+ it "should respect the 'stemming' option" do
48
+ str1 = 'My crazy cars have crazy minds!'
49
+ str2 = 'My crazi car have crazi mind!'
50
+
51
+ expect(Simhash.generate(str1, stemming: true)).to eq Simhash.generate(str2, stemming: true)
52
+ expect(Simhash.generate(str1, stemming: false)).not_to eq Simhash.generate(str2, stemming: false)
53
+ end
54
+
55
+ it 'should calculate hamming distances correctly' do
56
+ expect(Simhash.hamming_distance(2, 2)).to eq 0
57
+ expect(Simhash.hamming_distance(2, 3)).to eq 1
58
+ expect(Simhash.hamming_distance(255, 197)).to eq 4
59
+ end
60
+ end
data/spec/spec_helper.rb CHANGED
@@ -1,8 +1,6 @@
1
1
  require 'coveralls'
2
2
  Coveralls.wear!
3
3
 
4
- require 'simhash'
5
-
6
4
  # This file was generated by the `rspec --init` command. Conventionally, all
7
5
  # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
8
6
  # The generated `.rspec` file contains `--require spec_helper` which will cause
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: simhash2
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jonathan Wong
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-05-12 00:00:00.000000000 Z
11
+ date: 2017-09-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec
@@ -59,7 +59,7 @@ files:
59
59
  - simhash2.gemspec
60
60
  - spec/simhash_spec.rb
61
61
  - spec/spec_helper.rb
62
- homepage: http://github.com/armchairtheorist/simhash
62
+ homepage: https://github.com/armchairtheorist/simhash2
63
63
  licenses:
64
64
  - MIT
65
65
  metadata: {}