minhash 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 0a991d33257686b6ef59fc014ec0d2c89ecfdadc
4
+ data.tar.gz: cb6fc0a7e12924bf9b04f602f4be49a402ecc486
5
+ SHA512:
6
+ metadata.gz: 0a34f7fa09f634100dc48aae9ad37fa1368098d1c93157001d2e8fc12028a0320fc89fa12596ce437ee4662e67aba3d94e293f29a237beff0ab5366a303e300a
7
+ data.tar.gz: 95008befcd4a1ea8863b8ca12dfe3724dd0496d45d59bc80c646809291df58ae653b5050719d133bb4a71779acb82ecceb5654f88ed3e465aaec12c5e766474e
@@ -0,0 +1,5 @@
1
+ lib/**/*.rb
2
+ bin/*
3
+ -
4
+ features/**/*.feature
5
+ LICENSE.txt
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ --color
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source "http://rubygems.org"
2
+ gemspec
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2014 Roland Swingler
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,21 @@
1
+ = minhash
2
+
3
+ An implementation of the Minhash algorithm in Ruby.
4
+
5
+ For more details, see Chapter 3 of http://www.mmds.org/
6
+
7
+ == Contributing to minhash
8
+
9
+ * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet.
10
+ * Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it.
11
+ * Fork the project.
12
+ * Start a feature/bugfix branch.
13
+ * Commit and push until you are happy with your contribution.
14
+ * Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
15
+ * Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
16
+
17
+ == Copyright
18
+
19
+ Copyright (c) 2014 Roland Swingler. See LICENSE.txt for
20
+ further details.
21
+
@@ -0,0 +1,111 @@
1
+ require 'murmurhash3'
2
+ require 'set'
3
+
4
+ module Minhash
5
+ # Generates the k-shingles for a String.
6
+ #
7
+ # For example, with the string "12345" and k = 3 the shingles are
8
+ # "123", "234" and "345".
9
+ #
10
+ # k_shingles("ABCDE", 3)
11
+ # # => ["ABC", "BCD", "CDE"]
12
+ #
13
+ # Normalizes all whitespace to be single character spaces.
14
+ #
15
+ # You want to select k to be large enough to be able to
16
+ # distinguish chunks of text adequately - a value of around 9 is
17
+ # sensible for reasonable sized chunks of standard written text.
18
+ def self.k_shingles(text, k)
19
+ if text.size < k
20
+ [text]
21
+ else
22
+ text.tr("\t\n\r", " ").
23
+ squeeze(" ").
24
+ each_char.
25
+ each_cons(k).
26
+ map(&:join)
27
+ end
28
+ end
29
+
30
+ # Generates a Set of tokenized 32-bit integer k-shingles.
31
+ #
32
+ # tokenized_k_shingles("ABCDE", 3)
33
+ # # => #<Set: {1136772405, 3561005568, 944681077}>
34
+ #
35
+ # MurmurHash3 is used by default; if you want to use a different
36
+ # hash function, pass a block:
37
+ #
38
+ # tokenized_k_shingles("ABCDE", 3) do |shingle|
39
+ # # obviously a terrible hash function, just an example.
40
+ # shingle[0].ord
41
+ # end
42
+ # # => #<Set: {65, 66, 67}>
43
+ def self.tokenized_k_shingles(text, k, &hash)
44
+ hash ||= lambda {|s| MurmurHash3::Native32.murmur3_32_str_hash(s) }
45
+ k_shingles(text, k).map {|shingle| hash.call(shingle) }.to_set
46
+ end
47
+
48
+ # Returns the jaccard similarity between two sets of shingles /
49
+ # tokens.
50
+ def self.jaccard_similarity(a, b)
51
+ (a & b).size / (a | b).size.to_f
52
+ end
53
+
54
+ # Returns the approximate jaccard similarity of 2 sets, given their
55
+ # signatures.
56
+ def self.approximate_similarity(a, b)
57
+ a.length.times.select {|i| a[i] == b[i] }.size / a.length.to_f
58
+ end
59
+
60
+ # Mixin to extend String with k-shingle functions.
61
+ module StringExtensions
62
+ # Generates the k-shingles for this String.
63
+ #
64
+ # See Minhash::StringFunctions#k_shingles
65
+ def k_shingles(k)
66
+ Minhash.k_shingles self, k
67
+ end
68
+
69
+ # Generates the tokenized k-shingles for this String.
70
+ #
71
+ # See Minhash::StringFunctions#tokenized_k_shingles
72
+ def tokenized_k_shingles(k, &block)
73
+ Minhash.tokenized_k_shingles self, k, &block
74
+ end
75
+ end
76
+
77
+ # The Minhash signature algorithm.
78
+ #
79
+ # See section 3.3 of the http://www.mmds.org/ book:
80
+ # http://infolab.stanford.edu/~ullman/mmds/ch3.pdf
81
+ #
82
+ # Simple XORs of random integer bit masks are used as the hash
83
+ # functions.
84
+ class Algorithm
85
+ # Returns the bit masks used to implement the hash functions.
86
+ attr_reader :masks
87
+
88
+ # Creates a new instance of the algorithm, with the given bit
89
+ # masks.
90
+ def initialize(masks)
91
+ @masks = masks.freeze
92
+ @hash_functions ||= @masks.map {|mask| lambda {|i| i ^ mask } }
93
+ end
94
+
95
+ # Creates a new instance of the algorithm with +length+ random bit
96
+ # masks.
97
+ def self.create(length)
98
+ new length.times.map { rand(2 ** 32 -1) }
99
+ end
100
+
101
+ # Returns the minhash signature for a set of tokens.
102
+ def signature(tokens)
103
+ @hash_functions.map {|f| tokens.map(&f).min }
104
+ end
105
+ end
106
+ end
107
+
108
+ class String
109
+ include Minhash::StringExtensions
110
+ end
111
+
@@ -0,0 +1,58 @@
1
+ require 'spec_helper'
2
+
3
+ describe "Minhash" do
4
+ describe "generating k-shingles of a string" do
5
+ it "k-shingles text" do
6
+ expect( "12345".k_shingles(3) ).to eql(["123", "234", "345"])
7
+ end
8
+
9
+ it "generates a single shingle when the string is smaller than k" do
10
+ expect( "12".k_shingles(3) ).to eql(["12"])
11
+ end
12
+
13
+ it "replaces whitespace characters with a space" do
14
+ expect( "\t2\t4\n".k_shingles(3) ).to eql([" 2 ", "2 4", " 4 "])
15
+ end
16
+
17
+ it "squeezes consecutive whitespace characters" do
18
+ expect("22 \t \n".k_shingles(3)).to eql(["22 "])
19
+ end
20
+
21
+ it "generates tokenized k-shingles" do
22
+ expect("ABCDE".tokenized_k_shingles(3)).
23
+ to eql(Set.new([1136772405, 3561005568, 944681077]))
24
+ end
25
+
26
+ it "tokenizes shingles with a custom hash function" do
27
+ expect("ABCDE".tokenized_k_shingles(3) {|s| s[0].ord }).
28
+ to eql(Set.new([65,66,67]))
29
+ end
30
+ end
31
+
32
+ describe "generating a Minhash signature for text" do
33
+ it "returns a signature of the tokens" do
34
+ algorithm = Minhash::Algorithm.new([1,2])
35
+ expect(algorithm.signature([1, 2, 3])).to eql([0, 0])
36
+ end
37
+
38
+ it "can create X masks internally" do
39
+ algorithm = Minhash::Algorithm.create(20)
40
+ expect(algorithm.masks.size).to eql(20)
41
+ signature = algorithm.signature([1])
42
+ expect(signature.size).to eql(20)
43
+ expect(signature.all? {|i| i.instance_of?(Fixnum) }).to eql(true)
44
+ end
45
+ end
46
+
47
+ it "returns the jaccard similarity of 2 sets" do
48
+ a = Set.new([1,2,3])
49
+ b = Set.new([2,3,4])
50
+ expect(Minhash.jaccard_similarity(a,b)).to eql(0.5)
51
+ end
52
+
53
+ it "returns the approximate similarity for 2 signatures" do
54
+ a = [1,2,3,4]
55
+ b = [2,2,3,4]
56
+ expect(Minhash.approximate_similarity(a,b)).to eql(0.75)
57
+ end
58
+ end
@@ -0,0 +1,28 @@
1
+ require 'simplecov'
2
+
3
+ module SimpleCov::Configuration
4
+ def clean_filters
5
+ @filters = []
6
+ end
7
+ end
8
+
9
+ SimpleCov.configure do
10
+ clean_filters
11
+ load_adapter 'test_frameworks'
12
+ end
13
+
14
+ ENV["COVERAGE"] && SimpleCov.start do
15
+ add_filter "/.rvm/"
16
+ end
17
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
18
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
19
+
20
+ require 'rspec'
21
+ require 'minhash'
22
+
23
+ # Requires supporting files with custom matchers and macros, etc,
24
+ # in ./support/ and its subdirectories.
25
+ Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}
26
+
27
+ RSpec.configure do |config|
28
+ end
metadata ADDED
@@ -0,0 +1,82 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: minhash
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Roland Swingler
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-09-24 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: murmurhash3
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: bundler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1'
41
+ description: Minhash algorithm implementation
42
+ email: roland.swingler@gmail.com
43
+ executables: []
44
+ extensions: []
45
+ extra_rdoc_files:
46
+ - LICENSE.txt
47
+ - README.rdoc
48
+ files:
49
+ - ".document"
50
+ - ".rspec"
51
+ - Gemfile
52
+ - LICENSE.txt
53
+ - README.rdoc
54
+ - lib/minhash.rb
55
+ - spec/minhash_spec.rb
56
+ - spec/spec_helper.rb
57
+ homepage: http://github.com/knaveofdiamonds/minhash
58
+ licenses:
59
+ - MIT
60
+ metadata: {}
61
+ post_install_message:
62
+ rdoc_options: []
63
+ require_paths:
64
+ - lib
65
+ required_ruby_version: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ required_rubygems_version: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - ">="
73
+ - !ruby/object:Gem::Version
74
+ version: '0'
75
+ requirements: []
76
+ rubyforge_project:
77
+ rubygems_version: 2.2.2
78
+ signing_key:
79
+ specification_version: 4
80
+ summary: Minhash algorithm implementation in ruby
81
+ test_files: []
82
+ has_rdoc: