simple_similarity 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in simple_similarity.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Ori Pekelman
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,79 @@
1
+ # SimpleSimilarity
2
+
3
+ This is an implemenation of a very trivial, and very costly algorithm to find
4
+ Similarity in a collection of items. It calculates a distance between the
5
+ length of the items compressed each on its own, and compressed when concatenated
6
+ The theory is sound.
7
+
8
+ ## Installation
9
+
10
+ Add this line to your application's Gemfile:
11
+
12
+ gem 'simple_similarity'
13
+
14
+ And then execute:
15
+
16
+ $ bundle
17
+
18
+ Or install it yourself as:
19
+
20
+ $ gem install simple_similarity
21
+
22
+ ## Usage
23
+
24
+ Get the distance between "Hello" and "World", always takes to arguments.
25
+
26
+ SimpleSimilarity::SimpleSimilarity.distance("Hello","World")
27
+
28
+ $ 0.125
29
+
30
+ SimpleSimilarity::SimpleSimilarity.case_insensitive_distance("Hello","hello")
31
+
32
+ $ 1.0
33
+
34
+ Get a distance matrix, the first argument is an array of elements, the second
35
+ is a cutoff ratio for the detected similarities (0.5 is quite similar, above
36
+ 0.8 we are basically identical, depending on the length of the original strings..)
37
+
38
+ SimpleSimilarity::SimpleSimilarity.distance_matrix(%w( hello, hell, world), 0.5, true, false)
39
+
40
+ $ => {
41
+ :matrix=>[
42
+ [nil, 0.5833333333333333, 0.11764705882352944],
43
+ [nil, nil, 0.125],
44
+ [nil, nil, nil]
45
+ ],
46
+ :similar_terms=>[{
47
+ :first_term=>[0, "hello,"],
48
+ :second_term=>[1, "hell,"],
49
+ :score=>0.5833333333333333}]}
50
+
51
+ ### Caveat :
52
+ 1. this algorithm costs (n^2/2) - n in time and n^2 in memory
53
+ >So if n= 100,000
54
+ > there will be 4,999,900,000 calculations
55
+ 2. setting only_hits to true does not return the matrix only the hits but optimizes memory usage.
56
+ ### Note:
57
+ the matrix can be used later for clustering....
58
+
59
+ #Testing
60
+ spec:
61
+
62
+ bundle exec rspec spec
63
+
64
+
65
+ Benchmark:
66
+
67
+ On my machine comparing 100 items takes around 8 seconds:
68
+
69
+ bundle exec ruby spec/benchmark.rb
70
+
71
+ If what you want to compare are longer documents, swap the
72
+
73
+ ## Contributing
74
+
75
+ 1. Fork it
76
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
77
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
78
+ 4. Push to the branch (`git push origin my-new-feature`)
79
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,78 @@
1
+ require "simple_similarity/version"
2
+ require "zlib"
3
+
4
+ module SimpleSimilarity
5
+ class SimpleSimilarity
6
+ #calculate similarity by comparing the length of compressed(a) + length of
7
+ #compressed (b) to the lengt of compressed (a and b concatenated)
8
+ def self.distance(a, b)
9
+ a = a.to_s
10
+ b = b.to_s
11
+
12
+ # optimization: well if they are equal lets not do expensive stuff
13
+ if a==b then return 1.0 end
14
+
15
+ # optimization: if one is empty and the other not they are totally different
16
+ if ((a.empty? && !b.empty?) || (b.empty? && !a.empty?)) then return (0.to_f) end
17
+
18
+ # This is hacky, but useful. When strings are too short the compression
19
+ # is not sufficient.
20
+ if (a.length < 40 || b.length < 40)
21
+ a = a * 10
22
+ b = b * 10
23
+ end
24
+
25
+ # 8 characters is the minimal zlib compressed stream, this is basically a constant length we remove
26
+ minimal_zip_length = 7
27
+ azip = Zlib.deflate(a).length - minimal_zip_length
28
+ bzip = Zlib.deflate(b).length - minimal_zip_length
29
+
30
+ aplusbzip = (Zlib.deflate(a + b).length) - minimal_zip_length
31
+ ((azip + bzip).to_f/aplusbzip.to_f) - 1 # -1 is because something that is totally different will get 1.00 totally the same 2.00.
32
+ end
33
+
34
+ #Normalize the string, downcase remove non alphanumeric characters
35
+ def self.case_insensitive_distance(a, b)
36
+ self.distance(a.downcase.gsub(/[^[:alnum:]]/, ' '), b.downcase.gsub(/[^[:alnum:]]/, ' '))
37
+ end
38
+
39
+ # This method will return a distance matrix as well as matches beyond a supplies threshold
40
+ # Caveat : this algorithm costs (n^2/2) - n in time and n^2 in memory
41
+ # So if n= 100,000
42
+ # there will be 4,999,900,000 calculations
43
+ # setting only_hits to true does not return the matrix only the hits
44
+ # the matrix can be used later for clustering....
45
+
46
+ # "hello", "hello", "world", "hello world", "hello cruel world"
47
+ # "hello" [nil, 1.0, 0.125, 0.41176470588235303, 0.28],
48
+ # "hello" [nil, nil, 0.125, 0.41176470588235303, 0.28],
49
+ # "world" [nil, nil, nil, 0.41176470588235303, 0.28],
50
+ # "hello world" [nil, nil, nil, nil, 0.52],
51
+ # "hello cruel world" [nil, nil, nil, nil, nil]
52
+ def self.distance_matrix(arr, cutoff = 0.5, case_insensitive = false, only_hits = true )
53
+ sims =[]
54
+ len = arr.length
55
+ if only_hits
56
+ mat = []
57
+ else
58
+ mat = Array.new(len){Array.new(len)}
59
+ end
60
+
61
+ dist = 0.to_f
62
+ for i in 0..len
63
+ for j in i+1..len -1
64
+ if case_insensitive then
65
+ dist = self.case_insensitive_distance(arr[i], arr[j])
66
+ else
67
+ dist = self.distance(arr[i], arr[j])
68
+ end
69
+ if dist >= cutoff
70
+ sims.push({:first_term=>[i,arr[i]],:second_term=>[j,arr[j]],:score=>dist})
71
+ end
72
+ mat[i][j] = dist unless only_hits
73
+ end
74
+ end
75
+ {:matrix=>mat,:similar_terms=>sims}
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,3 @@
1
+ module SimpleSimilarity
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,23 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'simple_similarity/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "simple_similarity"
8
+ gem.version = SimpleSimilarity::VERSION
9
+ gem.authors = ["Ori Pekelman"]
10
+ gem.email = ["ori@pekelman.com"]
11
+ gem.description = %q{A simple (slow but beautiful) algorithm to find similar things}
12
+ gem.summary = %q{A simple (slow but beautiful) algorithm to find similar things}
13
+ gem.homepage = "https://github.com/OriPekelman/simple_similarity"
14
+
15
+ gem.files = `git ls-files`.split($/)
16
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
17
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
+ gem.require_paths = ["lib"]
19
+ gem.add_development_dependency "rspec", "~> 2.6"
20
+ gem.add_development_dependency "pry"
21
+ gem.add_development_dependency "awesome_print"
22
+ gem.add_development_dependency "colorize"
23
+ end
data/spec/benchmark.rb ADDED
@@ -0,0 +1,18 @@
1
+ require 'simple_similarity'
2
+ require "benchmark"
3
+
4
+ a = []
5
+ CHARS = (?a..?z).to_a + ["e","e","e","e","e","a","a","a","a","a","a","a","a","a","a","a","a"]
6
+ 1000.times {a.push(5.times.inject("") {|s, i| s << CHARS[rand(CHARS.size)]})}
7
+
8
+ time = Benchmark.measure do
9
+ puts SimpleSimilarity::SimpleSimilarity.distance_matrix(a, 0.5, false, true)[:similar_terms]
10
+ end
11
+ puts "Without matrix"
12
+ puts time
13
+
14
+ time = Benchmark.measure do
15
+ puts SimpleSimilarity::SimpleSimilarity.distance_matrix(a, 0.5, false, false)[:similar_terms]
16
+ end
17
+ puts "With matrix"
18
+ puts time
@@ -0,0 +1,41 @@
1
+ require 'simple_similarity'
2
+
3
+ describe SimpleSimilarity::SimpleSimilarity do
4
+ it "Hello is identical to Hello" do
5
+ SimpleSimilarity::SimpleSimilarity.distance("Hello","Hello").should eql(1.0)
6
+ end
7
+
8
+ it "Hello is not similar to world" do
9
+ SimpleSimilarity::SimpleSimilarity.distance("Hello","World").should eql(0.125)
10
+ end
11
+
12
+ it "Hello is somewhat similar hello" do
13
+ SimpleSimilarity::SimpleSimilarity.distance("Hello","hello").should eql(0.5)
14
+ end
15
+
16
+ it "Hello is identical to hello when case insensitive" do
17
+ SimpleSimilarity::SimpleSimilarity.case_insensitive_distance("Hello","hello").should eql(1.0)
18
+ end
19
+
20
+ it "from : hello, Hello, hell, world, cruel, crude with a cutoff of 0.5 we should have two couples" do
21
+ SimpleSimilarity::SimpleSimilarity.distance_matrix(%w( hello, Hello, hell, world, cruel, crud), 0.5)[:similar_terms].length.should eql(2)
22
+ end
23
+
24
+ it "from : hello, Hello, hell, world, cruel, crude with a cutoff of 0.5 we should have three couples case insensitive" do
25
+ SimpleSimilarity::SimpleSimilarity.distance_matrix(%w( hello, Hello, hell, world, cruel, crud), 0.5, true)[:similar_terms].length.should eql(3)
26
+ end
27
+
28
+ it "from : hello world, Hello world, hell world, world, cruel, crude with a cutoff of 1.0 we should have seven couples case insensitive" do
29
+ SimpleSimilarity::SimpleSimilarity.distance_matrix(%w( hello world, Hello world, hell world, world, cruel, crude), 1.0, true)[:similar_terms].length.should eql(7)
30
+ end
31
+
32
+ it "setting only_hits to false gives back a matrix" do
33
+ SimpleSimilarity::SimpleSimilarity.distance_matrix(%w( hello, world), 1.0, true, false)[:matrix].length.should eql(2)
34
+ end
35
+
36
+ it "by default does not gives an empty matrix" do
37
+ SimpleSimilarity::SimpleSimilarity.distance_matrix(%w( hello, world))[:matrix].length.should eql(0)
38
+ end
39
+
40
+
41
+ end
metadata ADDED
@@ -0,0 +1,122 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: simple_similarity
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Ori Pekelman
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-01-08 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rspec
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: '2.6'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ version: '2.6'
30
+ - !ruby/object:Gem::Dependency
31
+ name: pry
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :development
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: awesome_print
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: colorize
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :development
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ description: A simple (slow but beautiful) algorithm to find similar things
79
+ email:
80
+ - ori@pekelman.com
81
+ executables: []
82
+ extensions: []
83
+ extra_rdoc_files: []
84
+ files:
85
+ - .gitignore
86
+ - Gemfile
87
+ - LICENSE.txt
88
+ - README.md
89
+ - Rakefile
90
+ - lib/simple_similarity.rb
91
+ - lib/simple_similarity/version.rb
92
+ - simple_similarity.gemspec
93
+ - spec/benchmark.rb
94
+ - spec/simple_similarity_spec.rb
95
+ homepage: https://github.com/OriPekelman/simple_similarity
96
+ licenses: []
97
+ post_install_message:
98
+ rdoc_options: []
99
+ require_paths:
100
+ - lib
101
+ required_ruby_version: !ruby/object:Gem::Requirement
102
+ none: false
103
+ requirements:
104
+ - - ! '>='
105
+ - !ruby/object:Gem::Version
106
+ version: '0'
107
+ required_rubygems_version: !ruby/object:Gem::Requirement
108
+ none: false
109
+ requirements:
110
+ - - ! '>='
111
+ - !ruby/object:Gem::Version
112
+ version: '0'
113
+ requirements: []
114
+ rubyforge_project:
115
+ rubygems_version: 1.8.24
116
+ signing_key:
117
+ specification_version: 3
118
+ summary: A simple (slow but beautiful) algorithm to find similar things
119
+ test_files:
120
+ - spec/benchmark.rb
121
+ - spec/simple_similarity_spec.rb
122
+ has_rdoc: