simple_similarity 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +79 -0
- data/Rakefile +1 -0
- data/lib/simple_similarity.rb +78 -0
- data/lib/simple_similarity/version.rb +3 -0
- data/simple_similarity.gemspec +23 -0
- data/spec/benchmark.rb +18 -0
- data/spec/simple_similarity_spec.rb +41 -0
- metadata +122 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Ori Pekelman
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
# SimpleSimilarity
|
2
|
+
|
3
|
+
This is an implemenation of a very trivial, and very costly algorithm to find
|
4
|
+
Similarity in a collection of items. It calculates a distance between the
|
5
|
+
length of the items compressed each on its own, and compressed when concatenated
|
6
|
+
The theory is sound.
|
7
|
+
|
8
|
+
## Installation
|
9
|
+
|
10
|
+
Add this line to your application's Gemfile:
|
11
|
+
|
12
|
+
gem 'simple_similarity'
|
13
|
+
|
14
|
+
And then execute:
|
15
|
+
|
16
|
+
$ bundle
|
17
|
+
|
18
|
+
Or install it yourself as:
|
19
|
+
|
20
|
+
$ gem install simple_similarity
|
21
|
+
|
22
|
+
## Usage
|
23
|
+
|
24
|
+
Get the distance between "Hello" and "World", always takes to arguments.
|
25
|
+
|
26
|
+
SimpleSimilarity::SimpleSimilarity.distance("Hello","World")
|
27
|
+
|
28
|
+
$ 0.125
|
29
|
+
|
30
|
+
SimpleSimilarity::SimpleSimilarity.case_insensitive_distance("Hello","hello")
|
31
|
+
|
32
|
+
$ 1.0
|
33
|
+
|
34
|
+
Get a distance matrix, the first argument is an array of elements, the second
|
35
|
+
is a cutoff ratio for the detected similarities (0.5 is quite similar, above
|
36
|
+
0.8 we are basically identical, depending on the length of the original strings..)
|
37
|
+
|
38
|
+
SimpleSimilarity::SimpleSimilarity.distance_matrix(%w( hello, hell, world), 0.5, true, false)
|
39
|
+
|
40
|
+
$ => {
|
41
|
+
:matrix=>[
|
42
|
+
[nil, 0.5833333333333333, 0.11764705882352944],
|
43
|
+
[nil, nil, 0.125],
|
44
|
+
[nil, nil, nil]
|
45
|
+
],
|
46
|
+
:similar_terms=>[{
|
47
|
+
:first_term=>[0, "hello,"],
|
48
|
+
:second_term=>[1, "hell,"],
|
49
|
+
:score=>0.5833333333333333}]}
|
50
|
+
|
51
|
+
### Caveat :
|
52
|
+
1. this algorithm costs (n^2/2) - n in time and n^2 in memory
|
53
|
+
>So if n= 100,000
|
54
|
+
> there will be 4,999,900,000 calculations
|
55
|
+
2. setting only_hits to true does not return the matrix only the hits but optimizes memory usage.
|
56
|
+
### Note:
|
57
|
+
the matrix can be used later for clustering....
|
58
|
+
|
59
|
+
#Testing
|
60
|
+
spec:
|
61
|
+
|
62
|
+
bundle exec rspec spec
|
63
|
+
|
64
|
+
|
65
|
+
Benchmark:
|
66
|
+
|
67
|
+
On my machine comparing 100 items takes around 8 seconds:
|
68
|
+
|
69
|
+
bundle exec ruby spec/benchmark.rb
|
70
|
+
|
71
|
+
If what you want to compare are longer documents, swap the
|
72
|
+
|
73
|
+
## Contributing
|
74
|
+
|
75
|
+
1. Fork it
|
76
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
77
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
78
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
79
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
@@ -0,0 +1,78 @@
|
|
1
|
+
require "simple_similarity/version"
|
2
|
+
require "zlib"
|
3
|
+
|
4
|
+
module SimpleSimilarity
|
5
|
+
class SimpleSimilarity
|
6
|
+
#calculate similarity by comparing the length of compressed(a) + length of
|
7
|
+
#compressed (b) to the lengt of compressed (a and b concatenated)
|
8
|
+
def self.distance(a, b)
|
9
|
+
a = a.to_s
|
10
|
+
b = b.to_s
|
11
|
+
|
12
|
+
# optimization: well if they are equal lets not do expensive stuff
|
13
|
+
if a==b then return 1.0 end
|
14
|
+
|
15
|
+
# optimization: if one is empty and the other not they are totally different
|
16
|
+
if ((a.empty? && !b.empty?) || (b.empty? && !a.empty?)) then return (0.to_f) end
|
17
|
+
|
18
|
+
# This is hacky, but useful. When strings are too short the compression
|
19
|
+
# is not sufficient.
|
20
|
+
if (a.length < 40 || b.length < 40)
|
21
|
+
a = a * 10
|
22
|
+
b = b * 10
|
23
|
+
end
|
24
|
+
|
25
|
+
# 8 characters is the minimal zlib compressed stream, this is basically a constant length we remove
|
26
|
+
minimal_zip_length = 7
|
27
|
+
azip = Zlib.deflate(a).length - minimal_zip_length
|
28
|
+
bzip = Zlib.deflate(b).length - minimal_zip_length
|
29
|
+
|
30
|
+
aplusbzip = (Zlib.deflate(a + b).length) - minimal_zip_length
|
31
|
+
((azip + bzip).to_f/aplusbzip.to_f) - 1 # -1 is because something that is totally different will get 1.00 totally the same 2.00.
|
32
|
+
end
|
33
|
+
|
34
|
+
#Normalize the string, downcase remove non alphanumeric characters
|
35
|
+
def self.case_insensitive_distance(a, b)
|
36
|
+
self.distance(a.downcase.gsub(/[^[:alnum:]]/, ' '), b.downcase.gsub(/[^[:alnum:]]/, ' '))
|
37
|
+
end
|
38
|
+
|
39
|
+
# This method will return a distance matrix as well as matches beyond a supplies threshold
|
40
|
+
# Caveat : this algorithm costs (n^2/2) - n in time and n^2 in memory
|
41
|
+
# So if n= 100,000
|
42
|
+
# there will be 4,999,900,000 calculations
|
43
|
+
# setting only_hits to true does not return the matrix only the hits
|
44
|
+
# the matrix can be used later for clustering....
|
45
|
+
|
46
|
+
# "hello", "hello", "world", "hello world", "hello cruel world"
|
47
|
+
# "hello" [nil, 1.0, 0.125, 0.41176470588235303, 0.28],
|
48
|
+
# "hello" [nil, nil, 0.125, 0.41176470588235303, 0.28],
|
49
|
+
# "world" [nil, nil, nil, 0.41176470588235303, 0.28],
|
50
|
+
# "hello world" [nil, nil, nil, nil, 0.52],
|
51
|
+
# "hello cruel world" [nil, nil, nil, nil, nil]
|
52
|
+
def self.distance_matrix(arr, cutoff = 0.5, case_insensitive = false, only_hits = true )
|
53
|
+
sims =[]
|
54
|
+
len = arr.length
|
55
|
+
if only_hits
|
56
|
+
mat = []
|
57
|
+
else
|
58
|
+
mat = Array.new(len){Array.new(len)}
|
59
|
+
end
|
60
|
+
|
61
|
+
dist = 0.to_f
|
62
|
+
for i in 0..len
|
63
|
+
for j in i+1..len -1
|
64
|
+
if case_insensitive then
|
65
|
+
dist = self.case_insensitive_distance(arr[i], arr[j])
|
66
|
+
else
|
67
|
+
dist = self.distance(arr[i], arr[j])
|
68
|
+
end
|
69
|
+
if dist >= cutoff
|
70
|
+
sims.push({:first_term=>[i,arr[i]],:second_term=>[j,arr[j]],:score=>dist})
|
71
|
+
end
|
72
|
+
mat[i][j] = dist unless only_hits
|
73
|
+
end
|
74
|
+
end
|
75
|
+
{:matrix=>mat,:similar_terms=>sims}
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'simple_similarity/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |gem|
|
7
|
+
gem.name = "simple_similarity"
|
8
|
+
gem.version = SimpleSimilarity::VERSION
|
9
|
+
gem.authors = ["Ori Pekelman"]
|
10
|
+
gem.email = ["ori@pekelman.com"]
|
11
|
+
gem.description = %q{A simple (slow but beautiful) algorithm to find similar things}
|
12
|
+
gem.summary = %q{A simple (slow but beautiful) algorithm to find similar things}
|
13
|
+
gem.homepage = "https://github.com/OriPekelman/simple_similarity"
|
14
|
+
|
15
|
+
gem.files = `git ls-files`.split($/)
|
16
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
17
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
18
|
+
gem.require_paths = ["lib"]
|
19
|
+
gem.add_development_dependency "rspec", "~> 2.6"
|
20
|
+
gem.add_development_dependency "pry"
|
21
|
+
gem.add_development_dependency "awesome_print"
|
22
|
+
gem.add_development_dependency "colorize"
|
23
|
+
end
|
data/spec/benchmark.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'simple_similarity'
|
2
|
+
require "benchmark"
|
3
|
+
|
4
|
+
a = []
|
5
|
+
CHARS = (?a..?z).to_a + ["e","e","e","e","e","a","a","a","a","a","a","a","a","a","a","a","a"]
|
6
|
+
1000.times {a.push(5.times.inject("") {|s, i| s << CHARS[rand(CHARS.size)]})}
|
7
|
+
|
8
|
+
time = Benchmark.measure do
|
9
|
+
puts SimpleSimilarity::SimpleSimilarity.distance_matrix(a, 0.5, false, true)[:similar_terms]
|
10
|
+
end
|
11
|
+
puts "Without matrix"
|
12
|
+
puts time
|
13
|
+
|
14
|
+
time = Benchmark.measure do
|
15
|
+
puts SimpleSimilarity::SimpleSimilarity.distance_matrix(a, 0.5, false, false)[:similar_terms]
|
16
|
+
end
|
17
|
+
puts "With matrix"
|
18
|
+
puts time
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'simple_similarity'
|
2
|
+
|
3
|
+
describe SimpleSimilarity::SimpleSimilarity do
|
4
|
+
it "Hello is identical to Hello" do
|
5
|
+
SimpleSimilarity::SimpleSimilarity.distance("Hello","Hello").should eql(1.0)
|
6
|
+
end
|
7
|
+
|
8
|
+
it "Hello is not similar to world" do
|
9
|
+
SimpleSimilarity::SimpleSimilarity.distance("Hello","World").should eql(0.125)
|
10
|
+
end
|
11
|
+
|
12
|
+
it "Hello is somewhat similar hello" do
|
13
|
+
SimpleSimilarity::SimpleSimilarity.distance("Hello","hello").should eql(0.5)
|
14
|
+
end
|
15
|
+
|
16
|
+
it "Hello is identical to hello when case insensitive" do
|
17
|
+
SimpleSimilarity::SimpleSimilarity.case_insensitive_distance("Hello","hello").should eql(1.0)
|
18
|
+
end
|
19
|
+
|
20
|
+
it "from : hello, Hello, hell, world, cruel, crude with a cutoff of 0.5 we should have two couples" do
|
21
|
+
SimpleSimilarity::SimpleSimilarity.distance_matrix(%w( hello, Hello, hell, world, cruel, crud), 0.5)[:similar_terms].length.should eql(2)
|
22
|
+
end
|
23
|
+
|
24
|
+
it "from : hello, Hello, hell, world, cruel, crude with a cutoff of 0.5 we should have three couples case insensitive" do
|
25
|
+
SimpleSimilarity::SimpleSimilarity.distance_matrix(%w( hello, Hello, hell, world, cruel, crud), 0.5, true)[:similar_terms].length.should eql(3)
|
26
|
+
end
|
27
|
+
|
28
|
+
it "from : hello world, Hello world, hell world, world, cruel, crude with a cutoff of 1.0 we should have seven couples case insensitive" do
|
29
|
+
SimpleSimilarity::SimpleSimilarity.distance_matrix(%w( hello world, Hello world, hell world, world, cruel, crude), 1.0, true)[:similar_terms].length.should eql(7)
|
30
|
+
end
|
31
|
+
|
32
|
+
it "setting only_hits to false gives back a matrix" do
|
33
|
+
SimpleSimilarity::SimpleSimilarity.distance_matrix(%w( hello, world), 1.0, true, false)[:matrix].length.should eql(2)
|
34
|
+
end
|
35
|
+
|
36
|
+
it "by default does not gives an empty matrix" do
|
37
|
+
SimpleSimilarity::SimpleSimilarity.distance_matrix(%w( hello, world))[:matrix].length.should eql(0)
|
38
|
+
end
|
39
|
+
|
40
|
+
|
41
|
+
end
|
metadata
ADDED
@@ -0,0 +1,122 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: simple_similarity
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Ori Pekelman
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-01-08 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: rspec
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ~>
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '2.6'
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ~>
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '2.6'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: pry
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :development
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: awesome_print
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :development
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: colorize
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0'
|
70
|
+
type: :development
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
78
|
+
description: A simple (slow but beautiful) algorithm to find similar things
|
79
|
+
email:
|
80
|
+
- ori@pekelman.com
|
81
|
+
executables: []
|
82
|
+
extensions: []
|
83
|
+
extra_rdoc_files: []
|
84
|
+
files:
|
85
|
+
- .gitignore
|
86
|
+
- Gemfile
|
87
|
+
- LICENSE.txt
|
88
|
+
- README.md
|
89
|
+
- Rakefile
|
90
|
+
- lib/simple_similarity.rb
|
91
|
+
- lib/simple_similarity/version.rb
|
92
|
+
- simple_similarity.gemspec
|
93
|
+
- spec/benchmark.rb
|
94
|
+
- spec/simple_similarity_spec.rb
|
95
|
+
homepage: https://github.com/OriPekelman/simple_similarity
|
96
|
+
licenses: []
|
97
|
+
post_install_message:
|
98
|
+
rdoc_options: []
|
99
|
+
require_paths:
|
100
|
+
- lib
|
101
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
102
|
+
none: false
|
103
|
+
requirements:
|
104
|
+
- - ! '>='
|
105
|
+
- !ruby/object:Gem::Version
|
106
|
+
version: '0'
|
107
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
108
|
+
none: false
|
109
|
+
requirements:
|
110
|
+
- - ! '>='
|
111
|
+
- !ruby/object:Gem::Version
|
112
|
+
version: '0'
|
113
|
+
requirements: []
|
114
|
+
rubyforge_project:
|
115
|
+
rubygems_version: 1.8.24
|
116
|
+
signing_key:
|
117
|
+
specification_version: 3
|
118
|
+
summary: A simple (slow but beautiful) algorithm to find similar things
|
119
|
+
test_files:
|
120
|
+
- spec/benchmark.rb
|
121
|
+
- spec/simple_similarity_spec.rb
|
122
|
+
has_rdoc:
|