string-similarity 1.1.1 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +7 -3
- data/CHANGELOG.md +7 -1
- data/README.md +18 -5
- data/lib/string/similarity.rb +107 -127
- data/lib/string/similarity/version.rb +1 -1
- data/lib/string/similarity_refinements.rb +22 -0
- metadata +2 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 308c3664b419f777c0492b103cb9901e108455c0
|
4
|
+
data.tar.gz: 5e2af01712dc0a08c37b8dd4cbc1f60a5883cb38
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a739214fa67e112e179e9b744e9e8afa4c728d963a0d9ef70bbe3cbbe8abdc8485eef391670963ed050ed723e849ce59bddd672c543c8b12a8c330544800f09e
|
7
|
+
data.tar.gz: f6c7b317034c2b9c324cdbda88e33fac62e592663676735168017ef3c8ab23f247fe21a4e3289f202271bcbcb639f50c917cddcd47a26cee7e4ec68177235596
|
data/.travis.yml
CHANGED
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
**2.0.0** (2016-02-19)
|
2
|
+
|
3
|
+
* removed: core extensions on `String`
|
4
|
+
* added: refinements for `String` (see README!)
|
5
|
+
|
6
|
+
|
1
7
|
**1.1.1** (2016-02-19)
|
2
8
|
|
3
|
-
* added: `require 'string-similarity'` now works
|
9
|
+
* added: `require 'string-similarity'` now works as well.
|
data/README.md
CHANGED
@@ -43,26 +43,39 @@ String::Similarity.cosine 'mine', 'thyne'
|
|
43
43
|
String::Similarity.cosine 'foo', 'foo'
|
44
44
|
# => 1.0
|
45
45
|
|
46
|
-
# or call on a string directly
|
47
|
-
'string'.cosine_similarity_to 'strong'
|
48
|
-
# => 0.8333333333333335
|
49
|
-
|
50
46
|
|
51
47
|
# Same for Levenshtein:
|
52
48
|
String::Similarity.levenshtein_distance('kitten', 'sitting') # or ...
|
53
|
-
'kitten'.levenshtein_distance_to('sitting')
|
54
49
|
# => 3
|
55
50
|
String::Similarity.levenshtein('foo', 'far') # or ...
|
51
|
+
# => 0.5
|
52
|
+
```
|
53
|
+
|
54
|
+
If you want, you can use [Refinements](http://ruby-doc.org/core-2.3.0/doc/syntax/refinements_rdoc.html) to add the functionality to the `String` class:
|
55
|
+
|
56
|
+
```ruby
|
57
|
+
using String::SimilarityRefinements
|
58
|
+
|
59
|
+
'string'.cosine_similarity_to 'strong'
|
60
|
+
# => 0.8333333333333335
|
61
|
+
|
62
|
+
'kitten'.levenshtein_distance_to('sitting')
|
63
|
+
# => 3
|
64
|
+
|
56
65
|
'far'.levenshtein_similarity_to('foo')
|
57
66
|
# => 0.5
|
58
67
|
```
|
59
68
|
|
69
|
+
(See this free [Ruby Tapas Episode](http://www.rubytapas.com/episodes/250-Refinements) if you don't know Refinements)
|
70
|
+
|
60
71
|
## Development
|
61
72
|
|
62
73
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
63
74
|
|
64
75
|
To install this gem onto your local machine, run `bundle exec rake install`.
|
65
76
|
|
77
|
+
This Project uses [Semantic Versioning](http://semver.org/).
|
78
|
+
|
66
79
|
## Contributing
|
67
80
|
|
68
81
|
1. Fork it ( https://github.com/mhutter/string-similarity/fork )
|
data/lib/string/similarity.rb
CHANGED
@@ -1,142 +1,122 @@
|
|
1
1
|
require 'string/similarity/version'
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
#
|
12
|
-
#
|
13
|
-
|
14
|
-
|
2
|
+
require 'string/similarity_refinements'
|
3
|
+
|
4
|
+
# +String::Similarity+ provides various methods for
|
5
|
+
# calculating string distances.
|
6
|
+
module String::Similarity
|
7
|
+
# Calcuate the {https://en.wikipedia.org/wiki/Cosine_similarity
|
8
|
+
# Cosine similarity} of two strings.
|
9
|
+
#
|
10
|
+
# For an explanation of the Cosine similarity of two strings read
|
11
|
+
# {http://stackoverflow.com/a/1750187/405454 this excellent SO answer}.
|
12
|
+
#
|
13
|
+
# @param str1 [String] first string
|
14
|
+
# @param str2 [String] second string
|
15
|
+
# @return [Float] cosine similarity of the two arguments.
|
16
|
+
# - +1.0+ if the strings are identical
|
17
|
+
# - +0.0+ if the strings are completely different
|
18
|
+
# - +0.0+ if one of the strings is empty
|
19
|
+
def self.cosine(str1, str2)
|
20
|
+
return 1.0 if str1 == str2
|
21
|
+
return 0.0 if str1.empty? || str2.empty?
|
22
|
+
|
23
|
+
# convert both texts to vectors
|
24
|
+
v1 = vector(str1)
|
25
|
+
v2 = vector(str2)
|
26
|
+
|
27
|
+
# calculate the dot product
|
28
|
+
dot_product = dot(v1, v2)
|
29
|
+
|
30
|
+
# calculate the magnitude
|
31
|
+
magnitude = mag(v1.values) * mag(v2.values)
|
32
|
+
dot_product / magnitude
|
15
33
|
end
|
16
34
|
|
17
|
-
#
|
18
|
-
#
|
19
|
-
|
20
|
-
|
35
|
+
# Calculate the Levenshtein similarity for two strings.
|
36
|
+
#
|
37
|
+
# This is basically the inversion of the levenshtein_distance, i.e.
|
38
|
+
# 1 / levenshtein_distance(str1, str2)
|
39
|
+
#
|
40
|
+
# @param str1 [String] first string
|
41
|
+
# @param str2 [String] second string
|
42
|
+
# @return [Float] levenshtein similarity of the two arguments.
|
43
|
+
# - +1.0+ if the strings are identical
|
44
|
+
# - +0.0+ if one of the strings is empty
|
45
|
+
# @see #levenshtein_distance
|
46
|
+
def self.levenshtein(str1, str2)
|
47
|
+
return 1.0 if str1.eql?(str2)
|
48
|
+
return 0.0 if str1.empty? || str2.empty?
|
49
|
+
1.0 / levenshtein_distance(str1, str2)
|
21
50
|
end
|
22
51
|
|
23
|
-
#
|
24
|
-
#
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
#
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
#
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
dot_product / magnitude
|
52
|
-
end
|
53
|
-
|
54
|
-
# Calculate the Levenshtein similarity for two strings.
|
55
|
-
#
|
56
|
-
# This is basically the inversion of the levenshtein_distance, i.e.
|
57
|
-
# 1 / levenshtein_distance(str1, str2)
|
58
|
-
#
|
59
|
-
# @param str1 [String] first string
|
60
|
-
# @param str2 [String] second string
|
61
|
-
# @return [Float] levenshtein similarity of the two arguments.
|
62
|
-
# - +1.0+ if the strings are identical
|
63
|
-
# - +0.0+ if one of the strings is empty
|
64
|
-
# @see #levenshtein_distance
|
65
|
-
def self.levenshtein(str1, str2)
|
66
|
-
return 1.0 if str1.eql?(str2)
|
67
|
-
return 0.0 if str1.empty? || str2.empty?
|
68
|
-
1.0 / levenshtein_distance(str1, str2)
|
69
|
-
end
|
70
|
-
|
71
|
-
# Calculate the {https://en.wikipedia.org/wiki/Levenshtein_distance
|
72
|
-
# Levenshtein distance} of two strings.
|
73
|
-
#
|
74
|
-
# @param str1 [String] first string
|
75
|
-
# @param str2 [String] second string
|
76
|
-
# @return [Fixnum] edit distance between the two strings
|
77
|
-
# - +0+ if the strings are identical
|
78
|
-
def self.levenshtein_distance(str1, str2)
|
79
|
-
# base cases
|
80
|
-
result = base_case?(str1, str2)
|
81
|
-
return result if result
|
82
|
-
|
83
|
-
# Initialize cost-matrix rows
|
84
|
-
previous = (0..str2.length).to_a
|
85
|
-
current = []
|
86
|
-
|
87
|
-
(0...str1.length).each do |i|
|
88
|
-
# first element is always the edit distance from an empty string.
|
89
|
-
current[0] = i + 1
|
90
|
-
(0...str2.length).each do |j|
|
91
|
-
current[j + 1] = [
|
92
|
-
# insertion
|
93
|
-
current[j] + 1,
|
94
|
-
# deletion
|
95
|
-
previous[j + 1] + 1,
|
96
|
-
# substitution or no operation
|
97
|
-
previous[j] + (str1[i].eql?(str2[j]) ? 0 : 1)
|
98
|
-
].min
|
99
|
-
end
|
100
|
-
previous = current.dup
|
52
|
+
# Calculate the {https://en.wikipedia.org/wiki/Levenshtein_distance
|
53
|
+
# Levenshtein distance} of two strings.
|
54
|
+
#
|
55
|
+
# @param str1 [String] first string
|
56
|
+
# @param str2 [String] second string
|
57
|
+
# @return [Fixnum] edit distance between the two strings
|
58
|
+
# - +0+ if the strings are identical
|
59
|
+
def self.levenshtein_distance(str1, str2)
|
60
|
+
# base cases
|
61
|
+
result = base_case?(str1, str2)
|
62
|
+
return result if result
|
63
|
+
|
64
|
+
# Initialize cost-matrix rows
|
65
|
+
previous = (0..str2.length).to_a
|
66
|
+
current = []
|
67
|
+
|
68
|
+
(0...str1.length).each do |i|
|
69
|
+
# first element is always the edit distance from an empty string.
|
70
|
+
current[0] = i + 1
|
71
|
+
(0...str2.length).each do |j|
|
72
|
+
current[j + 1] = [
|
73
|
+
# insertion
|
74
|
+
current[j] + 1,
|
75
|
+
# deletion
|
76
|
+
previous[j + 1] + 1,
|
77
|
+
# substitution or no operation
|
78
|
+
previous[j] + (str1[i].eql?(str2[j]) ? 0 : 1)
|
79
|
+
].min
|
101
80
|
end
|
102
|
-
|
103
|
-
current[str2.length]
|
81
|
+
previous = current.dup
|
104
82
|
end
|
105
83
|
|
106
|
-
|
84
|
+
current[str2.length]
|
85
|
+
end
|
107
86
|
|
108
|
-
|
109
|
-
return 0 if str1.eql?(str2)
|
110
|
-
return str2.length if str1.empty?
|
111
|
-
return str1.length if str2.empty?
|
112
|
-
false
|
113
|
-
end
|
87
|
+
private
|
114
88
|
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
v = Hash.new(0)
|
122
|
-
str.each_char { |c| v[c] += 1 }
|
123
|
-
v
|
124
|
-
end
|
89
|
+
def self.base_case?(str1, str2)
|
90
|
+
return 0 if str1.eql?(str2)
|
91
|
+
return str2.length if str1.empty?
|
92
|
+
return str1.length if str2.empty?
|
93
|
+
false
|
94
|
+
end
|
125
95
|
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
96
|
+
# create a vector from +str+
|
97
|
+
#
|
98
|
+
# @example
|
99
|
+
# v1 = vector('hello') # => {"h"=>1, "e"=>1, "l"=>2, "o"=>1}
|
100
|
+
# v1["x"] # => 0
|
101
|
+
def self.vector(str)
|
102
|
+
v = Hash.new(0)
|
103
|
+
str.each_char { |c| v[c] += 1 }
|
104
|
+
v
|
105
|
+
end
|
134
106
|
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
107
|
+
# calculate the dot product of +vector1+ and +vector2+
|
108
|
+
def self.dot(vector1, vector2)
|
109
|
+
product = 0
|
110
|
+
vector1.each do |k, v|
|
111
|
+
product += v * vector2[k]
|
140
112
|
end
|
113
|
+
product
|
114
|
+
end
|
115
|
+
|
116
|
+
# calculate the magnitude for +vector+
|
117
|
+
def self.mag(vector)
|
118
|
+
# calculate the sum of squares
|
119
|
+
sq = vector.inject(0) { |a, e| a + e**2 }
|
120
|
+
Math.sqrt(sq)
|
141
121
|
end
|
142
122
|
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# provide refinements for the String class
|
2
|
+
module String::SimilarityRefinements
|
3
|
+
refine String do
|
4
|
+
# Returns the cosine similarity to +other+
|
5
|
+
# @see String::Similarity#cosine
|
6
|
+
def cosine_similarity_to(other)
|
7
|
+
String::Similarity.cosine(self, other)
|
8
|
+
end
|
9
|
+
|
10
|
+
# Returns the Levenshtein distance to +other+
|
11
|
+
# @see String::Similarity.levenshtein_distance
|
12
|
+
def levenshtein_distance_to(other)
|
13
|
+
String::Similarity.levenshtein_distance(self, other)
|
14
|
+
end
|
15
|
+
|
16
|
+
# Returns the Levenshtein similarity to +other+
|
17
|
+
# @see String::Similarity.levenshtein
|
18
|
+
def levenshtein_similarity_to(other)
|
19
|
+
String::Similarity.levenshtein(self, other)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: string-similarity
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Manuel Hutter
|
@@ -100,6 +100,7 @@ files:
|
|
100
100
|
- lib/string-similarity.rb
|
101
101
|
- lib/string/similarity.rb
|
102
102
|
- lib/string/similarity/version.rb
|
103
|
+
- lib/string/similarity_refinements.rb
|
103
104
|
- string-similarity.gemspec
|
104
105
|
homepage: https://github.com/mhutter/string-similarity
|
105
106
|
licenses:
|