string-similarity 1.1.1 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +7 -3
- data/CHANGELOG.md +7 -1
- data/README.md +18 -5
- data/lib/string/similarity.rb +107 -127
- data/lib/string/similarity/version.rb +1 -1
- data/lib/string/similarity_refinements.rb +22 -0
- metadata +2 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 308c3664b419f777c0492b103cb9901e108455c0
|
4
|
+
data.tar.gz: 5e2af01712dc0a08c37b8dd4cbc1f60a5883cb38
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a739214fa67e112e179e9b744e9e8afa4c728d963a0d9ef70bbe3cbbe8abdc8485eef391670963ed050ed723e849ce59bddd672c543c8b12a8c330544800f09e
|
7
|
+
data.tar.gz: f6c7b317034c2b9c324cdbda88e33fac62e592663676735168017ef3c8ab23f247fe21a4e3289f202271bcbcb639f50c917cddcd47a26cee7e4ec68177235596
|
data/.travis.yml
CHANGED
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
**2.0.0** (2016-02-19)
|
2
|
+
|
3
|
+
* removed: core extensions on `String`
|
4
|
+
* added: refinements for `String` (see README!)
|
5
|
+
|
6
|
+
|
1
7
|
**1.1.1** (2016-02-19)
|
2
8
|
|
3
|
-
* added: `require 'string-similarity'` now works
|
9
|
+
* added: `require 'string-similarity'` now works as well.
|
data/README.md
CHANGED
@@ -43,26 +43,39 @@ String::Similarity.cosine 'mine', 'thyne'
|
|
43
43
|
String::Similarity.cosine 'foo', 'foo'
|
44
44
|
# => 1.0
|
45
45
|
|
46
|
-
# or call on a string directly
|
47
|
-
'string'.cosine_similarity_to 'strong'
|
48
|
-
# => 0.8333333333333335
|
49
|
-
|
50
46
|
|
51
47
|
# Same for Levenshtein:
|
52
48
|
String::Similarity.levenshtein_distance('kitten', 'sitting') # or ...
|
53
|
-
'kitten'.levenshtein_distance_to('sitting')
|
54
49
|
# => 3
|
55
50
|
String::Similarity.levenshtein('foo', 'far') # or ...
|
51
|
+
# => 0.5
|
52
|
+
```
|
53
|
+
|
54
|
+
If you want, you can use [Refinements](http://ruby-doc.org/core-2.3.0/doc/syntax/refinements_rdoc.html) to add the functionality to the `String` class:
|
55
|
+
|
56
|
+
```ruby
|
57
|
+
using String::SimilarityRefinements
|
58
|
+
|
59
|
+
'string'.cosine_similarity_to 'strong'
|
60
|
+
# => 0.8333333333333335
|
61
|
+
|
62
|
+
'kitten'.levenshtein_distance_to('sitting')
|
63
|
+
# => 3
|
64
|
+
|
56
65
|
'far'.levenshtein_similarity_to('foo')
|
57
66
|
# => 0.5
|
58
67
|
```
|
59
68
|
|
69
|
+
(See this free [Ruby Tapas Episode](http://www.rubytapas.com/episodes/250-Refinements) if you don't know Refinements)
|
70
|
+
|
60
71
|
## Development
|
61
72
|
|
62
73
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
63
74
|
|
64
75
|
To install this gem onto your local machine, run `bundle exec rake install`.
|
65
76
|
|
77
|
+
This Project uses [Semantic Versioning](http://semver.org/).
|
78
|
+
|
66
79
|
## Contributing
|
67
80
|
|
68
81
|
1. Fork it ( https://github.com/mhutter/string-similarity/fork )
|
data/lib/string/similarity.rb
CHANGED
@@ -1,142 +1,122 @@
|
|
1
1
|
require 'string/similarity/version'
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
#
|
12
|
-
#
|
13
|
-
|
14
|
-
|
2
|
+
require 'string/similarity_refinements'
|
3
|
+
|
4
|
+
# +String::Similarity+ provides various methods for
|
5
|
+
# calculating string distances.
|
6
|
+
module String::Similarity
|
7
|
+
# Calcuate the {https://en.wikipedia.org/wiki/Cosine_similarity
|
8
|
+
# Cosine similarity} of two strings.
|
9
|
+
#
|
10
|
+
# For an explanation of the Cosine similarity of two strings read
|
11
|
+
# {http://stackoverflow.com/a/1750187/405454 this excellent SO answer}.
|
12
|
+
#
|
13
|
+
# @param str1 [String] first string
|
14
|
+
# @param str2 [String] second string
|
15
|
+
# @return [Float] cosine similarity of the two arguments.
|
16
|
+
# - +1.0+ if the strings are identical
|
17
|
+
# - +0.0+ if the strings are completely different
|
18
|
+
# - +0.0+ if one of the strings is empty
|
19
|
+
def self.cosine(str1, str2)
|
20
|
+
return 1.0 if str1 == str2
|
21
|
+
return 0.0 if str1.empty? || str2.empty?
|
22
|
+
|
23
|
+
# convert both texts to vectors
|
24
|
+
v1 = vector(str1)
|
25
|
+
v2 = vector(str2)
|
26
|
+
|
27
|
+
# calculate the dot product
|
28
|
+
dot_product = dot(v1, v2)
|
29
|
+
|
30
|
+
# calculate the magnitude
|
31
|
+
magnitude = mag(v1.values) * mag(v2.values)
|
32
|
+
dot_product / magnitude
|
15
33
|
end
|
16
34
|
|
17
|
-
#
|
18
|
-
#
|
19
|
-
|
20
|
-
|
35
|
+
# Calculate the Levenshtein similarity for two strings.
|
36
|
+
#
|
37
|
+
# This is basically the inversion of the levenshtein_distance, i.e.
|
38
|
+
# 1 / levenshtein_distance(str1, str2)
|
39
|
+
#
|
40
|
+
# @param str1 [String] first string
|
41
|
+
# @param str2 [String] second string
|
42
|
+
# @return [Float] levenshtein similarity of the two arguments.
|
43
|
+
# - +1.0+ if the strings are identical
|
44
|
+
# - +0.0+ if one of the strings is empty
|
45
|
+
# @see #levenshtein_distance
|
46
|
+
def self.levenshtein(str1, str2)
|
47
|
+
return 1.0 if str1.eql?(str2)
|
48
|
+
return 0.0 if str1.empty? || str2.empty?
|
49
|
+
1.0 / levenshtein_distance(str1, str2)
|
21
50
|
end
|
22
51
|
|
23
|
-
#
|
24
|
-
#
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
#
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
#
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
dot_product / magnitude
|
52
|
-
end
|
53
|
-
|
54
|
-
# Calculate the Levenshtein similarity for two strings.
|
55
|
-
#
|
56
|
-
# This is basically the inversion of the levenshtein_distance, i.e.
|
57
|
-
# 1 / levenshtein_distance(str1, str2)
|
58
|
-
#
|
59
|
-
# @param str1 [String] first string
|
60
|
-
# @param str2 [String] second string
|
61
|
-
# @return [Float] levenshtein similarity of the two arguments.
|
62
|
-
# - +1.0+ if the strings are identical
|
63
|
-
# - +0.0+ if one of the strings is empty
|
64
|
-
# @see #levenshtein_distance
|
65
|
-
def self.levenshtein(str1, str2)
|
66
|
-
return 1.0 if str1.eql?(str2)
|
67
|
-
return 0.0 if str1.empty? || str2.empty?
|
68
|
-
1.0 / levenshtein_distance(str1, str2)
|
69
|
-
end
|
70
|
-
|
71
|
-
# Calculate the {https://en.wikipedia.org/wiki/Levenshtein_distance
|
72
|
-
# Levenshtein distance} of two strings.
|
73
|
-
#
|
74
|
-
# @param str1 [String] first string
|
75
|
-
# @param str2 [String] second string
|
76
|
-
# @return [Fixnum] edit distance between the two strings
|
77
|
-
# - +0+ if the strings are identical
|
78
|
-
def self.levenshtein_distance(str1, str2)
|
79
|
-
# base cases
|
80
|
-
result = base_case?(str1, str2)
|
81
|
-
return result if result
|
82
|
-
|
83
|
-
# Initialize cost-matrix rows
|
84
|
-
previous = (0..str2.length).to_a
|
85
|
-
current = []
|
86
|
-
|
87
|
-
(0...str1.length).each do |i|
|
88
|
-
# first element is always the edit distance from an empty string.
|
89
|
-
current[0] = i + 1
|
90
|
-
(0...str2.length).each do |j|
|
91
|
-
current[j + 1] = [
|
92
|
-
# insertion
|
93
|
-
current[j] + 1,
|
94
|
-
# deletion
|
95
|
-
previous[j + 1] + 1,
|
96
|
-
# substitution or no operation
|
97
|
-
previous[j] + (str1[i].eql?(str2[j]) ? 0 : 1)
|
98
|
-
].min
|
99
|
-
end
|
100
|
-
previous = current.dup
|
52
|
+
# Calculate the {https://en.wikipedia.org/wiki/Levenshtein_distance
|
53
|
+
# Levenshtein distance} of two strings.
|
54
|
+
#
|
55
|
+
# @param str1 [String] first string
|
56
|
+
# @param str2 [String] second string
|
57
|
+
# @return [Fixnum] edit distance between the two strings
|
58
|
+
# - +0+ if the strings are identical
|
59
|
+
def self.levenshtein_distance(str1, str2)
|
60
|
+
# base cases
|
61
|
+
result = base_case?(str1, str2)
|
62
|
+
return result if result
|
63
|
+
|
64
|
+
# Initialize cost-matrix rows
|
65
|
+
previous = (0..str2.length).to_a
|
66
|
+
current = []
|
67
|
+
|
68
|
+
(0...str1.length).each do |i|
|
69
|
+
# first element is always the edit distance from an empty string.
|
70
|
+
current[0] = i + 1
|
71
|
+
(0...str2.length).each do |j|
|
72
|
+
current[j + 1] = [
|
73
|
+
# insertion
|
74
|
+
current[j] + 1,
|
75
|
+
# deletion
|
76
|
+
previous[j + 1] + 1,
|
77
|
+
# substitution or no operation
|
78
|
+
previous[j] + (str1[i].eql?(str2[j]) ? 0 : 1)
|
79
|
+
].min
|
101
80
|
end
|
102
|
-
|
103
|
-
current[str2.length]
|
81
|
+
previous = current.dup
|
104
82
|
end
|
105
83
|
|
106
|
-
|
84
|
+
current[str2.length]
|
85
|
+
end
|
107
86
|
|
108
|
-
|
109
|
-
return 0 if str1.eql?(str2)
|
110
|
-
return str2.length if str1.empty?
|
111
|
-
return str1.length if str2.empty?
|
112
|
-
false
|
113
|
-
end
|
87
|
+
private
|
114
88
|
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
v = Hash.new(0)
|
122
|
-
str.each_char { |c| v[c] += 1 }
|
123
|
-
v
|
124
|
-
end
|
89
|
+
def self.base_case?(str1, str2)
|
90
|
+
return 0 if str1.eql?(str2)
|
91
|
+
return str2.length if str1.empty?
|
92
|
+
return str1.length if str2.empty?
|
93
|
+
false
|
94
|
+
end
|
125
95
|
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
96
|
+
# create a vector from +str+
|
97
|
+
#
|
98
|
+
# @example
|
99
|
+
# v1 = vector('hello') # => {"h"=>1, "e"=>1, "l"=>2, "o"=>1}
|
100
|
+
# v1["x"] # => 0
|
101
|
+
def self.vector(str)
|
102
|
+
v = Hash.new(0)
|
103
|
+
str.each_char { |c| v[c] += 1 }
|
104
|
+
v
|
105
|
+
end
|
134
106
|
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
107
|
+
# calculate the dot product of +vector1+ and +vector2+
|
108
|
+
def self.dot(vector1, vector2)
|
109
|
+
product = 0
|
110
|
+
vector1.each do |k, v|
|
111
|
+
product += v * vector2[k]
|
140
112
|
end
|
113
|
+
product
|
114
|
+
end
|
115
|
+
|
116
|
+
# calculate the magnitude for +vector+
|
117
|
+
def self.mag(vector)
|
118
|
+
# calculate the sum of squares
|
119
|
+
sq = vector.inject(0) { |a, e| a + e**2 }
|
120
|
+
Math.sqrt(sq)
|
141
121
|
end
|
142
122
|
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# provide refinements for the String class
|
2
|
+
module String::SimilarityRefinements
|
3
|
+
refine String do
|
4
|
+
# Returns the cosine similarity to +other+
|
5
|
+
# @see String::Similarity#cosine
|
6
|
+
def cosine_similarity_to(other)
|
7
|
+
String::Similarity.cosine(self, other)
|
8
|
+
end
|
9
|
+
|
10
|
+
# Returns the Levenshtein distance to +other+
|
11
|
+
# @see String::Similarity.levenshtein_distance
|
12
|
+
def levenshtein_distance_to(other)
|
13
|
+
String::Similarity.levenshtein_distance(self, other)
|
14
|
+
end
|
15
|
+
|
16
|
+
# Returns the Levenshtein similarity to +other+
|
17
|
+
# @see String::Similarity.levenshtein
|
18
|
+
def levenshtein_similarity_to(other)
|
19
|
+
String::Similarity.levenshtein(self, other)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: string-similarity
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Manuel Hutter
|
@@ -100,6 +100,7 @@ files:
|
|
100
100
|
- lib/string-similarity.rb
|
101
101
|
- lib/string/similarity.rb
|
102
102
|
- lib/string/similarity/version.rb
|
103
|
+
- lib/string/similarity_refinements.rb
|
103
104
|
- string-similarity.gemspec
|
104
105
|
homepage: https://github.com/mhutter/string-similarity
|
105
106
|
licenses:
|