string-similarity 1.0.1 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rspec +1 -0
- data/Gemfile +6 -1
- data/README.md +9 -0
- data/bin/setup +7 -4
- data/lib/string/similarity.rb +77 -10
- data/lib/string/similarity/version.rb +2 -1
- data/string-similarity.gemspec +7 -3
- metadata +9 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b89926f7398486de02d94f5f1d8742834a95d54f
|
4
|
+
data.tar.gz: 0493548d9494e4855b648551848da77fbef6945d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8af14a673396cf5b7b20df9e279621e4301fb5c8c52f14b1a8aae3b096d1ad5a480ce6e4cc64c0b5ba6bef1c0fe72f6b530f8a10aef21bfe1c7f932d00007e3a
|
7
|
+
data.tar.gz: a89455cf5807fb41749d4f4b86e4642e50cf1ff8620d278e8e3b43c2abf6cd3454e740a81337e1fa7d7d3c76704a17b271a1cd61558362d37e109ca01835be6b
|
data/.rspec
CHANGED
data/Gemfile
CHANGED
@@ -3,4 +3,9 @@ source 'https://rubygems.org'
|
|
3
3
|
# Specify your gem's dependencies in string-similarity.gemspec
|
4
4
|
gemspec
|
5
5
|
|
6
|
-
|
6
|
+
group :test do
|
7
|
+
gem 'codeclimate-test-reporter', require: nil
|
8
|
+
gem 'guard'
|
9
|
+
gem 'guard-rspec'
|
10
|
+
gem 'growl'
|
11
|
+
end
|
data/README.md
CHANGED
@@ -46,6 +46,15 @@ String::Similarity.cosine 'foo', 'foo'
|
|
46
46
|
# or call on a string directly
|
47
47
|
'string'.cosine_similarity_to 'strong'
|
48
48
|
# => 0.8333333333333335
|
49
|
+
|
50
|
+
|
51
|
+
# Same for Levenshtein:
|
52
|
+
String::Similarity.levenshtein_distance('kitten', 'sitting') # or ...
|
53
|
+
'kitten'.levenshtein_distance_to('sitting')
|
54
|
+
# => 3
|
55
|
+
String::Similarity.levenshtein('foo', 'far') # or ...
|
56
|
+
'far'.levenshtein_similarity_to('foo')
|
57
|
+
# => 0.5
|
49
58
|
```
|
50
59
|
|
51
60
|
## Development
|
data/bin/setup
CHANGED
data/lib/string/similarity.rb
CHANGED
@@ -1,23 +1,36 @@
|
|
1
1
|
require 'string/similarity/version'
|
2
2
|
|
3
3
|
class String
|
4
|
-
|
4
|
+
# Returns the cosine similarity to +other+
|
5
|
+
# @see String::Similarity#cosine
|
5
6
|
def cosine_similarity_to(other)
|
6
7
|
String::Similarity.cosine(self, other)
|
7
8
|
end
|
8
9
|
|
10
|
+
# Returns the Levenshtein distance to +other+
|
11
|
+
# @see String::Similarity.levenshtein_distance
|
12
|
+
def levenshtein_distance_to(other)
|
13
|
+
String::Similarity.levenshtein_distance(self, other)
|
14
|
+
end
|
15
|
+
|
16
|
+
# Returns the Levenshtein similarity to +other+
|
17
|
+
# @see String::Similarity.levenshtein
|
18
|
+
def levenshtein_similarity_to(other)
|
19
|
+
String::Similarity.levenshtein(self, other)
|
20
|
+
end
|
21
|
+
|
9
22
|
# +String::Similarity+ provides various methods for
|
10
23
|
# calculating string distances.
|
11
24
|
module Similarity extend self
|
12
|
-
|
13
|
-
|
14
|
-
#
|
15
|
-
#
|
16
|
-
#
|
25
|
+
# Calcuate the {https://en.wikipedia.org/wiki/Cosine_similarity
|
26
|
+
# Cosine similarity} of two strings.
|
27
|
+
#
|
28
|
+
# For an explanation of the Cosine similarity of two strings read
|
29
|
+
# {http://stackoverflow.com/a/1750187/405454 this excellent SO answer}.
|
17
30
|
#
|
18
31
|
# @param str1 [String] first string
|
19
32
|
# @param str2 [String] second string
|
20
|
-
# @return [Float] cosine
|
33
|
+
# @return [Float] cosine similarity of the two arguments.
|
21
34
|
# - +1.0+ if the strings are identical
|
22
35
|
# - +0.0+ if the strings are completely different
|
23
36
|
# - +0.0+ if one of the strings is empty
|
@@ -36,12 +49,66 @@ class String
|
|
36
49
|
dot_product / magnitude
|
37
50
|
end
|
38
51
|
|
52
|
+
# Calculate the Levenshtein similarity for two strings.
|
53
|
+
#
|
54
|
+
# This is basically the inversion of the levenshtein_distance, i.e.
|
55
|
+
# 1 / levenshtein_distance(str1, str2)
|
56
|
+
#
|
57
|
+
# @param str1 [String] first string
|
58
|
+
# @param str2 [String] second string
|
59
|
+
# @return [Float] levenshtein similarity of the two arguments.
|
60
|
+
# - +1.0+ if the strings are identical
|
61
|
+
# - +0.0+ if one of the strings is empty
|
62
|
+
# @see #levenshtein_distance
|
63
|
+
def levenshtein(str1, str2)
|
64
|
+
return 1.0 if str1.eql?(str2)
|
65
|
+
return 0.0 if str1.empty? || str2.empty?
|
66
|
+
1.0 / levenshtein_distance(str1, str2)
|
67
|
+
end
|
68
|
+
|
69
|
+
# Calculate the {https://en.wikipedia.org/wiki/Levenshtein_distance
|
70
|
+
# Levenshtein distance} of two strings.
|
71
|
+
#
|
72
|
+
# @param str1 [String] first string
|
73
|
+
# @param str2 [String] second string
|
74
|
+
# @return [Fixnum] edit distance between the two strings
|
75
|
+
# - +0+ if the strings are identical
|
76
|
+
def levenshtein_distance(str1, str2)
|
77
|
+
# base cases
|
78
|
+
return 0 if str1.eql?(str2)
|
79
|
+
return str2.length if str1.empty?
|
80
|
+
return str1.length if str2.empty?
|
81
|
+
|
82
|
+
# Initialize cost-matrix rows
|
83
|
+
previous = (0..str2.length).to_a
|
84
|
+
current = []
|
85
|
+
|
86
|
+
(0...str1.length).each do |i|
|
87
|
+
# first element is always the edit distance from an empty string.
|
88
|
+
current[0] = i + 1
|
89
|
+
(0...str2.length).each do |j|
|
90
|
+
current[j+1] = [
|
91
|
+
# insertion
|
92
|
+
current[j] + 1,
|
93
|
+
# deletion
|
94
|
+
previous[j+1] + 1,
|
95
|
+
# substitution or no operation
|
96
|
+
previous[j] + (str1[i].eql?(str2[j]) ? 0 : 1)
|
97
|
+
].min
|
98
|
+
end
|
99
|
+
previous = current.dup
|
100
|
+
end
|
101
|
+
|
102
|
+
current[str2.length]
|
103
|
+
end
|
104
|
+
|
39
105
|
private
|
40
106
|
|
41
107
|
# create a vector from +str+
|
42
108
|
#
|
43
109
|
# @example
|
44
|
-
# vector('hello') # => {"h"=>1, "e"=>1, "l"=>2, "o"=>1}
|
110
|
+
# v1 = vector('hello') # => {"h"=>1, "e"=>1, "l"=>2, "o"=>1}
|
111
|
+
# v1["x"] # => 0
|
45
112
|
def vector(str)
|
46
113
|
v = Hash.new(0)
|
47
114
|
str.each_char { |c| v[c] += 1 }
|
@@ -51,7 +118,7 @@ class String
|
|
51
118
|
# calculate the dot product of +vector1+ and +vector2+
|
52
119
|
def dot(vector1, vector2)
|
53
120
|
product = 0
|
54
|
-
vector1.each do |k,v|
|
121
|
+
vector1.each do |k, v|
|
55
122
|
product += v * vector2[k]
|
56
123
|
end
|
57
124
|
product
|
@@ -60,7 +127,7 @@ class String
|
|
60
127
|
# calculate the magnitude for +vector+
|
61
128
|
def mag(vector)
|
62
129
|
# calculate the sum of squares
|
63
|
-
sq = vector.inject(0) { |s,n| s + n**2 }
|
130
|
+
sq = vector.inject(0) { |s, n| s + n**2 }
|
64
131
|
Math.sqrt(sq)
|
65
132
|
end
|
66
133
|
end
|
data/string-similarity.gemspec
CHANGED
@@ -11,14 +11,18 @@ Gem::Specification.new do |spec|
|
|
11
11
|
|
12
12
|
spec.summary = %q{Various methods for calculating string similarities.}
|
13
13
|
spec.description = <<-EOT
|
14
|
+
== Description
|
15
|
+
|
14
16
|
This gem provides some methods for calculating similarities of two strings.
|
15
17
|
|
16
|
-
Currently implemented
|
18
|
+
=== Currently implemented
|
19
|
+
|
17
20
|
- Cosine similarity
|
21
|
+
- Levenshtein distance/similarity
|
22
|
+
|
23
|
+
=== Planned
|
18
24
|
|
19
|
-
Planned:
|
20
25
|
- Hamming similarity
|
21
|
-
- Levenshtein similarity
|
22
26
|
EOT
|
23
27
|
spec.homepage = 'https://github.com/mhutter/string-similarity'
|
24
28
|
spec.license = 'MIT'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: string-similarity
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Manuel Hutter
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-09-
|
11
|
+
date: 2015-09-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -67,14 +67,18 @@ dependencies:
|
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '0'
|
69
69
|
description: |
|
70
|
+
== Description
|
71
|
+
|
70
72
|
This gem provides some methods for calculating similarities of two strings.
|
71
73
|
|
72
|
-
Currently implemented
|
74
|
+
=== Currently implemented
|
75
|
+
|
73
76
|
- Cosine similarity
|
77
|
+
- Levenshtein distance/similarity
|
78
|
+
|
79
|
+
=== Planned
|
74
80
|
|
75
|
-
Planned:
|
76
81
|
- Hamming similarity
|
77
|
-
- Levenshtein similarity
|
78
82
|
email:
|
79
83
|
- manuel@hutter.io
|
80
84
|
executables: []
|