string-similarity 1.0.1 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rspec +1 -0
- data/Gemfile +6 -1
- data/README.md +9 -0
- data/bin/setup +7 -4
- data/lib/string/similarity.rb +77 -10
- data/lib/string/similarity/version.rb +2 -1
- data/string-similarity.gemspec +7 -3
- metadata +9 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b89926f7398486de02d94f5f1d8742834a95d54f
|
4
|
+
data.tar.gz: 0493548d9494e4855b648551848da77fbef6945d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8af14a673396cf5b7b20df9e279621e4301fb5c8c52f14b1a8aae3b096d1ad5a480ce6e4cc64c0b5ba6bef1c0fe72f6b530f8a10aef21bfe1c7f932d00007e3a
|
7
|
+
data.tar.gz: a89455cf5807fb41749d4f4b86e4642e50cf1ff8620d278e8e3b43c2abf6cd3454e740a81337e1fa7d7d3c76704a17b271a1cd61558362d37e109ca01835be6b
|
data/.rspec
CHANGED
data/Gemfile
CHANGED
@@ -3,4 +3,9 @@ source 'https://rubygems.org'
|
|
3
3
|
# Specify your gem's dependencies in string-similarity.gemspec
|
4
4
|
gemspec
|
5
5
|
|
6
|
-
|
6
|
+
group :test do
|
7
|
+
gem 'codeclimate-test-reporter', require: nil
|
8
|
+
gem 'guard'
|
9
|
+
gem 'guard-rspec'
|
10
|
+
gem 'growl'
|
11
|
+
end
|
data/README.md
CHANGED
@@ -46,6 +46,15 @@ String::Similarity.cosine 'foo', 'foo'
|
|
46
46
|
# or call on a string directly
|
47
47
|
'string'.cosine_similarity_to 'strong'
|
48
48
|
# => 0.8333333333333335
|
49
|
+
|
50
|
+
|
51
|
+
# Same for Levenshtein:
|
52
|
+
String::Similarity.levenshtein_distance('kitten', 'sitting') # or ...
|
53
|
+
'kitten'.levenshtein_distance_to('sitting')
|
54
|
+
# => 3
|
55
|
+
String::Similarity.levenshtein('foo', 'far') # or ...
|
56
|
+
'far'.levenshtein_similarity_to('foo')
|
57
|
+
# => 0.5
|
49
58
|
```
|
50
59
|
|
51
60
|
## Development
|
data/bin/setup
CHANGED
data/lib/string/similarity.rb
CHANGED
@@ -1,23 +1,36 @@
|
|
1
1
|
require 'string/similarity/version'
|
2
2
|
|
3
3
|
class String
|
4
|
-
|
4
|
+
# Returns the cosine similarity to +other+
|
5
|
+
# @see String::Similarity#cosine
|
5
6
|
def cosine_similarity_to(other)
|
6
7
|
String::Similarity.cosine(self, other)
|
7
8
|
end
|
8
9
|
|
10
|
+
# Returns the Levenshtein distance to +other+
|
11
|
+
# @see String::Similarity.levenshtein_distance
|
12
|
+
def levenshtein_distance_to(other)
|
13
|
+
String::Similarity.levenshtein_distance(self, other)
|
14
|
+
end
|
15
|
+
|
16
|
+
# Returns the Levenshtein similarity to +other+
|
17
|
+
# @see String::Similarity.levenshtein
|
18
|
+
def levenshtein_similarity_to(other)
|
19
|
+
String::Similarity.levenshtein(self, other)
|
20
|
+
end
|
21
|
+
|
9
22
|
# +String::Similarity+ provides various methods for
|
10
23
|
# calculating string distances.
|
11
24
|
module Similarity extend self
|
12
|
-
|
13
|
-
|
14
|
-
#
|
15
|
-
#
|
16
|
-
#
|
25
|
+
# Calcuate the {https://en.wikipedia.org/wiki/Cosine_similarity
|
26
|
+
# Cosine similarity} of two strings.
|
27
|
+
#
|
28
|
+
# For an explanation of the Cosine similarity of two strings read
|
29
|
+
# {http://stackoverflow.com/a/1750187/405454 this excellent SO answer}.
|
17
30
|
#
|
18
31
|
# @param str1 [String] first string
|
19
32
|
# @param str2 [String] second string
|
20
|
-
# @return [Float] cosine
|
33
|
+
# @return [Float] cosine similarity of the two arguments.
|
21
34
|
# - +1.0+ if the strings are identical
|
22
35
|
# - +0.0+ if the strings are completely different
|
23
36
|
# - +0.0+ if one of the strings is empty
|
@@ -36,12 +49,66 @@ class String
|
|
36
49
|
dot_product / magnitude
|
37
50
|
end
|
38
51
|
|
52
|
+
# Calculate the Levenshtein similarity for two strings.
|
53
|
+
#
|
54
|
+
# This is basically the inversion of the levenshtein_distance, i.e.
|
55
|
+
# 1 / levenshtein_distance(str1, str2)
|
56
|
+
#
|
57
|
+
# @param str1 [String] first string
|
58
|
+
# @param str2 [String] second string
|
59
|
+
# @return [Float] levenshtein similarity of the two arguments.
|
60
|
+
# - +1.0+ if the strings are identical
|
61
|
+
# - +0.0+ if one of the strings is empty
|
62
|
+
# @see #levenshtein_distance
|
63
|
+
def levenshtein(str1, str2)
|
64
|
+
return 1.0 if str1.eql?(str2)
|
65
|
+
return 0.0 if str1.empty? || str2.empty?
|
66
|
+
1.0 / levenshtein_distance(str1, str2)
|
67
|
+
end
|
68
|
+
|
69
|
+
# Calculate the {https://en.wikipedia.org/wiki/Levenshtein_distance
|
70
|
+
# Levenshtein distance} of two strings.
|
71
|
+
#
|
72
|
+
# @param str1 [String] first string
|
73
|
+
# @param str2 [String] second string
|
74
|
+
# @return [Fixnum] edit distance between the two strings
|
75
|
+
# - +0+ if the strings are identical
|
76
|
+
def levenshtein_distance(str1, str2)
|
77
|
+
# base cases
|
78
|
+
return 0 if str1.eql?(str2)
|
79
|
+
return str2.length if str1.empty?
|
80
|
+
return str1.length if str2.empty?
|
81
|
+
|
82
|
+
# Initialize cost-matrix rows
|
83
|
+
previous = (0..str2.length).to_a
|
84
|
+
current = []
|
85
|
+
|
86
|
+
(0...str1.length).each do |i|
|
87
|
+
# first element is always the edit distance from an empty string.
|
88
|
+
current[0] = i + 1
|
89
|
+
(0...str2.length).each do |j|
|
90
|
+
current[j+1] = [
|
91
|
+
# insertion
|
92
|
+
current[j] + 1,
|
93
|
+
# deletion
|
94
|
+
previous[j+1] + 1,
|
95
|
+
# substitution or no operation
|
96
|
+
previous[j] + (str1[i].eql?(str2[j]) ? 0 : 1)
|
97
|
+
].min
|
98
|
+
end
|
99
|
+
previous = current.dup
|
100
|
+
end
|
101
|
+
|
102
|
+
current[str2.length]
|
103
|
+
end
|
104
|
+
|
39
105
|
private
|
40
106
|
|
41
107
|
# create a vector from +str+
|
42
108
|
#
|
43
109
|
# @example
|
44
|
-
# vector('hello') # => {"h"=>1, "e"=>1, "l"=>2, "o"=>1}
|
110
|
+
# v1 = vector('hello') # => {"h"=>1, "e"=>1, "l"=>2, "o"=>1}
|
111
|
+
# v1["x"] # => 0
|
45
112
|
def vector(str)
|
46
113
|
v = Hash.new(0)
|
47
114
|
str.each_char { |c| v[c] += 1 }
|
@@ -51,7 +118,7 @@ class String
|
|
51
118
|
# calculate the dot product of +vector1+ and +vector2+
|
52
119
|
def dot(vector1, vector2)
|
53
120
|
product = 0
|
54
|
-
vector1.each do |k,v|
|
121
|
+
vector1.each do |k, v|
|
55
122
|
product += v * vector2[k]
|
56
123
|
end
|
57
124
|
product
|
@@ -60,7 +127,7 @@ class String
|
|
60
127
|
# calculate the magnitude for +vector+
|
61
128
|
def mag(vector)
|
62
129
|
# calculate the sum of squares
|
63
|
-
sq = vector.inject(0) { |s,n| s + n**2 }
|
130
|
+
sq = vector.inject(0) { |s, n| s + n**2 }
|
64
131
|
Math.sqrt(sq)
|
65
132
|
end
|
66
133
|
end
|
data/string-similarity.gemspec
CHANGED
@@ -11,14 +11,18 @@ Gem::Specification.new do |spec|
|
|
11
11
|
|
12
12
|
spec.summary = %q{Various methods for calculating string similarities.}
|
13
13
|
spec.description = <<-EOT
|
14
|
+
== Description
|
15
|
+
|
14
16
|
This gem provides some methods for calculating similarities of two strings.
|
15
17
|
|
16
|
-
Currently implemented
|
18
|
+
=== Currently implemented
|
19
|
+
|
17
20
|
- Cosine similarity
|
21
|
+
- Levenshtein distance/similarity
|
22
|
+
|
23
|
+
=== Planned
|
18
24
|
|
19
|
-
Planned:
|
20
25
|
- Hamming similarity
|
21
|
-
- Levenshtein similarity
|
22
26
|
EOT
|
23
27
|
spec.homepage = 'https://github.com/mhutter/string-similarity'
|
24
28
|
spec.license = 'MIT'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: string-similarity
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Manuel Hutter
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-09-
|
11
|
+
date: 2015-09-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -67,14 +67,18 @@ dependencies:
|
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '0'
|
69
69
|
description: |
|
70
|
+
== Description
|
71
|
+
|
70
72
|
This gem provides some methods for calculating similarities of two strings.
|
71
73
|
|
72
|
-
Currently implemented
|
74
|
+
=== Currently implemented
|
75
|
+
|
73
76
|
- Cosine similarity
|
77
|
+
- Levenshtein distance/similarity
|
78
|
+
|
79
|
+
=== Planned
|
74
80
|
|
75
|
-
Planned:
|
76
81
|
- Hamming similarity
|
77
|
-
- Levenshtein similarity
|
78
82
|
email:
|
79
83
|
- manuel@hutter.io
|
80
84
|
executables: []
|