string-similarity 1.1.0 → 1.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +3 -0
- data/CHANGELOG.md +3 -0
- data/README.md +4 -4
- data/lib/string-similarity.rb +1 -0
- data/lib/string/similarity.rb +22 -14
- data/lib/string/similarity/version.rb +1 -1
- metadata +6 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cd97efdcd76434ae400e6382b55e54d44ce003d8
|
4
|
+
data.tar.gz: 1ca9a5eb0075b86d30afd03669226425161d510f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d904adf7b09fc53dadee2e47f3111f4bcce52d4cabfad7c1b7fee542d9af5c65298cbf7fa1be95d729dfb2672b21994c6f7e28ad0498a5f147311f7dc3e2418f
|
7
|
+
data.tar.gz: d849740e7fb49897439baf66f06f3cafd4677f01c6530bb212bb9b406c5c8c974716b3af562fb72c8bcffeaa1602a4d880bed35b5f5289381e01dd57200b313c
|
data/.rubocop.yml
ADDED
data/CHANGELOG.md
ADDED
data/README.md
CHANGED
@@ -10,9 +10,9 @@ Library for calculating the similarity of two strings.
|
|
10
10
|
|
11
11
|
## State
|
12
12
|
|
13
|
-
- Cosine
|
14
|
-
- Hamming
|
15
|
-
- Levenshtein
|
13
|
+
- [x] Cosine
|
14
|
+
- [ ] Hamming
|
15
|
+
- [x] Levenshtein
|
16
16
|
|
17
17
|
## Installation
|
18
18
|
|
@@ -33,7 +33,7 @@ Or install it yourself as:
|
|
33
33
|
## Usage
|
34
34
|
|
35
35
|
```ruby
|
36
|
-
require 'string
|
36
|
+
require 'string/similarity'
|
37
37
|
|
38
38
|
# Call the methods on the module
|
39
39
|
String::Similarity.cosine 'foo', 'bar'
|
@@ -0,0 +1 @@
|
|
1
|
+
# require 'string/similarity'
|
data/lib/string/similarity.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'string/similarity/version'
|
2
2
|
|
3
|
+
# For convenience, String is extended by a couple of helper methods
|
3
4
|
class String
|
4
5
|
# Returns the cosine similarity to +other+
|
5
6
|
# @see String::Similarity#cosine
|
@@ -21,7 +22,7 @@ class String
|
|
21
22
|
|
22
23
|
# +String::Similarity+ provides various methods for
|
23
24
|
# calculating string distances.
|
24
|
-
module Similarity
|
25
|
+
module Similarity
|
25
26
|
# Calcuate the {https://en.wikipedia.org/wiki/Cosine_similarity
|
26
27
|
# Cosine similarity} of two strings.
|
27
28
|
#
|
@@ -34,12 +35,13 @@ class String
|
|
34
35
|
# - +1.0+ if the strings are identical
|
35
36
|
# - +0.0+ if the strings are completely different
|
36
37
|
# - +0.0+ if one of the strings is empty
|
37
|
-
def cosine(str1, str2)
|
38
|
+
def self.cosine(str1, str2)
|
38
39
|
return 1.0 if str1 == str2
|
39
40
|
return 0.0 if str1.empty? || str2.empty?
|
40
41
|
|
41
42
|
# convert both texts to vectors
|
42
|
-
v1
|
43
|
+
v1 = vector(str1)
|
44
|
+
v2 = vector(str2)
|
43
45
|
|
44
46
|
# calculate the dot product
|
45
47
|
dot_product = dot(v1, v2)
|
@@ -60,7 +62,7 @@ class String
|
|
60
62
|
# - +1.0+ if the strings are identical
|
61
63
|
# - +0.0+ if one of the strings is empty
|
62
64
|
# @see #levenshtein_distance
|
63
|
-
def levenshtein(str1, str2)
|
65
|
+
def self.levenshtein(str1, str2)
|
64
66
|
return 1.0 if str1.eql?(str2)
|
65
67
|
return 0.0 if str1.empty? || str2.empty?
|
66
68
|
1.0 / levenshtein_distance(str1, str2)
|
@@ -73,11 +75,10 @@ class String
|
|
73
75
|
# @param str2 [String] second string
|
74
76
|
# @return [Fixnum] edit distance between the two strings
|
75
77
|
# - +0+ if the strings are identical
|
76
|
-
def levenshtein_distance(str1, str2)
|
78
|
+
def self.levenshtein_distance(str1, str2)
|
77
79
|
# base cases
|
78
|
-
|
79
|
-
return
|
80
|
-
return str1.length if str2.empty?
|
80
|
+
result = base_case?(str1, str2)
|
81
|
+
return result if result
|
81
82
|
|
82
83
|
# Initialize cost-matrix rows
|
83
84
|
previous = (0..str2.length).to_a
|
@@ -87,11 +88,11 @@ class String
|
|
87
88
|
# first element is always the edit distance from an empty string.
|
88
89
|
current[0] = i + 1
|
89
90
|
(0...str2.length).each do |j|
|
90
|
-
current[j+1] = [
|
91
|
+
current[j + 1] = [
|
91
92
|
# insertion
|
92
93
|
current[j] + 1,
|
93
94
|
# deletion
|
94
|
-
previous[j+1] + 1,
|
95
|
+
previous[j + 1] + 1,
|
95
96
|
# substitution or no operation
|
96
97
|
previous[j] + (str1[i].eql?(str2[j]) ? 0 : 1)
|
97
98
|
].min
|
@@ -104,19 +105,26 @@ class String
|
|
104
105
|
|
105
106
|
private
|
106
107
|
|
108
|
+
def self.base_case?(str1, str2)
|
109
|
+
return 0 if str1.eql?(str2)
|
110
|
+
return str2.length if str1.empty?
|
111
|
+
return str1.length if str2.empty?
|
112
|
+
false
|
113
|
+
end
|
114
|
+
|
107
115
|
# create a vector from +str+
|
108
116
|
#
|
109
117
|
# @example
|
110
118
|
# v1 = vector('hello') # => {"h"=>1, "e"=>1, "l"=>2, "o"=>1}
|
111
119
|
# v1["x"] # => 0
|
112
|
-
def vector(str)
|
120
|
+
def self.vector(str)
|
113
121
|
v = Hash.new(0)
|
114
122
|
str.each_char { |c| v[c] += 1 }
|
115
123
|
v
|
116
124
|
end
|
117
125
|
|
118
126
|
# calculate the dot product of +vector1+ and +vector2+
|
119
|
-
def dot(vector1, vector2)
|
127
|
+
def self.dot(vector1, vector2)
|
120
128
|
product = 0
|
121
129
|
vector1.each do |k, v|
|
122
130
|
product += v * vector2[k]
|
@@ -125,9 +133,9 @@ class String
|
|
125
133
|
end
|
126
134
|
|
127
135
|
# calculate the magnitude for +vector+
|
128
|
-
def mag(vector)
|
136
|
+
def self.mag(vector)
|
129
137
|
# calculate the sum of squares
|
130
|
-
sq = vector.inject(0) { |
|
138
|
+
sq = vector.inject(0) { |a, e| a + e**2 }
|
131
139
|
Math.sqrt(sq)
|
132
140
|
end
|
133
141
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: string-similarity
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Manuel Hutter
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2016-02-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -87,7 +87,9 @@ extra_rdoc_files: []
|
|
87
87
|
files:
|
88
88
|
- ".gitignore"
|
89
89
|
- ".rspec"
|
90
|
+
- ".rubocop.yml"
|
90
91
|
- ".travis.yml"
|
92
|
+
- CHANGELOG.md
|
91
93
|
- Gemfile
|
92
94
|
- Guardfile
|
93
95
|
- LICENSE.txt
|
@@ -95,6 +97,7 @@ files:
|
|
95
97
|
- Rakefile
|
96
98
|
- bin/console
|
97
99
|
- bin/setup
|
100
|
+
- lib/string-similarity.rb
|
98
101
|
- lib/string/similarity.rb
|
99
102
|
- lib/string/similarity/version.rb
|
100
103
|
- string-similarity.gemspec
|
@@ -118,9 +121,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
118
121
|
version: '0'
|
119
122
|
requirements: []
|
120
123
|
rubyforge_project:
|
121
|
-
rubygems_version: 2.
|
124
|
+
rubygems_version: 2.5.1
|
122
125
|
signing_key:
|
123
126
|
specification_version: 4
|
124
127
|
summary: Various methods for calculating string similarities.
|
125
128
|
test_files: []
|
126
|
-
has_rdoc:
|