string-similarity 1.1.0 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +3 -0
- data/CHANGELOG.md +3 -0
- data/README.md +4 -4
- data/lib/string-similarity.rb +1 -0
- data/lib/string/similarity.rb +22 -14
- data/lib/string/similarity/version.rb +1 -1
- metadata +6 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cd97efdcd76434ae400e6382b55e54d44ce003d8
|
4
|
+
data.tar.gz: 1ca9a5eb0075b86d30afd03669226425161d510f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d904adf7b09fc53dadee2e47f3111f4bcce52d4cabfad7c1b7fee542d9af5c65298cbf7fa1be95d729dfb2672b21994c6f7e28ad0498a5f147311f7dc3e2418f
|
7
|
+
data.tar.gz: d849740e7fb49897439baf66f06f3cafd4677f01c6530bb212bb9b406c5c8c974716b3af562fb72c8bcffeaa1602a4d880bed35b5f5289381e01dd57200b313c
|
data/.rubocop.yml
ADDED
data/CHANGELOG.md
ADDED
data/README.md
CHANGED
@@ -10,9 +10,9 @@ Library for calculating the similarity of two strings.
|
|
10
10
|
|
11
11
|
## State
|
12
12
|
|
13
|
-
- Cosine
|
14
|
-
- Hamming
|
15
|
-
- Levenshtein
|
13
|
+
- [x] Cosine
|
14
|
+
- [ ] Hamming
|
15
|
+
- [x] Levenshtein
|
16
16
|
|
17
17
|
## Installation
|
18
18
|
|
@@ -33,7 +33,7 @@ Or install it yourself as:
|
|
33
33
|
## Usage
|
34
34
|
|
35
35
|
```ruby
|
36
|
-
require 'string
|
36
|
+
require 'string/similarity'
|
37
37
|
|
38
38
|
# Call the methods on the module
|
39
39
|
String::Similarity.cosine 'foo', 'bar'
|
@@ -0,0 +1 @@
|
|
1
|
+
# require 'string/similarity'
|
data/lib/string/similarity.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'string/similarity/version'
|
2
2
|
|
3
|
+
# For convenience, String is extended by a couple of helper methods
|
3
4
|
class String
|
4
5
|
# Returns the cosine similarity to +other+
|
5
6
|
# @see String::Similarity#cosine
|
@@ -21,7 +22,7 @@ class String
|
|
21
22
|
|
22
23
|
# +String::Similarity+ provides various methods for
|
23
24
|
# calculating string distances.
|
24
|
-
module Similarity
|
25
|
+
module Similarity
|
25
26
|
# Calcuate the {https://en.wikipedia.org/wiki/Cosine_similarity
|
26
27
|
# Cosine similarity} of two strings.
|
27
28
|
#
|
@@ -34,12 +35,13 @@ class String
|
|
34
35
|
# - +1.0+ if the strings are identical
|
35
36
|
# - +0.0+ if the strings are completely different
|
36
37
|
# - +0.0+ if one of the strings is empty
|
37
|
-
def cosine(str1, str2)
|
38
|
+
def self.cosine(str1, str2)
|
38
39
|
return 1.0 if str1 == str2
|
39
40
|
return 0.0 if str1.empty? || str2.empty?
|
40
41
|
|
41
42
|
# convert both texts to vectors
|
42
|
-
v1
|
43
|
+
v1 = vector(str1)
|
44
|
+
v2 = vector(str2)
|
43
45
|
|
44
46
|
# calculate the dot product
|
45
47
|
dot_product = dot(v1, v2)
|
@@ -60,7 +62,7 @@ class String
|
|
60
62
|
# - +1.0+ if the strings are identical
|
61
63
|
# - +0.0+ if one of the strings is empty
|
62
64
|
# @see #levenshtein_distance
|
63
|
-
def levenshtein(str1, str2)
|
65
|
+
def self.levenshtein(str1, str2)
|
64
66
|
return 1.0 if str1.eql?(str2)
|
65
67
|
return 0.0 if str1.empty? || str2.empty?
|
66
68
|
1.0 / levenshtein_distance(str1, str2)
|
@@ -73,11 +75,10 @@ class String
|
|
73
75
|
# @param str2 [String] second string
|
74
76
|
# @return [Fixnum] edit distance between the two strings
|
75
77
|
# - +0+ if the strings are identical
|
76
|
-
def levenshtein_distance(str1, str2)
|
78
|
+
def self.levenshtein_distance(str1, str2)
|
77
79
|
# base cases
|
78
|
-
|
79
|
-
return
|
80
|
-
return str1.length if str2.empty?
|
80
|
+
result = base_case?(str1, str2)
|
81
|
+
return result if result
|
81
82
|
|
82
83
|
# Initialize cost-matrix rows
|
83
84
|
previous = (0..str2.length).to_a
|
@@ -87,11 +88,11 @@ class String
|
|
87
88
|
# first element is always the edit distance from an empty string.
|
88
89
|
current[0] = i + 1
|
89
90
|
(0...str2.length).each do |j|
|
90
|
-
current[j+1] = [
|
91
|
+
current[j + 1] = [
|
91
92
|
# insertion
|
92
93
|
current[j] + 1,
|
93
94
|
# deletion
|
94
|
-
previous[j+1] + 1,
|
95
|
+
previous[j + 1] + 1,
|
95
96
|
# substitution or no operation
|
96
97
|
previous[j] + (str1[i].eql?(str2[j]) ? 0 : 1)
|
97
98
|
].min
|
@@ -104,19 +105,26 @@ class String
|
|
104
105
|
|
105
106
|
private
|
106
107
|
|
108
|
+
def self.base_case?(str1, str2)
|
109
|
+
return 0 if str1.eql?(str2)
|
110
|
+
return str2.length if str1.empty?
|
111
|
+
return str1.length if str2.empty?
|
112
|
+
false
|
113
|
+
end
|
114
|
+
|
107
115
|
# create a vector from +str+
|
108
116
|
#
|
109
117
|
# @example
|
110
118
|
# v1 = vector('hello') # => {"h"=>1, "e"=>1, "l"=>2, "o"=>1}
|
111
119
|
# v1["x"] # => 0
|
112
|
-
def vector(str)
|
120
|
+
def self.vector(str)
|
113
121
|
v = Hash.new(0)
|
114
122
|
str.each_char { |c| v[c] += 1 }
|
115
123
|
v
|
116
124
|
end
|
117
125
|
|
118
126
|
# calculate the dot product of +vector1+ and +vector2+
|
119
|
-
def dot(vector1, vector2)
|
127
|
+
def self.dot(vector1, vector2)
|
120
128
|
product = 0
|
121
129
|
vector1.each do |k, v|
|
122
130
|
product += v * vector2[k]
|
@@ -125,9 +133,9 @@ class String
|
|
125
133
|
end
|
126
134
|
|
127
135
|
# calculate the magnitude for +vector+
|
128
|
-
def mag(vector)
|
136
|
+
def self.mag(vector)
|
129
137
|
# calculate the sum of squares
|
130
|
-
sq = vector.inject(0) { |
|
138
|
+
sq = vector.inject(0) { |a, e| a + e**2 }
|
131
139
|
Math.sqrt(sq)
|
132
140
|
end
|
133
141
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: string-similarity
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Manuel Hutter
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2016-02-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -87,7 +87,9 @@ extra_rdoc_files: []
|
|
87
87
|
files:
|
88
88
|
- ".gitignore"
|
89
89
|
- ".rspec"
|
90
|
+
- ".rubocop.yml"
|
90
91
|
- ".travis.yml"
|
92
|
+
- CHANGELOG.md
|
91
93
|
- Gemfile
|
92
94
|
- Guardfile
|
93
95
|
- LICENSE.txt
|
@@ -95,6 +97,7 @@ files:
|
|
95
97
|
- Rakefile
|
96
98
|
- bin/console
|
97
99
|
- bin/setup
|
100
|
+
- lib/string-similarity.rb
|
98
101
|
- lib/string/similarity.rb
|
99
102
|
- lib/string/similarity/version.rb
|
100
103
|
- string-similarity.gemspec
|
@@ -118,9 +121,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
118
121
|
version: '0'
|
119
122
|
requirements: []
|
120
123
|
rubyforge_project:
|
121
|
-
rubygems_version: 2.
|
124
|
+
rubygems_version: 2.5.1
|
122
125
|
signing_key:
|
123
126
|
specification_version: 4
|
124
127
|
summary: Various methods for calculating string similarities.
|
125
128
|
test_files: []
|
126
|
-
has_rdoc:
|