string-similarity 1.0.1 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a0e99a22e06043cfc985e7cda581c9d5b2747ada
4
- data.tar.gz: dfc533af4d4b9fc4d138d0505193b51cc934098f
3
+ metadata.gz: b89926f7398486de02d94f5f1d8742834a95d54f
4
+ data.tar.gz: 0493548d9494e4855b648551848da77fbef6945d
5
5
  SHA512:
6
- metadata.gz: 84343e34af35640b92bb708fa3d87d96b35aa1d56892dd33ce1f8843edf0439a349a106b924f500b659d7a87d530116a70d9fbef87165502d83fda58a30c9807
7
- data.tar.gz: a458bf9bec226ec89549c7ae7041ee1476e8ba2c2ecac92bc71a89e14f16d7350ee5e00bce3fa206d541613ca620ed373356e6fa399ba34ec60bbe45a8f393e6
6
+ metadata.gz: 8af14a673396cf5b7b20df9e279621e4301fb5c8c52f14b1a8aae3b096d1ad5a480ce6e4cc64c0b5ba6bef1c0fe72f6b530f8a10aef21bfe1c7f932d00007e3a
7
+ data.tar.gz: a89455cf5807fb41749d4f4b86e4642e50cf1ff8620d278e8e3b43c2abf6cd3454e740a81337e1fa7d7d3c76704a17b271a1cd61558362d37e109ca01835be6b
data/.rspec CHANGED
@@ -1,2 +1,3 @@
1
1
  --color
2
2
  --require spec_helper
3
+ --format doc
data/Gemfile CHANGED
@@ -3,4 +3,9 @@ source 'https://rubygems.org'
3
3
  # Specify your gem's dependencies in string-similarity.gemspec
4
4
  gemspec
5
5
 
6
- gem 'codeclimate-test-reporter', group: :test, require: nil
6
+ group :test do
7
+ gem 'codeclimate-test-reporter', require: nil
8
+ gem 'guard'
9
+ gem 'guard-rspec'
10
+ gem 'growl'
11
+ end
data/README.md CHANGED
@@ -46,6 +46,15 @@ String::Similarity.cosine 'foo', 'foo'
46
46
  # or call on a string directly
47
47
  'string'.cosine_similarity_to 'strong'
48
48
  # => 0.8333333333333335
49
+
50
+
51
+ # Same for Levenshtein:
52
+ String::Similarity.levenshtein_distance('kitten', 'sitting') # or ...
53
+ 'kitten'.levenshtein_distance_to('sitting')
54
+ # => 3
55
+ String::Similarity.levenshtein('foo', 'far') # or ...
56
+ 'far'.levenshtein_similarity_to('foo')
57
+ # => 0.5
49
58
  ```
50
59
 
51
60
  ## Development
data/bin/setup CHANGED
@@ -1,5 +1,8 @@
1
- #!/bin/bash
2
- set -euo pipefail
3
- IFS=$'\n\t'
1
+ #!/bin/sh
2
+ # Install required dependencies and set up for development.
4
3
 
5
- bundle install
4
+ set -e
5
+
6
+ cd "$(dirname $0)/.."
7
+
8
+ bundle install && bundle clean
@@ -1,23 +1,36 @@
1
1
  require 'string/similarity/version'
2
2
 
3
3
  class String
4
-
4
+ # Returns the cosine similarity to +other+
5
+ # @see String::Similarity#cosine
5
6
  def cosine_similarity_to(other)
6
7
  String::Similarity.cosine(self, other)
7
8
  end
8
9
 
10
+ # Returns the Levenshtein distance to +other+
11
+ # @see String::Similarity.levenshtein_distance
12
+ def levenshtein_distance_to(other)
13
+ String::Similarity.levenshtein_distance(self, other)
14
+ end
15
+
16
+ # Returns the Levenshtein similarity to +other+
17
+ # @see String::Similarity.levenshtein
18
+ def levenshtein_similarity_to(other)
19
+ String::Similarity.levenshtein(self, other)
20
+ end
21
+
9
22
  # +String::Similarity+ provides various methods for
10
23
  # calculating string distances.
11
24
  module Similarity extend self
12
-
13
-
14
- # Calcuate the
15
- # {https://en.wikipedia.org/wiki/Cosine_similarity Cosine similarity}
16
- # of two strings.
25
+ # Calcuate the {https://en.wikipedia.org/wiki/Cosine_similarity
26
+ # Cosine similarity} of two strings.
27
+ #
28
+ # For an explanation of the Cosine similarity of two strings read
29
+ # {http://stackoverflow.com/a/1750187/405454 this excellent SO answer}.
17
30
  #
18
31
  # @param str1 [String] first string
19
32
  # @param str2 [String] second string
20
- # @return [Float] cosine distance of the two arguments.
33
+ # @return [Float] cosine similarity of the two arguments.
21
34
  # - +1.0+ if the strings are identical
22
35
  # - +0.0+ if the strings are completely different
23
36
  # - +0.0+ if one of the strings is empty
@@ -36,12 +49,66 @@ class String
36
49
  dot_product / magnitude
37
50
  end
38
51
 
52
+ # Calculate the Levenshtein similarity for two strings.
53
+ #
54
+ # This is basically the inversion of the levenshtein_distance, i.e.
55
+ # 1 / levenshtein_distance(str1, str2)
56
+ #
57
+ # @param str1 [String] first string
58
+ # @param str2 [String] second string
59
+ # @return [Float] levenshtein similarity of the two arguments.
60
+ # - +1.0+ if the strings are identical
61
+ # - +0.0+ if one of the strings is empty
62
+ # @see #levenshtein_distance
63
+ def levenshtein(str1, str2)
64
+ return 1.0 if str1.eql?(str2)
65
+ return 0.0 if str1.empty? || str2.empty?
66
+ 1.0 / levenshtein_distance(str1, str2)
67
+ end
68
+
69
+ # Calculate the {https://en.wikipedia.org/wiki/Levenshtein_distance
70
+ # Levenshtein distance} of two strings.
71
+ #
72
+ # @param str1 [String] first string
73
+ # @param str2 [String] second string
74
+ # @return [Fixnum] edit distance between the two strings
75
+ # - +0+ if the strings are identical
76
+ def levenshtein_distance(str1, str2)
77
+ # base cases
78
+ return 0 if str1.eql?(str2)
79
+ return str2.length if str1.empty?
80
+ return str1.length if str2.empty?
81
+
82
+ # Initialize cost-matrix rows
83
+ previous = (0..str2.length).to_a
84
+ current = []
85
+
86
+ (0...str1.length).each do |i|
87
+ # first element is always the edit distance from an empty string.
88
+ current[0] = i + 1
89
+ (0...str2.length).each do |j|
90
+ current[j+1] = [
91
+ # insertion
92
+ current[j] + 1,
93
+ # deletion
94
+ previous[j+1] + 1,
95
+ # substitution or no operation
96
+ previous[j] + (str1[i].eql?(str2[j]) ? 0 : 1)
97
+ ].min
98
+ end
99
+ previous = current.dup
100
+ end
101
+
102
+ current[str2.length]
103
+ end
104
+
39
105
  private
40
106
 
41
107
  # create a vector from +str+
42
108
  #
43
109
  # @example
44
- # vector('hello') # => {"h"=>1, "e"=>1, "l"=>2, "o"=>1}
110
+ # v1 = vector('hello') # => {"h"=>1, "e"=>1, "l"=>2, "o"=>1}
111
+ # v1["x"] # => 0
45
112
  def vector(str)
46
113
  v = Hash.new(0)
47
114
  str.each_char { |c| v[c] += 1 }
@@ -51,7 +118,7 @@ class String
51
118
  # calculate the dot product of +vector1+ and +vector2+
52
119
  def dot(vector1, vector2)
53
120
  product = 0
54
- vector1.each do |k,v|
121
+ vector1.each do |k, v|
55
122
  product += v * vector2[k]
56
123
  end
57
124
  product
@@ -60,7 +127,7 @@ class String
60
127
  # calculate the magnitude for +vector+
61
128
  def mag(vector)
62
129
  # calculate the sum of squares
63
- sq = vector.inject(0) { |s,n| s + n**2 }
130
+ sq = vector.inject(0) { |s, n| s + n**2 }
64
131
  Math.sqrt(sq)
65
132
  end
66
133
  end
@@ -1,5 +1,6 @@
1
1
  class String
2
2
  module Similarity
3
- VERSION = '1.0.1'
3
+ # Gem version
4
+ VERSION = '1.1.0'
4
5
  end
5
6
  end
@@ -11,14 +11,18 @@ Gem::Specification.new do |spec|
11
11
 
12
12
  spec.summary = %q{Various methods for calculating string similarities.}
13
13
  spec.description = <<-EOT
14
+ == Description
15
+
14
16
  This gem provides some methods for calculating similarities of two strings.
15
17
 
16
- Currently implemented:
18
+ === Currently implemented
19
+
17
20
  - Cosine similarity
21
+ - Levenshtein distance/similarity
22
+
23
+ === Planned
18
24
 
19
- Planned:
20
25
  - Hamming similarity
21
- - Levenshtein similarity
22
26
  EOT
23
27
  spec.homepage = 'https://github.com/mhutter/string-similarity'
24
28
  spec.license = 'MIT'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: string-similarity
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.1
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Manuel Hutter
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2015-09-04 00:00:00.000000000 Z
11
+ date: 2015-09-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -67,14 +67,18 @@ dependencies:
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0'
69
69
  description: |
70
+ == Description
71
+
70
72
  This gem provides some methods for calculating similarities of two strings.
71
73
 
72
- Currently implemented:
74
+ === Currently implemented
75
+
73
76
  - Cosine similarity
77
+ - Levenshtein distance/similarity
78
+
79
+ === Planned
74
80
 
75
- Planned:
76
81
  - Hamming similarity
77
- - Levenshtein similarity
78
82
  email:
79
83
  - manuel@hutter.io
80
84
  executables: []