string-similarity 1.0.1 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a0e99a22e06043cfc985e7cda581c9d5b2747ada
4
- data.tar.gz: dfc533af4d4b9fc4d138d0505193b51cc934098f
3
+ metadata.gz: b89926f7398486de02d94f5f1d8742834a95d54f
4
+ data.tar.gz: 0493548d9494e4855b648551848da77fbef6945d
5
5
  SHA512:
6
- metadata.gz: 84343e34af35640b92bb708fa3d87d96b35aa1d56892dd33ce1f8843edf0439a349a106b924f500b659d7a87d530116a70d9fbef87165502d83fda58a30c9807
7
- data.tar.gz: a458bf9bec226ec89549c7ae7041ee1476e8ba2c2ecac92bc71a89e14f16d7350ee5e00bce3fa206d541613ca620ed373356e6fa399ba34ec60bbe45a8f393e6
6
+ metadata.gz: 8af14a673396cf5b7b20df9e279621e4301fb5c8c52f14b1a8aae3b096d1ad5a480ce6e4cc64c0b5ba6bef1c0fe72f6b530f8a10aef21bfe1c7f932d00007e3a
7
+ data.tar.gz: a89455cf5807fb41749d4f4b86e4642e50cf1ff8620d278e8e3b43c2abf6cd3454e740a81337e1fa7d7d3c76704a17b271a1cd61558362d37e109ca01835be6b
data/.rspec CHANGED
@@ -1,2 +1,3 @@
1
1
  --color
2
2
  --require spec_helper
3
+ --format doc
data/Gemfile CHANGED
@@ -3,4 +3,9 @@ source 'https://rubygems.org'
3
3
  # Specify your gem's dependencies in string-similarity.gemspec
4
4
  gemspec
5
5
 
6
- gem 'codeclimate-test-reporter', group: :test, require: nil
6
+ group :test do
7
+ gem 'codeclimate-test-reporter', require: nil
8
+ gem 'guard'
9
+ gem 'guard-rspec'
10
+ gem 'growl'
11
+ end
data/README.md CHANGED
@@ -46,6 +46,15 @@ String::Similarity.cosine 'foo', 'foo'
46
46
  # or call on a string directly
47
47
  'string'.cosine_similarity_to 'strong'
48
48
  # => 0.8333333333333335
49
+
50
+
51
+ # Same for Levenshtein:
52
+ String::Similarity.levenshtein_distance('kitten', 'sitting') # or ...
53
+ 'kitten'.levenshtein_distance_to('sitting')
54
+ # => 3
55
+ String::Similarity.levenshtein('foo', 'far') # or ...
56
+ 'far'.levenshtein_similarity_to('foo')
57
+ # => 0.5
49
58
  ```
50
59
 
51
60
  ## Development
data/bin/setup CHANGED
@@ -1,5 +1,8 @@
1
- #!/bin/bash
2
- set -euo pipefail
3
- IFS=$'\n\t'
1
+ #!/bin/sh
2
+ # Install required dependencies and set up for development.
4
3
 
5
- bundle install
4
+ set -e
5
+
6
+ cd "$(dirname $0)/.."
7
+
8
+ bundle install && bundle clean
@@ -1,23 +1,36 @@
1
1
  require 'string/similarity/version'
2
2
 
3
3
  class String
4
-
4
+ # Returns the cosine similarity to +other+
5
+ # @see String::Similarity#cosine
5
6
  def cosine_similarity_to(other)
6
7
  String::Similarity.cosine(self, other)
7
8
  end
8
9
 
10
+ # Returns the Levenshtein distance to +other+
11
+ # @see String::Similarity.levenshtein_distance
12
+ def levenshtein_distance_to(other)
13
+ String::Similarity.levenshtein_distance(self, other)
14
+ end
15
+
16
+ # Returns the Levenshtein similarity to +other+
17
+ # @see String::Similarity.levenshtein
18
+ def levenshtein_similarity_to(other)
19
+ String::Similarity.levenshtein(self, other)
20
+ end
21
+
9
22
  # +String::Similarity+ provides various methods for
10
23
  # calculating string distances.
11
24
  module Similarity extend self
12
-
13
-
14
- # Calcuate the
15
- # {https://en.wikipedia.org/wiki/Cosine_similarity Cosine similarity}
16
- # of two strings.
25
+ # Calcuate the {https://en.wikipedia.org/wiki/Cosine_similarity
26
+ # Cosine similarity} of two strings.
27
+ #
28
+ # For an explanation of the Cosine similarity of two strings read
29
+ # {http://stackoverflow.com/a/1750187/405454 this excellent SO answer}.
17
30
  #
18
31
  # @param str1 [String] first string
19
32
  # @param str2 [String] second string
20
- # @return [Float] cosine distance of the two arguments.
33
+ # @return [Float] cosine similarity of the two arguments.
21
34
  # - +1.0+ if the strings are identical
22
35
  # - +0.0+ if the strings are completely different
23
36
  # - +0.0+ if one of the strings is empty
@@ -36,12 +49,66 @@ class String
36
49
  dot_product / magnitude
37
50
  end
38
51
 
52
+ # Calculate the Levenshtein similarity for two strings.
53
+ #
54
+ # This is basically the inversion of the levenshtein_distance, i.e.
55
+ # 1 / levenshtein_distance(str1, str2)
56
+ #
57
+ # @param str1 [String] first string
58
+ # @param str2 [String] second string
59
+ # @return [Float] levenshtein similarity of the two arguments.
60
+ # - +1.0+ if the strings are identical
61
+ # - +0.0+ if one of the strings is empty
62
+ # @see #levenshtein_distance
63
+ def levenshtein(str1, str2)
64
+ return 1.0 if str1.eql?(str2)
65
+ return 0.0 if str1.empty? || str2.empty?
66
+ 1.0 / levenshtein_distance(str1, str2)
67
+ end
68
+
69
+ # Calculate the {https://en.wikipedia.org/wiki/Levenshtein_distance
70
+ # Levenshtein distance} of two strings.
71
+ #
72
+ # @param str1 [String] first string
73
+ # @param str2 [String] second string
74
+ # @return [Fixnum] edit distance between the two strings
75
+ # - +0+ if the strings are identical
76
+ def levenshtein_distance(str1, str2)
77
+ # base cases
78
+ return 0 if str1.eql?(str2)
79
+ return str2.length if str1.empty?
80
+ return str1.length if str2.empty?
81
+
82
+ # Initialize cost-matrix rows
83
+ previous = (0..str2.length).to_a
84
+ current = []
85
+
86
+ (0...str1.length).each do |i|
87
+ # first element is always the edit distance from an empty string.
88
+ current[0] = i + 1
89
+ (0...str2.length).each do |j|
90
+ current[j+1] = [
91
+ # insertion
92
+ current[j] + 1,
93
+ # deletion
94
+ previous[j+1] + 1,
95
+ # substitution or no operation
96
+ previous[j] + (str1[i].eql?(str2[j]) ? 0 : 1)
97
+ ].min
98
+ end
99
+ previous = current.dup
100
+ end
101
+
102
+ current[str2.length]
103
+ end
104
+
39
105
  private
40
106
 
41
107
  # create a vector from +str+
42
108
  #
43
109
  # @example
44
- # vector('hello') # => {"h"=>1, "e"=>1, "l"=>2, "o"=>1}
110
+ # v1 = vector('hello') # => {"h"=>1, "e"=>1, "l"=>2, "o"=>1}
111
+ # v1["x"] # => 0
45
112
  def vector(str)
46
113
  v = Hash.new(0)
47
114
  str.each_char { |c| v[c] += 1 }
@@ -51,7 +118,7 @@ class String
51
118
  # calculate the dot product of +vector1+ and +vector2+
52
119
  def dot(vector1, vector2)
53
120
  product = 0
54
- vector1.each do |k,v|
121
+ vector1.each do |k, v|
55
122
  product += v * vector2[k]
56
123
  end
57
124
  product
@@ -60,7 +127,7 @@ class String
60
127
  # calculate the magnitude for +vector+
61
128
  def mag(vector)
62
129
  # calculate the sum of squares
63
- sq = vector.inject(0) { |s,n| s + n**2 }
130
+ sq = vector.inject(0) { |s, n| s + n**2 }
64
131
  Math.sqrt(sq)
65
132
  end
66
133
  end
@@ -1,5 +1,6 @@
1
1
  class String
2
2
  module Similarity
3
- VERSION = '1.0.1'
3
+ # Gem version
4
+ VERSION = '1.1.0'
4
5
  end
5
6
  end
@@ -11,14 +11,18 @@ Gem::Specification.new do |spec|
11
11
 
12
12
  spec.summary = %q{Various methods for calculating string similarities.}
13
13
  spec.description = <<-EOT
14
+ == Description
15
+
14
16
  This gem provides some methods for calculating similarities of two strings.
15
17
 
16
- Currently implemented:
18
+ === Currently implemented
19
+
17
20
  - Cosine similarity
21
+ - Levenshtein distance/similarity
22
+
23
+ === Planned
18
24
 
19
- Planned:
20
25
  - Hamming similarity
21
- - Levenshtein similarity
22
26
  EOT
23
27
  spec.homepage = 'https://github.com/mhutter/string-similarity'
24
28
  spec.license = 'MIT'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: string-similarity
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.1
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Manuel Hutter
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2015-09-04 00:00:00.000000000 Z
11
+ date: 2015-09-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -67,14 +67,18 @@ dependencies:
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0'
69
69
  description: |
70
+ == Description
71
+
70
72
  This gem provides some methods for calculating similarities of two strings.
71
73
 
72
- Currently implemented:
74
+ === Currently implemented
75
+
73
76
  - Cosine similarity
77
+ - Levenshtein distance/similarity
78
+
79
+ === Planned
74
80
 
75
- Planned:
76
81
  - Hamming similarity
77
- - Levenshtein similarity
78
82
  email:
79
83
  - manuel@hutter.io
80
84
  executables: []