edits 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a437de45c8830ea622a9b57b84b3e92c54a58178
4
- data.tar.gz: 36eff87bb3ba0e832f2cd908cb0bafe358e05a19
3
+ metadata.gz: a6cd424759a87d827084b94756bebd46a2cce133
4
+ data.tar.gz: 703ee761905db5db81c9462727c7812b4519ff8b
5
5
  SHA512:
6
- metadata.gz: bf9474115aacaff6cb0411c85cf8f81409d6cfaff793ef656821fc1fc33ae0d5315aa6af98c8dbe17a68aa38514b8adef134824e2ea6fd8c1434aea372e37b6c
7
- data.tar.gz: 71199e4c9e67f4b78b90e21368dfd1162cd660fa38b01e40ea412e1a37aa29529d57ee78f1e5584f5a6319a22b2134165b638335b0e75b163650aaadaa9d67ae
6
+ metadata.gz: ccee510e44ace7a1c88362dcd8a15fda256d652d9e65e78def3a28b95c3c30fded7ff2e1205b302f56d38f4365fe3f4a678dfec9b5bca2d441b08b00401bd390
7
+ data.tar.gz: 4b56c06fba560443d7aa2c057d959d271a013216d4e45867c688dda07bad18acce8e8730e6513384c3f03aa0b0985b9333c49e942d4e2bb0ee35b2239ea3abc1
@@ -1,5 +1,14 @@
1
1
  sudo: false
2
2
  language: ruby
3
+ cache: bundler
3
4
  rvm:
4
5
  - 2.4.0
5
6
  before_install: gem install bundler -v 1.15.4
7
+ deploy:
8
+ provider: rubygems
9
+ api_key:
10
+ secure: NKUWS12yMpsMtPeKL+4X56xZ02p5y4R+HYQmgvxFNajOS42aTMXncx4eTnuHoNdKbh+x5yci+lbPr+j5t9qWo5GzJsxjqnFW4lO2V5O1ONBzcAB/g6BPDMQsWbX34eBVafCEidNoOY1HhLlKerWzlsGRq+P4q+3WKLJihrl99pyv+EQykqS11/YVCsmFjrPVa/aGk84njbnI/kIDOY5HV2gVBazARixRO5y2AUg3hRUf2+Tu/X1ke80YBD9LazW2kRUzz0Rs+1vc653JtYn6MeK/bEkGWvN2Qs/k6Q2nNr6ni4v39Y07yylu1EnYh1H/0OT/H2hehxsjMQCQhDlVKbN7NTmOV026aWXA2HdSxmVhxQCIKRGW3Nm81kBj1/edXLpJActnLeex2iCMcXJk8yAAzF0q+vSHLld0w9Jx95kIJB4tnaonWJxcaWaX58HWFbdOuYKvQhXqcflI4KmNH8xXm/O0FIM8VEJRg9dojZ5S8Us3fZpBFZxVJ3H3Fcb406AmoIqcHOsqJ1GvBM8EdWkwuaH9GsUWf8pydgKFgStYUaKk8DDmJonT748emG4yw+78uMGWPoFxf+Mc8jazRxIaRiQbjVFUcLjaRkRFlL8UT9BB9k7c1egjvKhUo/pqV7KIEsAJrrh6zZIkz4h9AwxWpZjVj9+z63Kh6NC1lUI=
11
+ gem: edits
12
+ on:
13
+ tags: true
14
+ repo: tcrouch/edits
data/README.md CHANGED
@@ -1,8 +1,13 @@
1
1
  # Edits
2
2
 
3
+ [![Build Status](https://travis-ci.org/tcrouch/edits.svg?branch=master)](https://travis-ci.org/tcrouch/edits)
4
+ [![Code Climate](https://codeclimate.com/github/tcrouch/edits/badges/gpa.svg)](https://codeclimate.com/github/tcrouch/edits)
5
+ [![Inline docs](http://inch-ci.org/github/tcrouch/edits.svg?branch=master)](http://inch-ci.org/github/tcrouch/edits)
6
+ [![Yard Docs](http://img.shields.io/badge/yard-docs-blue.svg)](http://rubydoc.info/github/tcrouch/edits)
7
+
3
8
  A collection of edit distance algorithms in Ruby.
4
9
 
5
- Includes Levenshtein, Restricted Edit (Optimal Alignment) and Damerau-Levenshtein distances, and Jaro and Jaro-Winkler similarity.
10
+ Includes Levenshtein, Restricted Edit (Optimal Alignment) and Damerau-Levenshtein distances, and Jaro & Jaro-Winkler similarity.
6
11
 
7
12
  ## Installation
8
13
 
@@ -37,14 +42,15 @@ Edits::Levenshtein.distance "acer", "earn"
37
42
  # Max distance
38
43
  Edits::Levenshtein.distance_with_max "iota", "atom", 2
39
44
  # => 2
40
- Edits::Levenshtein.most_similar "atom", %w[tram atlas rota racer]
41
- # => "atlas"
45
+ Edits::Levenshtein.most_similar "atom", %w[tree rota toes racer]
46
+ # => "toes"
42
47
  ```
43
48
 
44
49
  ### Restricted Edit (Optimal Alignment)
45
50
 
46
- Edit distance, accounting for deletion, addition, substitution and swapped
47
- characters.
51
+ Edit distance, accounting for deletion, addition, substitution and
52
+ transposition (two adjacent characters are swapped). This variant is
53
+ restricted by the condition that no sub-string is edited more than once.
48
54
 
49
55
  ```ruby
50
56
  Edits::RestrictedEdit.distance "raked", "bakers"
@@ -53,12 +59,18 @@ Edits::RestrictedEdit.distance "iota", "atom"
53
59
  # => 3
54
60
  Edits::RestrictedEdit.distance "acer", "earn"
55
61
  # => 4
62
+
63
+ # Max distance
64
+ Edits::RestrictedEdit.distance_with_max "iota", "atom", 2
65
+ # => 2
66
+ Edits::RestrictedEdit.most_similar "atom", %w[tree rota toes racer]
67
+ # => "rota"
56
68
  ```
57
69
 
58
70
  ### Damerau-Levenshtein
59
71
 
60
72
  Edit distance, accounting for deletions, additions, substitution and
61
- transposition.
73
+ transposition (two adjacent characters are swapped).
62
74
 
63
75
  ```ruby
64
76
  Edits::DamerauLevenshtein.distance "raked", "bakers"
@@ -2,6 +2,7 @@
2
2
 
3
3
  require "edits/version"
4
4
 
5
+ require "edits/compare"
5
6
  require "edits/damerau_levenshtein"
6
7
  require "edits/hamming"
7
8
  require "edits/jaro"
@@ -0,0 +1,36 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Edits
4
+ # Comparison helpers
5
+ module Compare
6
+ # Given a prototype string and an array of strings, determines which
7
+ # string is most similar to the prototype.
8
+ #
9
+ # `most_similar("foo", strings)` is functionally equivalent to
10
+ # `strings.min_by { |s| distance("foo", s) }`, leveraging
11
+ # {.distance_with_max}.
12
+ #
13
+ # @example
14
+ # most_similar("atom", %w[tram atlas rota racer])
15
+ # # => "atlas"
16
+ # @param prototype [String]
17
+ # @param strings [<String>]
18
+ # @return [String, nil] most similar string, or nil for empty array
19
+ def most_similar(prototype, strings)
20
+ return nil if strings.empty?
21
+ min_s = strings[0]
22
+ min_d = distance(prototype, min_s)
23
+
24
+ strings[1..-1].each do |s|
25
+ return min_s if min_d.zero?
26
+ d = distance_with_max(prototype, s, min_d)
27
+ if d < min_d
28
+ min_d = d
29
+ min_s = s
30
+ end
31
+ end
32
+
33
+ min_s
34
+ end
35
+ end
36
+ end
@@ -1,13 +1,13 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Edits
4
- # Implemention of the Damerau/Levenshtein distance algorithm.
4
+ # Implements the Damerau/Levenshtein distance algorithm.
5
5
  #
6
6
  # Determines distance between two strings by counting edits, identifying:
7
7
  # * Insertion
8
8
  # * Deletion
9
9
  # * Substitution
10
- # * Transposition
10
+ # * Adjacent transposition
11
11
  module DamerauLevenshtein
12
12
  # Calculate the Damerau/Levenshtein distance of two sequences.
13
13
  #
@@ -18,13 +18,9 @@ module Edits
18
18
  # @param seq2 [String, Array]
19
19
  # @return [Integer]
20
20
  def self.distance(seq1, seq2)
21
- if seq1.length > seq2.length
22
- temp = seq1
23
- seq1 = seq2
24
- seq2 = temp
25
- end
21
+ seq1, seq2 = seq2, seq1 if seq1.length > seq2.length
26
22
 
27
- # array of Integer codepoints outperforms String
23
+ # array of codepoints outperforms String
28
24
  seq1 = seq1.codepoints if seq1.is_a? String
29
25
  seq2 = seq2.codepoints if seq2.is_a? String
30
26
 
@@ -34,7 +30,7 @@ module Edits
34
30
  return rows if cols.zero?
35
31
 
36
32
  # 'infinite' edit distance for padding cost matrix.
37
- # Can be any value greater than max[rows, cols]
33
+ # Can be any value > max[rows, cols]
38
34
  inf = rows + cols
39
35
 
40
36
  # Initialize first two rows of cost matrix.
@@ -71,14 +67,14 @@ module Edits
71
67
 
72
68
  # TODO: do insertion/deletion need to be considered when
73
69
  # seq1_item == seq2_item ?
74
- deletion = matrix[row][col + 1] + 1
75
- insertion = matrix[row + 1][col] + 1
76
- substitution = matrix[row][col] + sub_cost
77
-
78
- # step cost is min of operation costs
79
- cost = substitution < insertion ? substitution : insertion
80
- cost = deletion if deletion < cost
81
- cost = transposition if transposition < cost
70
+ #
71
+ # substitution, deletion, insertion, transposition
72
+ cost = [
73
+ matrix[row][col] + sub_cost,
74
+ matrix[row][col + 1] + 1,
75
+ matrix[row + 1][col] + 1,
76
+ transposition
77
+ ].min
82
78
 
83
79
  matrix[row + 1][col + 1] = cost
84
80
 
@@ -1,6 +1,8 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Edits
4
+ # Implements Hamming distance algorithm
5
+ #
4
6
  # @see https://en.wikipedia.org/wiki/Hamming_distance
5
7
  module Hamming
6
8
  # Calculate the Hamming distance between two sequences.
@@ -1,6 +1,8 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Edits
4
+ # Implements Jaro similarity algorithm.
5
+ #
4
6
  # @see https://en.wikipedia.org/wiki/Jaro-Winkler_distance
5
7
  module Jaro
6
8
  # Calculate Jaro similarity
@@ -14,7 +16,7 @@ module Edits
14
16
  # # => 0.9023569023569024
15
17
  # @param seq1 [String, Array]
16
18
  # @param seq2 [String, Array]
17
- # @return [Float]
19
+ # @return [Float] similarity, between 0.0 (none) and 1.0 (identical)
18
20
  def self.similarity(seq1, seq2)
19
21
  return 1.0 if seq1 == seq2
20
22
  return 0.0 if seq1.empty? || seq2.empty?
@@ -26,16 +28,18 @@ module Edits
26
28
  return 0.0 if m.zero?
27
29
 
28
30
  m = m.to_f
29
- (1.0 / 3) * ((m / seq1.length) + (m / seq2.length) + ((m - t) / m))
31
+ ((m / seq1.length) + (m / seq2.length) + ((m - t) / m)) / 3
30
32
  end
31
33
 
32
34
  # Calculate Jaro distance
33
35
  #
36
+ # `Dj = 1 - Sj`
37
+ #
34
38
  # @example
35
39
  # Edits::Jaro.distance("information", "informant")
36
40
  # # => 0.09764309764309764
37
41
  # @param (see #distance)
38
- # @return [Float]
42
+ # @return [Float] distance, between 0.0 (identical) and 1.0 (distant)
39
43
  def self.distance(str1, str2)
40
44
  1.0 - similarity(str1, str2)
41
45
  end
@@ -45,11 +49,7 @@ module Edits
45
49
  # @param (see #distance)
46
50
  # @return [(Integer, Integer)] matches and transpositions
47
51
  def self.jaro_matches(seq1, seq2)
48
- if seq1.length > seq2.length
49
- temp = seq1
50
- seq1 = seq2
51
- seq2 = temp
52
- end
52
+ seq1, seq2 = seq2, seq1 if seq1.length > seq2.length
53
53
 
54
54
  # search range: (max(|A|, |B|) / 2) - 1
55
55
  range = (seq2.length / 2) - 1
@@ -1,6 +1,8 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Edits
4
+ # Implements Jaro-Winkler similarity algorithm.
5
+ #
4
6
  # @see https://en.wikipedia.org/wiki/Jaro-Winkler_distance
5
7
  module JaroWinkler
6
8
  # Prefix scaling factor for jaro-winkler metric. Default is 0.1
@@ -13,9 +15,9 @@ module Edits
13
15
 
14
16
  # Calculate Jaro-Winkler similarity of given strings
15
17
  #
16
- # Adds weight to Jaro distance according to the length of a common prefix
18
+ # Adds weight to Jaro similarity according to the length of a common prefix
17
19
  # of up to 4 letters, where exists. The additional weighting is only
18
- # applied when the original distance passes a threshold.
20
+ # applied when the original similarity passes a threshold.
19
21
  #
20
22
  # `Sw = Sj + (l * p * (1 - Dj))`
21
23
  #
@@ -29,7 +31,7 @@ module Edits
29
31
  # @param seq2 [String, Array]
30
32
  # @param threshold [Float] threshold for applying Winkler prefix weighting
31
33
  # @param weight [Float] weighting for common prefix, should not exceed 0.25
32
- # @return [Float]
34
+ # @return [Float] similarity, between 0.0 (none) and 1.0 (identical)
33
35
  def self.similarity(
34
36
  seq1, seq2,
35
37
  threshold: WINKLER_THRESHOLD,
@@ -60,7 +62,7 @@ module Edits
60
62
  # Edits::JaroWinkler.distance("information", "informant")
61
63
  # # => 0.05858585858585863
62
64
  # @param (see #distance)
63
- # @return [Float]
65
+ # @return [Float] distance, between 0.0 (identical) and 1.0 (distant)
64
66
  def self.distance(
65
67
  seq1, seq2,
66
68
  threshold: WINKLER_THRESHOLD,
@@ -1,13 +1,15 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Edits
4
- # Implementation of Levenshtein distance algorithm.
4
+ # Implements Levenshtein distance algorithm.
5
5
  #
6
6
  # Determines distance between two string by counting edits, identifying:
7
- # - Insertion
8
- # - Deletion
9
- # - Substitution
7
+ # * Insertion
8
+ # * Deletion
9
+ # * Substitution
10
10
  module Levenshtein
11
+ extend Compare
12
+
11
13
  # Calculate the Levenshtein (edit) distance of two sequences.
12
14
  #
13
15
  # @note A true distance metric, satisfies triangle inequality.
@@ -18,13 +20,9 @@ module Edits
18
20
  # @param seq2 [String, Array]
19
21
  # @return [Integer]
20
22
  def self.distance(seq1, seq2)
21
- if seq1.length > seq2.length
22
- temp = seq1
23
- seq1 = seq2
24
- seq2 = temp
25
- end
23
+ seq1, seq2 = seq2, seq1 if seq1.length > seq2.length
26
24
 
27
- # array of Integer codepoints outperforms String
25
+ # array of codepoints outperforms String
28
26
  seq1 = seq1.codepoints if seq1.is_a? String
29
27
  seq2 = seq2.codepoints if seq2.is_a? String
30
28
 
@@ -41,24 +39,25 @@ module Edits
41
39
  last_row = 0.upto(cols).to_a
42
40
 
43
41
  rows.times do |row|
44
- last_col = row + 1
45
-
42
+ prev_col_cost = row + 1
46
43
  seq1_item = seq1[row]
47
44
 
48
45
  cols.times do |col|
49
- deletion = last_row[col + 1] + 1
50
- insertion = last_col + 1
51
- substitution = last_row[col] + (seq1_item == seq2[col] ? 0 : 1)
52
-
46
+ # | Xs | Xd |
47
+ # | Xi | ? |
53
48
  # step cost is min of operation costs
54
- cost = deletion < insertion ? deletion : insertion
55
- cost = substitution if substitution < cost
49
+ # substitution, deletion, insertion
50
+ cost = [
51
+ last_row[col] + (seq1_item == seq2[col] ? 0 : 1),
52
+ last_row[col + 1] + 1,
53
+ prev_col_cost + 1
54
+ ].min
56
55
 
57
56
  # overwrite previous row as we progress
58
- last_row[col] = last_col
59
- last_col = cost
57
+ last_row[col] = prev_col_cost
58
+ prev_col_cost = cost
60
59
  end
61
- last_row[cols] = last_col
60
+ last_row[cols] = prev_col_cost
62
61
  end
63
62
 
64
63
  last_row[cols]
@@ -77,85 +76,56 @@ module Edits
77
76
  # @param max [Integer] maximum distance
78
77
  # @return [Integer]
79
78
  def self.distance_with_max(seq1, seq2, max)
80
- if seq1.length > seq2.length
81
- temp = seq1
82
- seq1 = seq2
83
- seq2 = temp
84
- end
79
+ seq1, seq2 = seq2, seq1 if seq1.length > seq2.length
85
80
 
86
81
  rows = seq1.length
87
82
  cols = seq2.length
88
- return cols if rows.zero?
89
- return rows if cols.zero?
90
- return max if (rows - cols).abs >= max
83
+ return cols > max ? max : cols if rows.zero?
84
+ return rows > max ? max : rows if cols.zero?
85
+ return max if (cols - rows) >= max
91
86
 
87
+ # array of codepoints outperforms String
92
88
  seq1 = seq1.codepoints if seq1.is_a? String
93
89
  seq2 = seq2.codepoints if seq2.is_a? String
94
90
 
91
+ # 'infinite' edit distance for padding cost matrix.
92
+ # Can be any value > max[rows, cols]
93
+ inf = cols + 1
94
+
95
+ # retain previous row of cost matrix
95
96
  last_row = 0.upto(cols).to_a
96
97
 
97
98
  rows.times do |row|
98
- last_col_cost = row + 1
99
- seq1_item = seq1[row]
100
-
99
+ # Ukkonen cut-off
101
100
  min_col = row > max ? row - max : 0
102
101
  max_col = row + max
103
102
  max_col = cols - 1 if max_col > cols - 1
103
+
104
+ prev_col_cost = min_col.zero? ? row + 1 : inf
105
+ seq1_item = seq1[row]
104
106
  diagonal = cols - rows + row
105
107
 
106
- cols.times do |col|
108
+ min_col.upto(max_col) do |col|
107
109
  return max if diagonal == col && last_row[col] >= max
108
- col_cost =
109
- if col < min_col || col > max_col
110
- max + 1
111
- else
112
- # step cost is min of operation costs
113
- deletion = last_row[col + 1] + 1
114
- insertion = last_col_cost + 1
115
- substitution = last_row[col] + (seq1_item == seq2[col] ? 0 : 1)
116
-
117
- cost = deletion < insertion ? deletion : insertion
118
- substitution < cost ? substitution : cost
119
- end
120
-
121
- last_row[col] = last_col_cost
122
- last_col_cost = col_cost
123
- end
124
-
125
- last_row[cols] = last_col_cost
126
- end
127
110
 
128
- last_row[cols] > max ? max : last_row[cols]
129
- end
111
+ # | Xs | Xd |
112
+ # | Xi | ? |
113
+ # substitution, deletion, insertion
114
+ cost = [
115
+ last_row[col] + (seq1_item == seq2[col] ? 0 : 1),
116
+ last_row[col + 1] + 1,
117
+ prev_col_cost + 1
118
+ ].min
130
119
 
131
- # Given a prototype string and an array of strings, determines which
132
- # string is most similar to the prototype.
133
- #
134
- # `Levenshtein.most_similar("foo", strings)` is functionally equivalent to
135
- # `strings.min_by { |s| Levenshtein.distance("foo", s) }`, leveraging
136
- # {.distance_with_max}.
137
- #
138
- # @example
139
- # Edits::Levenshtein.most_similar("atom", %w[tram atlas rota racer])
140
- # # => "atlas"
141
- # @param prototype [String]
142
- # @param strings [<String>]
143
- # @return [String, nil] most similar string, or nil for empty array
144
- def self.most_similar(prototype, strings)
145
- return nil if strings.empty?
146
- min_s = strings[0]
147
- min_d = distance(prototype, min_s)
148
-
149
- strings[1..-1].each do |s|
150
- return min_s if min_d.zero?
151
- d = distance_with_max(prototype, s, min_d)
152
- if d < min_d
153
- min_d = d
154
- min_s = s
120
+ # overwrite previous row as we progress
121
+ last_row[col] = prev_col_cost
122
+ prev_col_cost = cost
155
123
  end
124
+
125
+ last_row[cols] = prev_col_cost
156
126
  end
157
127
 
158
- min_s
128
+ last_row[cols] > max ? max : last_row[cols]
159
129
  end
160
130
  end
161
131
  end
@@ -8,8 +8,13 @@ module Edits
8
8
  # * Insertion
9
9
  # * Deletion
10
10
  # * Substitution
11
- # * Swapped items
11
+ # * Adjacent transposition
12
+ #
13
+ # This variant is restricted by the condition that no sub-string is edited
14
+ # more than once.
12
15
  module RestrictedEdit
16
+ extend Compare
17
+
13
18
  # Calculate the Restricted Damerau-Levenshtein distance (Optimal Alignment)
14
19
  # of two sequences.
15
20
  #
@@ -21,13 +26,9 @@ module Edits
21
26
  # @param seq2 [String, Array]
22
27
  # @return [Integer]
23
28
  def self.distance(seq1, seq2)
24
- if seq1.length > seq2.length
25
- temp = seq1
26
- seq1 = seq2
27
- seq2 = temp
28
- end
29
+ seq1, seq2 = seq2, seq1 if seq1.length > seq2.length
29
30
 
30
- # array of Integer codepoints outperforms String
31
+ # array of codepoints outperforms String
31
32
  seq1 = seq1.codepoints if seq1.is_a? String
32
33
  seq2 = seq2.codepoints if seq2.is_a? String
33
34
 
@@ -36,9 +37,10 @@ module Edits
36
37
  return cols if rows.zero?
37
38
  return rows if cols.zero?
38
39
 
39
- # previous two rows of cost matrix are retained
40
+ # retain previous two rows of cost matrix
40
41
  lastlast_row = []
41
42
  last_row = []
43
+
42
44
  # Initialize first row of cost matrix.
43
45
  # The full initial state where cols=3, rows=2 would be:
44
46
  # [[0, 1, 2, 3],
@@ -47,29 +49,29 @@ module Edits
47
49
  curr_row = 0.upto(cols).to_a
48
50
 
49
51
  rows.times do |row|
50
- lastlast_row = last_row
51
- last_row = curr_row
52
+ # rotate row arrays
53
+ curr_row, last_row, lastlast_row = lastlast_row, curr_row, last_row
52
54
 
53
- # generate next row of cost matrix
54
- curr_row = Array.new(cols + 1, 0)
55
55
  curr_row[0] = row + 1
56
-
57
- curr_item = seq1[row]
56
+ seq1_item = seq1[row]
58
57
 
59
58
  cols.times do |col|
60
- sub_cost = curr_item == seq2[col] ? 0 : 1
61
- is_swap = sub_cost == 1 &&
59
+ sub_cost = seq1_item == seq2[col] ? 0 : 1
60
+ is_swap = sub_cost.positive? &&
62
61
  row.positive? && col.positive? &&
63
- curr_item == seq2[col - 1] &&
62
+ seq1_item == seq2[col - 1] &&
64
63
  seq1[row - 1] == seq2[col]
65
64
 
66
- deletion = last_row[col + 1] + 1
67
- insertion = curr_row[col] + 1
68
- substitution = last_row[col] + sub_cost
69
-
65
+ # | Xt | | |
66
+ # | | Xs | Xd |
67
+ # | | Xi | ? |
70
68
  # step cost is min of operation costs
71
- cost = deletion < insertion ? deletion : insertion
72
- cost = substitution if substitution < cost
69
+ # substitution, deletion, insertion, transposition
70
+ cost = [
71
+ last_row[col] + sub_cost,
72
+ last_row[col + 1] + 1,
73
+ curr_row[col] + 1
74
+ ].min
73
75
 
74
76
  if is_swap
75
77
  swap = lastlast_row[col - 1] + 1
@@ -82,5 +84,84 @@ module Edits
82
84
 
83
85
  curr_row[cols]
84
86
  end
87
+
88
+ # Calculate the Restricted Damerau-Levenshtein distance (Optimal Alignment)
89
+ # of two sequences, bounded by a maximum value.
90
+ #
91
+ # @example
92
+ # Edits::RestrictedEdit.distance("cloud", "crayon")
93
+ # # => 5
94
+ # Edits::RestrictedEdit.distance_with_max("cloud", "crayon", 2)
95
+ # # => 2
96
+ # @param seq1 [String, Array]
97
+ # @param seq2 [String, Array]
98
+ # @param max [Integer] maximum distance
99
+ # @return [Integer]
100
+ def self.distance_with_max(seq1, seq2, max)
101
+ seq1, seq2 = seq2, seq1 if seq1.length > seq2.length
102
+
103
+ rows = seq1.length
104
+ cols = seq2.length
105
+ return cols > max ? max : cols if rows.zero?
106
+ return rows > max ? max : rows if cols.zero?
107
+ return max if (cols - rows) >= max
108
+
109
+ # array of codepoints outperforms String
110
+ seq1 = seq1.codepoints if seq1.is_a? String
111
+ seq2 = seq2.codepoints if seq2.is_a? String
112
+
113
+ # 'infinite' edit distance for padding cost matrix.
114
+ # Can be any value > max[rows, cols]
115
+ inf = cols + 1
116
+
117
+ # retain previous two rows of cost matrix,
118
+ # padded with "inf" as matrix is not fully evaluated
119
+ lastlast_row = Array.new(inf, inf)
120
+ last_row = Array.new(inf, inf)
121
+ curr_row = 0.upto(cols).to_a
122
+
123
+ rows.times do |row|
124
+ # rotate row arrays
125
+ curr_row, last_row, lastlast_row = lastlast_row, curr_row, last_row
126
+
127
+ # Ukkonen cut-off
128
+ min_col = row > max ? row - max : 0
129
+ max_col = row + max
130
+ max_col = cols - 1 if max_col > cols - 1
131
+
132
+ curr_row[min_col] = min_col.zero? ? row + 1 : inf
133
+ seq1_item = seq1[row]
134
+ diagonal = cols - rows + row
135
+
136
+ min_col.upto(max_col) do |col|
137
+ return max if diagonal == col && last_row[col] >= max
138
+
139
+ sub_cost = seq1_item == seq2[col] ? 0 : 1
140
+ is_swap = sub_cost.positive? &&
141
+ row.positive? && col.positive? &&
142
+ seq1_item == seq2[col - 1] &&
143
+ seq1[row - 1] == seq2[col]
144
+
145
+ # | Xt | | |
146
+ # | | Xs | Xd |
147
+ # | | Xi | ? |
148
+ # substitution, deletion, insertion, transposition
149
+ cost = [
150
+ last_row[col] + sub_cost,
151
+ last_row[col + 1] + 1,
152
+ curr_row[col] + 1
153
+ ].min
154
+
155
+ if is_swap
156
+ swap = lastlast_row[col - 1] + 1
157
+ cost = swap if swap < cost
158
+ end
159
+
160
+ curr_row[col + 1] = cost
161
+ end
162
+ end
163
+
164
+ curr_row[cols] > max ? max : curr_row[cols]
165
+ end
85
166
  end
86
167
  end
@@ -1,5 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Edits
4
- VERSION = "0.1.0"
4
+ # Current gem version
5
+ VERSION = "0.2.0"
5
6
  end
@@ -5,7 +5,7 @@ require "benchmark/ips"
5
5
  require "edits"
6
6
 
7
7
  namespace :benchmark do
8
- desc "distance vs. distance_with_max (x100)"
8
+ desc "levenshtein distance vs. distance_with_max (x100)"
9
9
  task :lev_max do
10
10
  words = File.read("/usr/share/dict/words")
11
11
  .split(/\n/).compact.shuffle(random: Random.new(1))
@@ -64,6 +64,65 @@ namespace :benchmark do
64
64
  end
65
65
  end
66
66
 
67
+ desc "restricted distance vs. distance_with_max (x100)"
68
+ task :restricted_max do
69
+ words = File.read("/usr/share/dict/words")
70
+ .split(/\n/).compact.shuffle(random: Random.new(1))
71
+ .take(101)
72
+
73
+ Benchmark.ips do |x|
74
+ x.report("distance") do
75
+ words.each_cons(2) do |a, b|
76
+ Edits::RestrictedEdit.distance a, b
77
+ end
78
+ end
79
+
80
+ x.report("with max 1") do
81
+ words.each_cons(2) do |a, b|
82
+ Edits::RestrictedEdit.distance_with_max a, b, 1
83
+ end
84
+ end
85
+
86
+ x.report("with max 2") do
87
+ words.each_cons(2) do |a, b|
88
+ Edits::RestrictedEdit.distance_with_max a, b, 2
89
+ end
90
+ end
91
+
92
+ x.report("with max 3") do
93
+ words.each_cons(2) do |a, b|
94
+ Edits::RestrictedEdit.distance_with_max a, b, 3
95
+ end
96
+ end
97
+
98
+ x.report("with max 4") do
99
+ words.each_cons(2) do |a, b|
100
+ Edits::RestrictedEdit.distance_with_max a, b, 4
101
+ end
102
+ end
103
+
104
+ x.report("with max 6") do
105
+ words.each_cons(2) do |a, b|
106
+ Edits::RestrictedEdit.distance_with_max a, b, 6
107
+ end
108
+ end
109
+
110
+ x.report("with max 8") do
111
+ words.each_cons(2) do |a, b|
112
+ Edits::RestrictedEdit.distance_with_max a, b, 8
113
+ end
114
+ end
115
+
116
+ x.report("with max 50") do
117
+ words.each_cons(2) do |a, b|
118
+ Edits::RestrictedEdit.distance_with_max a, b, 100
119
+ end
120
+ end
121
+
122
+ x.compare!
123
+ end
124
+ end
125
+
67
126
  desc "most_similar vs. min_by (100 words)"
68
127
  task :lev_similar do
69
128
  words = File.read("/usr/share/dict/words")
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: edits
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tom Crouch
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-09-22 00:00:00.000000000 Z
11
+ date: 2017-10-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -114,6 +114,7 @@ files:
114
114
  - bin/setup
115
115
  - edits.gemspec
116
116
  - lib/edits.rb
117
+ - lib/edits/compare.rb
117
118
  - lib/edits/damerau_levenshtein.rb
118
119
  - lib/edits/hamming.rb
119
120
  - lib/edits/jaro.rb
@@ -143,7 +144,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
143
144
  version: '0'
144
145
  requirements: []
145
146
  rubyforge_project:
146
- rubygems_version: 2.6.8
147
+ rubygems_version: 2.6.13
147
148
  signing_key:
148
149
  specification_version: 4
149
150
  summary: A collection of edit distance algorithms.