edits 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +9 -0
- data/README.md +18 -6
- data/lib/edits.rb +1 -0
- data/lib/edits/compare.rb +36 -0
- data/lib/edits/damerau_levenshtein.rb +13 -17
- data/lib/edits/hamming.rb +2 -0
- data/lib/edits/jaro.rb +8 -8
- data/lib/edits/jaro_winkler.rb +6 -4
- data/lib/edits/levenshtein.rb +49 -79
- data/lib/edits/restricted_edit.rb +104 -23
- data/lib/edits/version.rb +2 -1
- data/tasks/benchmark/levenshtein.rake +60 -1
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a6cd424759a87d827084b94756bebd46a2cce133
|
4
|
+
data.tar.gz: 703ee761905db5db81c9462727c7812b4519ff8b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ccee510e44ace7a1c88362dcd8a15fda256d652d9e65e78def3a28b95c3c30fded7ff2e1205b302f56d38f4365fe3f4a678dfec9b5bca2d441b08b00401bd390
|
7
|
+
data.tar.gz: 4b56c06fba560443d7aa2c057d959d271a013216d4e45867c688dda07bad18acce8e8730e6513384c3f03aa0b0985b9333c49e942d4e2bb0ee35b2239ea3abc1
|
data/.travis.yml
CHANGED
@@ -1,5 +1,14 @@
|
|
1
1
|
sudo: false
|
2
2
|
language: ruby
|
3
|
+
cache: bundler
|
3
4
|
rvm:
|
4
5
|
- 2.4.0
|
5
6
|
before_install: gem install bundler -v 1.15.4
|
7
|
+
deploy:
|
8
|
+
provider: rubygems
|
9
|
+
api_key:
|
10
|
+
secure: NKUWS12yMpsMtPeKL+4X56xZ02p5y4R+HYQmgvxFNajOS42aTMXncx4eTnuHoNdKbh+x5yci+lbPr+j5t9qWo5GzJsxjqnFW4lO2V5O1ONBzcAB/g6BPDMQsWbX34eBVafCEidNoOY1HhLlKerWzlsGRq+P4q+3WKLJihrl99pyv+EQykqS11/YVCsmFjrPVa/aGk84njbnI/kIDOY5HV2gVBazARixRO5y2AUg3hRUf2+Tu/X1ke80YBD9LazW2kRUzz0Rs+1vc653JtYn6MeK/bEkGWvN2Qs/k6Q2nNr6ni4v39Y07yylu1EnYh1H/0OT/H2hehxsjMQCQhDlVKbN7NTmOV026aWXA2HdSxmVhxQCIKRGW3Nm81kBj1/edXLpJActnLeex2iCMcXJk8yAAzF0q+vSHLld0w9Jx95kIJB4tnaonWJxcaWaX58HWFbdOuYKvQhXqcflI4KmNH8xXm/O0FIM8VEJRg9dojZ5S8Us3fZpBFZxVJ3H3Fcb406AmoIqcHOsqJ1GvBM8EdWkwuaH9GsUWf8pydgKFgStYUaKk8DDmJonT748emG4yw+78uMGWPoFxf+Mc8jazRxIaRiQbjVFUcLjaRkRFlL8UT9BB9k7c1egjvKhUo/pqV7KIEsAJrrh6zZIkz4h9AwxWpZjVj9+z63Kh6NC1lUI=
|
11
|
+
gem: edits
|
12
|
+
on:
|
13
|
+
tags: true
|
14
|
+
repo: tcrouch/edits
|
data/README.md
CHANGED
@@ -1,8 +1,13 @@
|
|
1
1
|
# Edits
|
2
2
|
|
3
|
+
[](https://travis-ci.org/tcrouch/edits)
|
4
|
+
[](https://codeclimate.com/github/tcrouch/edits)
|
5
|
+
[](http://inch-ci.org/github/tcrouch/edits)
|
6
|
+
[](http://rubydoc.info/github/tcrouch/edits)
|
7
|
+
|
3
8
|
A collection of edit distance algorithms in Ruby.
|
4
9
|
|
5
|
-
Includes Levenshtein, Restricted Edit (Optimal Alignment) and Damerau-Levenshtein distances, and Jaro
|
10
|
+
Includes Levenshtein, Restricted Edit (Optimal Alignment) and Damerau-Levenshtein distances, and Jaro & Jaro-Winkler similarity.
|
6
11
|
|
7
12
|
## Installation
|
8
13
|
|
@@ -37,14 +42,15 @@ Edits::Levenshtein.distance "acer", "earn"
|
|
37
42
|
# Max distance
|
38
43
|
Edits::Levenshtein.distance_with_max "iota", "atom", 2
|
39
44
|
# => 2
|
40
|
-
Edits::Levenshtein.most_similar "atom", %w[
|
41
|
-
# => "
|
45
|
+
Edits::Levenshtein.most_similar "atom", %w[tree rota toes racer]
|
46
|
+
# => "toes"
|
42
47
|
```
|
43
48
|
|
44
49
|
### Restricted Edit (Optimal Alignment)
|
45
50
|
|
46
|
-
Edit distance, accounting for deletion, addition, substitution and
|
47
|
-
characters.
|
51
|
+
Edit distance, accounting for deletion, addition, substitution and
|
52
|
+
transposition (two adjacent characters are swapped). This variant is
|
53
|
+
restricted by the condition that no sub-string is edited more than once.
|
48
54
|
|
49
55
|
```ruby
|
50
56
|
Edits::RestrictedEdit.distance "raked", "bakers"
|
@@ -53,12 +59,18 @@ Edits::RestrictedEdit.distance "iota", "atom"
|
|
53
59
|
# => 3
|
54
60
|
Edits::RestrictedEdit.distance "acer", "earn"
|
55
61
|
# => 4
|
62
|
+
|
63
|
+
# Max distance
|
64
|
+
Edits::RestrictedEdit.distance_with_max "iota", "atom", 2
|
65
|
+
# => 2
|
66
|
+
Edits::RestrictedEdit.most_similar "atom", %w[tree rota toes racer]
|
67
|
+
# => "rota"
|
56
68
|
```
|
57
69
|
|
58
70
|
### Damerau-Levenshtein
|
59
71
|
|
60
72
|
Edit distance, accounting for deletions, additions, substitution and
|
61
|
-
transposition.
|
73
|
+
transposition (two adjacent characters are swapped).
|
62
74
|
|
63
75
|
```ruby
|
64
76
|
Edits::DamerauLevenshtein.distance "raked", "bakers"
|
data/lib/edits.rb
CHANGED
@@ -0,0 +1,36 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Edits
|
4
|
+
# Comparison helpers
|
5
|
+
module Compare
|
6
|
+
# Given a prototype string and an array of strings, determines which
|
7
|
+
# string is most similar to the prototype.
|
8
|
+
#
|
9
|
+
# `most_similar("foo", strings)` is functionally equivalent to
|
10
|
+
# `strings.min_by { |s| distance("foo", s) }`, leveraging
|
11
|
+
# {.distance_with_max}.
|
12
|
+
#
|
13
|
+
# @example
|
14
|
+
# most_similar("atom", %w[tram atlas rota racer])
|
15
|
+
# # => "atlas"
|
16
|
+
# @param prototype [String]
|
17
|
+
# @param strings [<String>]
|
18
|
+
# @return [String, nil] most similar string, or nil for empty array
|
19
|
+
def most_similar(prototype, strings)
|
20
|
+
return nil if strings.empty?
|
21
|
+
min_s = strings[0]
|
22
|
+
min_d = distance(prototype, min_s)
|
23
|
+
|
24
|
+
strings[1..-1].each do |s|
|
25
|
+
return min_s if min_d.zero?
|
26
|
+
d = distance_with_max(prototype, s, min_d)
|
27
|
+
if d < min_d
|
28
|
+
min_d = d
|
29
|
+
min_s = s
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
min_s
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -1,13 +1,13 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Edits
|
4
|
-
#
|
4
|
+
# Implements the Damerau/Levenshtein distance algorithm.
|
5
5
|
#
|
6
6
|
# Determines distance between two strings by counting edits, identifying:
|
7
7
|
# * Insertion
|
8
8
|
# * Deletion
|
9
9
|
# * Substitution
|
10
|
-
# *
|
10
|
+
# * Adjacent transposition
|
11
11
|
module DamerauLevenshtein
|
12
12
|
# Calculate the Damerau/Levenshtein distance of two sequences.
|
13
13
|
#
|
@@ -18,13 +18,9 @@ module Edits
|
|
18
18
|
# @param seq2 [String, Array]
|
19
19
|
# @return [Integer]
|
20
20
|
def self.distance(seq1, seq2)
|
21
|
-
if seq1.length > seq2.length
|
22
|
-
temp = seq1
|
23
|
-
seq1 = seq2
|
24
|
-
seq2 = temp
|
25
|
-
end
|
21
|
+
seq1, seq2 = seq2, seq1 if seq1.length > seq2.length
|
26
22
|
|
27
|
-
# array of
|
23
|
+
# array of codepoints outperforms String
|
28
24
|
seq1 = seq1.codepoints if seq1.is_a? String
|
29
25
|
seq2 = seq2.codepoints if seq2.is_a? String
|
30
26
|
|
@@ -34,7 +30,7 @@ module Edits
|
|
34
30
|
return rows if cols.zero?
|
35
31
|
|
36
32
|
# 'infinite' edit distance for padding cost matrix.
|
37
|
-
# Can be any value
|
33
|
+
# Can be any value > max[rows, cols]
|
38
34
|
inf = rows + cols
|
39
35
|
|
40
36
|
# Initialize first two rows of cost matrix.
|
@@ -71,14 +67,14 @@ module Edits
|
|
71
67
|
|
72
68
|
# TODO: do insertion/deletion need to be considered when
|
73
69
|
# seq1_item == seq2_item ?
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
70
|
+
#
|
71
|
+
# substitution, deletion, insertion, transposition
|
72
|
+
cost = [
|
73
|
+
matrix[row][col] + sub_cost,
|
74
|
+
matrix[row][col + 1] + 1,
|
75
|
+
matrix[row + 1][col] + 1,
|
76
|
+
transposition
|
77
|
+
].min
|
82
78
|
|
83
79
|
matrix[row + 1][col + 1] = cost
|
84
80
|
|
data/lib/edits/hamming.rb
CHANGED
data/lib/edits/jaro.rb
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Edits
|
4
|
+
# Implements Jaro similarity algorithm.
|
5
|
+
#
|
4
6
|
# @see https://en.wikipedia.org/wiki/Jaro-Winkler_distance
|
5
7
|
module Jaro
|
6
8
|
# Calculate Jaro similarity
|
@@ -14,7 +16,7 @@ module Edits
|
|
14
16
|
# # => 0.9023569023569024
|
15
17
|
# @param seq1 [String, Array]
|
16
18
|
# @param seq2 [String, Array]
|
17
|
-
# @return [Float]
|
19
|
+
# @return [Float] similarity, between 0.0 (none) and 1.0 (identical)
|
18
20
|
def self.similarity(seq1, seq2)
|
19
21
|
return 1.0 if seq1 == seq2
|
20
22
|
return 0.0 if seq1.empty? || seq2.empty?
|
@@ -26,16 +28,18 @@ module Edits
|
|
26
28
|
return 0.0 if m.zero?
|
27
29
|
|
28
30
|
m = m.to_f
|
29
|
-
(
|
31
|
+
((m / seq1.length) + (m / seq2.length) + ((m - t) / m)) / 3
|
30
32
|
end
|
31
33
|
|
32
34
|
# Calculate Jaro distance
|
33
35
|
#
|
36
|
+
# `Dj = 1 - Sj`
|
37
|
+
#
|
34
38
|
# @example
|
35
39
|
# Edits::Jaro.distance("information", "informant")
|
36
40
|
# # => 0.09764309764309764
|
37
41
|
# @param (see #distance)
|
38
|
-
# @return [Float]
|
42
|
+
# @return [Float] distance, between 0.0 (identical) and 1.0 (distant)
|
39
43
|
def self.distance(str1, str2)
|
40
44
|
1.0 - similarity(str1, str2)
|
41
45
|
end
|
@@ -45,11 +49,7 @@ module Edits
|
|
45
49
|
# @param (see #distance)
|
46
50
|
# @return [(Integer, Integer)] matches and transpositions
|
47
51
|
def self.jaro_matches(seq1, seq2)
|
48
|
-
if seq1.length > seq2.length
|
49
|
-
temp = seq1
|
50
|
-
seq1 = seq2
|
51
|
-
seq2 = temp
|
52
|
-
end
|
52
|
+
seq1, seq2 = seq2, seq1 if seq1.length > seq2.length
|
53
53
|
|
54
54
|
# search range: (max(|A|, |B|) / 2) - 1
|
55
55
|
range = (seq2.length / 2) - 1
|
data/lib/edits/jaro_winkler.rb
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Edits
|
4
|
+
# Implements Jaro-Winkler similarity algorithm.
|
5
|
+
#
|
4
6
|
# @see https://en.wikipedia.org/wiki/Jaro-Winkler_distance
|
5
7
|
module JaroWinkler
|
6
8
|
# Prefix scaling factor for jaro-winkler metric. Default is 0.1
|
@@ -13,9 +15,9 @@ module Edits
|
|
13
15
|
|
14
16
|
# Calculate Jaro-Winkler similarity of given strings
|
15
17
|
#
|
16
|
-
# Adds weight to Jaro
|
18
|
+
# Adds weight to Jaro similarity according to the length of a common prefix
|
17
19
|
# of up to 4 letters, where exists. The additional weighting is only
|
18
|
-
# applied when the original
|
20
|
+
# applied when the original similarity passes a threshold.
|
19
21
|
#
|
20
22
|
# `Sw = Sj + (l * p * (1 - Dj))`
|
21
23
|
#
|
@@ -29,7 +31,7 @@ module Edits
|
|
29
31
|
# @param seq2 [String, Array]
|
30
32
|
# @param threshold [Float] threshold for applying Winkler prefix weighting
|
31
33
|
# @param weight [Float] weighting for common prefix, should not exceed 0.25
|
32
|
-
# @return [Float]
|
34
|
+
# @return [Float] similarity, between 0.0 (none) and 1.0 (identical)
|
33
35
|
def self.similarity(
|
34
36
|
seq1, seq2,
|
35
37
|
threshold: WINKLER_THRESHOLD,
|
@@ -60,7 +62,7 @@ module Edits
|
|
60
62
|
# Edits::JaroWinkler.distance("information", "informant")
|
61
63
|
# # => 0.05858585858585863
|
62
64
|
# @param (see #distance)
|
63
|
-
# @return [Float]
|
65
|
+
# @return [Float] distance, between 0.0 (identical) and 1.0 (distant)
|
64
66
|
def self.distance(
|
65
67
|
seq1, seq2,
|
66
68
|
threshold: WINKLER_THRESHOLD,
|
data/lib/edits/levenshtein.rb
CHANGED
@@ -1,13 +1,15 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Edits
|
4
|
-
#
|
4
|
+
# Implements Levenshtein distance algorithm.
|
5
5
|
#
|
6
6
|
# Determines distance between two string by counting edits, identifying:
|
7
|
-
#
|
8
|
-
#
|
9
|
-
#
|
7
|
+
# * Insertion
|
8
|
+
# * Deletion
|
9
|
+
# * Substitution
|
10
10
|
module Levenshtein
|
11
|
+
extend Compare
|
12
|
+
|
11
13
|
# Calculate the Levenshtein (edit) distance of two sequences.
|
12
14
|
#
|
13
15
|
# @note A true distance metric, satisfies triangle inequality.
|
@@ -18,13 +20,9 @@ module Edits
|
|
18
20
|
# @param seq2 [String, Array]
|
19
21
|
# @return [Integer]
|
20
22
|
def self.distance(seq1, seq2)
|
21
|
-
if seq1.length > seq2.length
|
22
|
-
temp = seq1
|
23
|
-
seq1 = seq2
|
24
|
-
seq2 = temp
|
25
|
-
end
|
23
|
+
seq1, seq2 = seq2, seq1 if seq1.length > seq2.length
|
26
24
|
|
27
|
-
# array of
|
25
|
+
# array of codepoints outperforms String
|
28
26
|
seq1 = seq1.codepoints if seq1.is_a? String
|
29
27
|
seq2 = seq2.codepoints if seq2.is_a? String
|
30
28
|
|
@@ -41,24 +39,25 @@ module Edits
|
|
41
39
|
last_row = 0.upto(cols).to_a
|
42
40
|
|
43
41
|
rows.times do |row|
|
44
|
-
|
45
|
-
|
42
|
+
prev_col_cost = row + 1
|
46
43
|
seq1_item = seq1[row]
|
47
44
|
|
48
45
|
cols.times do |col|
|
49
|
-
|
50
|
-
|
51
|
-
substitution = last_row[col] + (seq1_item == seq2[col] ? 0 : 1)
|
52
|
-
|
46
|
+
# | Xs | Xd |
|
47
|
+
# | Xi | ? |
|
53
48
|
# step cost is min of operation costs
|
54
|
-
|
55
|
-
cost =
|
49
|
+
# substitution, deletion, insertion
|
50
|
+
cost = [
|
51
|
+
last_row[col] + (seq1_item == seq2[col] ? 0 : 1),
|
52
|
+
last_row[col + 1] + 1,
|
53
|
+
prev_col_cost + 1
|
54
|
+
].min
|
56
55
|
|
57
56
|
# overwrite previous row as we progress
|
58
|
-
last_row[col] =
|
59
|
-
|
57
|
+
last_row[col] = prev_col_cost
|
58
|
+
prev_col_cost = cost
|
60
59
|
end
|
61
|
-
last_row[cols] =
|
60
|
+
last_row[cols] = prev_col_cost
|
62
61
|
end
|
63
62
|
|
64
63
|
last_row[cols]
|
@@ -77,85 +76,56 @@ module Edits
|
|
77
76
|
# @param max [Integer] maximum distance
|
78
77
|
# @return [Integer]
|
79
78
|
def self.distance_with_max(seq1, seq2, max)
|
80
|
-
if seq1.length > seq2.length
|
81
|
-
temp = seq1
|
82
|
-
seq1 = seq2
|
83
|
-
seq2 = temp
|
84
|
-
end
|
79
|
+
seq1, seq2 = seq2, seq1 if seq1.length > seq2.length
|
85
80
|
|
86
81
|
rows = seq1.length
|
87
82
|
cols = seq2.length
|
88
|
-
return cols if rows.zero?
|
89
|
-
return rows if cols.zero?
|
90
|
-
return max if (
|
83
|
+
return cols > max ? max : cols if rows.zero?
|
84
|
+
return rows > max ? max : rows if cols.zero?
|
85
|
+
return max if (cols - rows) >= max
|
91
86
|
|
87
|
+
# array of codepoints outperforms String
|
92
88
|
seq1 = seq1.codepoints if seq1.is_a? String
|
93
89
|
seq2 = seq2.codepoints if seq2.is_a? String
|
94
90
|
|
91
|
+
# 'infinite' edit distance for padding cost matrix.
|
92
|
+
# Can be any value > max[rows, cols]
|
93
|
+
inf = cols + 1
|
94
|
+
|
95
|
+
# retain previous row of cost matrix
|
95
96
|
last_row = 0.upto(cols).to_a
|
96
97
|
|
97
98
|
rows.times do |row|
|
98
|
-
|
99
|
-
seq1_item = seq1[row]
|
100
|
-
|
99
|
+
# Ukkonen cut-off
|
101
100
|
min_col = row > max ? row - max : 0
|
102
101
|
max_col = row + max
|
103
102
|
max_col = cols - 1 if max_col > cols - 1
|
103
|
+
|
104
|
+
prev_col_cost = min_col.zero? ? row + 1 : inf
|
105
|
+
seq1_item = seq1[row]
|
104
106
|
diagonal = cols - rows + row
|
105
107
|
|
106
|
-
|
108
|
+
min_col.upto(max_col) do |col|
|
107
109
|
return max if diagonal == col && last_row[col] >= max
|
108
|
-
col_cost =
|
109
|
-
if col < min_col || col > max_col
|
110
|
-
max + 1
|
111
|
-
else
|
112
|
-
# step cost is min of operation costs
|
113
|
-
deletion = last_row[col + 1] + 1
|
114
|
-
insertion = last_col_cost + 1
|
115
|
-
substitution = last_row[col] + (seq1_item == seq2[col] ? 0 : 1)
|
116
|
-
|
117
|
-
cost = deletion < insertion ? deletion : insertion
|
118
|
-
substitution < cost ? substitution : cost
|
119
|
-
end
|
120
|
-
|
121
|
-
last_row[col] = last_col_cost
|
122
|
-
last_col_cost = col_cost
|
123
|
-
end
|
124
|
-
|
125
|
-
last_row[cols] = last_col_cost
|
126
|
-
end
|
127
110
|
|
128
|
-
|
129
|
-
|
111
|
+
# | Xs | Xd |
|
112
|
+
# | Xi | ? |
|
113
|
+
# substitution, deletion, insertion
|
114
|
+
cost = [
|
115
|
+
last_row[col] + (seq1_item == seq2[col] ? 0 : 1),
|
116
|
+
last_row[col + 1] + 1,
|
117
|
+
prev_col_cost + 1
|
118
|
+
].min
|
130
119
|
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
# `Levenshtein.most_similar("foo", strings)` is functionally equivalent to
|
135
|
-
# `strings.min_by { |s| Levenshtein.distance("foo", s) }`, leveraging
|
136
|
-
# {.distance_with_max}.
|
137
|
-
#
|
138
|
-
# @example
|
139
|
-
# Edits::Levenshtein.most_similar("atom", %w[tram atlas rota racer])
|
140
|
-
# # => "atlas"
|
141
|
-
# @param prototype [String]
|
142
|
-
# @param strings [<String>]
|
143
|
-
# @return [String, nil] most similar string, or nil for empty array
|
144
|
-
def self.most_similar(prototype, strings)
|
145
|
-
return nil if strings.empty?
|
146
|
-
min_s = strings[0]
|
147
|
-
min_d = distance(prototype, min_s)
|
148
|
-
|
149
|
-
strings[1..-1].each do |s|
|
150
|
-
return min_s if min_d.zero?
|
151
|
-
d = distance_with_max(prototype, s, min_d)
|
152
|
-
if d < min_d
|
153
|
-
min_d = d
|
154
|
-
min_s = s
|
120
|
+
# overwrite previous row as we progress
|
121
|
+
last_row[col] = prev_col_cost
|
122
|
+
prev_col_cost = cost
|
155
123
|
end
|
124
|
+
|
125
|
+
last_row[cols] = prev_col_cost
|
156
126
|
end
|
157
127
|
|
158
|
-
|
128
|
+
last_row[cols] > max ? max : last_row[cols]
|
159
129
|
end
|
160
130
|
end
|
161
131
|
end
|
@@ -8,8 +8,13 @@ module Edits
|
|
8
8
|
# * Insertion
|
9
9
|
# * Deletion
|
10
10
|
# * Substitution
|
11
|
-
# *
|
11
|
+
# * Adjacent transposition
|
12
|
+
#
|
13
|
+
# This variant is restricted by the condition that no sub-string is edited
|
14
|
+
# more than once.
|
12
15
|
module RestrictedEdit
|
16
|
+
extend Compare
|
17
|
+
|
13
18
|
# Calculate the Restricted Damerau-Levenshtein distance (Optimal Alignment)
|
14
19
|
# of two sequences.
|
15
20
|
#
|
@@ -21,13 +26,9 @@ module Edits
|
|
21
26
|
# @param seq2 [String, Array]
|
22
27
|
# @return [Integer]
|
23
28
|
def self.distance(seq1, seq2)
|
24
|
-
if seq1.length > seq2.length
|
25
|
-
temp = seq1
|
26
|
-
seq1 = seq2
|
27
|
-
seq2 = temp
|
28
|
-
end
|
29
|
+
seq1, seq2 = seq2, seq1 if seq1.length > seq2.length
|
29
30
|
|
30
|
-
# array of
|
31
|
+
# array of codepoints outperforms String
|
31
32
|
seq1 = seq1.codepoints if seq1.is_a? String
|
32
33
|
seq2 = seq2.codepoints if seq2.is_a? String
|
33
34
|
|
@@ -36,9 +37,10 @@ module Edits
|
|
36
37
|
return cols if rows.zero?
|
37
38
|
return rows if cols.zero?
|
38
39
|
|
39
|
-
# previous two rows of cost matrix
|
40
|
+
# retain previous two rows of cost matrix
|
40
41
|
lastlast_row = []
|
41
42
|
last_row = []
|
43
|
+
|
42
44
|
# Initialize first row of cost matrix.
|
43
45
|
# The full initial state where cols=3, rows=2 would be:
|
44
46
|
# [[0, 1, 2, 3],
|
@@ -47,29 +49,29 @@ module Edits
|
|
47
49
|
curr_row = 0.upto(cols).to_a
|
48
50
|
|
49
51
|
rows.times do |row|
|
50
|
-
|
51
|
-
last_row = curr_row
|
52
|
+
# rotate row arrays
|
53
|
+
curr_row, last_row, lastlast_row = lastlast_row, curr_row, last_row
|
52
54
|
|
53
|
-
# generate next row of cost matrix
|
54
|
-
curr_row = Array.new(cols + 1, 0)
|
55
55
|
curr_row[0] = row + 1
|
56
|
-
|
57
|
-
curr_item = seq1[row]
|
56
|
+
seq1_item = seq1[row]
|
58
57
|
|
59
58
|
cols.times do |col|
|
60
|
-
sub_cost =
|
61
|
-
is_swap = sub_cost
|
59
|
+
sub_cost = seq1_item == seq2[col] ? 0 : 1
|
60
|
+
is_swap = sub_cost.positive? &&
|
62
61
|
row.positive? && col.positive? &&
|
63
|
-
|
62
|
+
seq1_item == seq2[col - 1] &&
|
64
63
|
seq1[row - 1] == seq2[col]
|
65
64
|
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
65
|
+
# | Xt | | |
|
66
|
+
# | | Xs | Xd |
|
67
|
+
# | | Xi | ? |
|
70
68
|
# step cost is min of operation costs
|
71
|
-
|
72
|
-
cost =
|
69
|
+
# substitution, deletion, insertion, transposition
|
70
|
+
cost = [
|
71
|
+
last_row[col] + sub_cost,
|
72
|
+
last_row[col + 1] + 1,
|
73
|
+
curr_row[col] + 1
|
74
|
+
].min
|
73
75
|
|
74
76
|
if is_swap
|
75
77
|
swap = lastlast_row[col - 1] + 1
|
@@ -82,5 +84,84 @@ module Edits
|
|
82
84
|
|
83
85
|
curr_row[cols]
|
84
86
|
end
|
87
|
+
|
88
|
+
# Calculate the Restricted Damerau-Levenshtein distance (Optimal Alignment)
|
89
|
+
# of two sequences, bounded by a maximum value.
|
90
|
+
#
|
91
|
+
# @example
|
92
|
+
# Edits::RestrictedEdit.distance("cloud", "crayon")
|
93
|
+
# # => 5
|
94
|
+
# Edits::RestrictedEdit.distance_with_max("cloud", "crayon", 2)
|
95
|
+
# # => 2
|
96
|
+
# @param seq1 [String, Array]
|
97
|
+
# @param seq2 [String, Array]
|
98
|
+
# @param max [Integer] maximum distance
|
99
|
+
# @return [Integer]
|
100
|
+
def self.distance_with_max(seq1, seq2, max)
|
101
|
+
seq1, seq2 = seq2, seq1 if seq1.length > seq2.length
|
102
|
+
|
103
|
+
rows = seq1.length
|
104
|
+
cols = seq2.length
|
105
|
+
return cols > max ? max : cols if rows.zero?
|
106
|
+
return rows > max ? max : rows if cols.zero?
|
107
|
+
return max if (cols - rows) >= max
|
108
|
+
|
109
|
+
# array of codepoints outperforms String
|
110
|
+
seq1 = seq1.codepoints if seq1.is_a? String
|
111
|
+
seq2 = seq2.codepoints if seq2.is_a? String
|
112
|
+
|
113
|
+
# 'infinite' edit distance for padding cost matrix.
|
114
|
+
# Can be any value > max[rows, cols]
|
115
|
+
inf = cols + 1
|
116
|
+
|
117
|
+
# retain previous two rows of cost matrix,
|
118
|
+
# padded with "inf" as matrix is not fully evaluated
|
119
|
+
lastlast_row = Array.new(inf, inf)
|
120
|
+
last_row = Array.new(inf, inf)
|
121
|
+
curr_row = 0.upto(cols).to_a
|
122
|
+
|
123
|
+
rows.times do |row|
|
124
|
+
# rotate row arrays
|
125
|
+
curr_row, last_row, lastlast_row = lastlast_row, curr_row, last_row
|
126
|
+
|
127
|
+
# Ukkonen cut-off
|
128
|
+
min_col = row > max ? row - max : 0
|
129
|
+
max_col = row + max
|
130
|
+
max_col = cols - 1 if max_col > cols - 1
|
131
|
+
|
132
|
+
curr_row[min_col] = min_col.zero? ? row + 1 : inf
|
133
|
+
seq1_item = seq1[row]
|
134
|
+
diagonal = cols - rows + row
|
135
|
+
|
136
|
+
min_col.upto(max_col) do |col|
|
137
|
+
return max if diagonal == col && last_row[col] >= max
|
138
|
+
|
139
|
+
sub_cost = seq1_item == seq2[col] ? 0 : 1
|
140
|
+
is_swap = sub_cost.positive? &&
|
141
|
+
row.positive? && col.positive? &&
|
142
|
+
seq1_item == seq2[col - 1] &&
|
143
|
+
seq1[row - 1] == seq2[col]
|
144
|
+
|
145
|
+
# | Xt | | |
|
146
|
+
# | | Xs | Xd |
|
147
|
+
# | | Xi | ? |
|
148
|
+
# substitution, deletion, insertion, transposition
|
149
|
+
cost = [
|
150
|
+
last_row[col] + sub_cost,
|
151
|
+
last_row[col + 1] + 1,
|
152
|
+
curr_row[col] + 1
|
153
|
+
].min
|
154
|
+
|
155
|
+
if is_swap
|
156
|
+
swap = lastlast_row[col - 1] + 1
|
157
|
+
cost = swap if swap < cost
|
158
|
+
end
|
159
|
+
|
160
|
+
curr_row[col + 1] = cost
|
161
|
+
end
|
162
|
+
end
|
163
|
+
|
164
|
+
curr_row[cols] > max ? max : curr_row[cols]
|
165
|
+
end
|
85
166
|
end
|
86
167
|
end
|
data/lib/edits/version.rb
CHANGED
@@ -5,7 +5,7 @@ require "benchmark/ips"
|
|
5
5
|
require "edits"
|
6
6
|
|
7
7
|
namespace :benchmark do
|
8
|
-
desc "distance vs. distance_with_max (x100)"
|
8
|
+
desc "levenshtein distance vs. distance_with_max (x100)"
|
9
9
|
task :lev_max do
|
10
10
|
words = File.read("/usr/share/dict/words")
|
11
11
|
.split(/\n/).compact.shuffle(random: Random.new(1))
|
@@ -64,6 +64,65 @@ namespace :benchmark do
|
|
64
64
|
end
|
65
65
|
end
|
66
66
|
|
67
|
+
desc "restricted distance vs. distance_with_max (x100)"
|
68
|
+
task :restricted_max do
|
69
|
+
words = File.read("/usr/share/dict/words")
|
70
|
+
.split(/\n/).compact.shuffle(random: Random.new(1))
|
71
|
+
.take(101)
|
72
|
+
|
73
|
+
Benchmark.ips do |x|
|
74
|
+
x.report("distance") do
|
75
|
+
words.each_cons(2) do |a, b|
|
76
|
+
Edits::RestrictedEdit.distance a, b
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
x.report("with max 1") do
|
81
|
+
words.each_cons(2) do |a, b|
|
82
|
+
Edits::RestrictedEdit.distance_with_max a, b, 1
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
x.report("with max 2") do
|
87
|
+
words.each_cons(2) do |a, b|
|
88
|
+
Edits::RestrictedEdit.distance_with_max a, b, 2
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
x.report("with max 3") do
|
93
|
+
words.each_cons(2) do |a, b|
|
94
|
+
Edits::RestrictedEdit.distance_with_max a, b, 3
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
x.report("with max 4") do
|
99
|
+
words.each_cons(2) do |a, b|
|
100
|
+
Edits::RestrictedEdit.distance_with_max a, b, 4
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
x.report("with max 6") do
|
105
|
+
words.each_cons(2) do |a, b|
|
106
|
+
Edits::RestrictedEdit.distance_with_max a, b, 6
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
x.report("with max 8") do
|
111
|
+
words.each_cons(2) do |a, b|
|
112
|
+
Edits::RestrictedEdit.distance_with_max a, b, 8
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
x.report("with max 50") do
|
117
|
+
words.each_cons(2) do |a, b|
|
118
|
+
Edits::RestrictedEdit.distance_with_max a, b, 100
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
x.compare!
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
67
126
|
desc "most_similar vs. min_by (100 words)"
|
68
127
|
task :lev_similar do
|
69
128
|
words = File.read("/usr/share/dict/words")
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: edits
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tom Crouch
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-10-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -114,6 +114,7 @@ files:
|
|
114
114
|
- bin/setup
|
115
115
|
- edits.gemspec
|
116
116
|
- lib/edits.rb
|
117
|
+
- lib/edits/compare.rb
|
117
118
|
- lib/edits/damerau_levenshtein.rb
|
118
119
|
- lib/edits/hamming.rb
|
119
120
|
- lib/edits/jaro.rb
|
@@ -143,7 +144,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
143
144
|
version: '0'
|
144
145
|
requirements: []
|
145
146
|
rubyforge_project:
|
146
|
-
rubygems_version: 2.6.
|
147
|
+
rubygems_version: 2.6.13
|
147
148
|
signing_key:
|
148
149
|
specification_version: 4
|
149
150
|
summary: A collection of edit distance algorithms.
|