edits 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/edits/damerau_levenshtein.rb +39 -43
- data/lib/edits/version.rb +1 -1
- metadata +3 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d51ead5c11a725d7c2efcdd8030098c67e20c765ae1c1d4a17843bb4f993f781
|
4
|
+
data.tar.gz: dcec6dfc9b48f29050989edf108e9d930b49c66d73dbd29800466adf307b68eb
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b7b82962d4b3fbbf84fa73d79052f449a7016529f5dd3b82960c1a9898df066f566ea3b768eec218500c65cdd55b57b34eb729f142a6fc2a0eced43415e35ef4
|
7
|
+
data.tar.gz: 0a0c8e0f1593847a74e25c585c2c4265b4790a641b6ce5d9cebb5c9fda0e4d971313e03efb314e937c29eeef4b168459eb1e8c6f75fd22348635110f8467b07f
|
@@ -21,70 +21,66 @@ module Edits
|
|
21
21
|
seq1, seq2 = seq2, seq1 if seq1.length > seq2.length
|
22
22
|
|
23
23
|
# array of codepoints outperforms String
|
24
|
-
|
25
|
-
|
24
|
+
if seq1.is_a?(String) && seq2.is_a?(String)
|
25
|
+
seq1 = seq1.codepoints
|
26
|
+
seq2 = seq2.codepoints
|
27
|
+
end
|
26
28
|
|
27
29
|
rows = seq1.length
|
28
30
|
cols = seq2.length
|
29
31
|
return cols if rows == 0
|
30
32
|
return rows if cols == 0
|
31
33
|
|
32
|
-
# 'infinite' edit distance
|
33
|
-
#
|
34
|
-
inf =
|
35
|
-
|
36
|
-
# Initialize first two rows of cost matrix.
|
37
|
-
# The full initial state where cols=3, rows=2 (inf=5) would be:
|
38
|
-
# [[5, 5, 5, 5, 5],
|
39
|
-
# [5, 0, 1, 2, 3],
|
40
|
-
# [5, 1, 0, 0, 0],
|
41
|
-
# [5, 2, 0, 0, 0]]
|
42
|
-
matrix = [Array.new(cols + 2, inf)]
|
43
|
-
matrix << 0.upto(cols).to_a.unshift(inf)
|
34
|
+
# 'infinite' edit distance to pad cost matrix.
|
35
|
+
# Any value > max[rows, cols]
|
36
|
+
inf = cols + 1
|
44
37
|
|
45
38
|
# element => last row seen
|
46
|
-
|
39
|
+
row_history = Hash.new(0)
|
47
40
|
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
new_row[0] = inf
|
52
|
-
new_row[1] = row
|
53
|
-
matrix << new_row
|
41
|
+
# initialize alphabet-keyed cost matrix
|
42
|
+
matrix = {}
|
43
|
+
curr_row = 0.upto(cols).to_a
|
54
44
|
|
55
|
-
|
56
|
-
seq1_item = seq1[row
|
45
|
+
rows.times do |row|
|
46
|
+
seq1_item = seq1[row]
|
47
|
+
match_col = 0
|
57
48
|
|
58
|
-
|
59
|
-
|
60
|
-
|
49
|
+
# rotate row arrays & generate next
|
50
|
+
matrix[seq1_item] = last_row = curr_row
|
51
|
+
curr_row = Array.new(cols + 1, inf)
|
52
|
+
curr_row[0] = row + 1
|
61
53
|
|
54
|
+
cols.times do |col|
|
55
|
+
seq2_item = seq2[col]
|
62
56
|
sub_cost = seq1_item == seq2_item ? 0 : 1
|
63
57
|
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
# TODO: do insertion/deletion need to be considered when
|
69
|
-
# seq1_item == seq2_item ?
|
70
|
-
#
|
71
|
-
# substitution, deletion, insertion, transposition
|
58
|
+
# | Xs | Xd |
|
59
|
+
# | Xi | ? |
|
60
|
+
# substitution, deletion, insertion
|
72
61
|
cost = [
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
transposition
|
62
|
+
last_row[col] + sub_cost,
|
63
|
+
last_row[col + 1] + 1,
|
64
|
+
curr_row[col] + 1
|
77
65
|
].min
|
78
66
|
|
79
|
-
|
80
|
-
|
81
|
-
|
67
|
+
# transposition cost
|
68
|
+
# skip missed matrix lookup (inf cost)
|
69
|
+
if sub_cost > 0 && row > 0 && (m = matrix[seq2_item])
|
70
|
+
transpose = 1 + m[match_col] \
|
71
|
+
+ (row - row_history[seq2_item] - 1) \
|
72
|
+
+ (col - match_col - 1)
|
73
|
+
cost = transpose if transpose < cost
|
74
|
+
end
|
75
|
+
|
76
|
+
match_col = col if sub_cost == 0
|
77
|
+
curr_row[col + 1] = cost
|
82
78
|
end
|
83
79
|
|
84
|
-
|
80
|
+
row_history[seq1_item] = row
|
85
81
|
end
|
86
82
|
|
87
|
-
|
83
|
+
curr_row[cols]
|
88
84
|
end
|
89
85
|
end
|
90
86
|
end
|
data/lib/edits/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: edits
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tom Crouch
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-10-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: benchmark-ips
|
@@ -174,8 +174,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
174
174
|
- !ruby/object:Gem::Version
|
175
175
|
version: '0'
|
176
176
|
requirements: []
|
177
|
-
|
178
|
-
rubygems_version: 2.7.7
|
177
|
+
rubygems_version: 3.0.8
|
179
178
|
signing_key:
|
180
179
|
specification_version: 4
|
181
180
|
summary: A collection of edit distance algorithms.
|