edits 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1c829d27a322f8c9b3ad50437a3b66d2548ccfc40d435ebaee4c362b22802fb8
4
- data.tar.gz: dd351b715125154bd7f9b37b98fd1f75aea504163fe550795c0615af1fea9b65
3
+ metadata.gz: d51ead5c11a725d7c2efcdd8030098c67e20c765ae1c1d4a17843bb4f993f781
4
+ data.tar.gz: dcec6dfc9b48f29050989edf108e9d930b49c66d73dbd29800466adf307b68eb
5
5
  SHA512:
6
- metadata.gz: 8d5b3cd301621b913768f63c50f4876a340f620c22c2fbe68d1659bd1566141f6a737264dfb3da15f0b7a7e1a567a1036dcd978cd9c2001210f843a1e5da8f93
7
- data.tar.gz: 821c4cd4acf9e2b9e25c21ce2f61a4f489838c4252fd6c4e0fd323fe9b5f3baaa1948e3ab87b0c1448830e39c9ff2362aa457e4fe9ce902994545dd8c986137e
6
+ metadata.gz: b7b82962d4b3fbbf84fa73d79052f449a7016529f5dd3b82960c1a9898df066f566ea3b768eec218500c65cdd55b57b34eb729f142a6fc2a0eced43415e35ef4
7
+ data.tar.gz: 0a0c8e0f1593847a74e25c585c2c4265b4790a641b6ce5d9cebb5c9fda0e4d971313e03efb314e937c29eeef4b168459eb1e8c6f75fd22348635110f8467b07f
@@ -21,70 +21,66 @@ module Edits
21
21
  seq1, seq2 = seq2, seq1 if seq1.length > seq2.length
22
22
 
23
23
  # array of codepoints outperforms String
24
- seq1 = seq1.codepoints if seq1.is_a? String
25
- seq2 = seq2.codepoints if seq2.is_a? String
24
+ if seq1.is_a?(String) && seq2.is_a?(String)
25
+ seq1 = seq1.codepoints
26
+ seq2 = seq2.codepoints
27
+ end
26
28
 
27
29
  rows = seq1.length
28
30
  cols = seq2.length
29
31
  return cols if rows == 0
30
32
  return rows if cols == 0
31
33
 
32
- # 'infinite' edit distance for padding cost matrix.
33
- # Can be any value > max[rows, cols]
34
- inf = rows + cols
35
-
36
- # Initialize first two rows of cost matrix.
37
- # The full initial state where cols=3, rows=2 (inf=5) would be:
38
- # [[5, 5, 5, 5, 5],
39
- # [5, 0, 1, 2, 3],
40
- # [5, 1, 0, 0, 0],
41
- # [5, 2, 0, 0, 0]]
42
- matrix = [Array.new(cols + 2, inf)]
43
- matrix << 0.upto(cols).to_a.unshift(inf)
34
+ # 'infinite' edit distance to pad cost matrix.
35
+ # Any value > max[rows, cols]
36
+ inf = cols + 1
44
37
 
45
38
  # element => last row seen
46
- item_history = Hash.new(0)
39
+ row_history = Hash.new(0)
47
40
 
48
- 1.upto(rows) do |row|
49
- # generate next row of cost matrix
50
- new_row = Array.new(cols + 2, 0)
51
- new_row[0] = inf
52
- new_row[1] = row
53
- matrix << new_row
41
+ # initialize alphabet-keyed cost matrix
42
+ matrix = {}
43
+ curr_row = 0.upto(cols).to_a
54
44
 
55
- last_match_col = 0
56
- seq1_item = seq1[row - 1]
45
+ rows.times do |row|
46
+ seq1_item = seq1[row]
47
+ match_col = 0
57
48
 
58
- 1.upto(cols) do |col|
59
- seq2_item = seq2[col - 1]
60
- last_match_row = item_history[seq2_item]
49
+ # rotate row arrays & generate next
50
+ matrix[seq1_item] = last_row = curr_row
51
+ curr_row = Array.new(cols + 1, inf)
52
+ curr_row[0] = row + 1
61
53
 
54
+ cols.times do |col|
55
+ seq2_item = seq2[col]
62
56
  sub_cost = seq1_item == seq2_item ? 0 : 1
63
57
 
64
- transposition = 1 + matrix[last_match_row][last_match_col]
65
- transposition += row - last_match_row - 1
66
- transposition += col - last_match_col - 1
67
-
68
- # TODO: do insertion/deletion need to be considered when
69
- # seq1_item == seq2_item ?
70
- #
71
- # substitution, deletion, insertion, transposition
58
+ # | Xs | Xd |
59
+ # | Xi | ? |
60
+ # substitution, deletion, insertion
72
61
  cost = [
73
- matrix[row][col] + sub_cost,
74
- matrix[row][col + 1] + 1,
75
- matrix[row + 1][col] + 1,
76
- transposition
62
+ last_row[col] + sub_cost,
63
+ last_row[col + 1] + 1,
64
+ curr_row[col] + 1
77
65
  ].min
78
66
 
79
- matrix[row + 1][col + 1] = cost
80
-
81
- last_match_col = col if sub_cost == 0
67
+ # transposition cost
68
+ # skip missed matrix lookup (inf cost)
69
+ if sub_cost > 0 && row > 0 && (m = matrix[seq2_item])
70
+ transpose = 1 + m[match_col] \
71
+ + (row - row_history[seq2_item] - 1) \
72
+ + (col - match_col - 1)
73
+ cost = transpose if transpose < cost
74
+ end
75
+
76
+ match_col = col if sub_cost == 0
77
+ curr_row[col + 1] = cost
82
78
  end
83
79
 
84
- item_history[seq1_item] = row
80
+ row_history[seq1_item] = row
85
81
  end
86
82
 
87
- matrix[rows + 1][cols + 1]
83
+ curr_row[cols]
88
84
  end
89
85
  end
90
86
  end
@@ -2,5 +2,5 @@
2
2
 
3
3
  module Edits
4
4
  # Current gem version
5
- VERSION = "0.3.0"
5
+ VERSION = "0.4.0"
6
6
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: edits
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tom Crouch
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-05-21 00:00:00.000000000 Z
11
+ date: 2020-10-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: benchmark-ips
@@ -174,8 +174,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
174
174
  - !ruby/object:Gem::Version
175
175
  version: '0'
176
176
  requirements: []
177
- rubyforge_project:
178
- rubygems_version: 2.7.7
177
+ rubygems_version: 3.0.8
179
178
  signing_key:
180
179
  specification_version: 4
181
180
  summary: A collection of edit distance algorithms.