edits 0.3.0 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1c829d27a322f8c9b3ad50437a3b66d2548ccfc40d435ebaee4c362b22802fb8
4
- data.tar.gz: dd351b715125154bd7f9b37b98fd1f75aea504163fe550795c0615af1fea9b65
3
+ metadata.gz: d51ead5c11a725d7c2efcdd8030098c67e20c765ae1c1d4a17843bb4f993f781
4
+ data.tar.gz: dcec6dfc9b48f29050989edf108e9d930b49c66d73dbd29800466adf307b68eb
5
5
  SHA512:
6
- metadata.gz: 8d5b3cd301621b913768f63c50f4876a340f620c22c2fbe68d1659bd1566141f6a737264dfb3da15f0b7a7e1a567a1036dcd978cd9c2001210f843a1e5da8f93
7
- data.tar.gz: 821c4cd4acf9e2b9e25c21ce2f61a4f489838c4252fd6c4e0fd323fe9b5f3baaa1948e3ab87b0c1448830e39c9ff2362aa457e4fe9ce902994545dd8c986137e
6
+ metadata.gz: b7b82962d4b3fbbf84fa73d79052f449a7016529f5dd3b82960c1a9898df066f566ea3b768eec218500c65cdd55b57b34eb729f142a6fc2a0eced43415e35ef4
7
+ data.tar.gz: 0a0c8e0f1593847a74e25c585c2c4265b4790a641b6ce5d9cebb5c9fda0e4d971313e03efb314e937c29eeef4b168459eb1e8c6f75fd22348635110f8467b07f
@@ -21,70 +21,66 @@ module Edits
21
21
  seq1, seq2 = seq2, seq1 if seq1.length > seq2.length
22
22
 
23
23
  # array of codepoints outperforms String
24
- seq1 = seq1.codepoints if seq1.is_a? String
25
- seq2 = seq2.codepoints if seq2.is_a? String
24
+ if seq1.is_a?(String) && seq2.is_a?(String)
25
+ seq1 = seq1.codepoints
26
+ seq2 = seq2.codepoints
27
+ end
26
28
 
27
29
  rows = seq1.length
28
30
  cols = seq2.length
29
31
  return cols if rows == 0
30
32
  return rows if cols == 0
31
33
 
32
- # 'infinite' edit distance for padding cost matrix.
33
- # Can be any value > max[rows, cols]
34
- inf = rows + cols
35
-
36
- # Initialize first two rows of cost matrix.
37
- # The full initial state where cols=3, rows=2 (inf=5) would be:
38
- # [[5, 5, 5, 5, 5],
39
- # [5, 0, 1, 2, 3],
40
- # [5, 1, 0, 0, 0],
41
- # [5, 2, 0, 0, 0]]
42
- matrix = [Array.new(cols + 2, inf)]
43
- matrix << 0.upto(cols).to_a.unshift(inf)
34
+ # 'infinite' edit distance to pad cost matrix.
35
+ # Any value > max[rows, cols]
36
+ inf = cols + 1
44
37
 
45
38
  # element => last row seen
46
- item_history = Hash.new(0)
39
+ row_history = Hash.new(0)
47
40
 
48
- 1.upto(rows) do |row|
49
- # generate next row of cost matrix
50
- new_row = Array.new(cols + 2, 0)
51
- new_row[0] = inf
52
- new_row[1] = row
53
- matrix << new_row
41
+ # initialize alphabet-keyed cost matrix
42
+ matrix = {}
43
+ curr_row = 0.upto(cols).to_a
54
44
 
55
- last_match_col = 0
56
- seq1_item = seq1[row - 1]
45
+ rows.times do |row|
46
+ seq1_item = seq1[row]
47
+ match_col = 0
57
48
 
58
- 1.upto(cols) do |col|
59
- seq2_item = seq2[col - 1]
60
- last_match_row = item_history[seq2_item]
49
+ # rotate row arrays & generate next
50
+ matrix[seq1_item] = last_row = curr_row
51
+ curr_row = Array.new(cols + 1, inf)
52
+ curr_row[0] = row + 1
61
53
 
54
+ cols.times do |col|
55
+ seq2_item = seq2[col]
62
56
  sub_cost = seq1_item == seq2_item ? 0 : 1
63
57
 
64
- transposition = 1 + matrix[last_match_row][last_match_col]
65
- transposition += row - last_match_row - 1
66
- transposition += col - last_match_col - 1
67
-
68
- # TODO: do insertion/deletion need to be considered when
69
- # seq1_item == seq2_item ?
70
- #
71
- # substitution, deletion, insertion, transposition
58
+ # | Xs | Xd |
59
+ # | Xi | ? |
60
+ # substitution, deletion, insertion
72
61
  cost = [
73
- matrix[row][col] + sub_cost,
74
- matrix[row][col + 1] + 1,
75
- matrix[row + 1][col] + 1,
76
- transposition
62
+ last_row[col] + sub_cost,
63
+ last_row[col + 1] + 1,
64
+ curr_row[col] + 1
77
65
  ].min
78
66
 
79
- matrix[row + 1][col + 1] = cost
80
-
81
- last_match_col = col if sub_cost == 0
67
+ # transposition cost
68
+ # skip missed matrix lookup (inf cost)
69
+ if sub_cost > 0 && row > 0 && (m = matrix[seq2_item])
70
+ transpose = 1 + m[match_col] \
71
+ + (row - row_history[seq2_item] - 1) \
72
+ + (col - match_col - 1)
73
+ cost = transpose if transpose < cost
74
+ end
75
+
76
+ match_col = col if sub_cost == 0
77
+ curr_row[col + 1] = cost
82
78
  end
83
79
 
84
- item_history[seq1_item] = row
80
+ row_history[seq1_item] = row
85
81
  end
86
82
 
87
- matrix[rows + 1][cols + 1]
83
+ curr_row[cols]
88
84
  end
89
85
  end
90
86
  end
@@ -2,5 +2,5 @@
2
2
 
3
3
  module Edits
4
4
  # Current gem version
5
- VERSION = "0.3.0"
5
+ VERSION = "0.4.0"
6
6
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: edits
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tom Crouch
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-05-21 00:00:00.000000000 Z
11
+ date: 2020-10-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: benchmark-ips
@@ -174,8 +174,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
174
174
  - !ruby/object:Gem::Version
175
175
  version: '0'
176
176
  requirements: []
177
- rubyforge_project:
178
- rubygems_version: 2.7.7
177
+ rubygems_version: 3.0.8
179
178
  signing_key:
180
179
  specification_version: 4
181
180
  summary: A collection of edit distance algorithms.