string_metric 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/string_metric/levenshtein/experiment.rb +1 -5
- data/lib/string_metric/levenshtein/iterative_with_full_matrix.rb +21 -12
- data/lib/string_metric/levenshtein/iterative_with_two_matrix_rows.rb +15 -15
- data/lib/string_metric/levenshtein/iterative_with_two_matrix_rows_optimized.rb +14 -14
- data/lib/string_metric/levenshtein/recursive.rb +9 -5
- data/lib/string_metric/version.rb +1 -1
- data/spec/fixtures/levenshtein.csv +4 -1
- data/spec/support/levenshtein.rb +41 -4
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 08ed461c4edc7dfd5e32fb34e93b13b4442c5167
|
4
|
+
data.tar.gz: 69d6cd7273238442f184d6a63fec949ffcc6f6db
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f5d7e8664514c5f31075755c3b09b2a7c4a0cd963601aefd26f1b928426dea2051a12b8abd039dddb0d8986fe25c6ce860d8c4a867757a4bf5b5a1883c6ace1e
|
7
|
+
data.tar.gz: d9b631aad62da262d7a9afec9c7c1295e8b238cbfe4ef432354e0a2c42d9bf3b777758cf00abe2b14b702069eeb335df505ba9db37faa165335d748f4d748f2e
|
@@ -4,10 +4,6 @@ module StringMetric
|
|
4
4
|
module Levenshtein
|
5
5
|
class Experiment
|
6
6
|
def self.distance(from, to, options = {})
|
7
|
-
return 0 if from == to
|
8
|
-
return to.size if from.size.zero?
|
9
|
-
return from.size if to.size.zero?
|
10
|
-
|
11
7
|
m = from.length
|
12
8
|
n = to.length
|
13
9
|
|
@@ -25,4 +21,4 @@ module StringMetric
|
|
25
21
|
end
|
26
22
|
end
|
27
23
|
end
|
28
|
-
end
|
24
|
+
end
|
@@ -4,24 +4,32 @@ module StringMetric
|
|
4
4
|
module Levenshtein
|
5
5
|
class IterativeWithFullMatrix
|
6
6
|
def self.distance(from, to, options = {})
|
7
|
-
return 0 if from == to
|
8
|
-
return to.size if from.size.zero?
|
9
|
-
return from.size if to.size.zero?
|
10
|
-
|
11
7
|
max_distance = options[:max_distance]
|
12
8
|
insertion_cost = options.fetch(:insertion_cost, 1)
|
13
9
|
deletion_cost = options.fetch(:deletion_cost, 1)
|
14
10
|
substitution_cost = options.fetch(:substitution_cost, 1)
|
15
11
|
|
16
|
-
|
17
|
-
|
12
|
+
m = from.length
|
13
|
+
n = to.length
|
14
|
+
|
15
|
+
if max_distance && (m - n).abs >= max_distance
|
16
|
+
return max_distance
|
17
|
+
end
|
18
|
+
|
19
|
+
return 0 if from == to
|
20
|
+
return n if m.zero?
|
21
|
+
return m if n.zero?
|
22
|
+
|
23
|
+
d = (0..n).map do |i|
|
24
|
+
[0] * (m + 1)
|
18
25
|
end
|
19
26
|
|
20
|
-
(1..
|
21
|
-
(1..
|
27
|
+
(1..m).each { |j| d[0][j] = j }
|
28
|
+
(1..n).each { |i| d[i][0] = i }
|
22
29
|
|
23
|
-
|
24
|
-
|
30
|
+
to_column = 0
|
31
|
+
(1..m).each do |j|
|
32
|
+
(1..n).each do |i|
|
25
33
|
if from[j-1] == to[i-1]
|
26
34
|
d[i][j] = d[i -1][j-1]
|
27
35
|
else
|
@@ -30,12 +38,13 @@ module StringMetric
|
|
30
38
|
d[i-1][j-1] + substitution_cost # substitution
|
31
39
|
].min
|
32
40
|
end
|
41
|
+
to_column = i
|
33
42
|
end
|
34
43
|
|
35
|
-
break if max_distance and d[
|
44
|
+
break if max_distance and d[to_column].min > max_distance
|
36
45
|
end
|
37
46
|
|
38
|
-
x = d[
|
47
|
+
x = d[n][m]
|
39
48
|
if max_distance && x > max_distance
|
40
49
|
max_distance
|
41
50
|
else
|
@@ -4,10 +4,6 @@ module StringMetric
|
|
4
4
|
module Levenshtein
|
5
5
|
class IterativeWithTwoMatrixRows
|
6
6
|
def self.distance(from, to, options = {})
|
7
|
-
return 0 if from == to
|
8
|
-
return to.size if from.size.zero?
|
9
|
-
return from.size if to.size.zero?
|
10
|
-
|
11
7
|
max_distance = options[:max_distance]
|
12
8
|
insertion_cost = options.fetch(:insertion_cost, 1)
|
13
9
|
deletion_cost = options.fetch(:deletion_cost, 1)
|
@@ -16,13 +12,19 @@ module StringMetric
|
|
16
12
|
m = from.length
|
17
13
|
n = to.length
|
18
14
|
|
15
|
+
if max_distance && (n - m).abs >= max_distance
|
16
|
+
return max_distance
|
17
|
+
end
|
18
|
+
|
19
|
+
return 0 if from == to
|
20
|
+
return n if m.zero?
|
21
|
+
return m if n.zero?
|
22
|
+
|
19
23
|
v0 = (0..m).to_a
|
20
|
-
v1 = []
|
21
24
|
x = 0
|
22
25
|
|
23
26
|
n.times do |i|
|
24
|
-
|
25
|
-
|
27
|
+
current = x = i + 1
|
26
28
|
sub_cell = v0[0]
|
27
29
|
|
28
30
|
m.times do |j|
|
@@ -30,20 +32,18 @@ module StringMetric
|
|
30
32
|
|
31
33
|
ins_cell = v0[j+1]
|
32
34
|
|
33
|
-
x = [
|
35
|
+
x = [current + deletion_cost, # deletion
|
34
36
|
ins_cell + insertion_cost, # insertion
|
35
37
|
sub_cell + cost # substitution
|
36
38
|
].min
|
37
39
|
|
38
|
-
|
39
|
-
|
40
|
-
|
40
|
+
v0[j] = current
|
41
|
+
current = x
|
41
42
|
sub_cell = ins_cell
|
42
43
|
end
|
43
44
|
|
44
|
-
|
45
|
-
|
46
|
-
v0 = v1.dup
|
45
|
+
v0[m] = x
|
46
|
+
break if max_distance && v0.min > max_distance
|
47
47
|
end
|
48
48
|
|
49
49
|
if max_distance && x > max_distance
|
@@ -54,4 +54,4 @@ module StringMetric
|
|
54
54
|
end
|
55
55
|
end
|
56
56
|
end
|
57
|
-
end
|
57
|
+
end
|
@@ -4,10 +4,6 @@ module StringMetric
|
|
4
4
|
module Levenshtein
|
5
5
|
class IterativeWithTwoMatrixRowsOptimized
|
6
6
|
def self.distance(from, to, options = {})
|
7
|
-
return 0 if from == to
|
8
|
-
return to.size if from.size.zero?
|
9
|
-
return from.size if to.size.zero?
|
10
|
-
|
11
7
|
max_distance = options[:max_distance]
|
12
8
|
insertion_cost = options[:insertion_cost] || 1
|
13
9
|
deletion_cost = options[:deletion_cost] || 1
|
@@ -16,36 +12,40 @@ module StringMetric
|
|
16
12
|
m = from.length
|
17
13
|
n = to.length
|
18
14
|
|
15
|
+
if max_distance && (n - m).abs >= max_distance
|
16
|
+
return max_distance
|
17
|
+
end
|
18
|
+
|
19
|
+
return 0 if from == to
|
20
|
+
return n if m.zero?
|
21
|
+
return m if n.zero?
|
22
|
+
|
19
23
|
from = from.codepoints.to_a
|
20
24
|
to = to.codepoints.to_a
|
21
25
|
|
22
26
|
v0 = (0..m).to_a
|
23
|
-
v1 = []
|
24
27
|
x = 0
|
25
28
|
|
26
29
|
n.times do |i|
|
27
|
-
|
28
|
-
|
30
|
+
current = x = i + 1
|
29
31
|
sub_cell = v0[0]
|
30
32
|
|
31
33
|
m.times do |j|
|
32
34
|
cost = (from[j] == to[i]) ? 0 : substitution_cost
|
33
|
-
|
34
35
|
ins_cell = v0[j + 1]
|
35
36
|
|
36
|
-
x = [
|
37
|
+
x = [current + deletion_cost, # deletion
|
37
38
|
ins_cell + insertion_cost, # insertion
|
38
39
|
sub_cell + cost # substitution
|
39
40
|
].sort![0]
|
40
41
|
|
41
|
-
|
42
|
-
|
42
|
+
v0[j] = current
|
43
|
+
current = x
|
43
44
|
sub_cell = ins_cell
|
44
45
|
end
|
45
46
|
|
46
|
-
|
47
|
-
|
48
|
-
v0 = v1.dup
|
47
|
+
v0[m] = x
|
48
|
+
break if max_distance && v0.sort[0] > max_distance
|
49
49
|
end
|
50
50
|
|
51
51
|
if max_distance && x > max_distance
|
@@ -4,15 +4,19 @@ module StringMetric
|
|
4
4
|
module Levenshtein
|
5
5
|
class Recursive
|
6
6
|
def self.distance(from, to, options = {})
|
7
|
-
return 0 if from == to
|
8
|
-
return to.size if from.size.zero?
|
9
|
-
return from.size if to.size.zero?
|
10
|
-
|
11
7
|
max_distance = options[:max_distance]
|
12
8
|
insertion_cost = options.fetch(:insertion_cost, 1)
|
13
9
|
deletion_cost = options.fetch(:deletion_cost, 1)
|
14
10
|
substitution_cost = options.fetch(:substitution_cost, 1)
|
15
11
|
|
12
|
+
if max_distance && (from.size - to.size).abs >= max_distance
|
13
|
+
return max_distance
|
14
|
+
end
|
15
|
+
|
16
|
+
return 0 if from == to
|
17
|
+
return to.size if from.size.zero?
|
18
|
+
return from.size if to.size.zero?
|
19
|
+
|
16
20
|
if from.chars.to_a.last == to.chars.to_a.last
|
17
21
|
cost = 0
|
18
22
|
else
|
@@ -35,4 +39,4 @@ module StringMetric
|
|
35
39
|
end
|
36
40
|
end
|
37
41
|
end
|
38
|
-
end
|
42
|
+
end
|
data/spec/support/levenshtein.rb
CHANGED
@@ -24,11 +24,48 @@ shared_examples "Levenshtein Distance" do |options|
|
|
24
24
|
|
25
25
|
context "when max_distance is passed as option" do
|
26
26
|
context "and normal distance is greater than max_distance" do
|
27
|
-
let(:max_distance) { 2 }
|
28
|
-
|
29
27
|
it "is trimmed to max_distance" do
|
30
|
-
expect(described_class.distance("
|
31
|
-
|
28
|
+
expect(described_class.distance("gumbo", "gambol", max_distance: 1)).to eq 1
|
29
|
+
expect(described_class.distance("test", "tasf", max_distance: 1)).to eq 1
|
30
|
+
expect(described_class.distance("kitten", "sitting", max_distance: 2)).to eq 2
|
31
|
+
expect(described_class.distance("kitten", "kittenss", max_distance: 1)).to eq 1
|
32
|
+
expect(described_class.distance("kittenss", "kitten", max_distance: 1)).to eq 1
|
33
|
+
expect(described_class.distance("sitting", "kitten", max_distance: 2)).to eq 2
|
34
|
+
expect(described_class.distance("gambol", "gumbo", max_distance: 1)).to eq 1
|
35
|
+
expect(described_class.distance("kitten", "", max_distance: 2)).to eq 2
|
36
|
+
expect(described_class.distance("", "kitten", max_distance: 3)).to eq 3
|
37
|
+
end
|
38
|
+
end
|
39
|
+
context "and normal distance is less than max_distance" do
|
40
|
+
it "is calculated distance" do
|
41
|
+
expect(described_class.distance("", "t", max_distance: 2)).to eq 1
|
42
|
+
expect(described_class.distance("t", "", max_distance: 3)).to eq 1
|
43
|
+
expect(described_class.distance("test", "test", max_distance: 1)).to eq 0
|
44
|
+
expect(described_class.distance("test", "tent", max_distance: 2)).to eq 1
|
45
|
+
expect(described_class.distance("gumbo", "gambol", max_distance: 3)).to eq 2
|
46
|
+
expect(described_class.distance("kitten", "sitting", max_distance: 4)).to eq 3
|
47
|
+
expect(described_class.distance("kitten", "kittenss", max_distance: 4)).to eq 2
|
48
|
+
expect(described_class.distance("kittenss", "kitten", max_distance: 4)).to eq 2
|
49
|
+
expect(described_class.distance("sitting", "kitten", max_distance: 4)).to eq 3
|
50
|
+
expect(described_class.distance("gambol", "gumbo", max_distance: 3)).to eq 2
|
51
|
+
expect(described_class.distance("", "cat", max_distance: 4)).to eq 3
|
52
|
+
expect(described_class.distance("cat", "", max_distance: 5)).to eq 3
|
53
|
+
expect(described_class.distance("", "", max_distance: 2)).to eq 0
|
54
|
+
end
|
55
|
+
end
|
56
|
+
context "and normal distance is same as max_distance" do
|
57
|
+
it "is calculated distance" do
|
58
|
+
expect(described_class.distance("test", "test", max_distance: 0)).to eq 0
|
59
|
+
expect(described_class.distance("test", "tent", max_distance: 1)).to eq 1
|
60
|
+
expect(described_class.distance("gumbo", "gambol", max_distance: 2)).to eq 2
|
61
|
+
expect(described_class.distance("kitten", "sitting", max_distance: 3)).to eq 3
|
62
|
+
expect(described_class.distance("kitten", "kittenss", max_distance: 2)).to eq 2
|
63
|
+
expect(described_class.distance("kittenss", "kitten", max_distance: 2)).to eq 2
|
64
|
+
expect(described_class.distance("sitting", "kitten", max_distance: 3)).to eq 3
|
65
|
+
expect(described_class.distance("gambol", "gumbo", max_distance: 2)).to eq 2
|
66
|
+
expect(described_class.distance("", "cat", max_distance: 3)).to eq 3
|
67
|
+
expect(described_class.distance("cat", "", max_distance: 3)).to eq 3
|
68
|
+
expect(described_class.distance("", "", max_distance: 0)).to eq 0
|
32
69
|
end
|
33
70
|
end
|
34
71
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: string_metric
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Giorgos Tsiftsis
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2015-04-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -133,7 +133,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
133
133
|
version: '0'
|
134
134
|
requirements: []
|
135
135
|
rubyforge_project:
|
136
|
-
rubygems_version: 2.
|
136
|
+
rubygems_version: 2.4.6
|
137
137
|
signing_key:
|
138
138
|
specification_version: 4
|
139
139
|
summary: A simple library with String Metric algorithms
|
@@ -147,3 +147,4 @@ test_files:
|
|
147
147
|
- spec/lib/levenshtein_spec.rb
|
148
148
|
- spec/spec_helper.rb
|
149
149
|
- spec/support/levenshtein.rb
|
150
|
+
has_rdoc:
|