phonetics 1.0.0 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +10 -0
- data/VERSION +1 -1
- data/lib/phonetics/levenshtein.rb +119 -0
- data/phonetics.gemspec +4 -3
- metadata +16 -2
- data/lib/phonetic_levenshtein.rb +0 -103
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b9f2be3c1987040125914d8a028218b495e6cd8c4c004820b4f9ffbcc9995eb8
|
4
|
+
data.tar.gz: 9b787ce78e582bfff9f515ab42b1f2d43de2b4224cb5d1a31831ddc8d1b5a672
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4cd0298205c7e28785cdee86b95233ff9547317f4aa1bdee33f9fc3ca93fae636a6f1caf2b7a25061c02cf553aca9142d059f2eae19cb20ffb22b34ac3c2679e
|
7
|
+
data.tar.gz: e25d349bb3b6430f0b5001902ec9613974d60d880ccf85af4abb3ba191c73e7fb1edb3fd8c1b8af9cec39173d707a51c71d00a375de88490d7fd01092e402795
|
data/Gemfile.lock
CHANGED
@@ -6,7 +6,16 @@ PATH
|
|
6
6
|
GEM
|
7
7
|
remote: https://rubygems.org/
|
8
8
|
specs:
|
9
|
+
byebug (11.0.1)
|
10
|
+
coderay (1.1.2)
|
9
11
|
diff-lcs (1.3)
|
12
|
+
method_source (0.9.2)
|
13
|
+
pry (0.12.2)
|
14
|
+
coderay (~> 1.1.0)
|
15
|
+
method_source (~> 0.9.0)
|
16
|
+
pry-byebug (3.7.0)
|
17
|
+
byebug (~> 11.0)
|
18
|
+
pry (~> 0.10)
|
10
19
|
rake (12.3.2)
|
11
20
|
rspec (3.8.0)
|
12
21
|
rspec-core (~> 3.8.0)
|
@@ -28,6 +37,7 @@ PLATFORMS
|
|
28
37
|
DEPENDENCIES
|
29
38
|
bundler (~> 1.16)
|
30
39
|
phonetics!
|
40
|
+
pry-byebug
|
31
41
|
rake
|
32
42
|
rspec (~> 3.0)
|
33
43
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
1.
|
1
|
+
1.1.0
|
@@ -0,0 +1,119 @@
|
|
1
|
+
require_relative '../phonetics'
|
2
|
+
|
3
|
+
# Using the Damerau version of the Levenshtein algorithm, with phonetic feature
|
4
|
+
# count used instead of a binary edit distance calculation
|
5
|
+
#
|
6
|
+
# This implementation is almost entirely taken from the damerau-levenshtein gem
|
7
|
+
# (https://github.com/GlobalNamesArchitecture/damerau-levenshtein/tree/master/ext/damerau_levenshtein).
|
8
|
+
# The implementation is modified based on "Using Phonologically Weighted
|
9
|
+
# Levenshtein Distances for the Prediction of Microscopic Intelligibility" by
|
10
|
+
# Lionel Fontan, Isabelle Ferrané, Jérôme Farinas, Julien Pinquier, Xavier
|
11
|
+
# Aumont, 2016
|
12
|
+
# https://hal.archives-ouvertes.fr/hal-01474904/document
|
13
|
+
module Phonetics
|
14
|
+
class Levenshtein
|
15
|
+
def initialize(ipa_str1, ipa_str2)
|
16
|
+
@str1 = ipa_str1
|
17
|
+
@str2 = ipa_str2
|
18
|
+
@len1 = ipa_str1.size
|
19
|
+
@len2 = ipa_str2.size
|
20
|
+
prepare_matrix
|
21
|
+
set_edit_distances(ipa_str1, ipa_str2)
|
22
|
+
end
|
23
|
+
|
24
|
+
def distance
|
25
|
+
walk.last[:distance]
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.distance(str1, str2)
|
29
|
+
new(str1, str2).distance
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def walk
|
35
|
+
res = []
|
36
|
+
cell = [@len2, @len1]
|
37
|
+
while cell != [0, 0]
|
38
|
+
cell, char = char_data(cell)
|
39
|
+
res.unshift char
|
40
|
+
end
|
41
|
+
res
|
42
|
+
end
|
43
|
+
|
44
|
+
def set_edit_distances(str1, str2)
|
45
|
+
(1..@len2).each do |i|
|
46
|
+
(1..@len1).each do |j|
|
47
|
+
no_change(i, j) && next if str2[i - 1] == str1[j - 1]
|
48
|
+
@matrix[i][j] = [del(i, j) + 1.0, ins(i, j) + 1.0, subst(i, j)].min
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def char_data(cell)
|
54
|
+
char = { distance: @matrix[cell[0]][cell[1]] }
|
55
|
+
val = find_previous(cell)
|
56
|
+
previous_value = val[0][0]
|
57
|
+
char[:type] = previous_value == char[:distance] ? :same : val[1]
|
58
|
+
cell = val.pop
|
59
|
+
[cell, char]
|
60
|
+
end
|
61
|
+
|
62
|
+
def find_previous(cell)
|
63
|
+
candidates = [
|
64
|
+
[
|
65
|
+
[ins(*cell), 1],
|
66
|
+
:ins,
|
67
|
+
[cell[0], cell[1] - 1],
|
68
|
+
],
|
69
|
+
[
|
70
|
+
[del(*cell), 2],
|
71
|
+
:del,
|
72
|
+
[cell[0] - 1, cell[1]],
|
73
|
+
],
|
74
|
+
[
|
75
|
+
[subst(*cell), 0],
|
76
|
+
:subst,
|
77
|
+
[cell[0] - 1, cell[1] - 1],
|
78
|
+
],
|
79
|
+
]
|
80
|
+
select_cell(candidates)
|
81
|
+
end
|
82
|
+
|
83
|
+
def select_cell(candidates)
|
84
|
+
candidates.select { |e| e[-1][0] >= 0 && e[-1][1] >= 0 }.
|
85
|
+
sort_by(&:first).first
|
86
|
+
end
|
87
|
+
|
88
|
+
# TODO: Score the edit distance lower if sonorant sounds are found in sequence.
|
89
|
+
def del(i, j)
|
90
|
+
@matrix[i - 1][j]
|
91
|
+
end
|
92
|
+
|
93
|
+
def ins(i, j)
|
94
|
+
@matrix[i][j - 1]
|
95
|
+
end
|
96
|
+
|
97
|
+
# This is where we implement the modifications to Damerau-Levenshtein according to
|
98
|
+
# https://hal.archives-ouvertes.fr/hal-01474904/document
|
99
|
+
def subst(i, j)
|
100
|
+
map = Phonetics.distance_map[@str1[j]]
|
101
|
+
score = map[@str2[i]] if map
|
102
|
+
score ||= 1.0
|
103
|
+
@matrix[i - 1][j - 1] + score
|
104
|
+
end
|
105
|
+
|
106
|
+
def no_change(i, j)
|
107
|
+
@matrix[i][j] = @matrix[i - 1][j - 1]
|
108
|
+
end
|
109
|
+
|
110
|
+
def prepare_matrix
|
111
|
+
@matrix = []
|
112
|
+
@matrix << (0..@len1).to_a
|
113
|
+
@len2.times do |i|
|
114
|
+
ary = [i + 1] + (1..@len1).map { nil }
|
115
|
+
@matrix << ary
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
data/phonetics.gemspec
CHANGED
@@ -16,7 +16,8 @@ Gem::Specification.new do |spec|
|
|
16
16
|
end
|
17
17
|
spec.require_paths = ["lib"]
|
18
18
|
|
19
|
-
spec.add_development_dependency
|
20
|
-
spec.add_development_dependency
|
21
|
-
spec.add_development_dependency "
|
19
|
+
spec.add_development_dependency 'bundler', '~> 1.16'
|
20
|
+
spec.add_development_dependency 'rake'
|
21
|
+
spec.add_development_dependency "pry-byebug"
|
22
|
+
spec.add_development_dependency 'rspec', '~> 3.0'
|
22
23
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: phonetics
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jack Danger
|
@@ -38,6 +38,20 @@ dependencies:
|
|
38
38
|
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: pry-byebug
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
41
55
|
- !ruby/object:Gem::Dependency
|
42
56
|
name: rspec
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -69,8 +83,8 @@ files:
|
|
69
83
|
- README.md
|
70
84
|
- Rakefile
|
71
85
|
- VERSION
|
72
|
-
- lib/phonetic_levenshtein.rb
|
73
86
|
- lib/phonetics.rb
|
87
|
+
- lib/phonetics/levenshtein.rb
|
74
88
|
- lib/phonetics/version.rb
|
75
89
|
- phonetics.gemspec
|
76
90
|
homepage: https://github.com/JackDanger/phonetics
|
data/lib/phonetic_levenshtein.rb
DELETED
@@ -1,103 +0,0 @@
|
|
1
|
-
require_relative './phonetics'
|
2
|
-
|
3
|
-
# Using the Damerau version of the Levenshtein algorithm, with phonetic feature
|
4
|
-
# count used instead of a binary edit distance calculation
|
5
|
-
#
|
6
|
-
# This implementation is almost entirely taken from the damerau-levenshtein gem
|
7
|
-
# (https://github.com/GlobalNamesArchitecture/damerau-levenshtein/tree/master/ext/damerau_levenshtein).
|
8
|
-
# The implementation is modified based on "Using Phonologically Weighted
|
9
|
-
# Levenshtein Distances for the Prediction of Microscopic Intelligibility" by
|
10
|
-
# Lionel Fontan, Isabelle Ferrané, Jérôme Farinas, Julien Pinquier, Xavier
|
11
|
-
# Aumont, 2016
|
12
|
-
# https://hal.archives-ouvertes.fr/hal-01474904/document
|
13
|
-
class PhoneticLevenshtein
|
14
|
-
def initialize(ipa_str1, ipa_str2)
|
15
|
-
@str1 = ipa_str1
|
16
|
-
@str2 = ipa_str2
|
17
|
-
@len1 = ipa_str1.size
|
18
|
-
@len2 = ipa_str2.size
|
19
|
-
prepare_matrix
|
20
|
-
set_edit_distances(ipa_str1, ipa_str2)
|
21
|
-
end
|
22
|
-
|
23
|
-
def distance
|
24
|
-
walk.last[:distance]
|
25
|
-
end
|
26
|
-
|
27
|
-
def self.distance(str1, str2)
|
28
|
-
new(str1, str2).distance
|
29
|
-
end
|
30
|
-
|
31
|
-
private
|
32
|
-
|
33
|
-
def walk
|
34
|
-
res = []
|
35
|
-
cell = [@len2, @len1]
|
36
|
-
while cell != [0, 0]
|
37
|
-
cell, char = char_data(cell)
|
38
|
-
res.unshift char
|
39
|
-
end
|
40
|
-
res
|
41
|
-
end
|
42
|
-
|
43
|
-
def set_edit_distances(str1, str2)
|
44
|
-
(1..@len2).each do |i|
|
45
|
-
(1..@len1).each do |j|
|
46
|
-
no_change(i, j) && next if str2[i - 1] == str1[j - 1]
|
47
|
-
@matrix[i][j] = [del(i, j) + 1.0, ins(i, j) + 1.0, subst(i, j)].min
|
48
|
-
end
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
def char_data(cell)
|
53
|
-
char = { distance: @matrix[cell[0]][cell[1]] }
|
54
|
-
val = find_previous(cell)
|
55
|
-
previous_value = val[0][0]
|
56
|
-
char[:type] = previous_value == char[:distance] ? :same : val[1]
|
57
|
-
cell = val.pop
|
58
|
-
[cell, char]
|
59
|
-
end
|
60
|
-
|
61
|
-
def find_previous(cell)
|
62
|
-
candidates = [[[ins(*cell), 1], :ins, [cell[0], cell[1] - 1]],
|
63
|
-
[[del(*cell), 2], :del, [cell[0] - 1, cell[1]]],
|
64
|
-
[[subst(*cell), 0], :subst, [cell[0] - 1, cell[1] - 1]]]
|
65
|
-
select_cell(candidates)
|
66
|
-
end
|
67
|
-
|
68
|
-
def select_cell(candidates)
|
69
|
-
candidates.select { |e| e[-1][0] >= 0 && e[-1][1] >= 0 }.
|
70
|
-
sort_by(&:first).first
|
71
|
-
end
|
72
|
-
|
73
|
-
# TODO: Score the edit distance lower if sonorant sounds are found in sequence.
|
74
|
-
def del(i, j)
|
75
|
-
@matrix[i - 1][j]
|
76
|
-
end
|
77
|
-
|
78
|
-
def ins(i, j)
|
79
|
-
@matrix[i][j - 1]
|
80
|
-
end
|
81
|
-
|
82
|
-
# This is where we implement the modifications to Damerau-Levenshtein according to
|
83
|
-
# https://hal.archives-ouvertes.fr/hal-01474904/document
|
84
|
-
def subst(i, j)
|
85
|
-
map = Phonetics.distance_map[@str1[j]]
|
86
|
-
score = map[@str2[i]] if map
|
87
|
-
score ||= 1.0
|
88
|
-
@matrix[i - 1][j - 1] + score
|
89
|
-
end
|
90
|
-
|
91
|
-
def no_change(i, j)
|
92
|
-
@matrix[i][j] = @matrix[i - 1][j - 1]
|
93
|
-
end
|
94
|
-
|
95
|
-
def prepare_matrix
|
96
|
-
@matrix = []
|
97
|
-
@matrix << (0..@len1).to_a
|
98
|
-
@len2.times do |i|
|
99
|
-
ary = [i + 1] + (1..@len1).map { nil }
|
100
|
-
@matrix << ary
|
101
|
-
end
|
102
|
-
end
|
103
|
-
end
|