phonetics 1.0.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +10 -0
- data/VERSION +1 -1
- data/lib/phonetics/levenshtein.rb +119 -0
- data/phonetics.gemspec +4 -3
- metadata +16 -2
- data/lib/phonetic_levenshtein.rb +0 -103
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b9f2be3c1987040125914d8a028218b495e6cd8c4c004820b4f9ffbcc9995eb8
|
4
|
+
data.tar.gz: 9b787ce78e582bfff9f515ab42b1f2d43de2b4224cb5d1a31831ddc8d1b5a672
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4cd0298205c7e28785cdee86b95233ff9547317f4aa1bdee33f9fc3ca93fae636a6f1caf2b7a25061c02cf553aca9142d059f2eae19cb20ffb22b34ac3c2679e
|
7
|
+
data.tar.gz: e25d349bb3b6430f0b5001902ec9613974d60d880ccf85af4abb3ba191c73e7fb1edb3fd8c1b8af9cec39173d707a51c71d00a375de88490d7fd01092e402795
|
data/Gemfile.lock
CHANGED
@@ -6,7 +6,16 @@ PATH
|
|
6
6
|
GEM
|
7
7
|
remote: https://rubygems.org/
|
8
8
|
specs:
|
9
|
+
byebug (11.0.1)
|
10
|
+
coderay (1.1.2)
|
9
11
|
diff-lcs (1.3)
|
12
|
+
method_source (0.9.2)
|
13
|
+
pry (0.12.2)
|
14
|
+
coderay (~> 1.1.0)
|
15
|
+
method_source (~> 0.9.0)
|
16
|
+
pry-byebug (3.7.0)
|
17
|
+
byebug (~> 11.0)
|
18
|
+
pry (~> 0.10)
|
10
19
|
rake (12.3.2)
|
11
20
|
rspec (3.8.0)
|
12
21
|
rspec-core (~> 3.8.0)
|
@@ -28,6 +37,7 @@ PLATFORMS
|
|
28
37
|
DEPENDENCIES
|
29
38
|
bundler (~> 1.16)
|
30
39
|
phonetics!
|
40
|
+
pry-byebug
|
31
41
|
rake
|
32
42
|
rspec (~> 3.0)
|
33
43
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
1.
|
1
|
+
1.1.0
|
@@ -0,0 +1,119 @@
|
|
1
|
+
require_relative '../phonetics'
|
2
|
+
|
3
|
+
# Using the Damerau version of the Levenshtein algorithm, with phonetic feature
|
4
|
+
# count used instead of a binary edit distance calculation
|
5
|
+
#
|
6
|
+
# This implementation is almost entirely taken from the damerau-levenshtein gem
|
7
|
+
# (https://github.com/GlobalNamesArchitecture/damerau-levenshtein/tree/master/ext/damerau_levenshtein).
|
8
|
+
# The implementation is modified based on "Using Phonologically Weighted
|
9
|
+
# Levenshtein Distances for the Prediction of Microscopic Intelligibility" by
|
10
|
+
# Lionel Fontan, Isabelle Ferrané, Jérôme Farinas, Julien Pinquier, Xavier
|
11
|
+
# Aumont, 2016
|
12
|
+
# https://hal.archives-ouvertes.fr/hal-01474904/document
|
13
|
+
module Phonetics
|
14
|
+
class Levenshtein
|
15
|
+
def initialize(ipa_str1, ipa_str2)
|
16
|
+
@str1 = ipa_str1
|
17
|
+
@str2 = ipa_str2
|
18
|
+
@len1 = ipa_str1.size
|
19
|
+
@len2 = ipa_str2.size
|
20
|
+
prepare_matrix
|
21
|
+
set_edit_distances(ipa_str1, ipa_str2)
|
22
|
+
end
|
23
|
+
|
24
|
+
def distance
|
25
|
+
walk.last[:distance]
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.distance(str1, str2)
|
29
|
+
new(str1, str2).distance
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def walk
|
35
|
+
res = []
|
36
|
+
cell = [@len2, @len1]
|
37
|
+
while cell != [0, 0]
|
38
|
+
cell, char = char_data(cell)
|
39
|
+
res.unshift char
|
40
|
+
end
|
41
|
+
res
|
42
|
+
end
|
43
|
+
|
44
|
+
def set_edit_distances(str1, str2)
|
45
|
+
(1..@len2).each do |i|
|
46
|
+
(1..@len1).each do |j|
|
47
|
+
no_change(i, j) && next if str2[i - 1] == str1[j - 1]
|
48
|
+
@matrix[i][j] = [del(i, j) + 1.0, ins(i, j) + 1.0, subst(i, j)].min
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def char_data(cell)
|
54
|
+
char = { distance: @matrix[cell[0]][cell[1]] }
|
55
|
+
val = find_previous(cell)
|
56
|
+
previous_value = val[0][0]
|
57
|
+
char[:type] = previous_value == char[:distance] ? :same : val[1]
|
58
|
+
cell = val.pop
|
59
|
+
[cell, char]
|
60
|
+
end
|
61
|
+
|
62
|
+
def find_previous(cell)
|
63
|
+
candidates = [
|
64
|
+
[
|
65
|
+
[ins(*cell), 1],
|
66
|
+
:ins,
|
67
|
+
[cell[0], cell[1] - 1],
|
68
|
+
],
|
69
|
+
[
|
70
|
+
[del(*cell), 2],
|
71
|
+
:del,
|
72
|
+
[cell[0] - 1, cell[1]],
|
73
|
+
],
|
74
|
+
[
|
75
|
+
[subst(*cell), 0],
|
76
|
+
:subst,
|
77
|
+
[cell[0] - 1, cell[1] - 1],
|
78
|
+
],
|
79
|
+
]
|
80
|
+
select_cell(candidates)
|
81
|
+
end
|
82
|
+
|
83
|
+
def select_cell(candidates)
|
84
|
+
candidates.select { |e| e[-1][0] >= 0 && e[-1][1] >= 0 }.
|
85
|
+
sort_by(&:first).first
|
86
|
+
end
|
87
|
+
|
88
|
+
# TODO: Score the edit distance lower if sonorant sounds are found in sequence.
|
89
|
+
def del(i, j)
|
90
|
+
@matrix[i - 1][j]
|
91
|
+
end
|
92
|
+
|
93
|
+
def ins(i, j)
|
94
|
+
@matrix[i][j - 1]
|
95
|
+
end
|
96
|
+
|
97
|
+
# This is where we implement the modifications to Damerau-Levenshtein according to
|
98
|
+
# https://hal.archives-ouvertes.fr/hal-01474904/document
|
99
|
+
def subst(i, j)
|
100
|
+
map = Phonetics.distance_map[@str1[j]]
|
101
|
+
score = map[@str2[i]] if map
|
102
|
+
score ||= 1.0
|
103
|
+
@matrix[i - 1][j - 1] + score
|
104
|
+
end
|
105
|
+
|
106
|
+
def no_change(i, j)
|
107
|
+
@matrix[i][j] = @matrix[i - 1][j - 1]
|
108
|
+
end
|
109
|
+
|
110
|
+
def prepare_matrix
|
111
|
+
@matrix = []
|
112
|
+
@matrix << (0..@len1).to_a
|
113
|
+
@len2.times do |i|
|
114
|
+
ary = [i + 1] + (1..@len1).map { nil }
|
115
|
+
@matrix << ary
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
data/phonetics.gemspec
CHANGED
@@ -16,7 +16,8 @@ Gem::Specification.new do |spec|
|
|
16
16
|
end
|
17
17
|
spec.require_paths = ["lib"]
|
18
18
|
|
19
|
-
spec.add_development_dependency
|
20
|
-
spec.add_development_dependency
|
21
|
-
spec.add_development_dependency "
|
19
|
+
spec.add_development_dependency 'bundler', '~> 1.16'
|
20
|
+
spec.add_development_dependency 'rake'
|
21
|
+
spec.add_development_dependency "pry-byebug"
|
22
|
+
spec.add_development_dependency 'rspec', '~> 3.0'
|
22
23
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: phonetics
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jack Danger
|
@@ -38,6 +38,20 @@ dependencies:
|
|
38
38
|
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: pry-byebug
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
41
55
|
- !ruby/object:Gem::Dependency
|
42
56
|
name: rspec
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -69,8 +83,8 @@ files:
|
|
69
83
|
- README.md
|
70
84
|
- Rakefile
|
71
85
|
- VERSION
|
72
|
-
- lib/phonetic_levenshtein.rb
|
73
86
|
- lib/phonetics.rb
|
87
|
+
- lib/phonetics/levenshtein.rb
|
74
88
|
- lib/phonetics/version.rb
|
75
89
|
- phonetics.gemspec
|
76
90
|
homepage: https://github.com/JackDanger/phonetics
|
data/lib/phonetic_levenshtein.rb
DELETED
@@ -1,103 +0,0 @@
|
|
1
|
-
require_relative './phonetics'
|
2
|
-
|
3
|
-
# Using the Damerau version of the Levenshtein algorithm, with phonetic feature
|
4
|
-
# count used instead of a binary edit distance calculation
|
5
|
-
#
|
6
|
-
# This implementation is almost entirely taken from the damerau-levenshtein gem
|
7
|
-
# (https://github.com/GlobalNamesArchitecture/damerau-levenshtein/tree/master/ext/damerau_levenshtein).
|
8
|
-
# The implementation is modified based on "Using Phonologically Weighted
|
9
|
-
# Levenshtein Distances for the Prediction of Microscopic Intelligibility" by
|
10
|
-
# Lionel Fontan, Isabelle Ferrané, Jérôme Farinas, Julien Pinquier, Xavier
|
11
|
-
# Aumont, 2016
|
12
|
-
# https://hal.archives-ouvertes.fr/hal-01474904/document
|
13
|
-
class PhoneticLevenshtein
|
14
|
-
def initialize(ipa_str1, ipa_str2)
|
15
|
-
@str1 = ipa_str1
|
16
|
-
@str2 = ipa_str2
|
17
|
-
@len1 = ipa_str1.size
|
18
|
-
@len2 = ipa_str2.size
|
19
|
-
prepare_matrix
|
20
|
-
set_edit_distances(ipa_str1, ipa_str2)
|
21
|
-
end
|
22
|
-
|
23
|
-
def distance
|
24
|
-
walk.last[:distance]
|
25
|
-
end
|
26
|
-
|
27
|
-
def self.distance(str1, str2)
|
28
|
-
new(str1, str2).distance
|
29
|
-
end
|
30
|
-
|
31
|
-
private
|
32
|
-
|
33
|
-
def walk
|
34
|
-
res = []
|
35
|
-
cell = [@len2, @len1]
|
36
|
-
while cell != [0, 0]
|
37
|
-
cell, char = char_data(cell)
|
38
|
-
res.unshift char
|
39
|
-
end
|
40
|
-
res
|
41
|
-
end
|
42
|
-
|
43
|
-
def set_edit_distances(str1, str2)
|
44
|
-
(1..@len2).each do |i|
|
45
|
-
(1..@len1).each do |j|
|
46
|
-
no_change(i, j) && next if str2[i - 1] == str1[j - 1]
|
47
|
-
@matrix[i][j] = [del(i, j) + 1.0, ins(i, j) + 1.0, subst(i, j)].min
|
48
|
-
end
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
def char_data(cell)
|
53
|
-
char = { distance: @matrix[cell[0]][cell[1]] }
|
54
|
-
val = find_previous(cell)
|
55
|
-
previous_value = val[0][0]
|
56
|
-
char[:type] = previous_value == char[:distance] ? :same : val[1]
|
57
|
-
cell = val.pop
|
58
|
-
[cell, char]
|
59
|
-
end
|
60
|
-
|
61
|
-
def find_previous(cell)
|
62
|
-
candidates = [[[ins(*cell), 1], :ins, [cell[0], cell[1] - 1]],
|
63
|
-
[[del(*cell), 2], :del, [cell[0] - 1, cell[1]]],
|
64
|
-
[[subst(*cell), 0], :subst, [cell[0] - 1, cell[1] - 1]]]
|
65
|
-
select_cell(candidates)
|
66
|
-
end
|
67
|
-
|
68
|
-
def select_cell(candidates)
|
69
|
-
candidates.select { |e| e[-1][0] >= 0 && e[-1][1] >= 0 }.
|
70
|
-
sort_by(&:first).first
|
71
|
-
end
|
72
|
-
|
73
|
-
# TODO: Score the edit distance lower if sonorant sounds are found in sequence.
|
74
|
-
def del(i, j)
|
75
|
-
@matrix[i - 1][j]
|
76
|
-
end
|
77
|
-
|
78
|
-
def ins(i, j)
|
79
|
-
@matrix[i][j - 1]
|
80
|
-
end
|
81
|
-
|
82
|
-
# This is where we implement the modifications to Damerau-Levenshtein according to
|
83
|
-
# https://hal.archives-ouvertes.fr/hal-01474904/document
|
84
|
-
def subst(i, j)
|
85
|
-
map = Phonetics.distance_map[@str1[j]]
|
86
|
-
score = map[@str2[i]] if map
|
87
|
-
score ||= 1.0
|
88
|
-
@matrix[i - 1][j - 1] + score
|
89
|
-
end
|
90
|
-
|
91
|
-
def no_change(i, j)
|
92
|
-
@matrix[i][j] = @matrix[i - 1][j - 1]
|
93
|
-
end
|
94
|
-
|
95
|
-
def prepare_matrix
|
96
|
-
@matrix = []
|
97
|
-
@matrix << (0..@len1).to_a
|
98
|
-
@len2.times do |i|
|
99
|
-
ary = [i + 1] + (1..@len1).map { nil }
|
100
|
-
@matrix << ary
|
101
|
-
end
|
102
|
-
end
|
103
|
-
end
|