phonetics 2.0.1 → 3.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/test.yml +0 -2
- data/.rubocop.yml +5 -5
- data/README.md +2 -1
- data/VERSION +1 -1
- data/bin/gempush-if-changed +9 -2
- data/lib/common_ipa_transcriptions.json +905980 -0
- data/lib/phonetics.rb +2 -237
- data/lib/phonetics/distances.rb +239 -0
- data/lib/phonetics/levenshtein.rb +0 -6
- data/lib/phonetics/transcriptions.rb +151 -0
- metadata +5 -2
data/lib/phonetics.rb
CHANGED
@@ -1,239 +1,4 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require '
|
4
|
-
|
5
|
-
module Phonetics
|
6
|
-
extend self
|
7
|
-
|
8
|
-
# This subclass of the stdlib's String allows us to iterate over each phoneme
|
9
|
-
# in a string without monkeypatching
|
10
|
-
#
|
11
|
-
# Usage:
|
12
|
-
# Phonetics::String.new("wətɛvɝ").each_phoneme.to_a
|
13
|
-
# => ["w", "ə", "t", "ɛ", "v", "ɝ"]
|
14
|
-
class String < SimpleDelegator
|
15
|
-
# Group all phonemes by how many characters they have. Use this to walk
|
16
|
-
# through a string finding phonemes (looking for longest ones first)
|
17
|
-
def self.phonemes_by_length
|
18
|
-
@phonemes_by_length ||= Phonetics.phonemes.each_with_object(
|
19
|
-
# This relies on the impicit stable key ordering of Hash objects in Ruby
|
20
|
-
# 2+ to keep the keys in descending order.
|
21
|
-
4 => Set.new, 3 => Set.new, 2 => Set.new, 1 => Set.new
|
22
|
-
) do |str, acc|
|
23
|
-
acc[str.chars.size] << str
|
24
|
-
end
|
25
|
-
end
|
26
|
-
|
27
|
-
def each_phoneme
|
28
|
-
idx = 0
|
29
|
-
Enumerator.new do |y|
|
30
|
-
while idx < chars.length
|
31
|
-
found = false
|
32
|
-
self.class.phonemes_by_length.each do |size, phonemes|
|
33
|
-
next unless idx + size <= chars.length
|
34
|
-
|
35
|
-
candidate = chars[idx..idx + size - 1].join
|
36
|
-
next unless phonemes.include?(candidate)
|
37
|
-
|
38
|
-
y.yield candidate
|
39
|
-
idx += size
|
40
|
-
found = true
|
41
|
-
break
|
42
|
-
end
|
43
|
-
idx += 1 unless found
|
44
|
-
end
|
45
|
-
end
|
46
|
-
end
|
47
|
-
end
|
48
|
-
|
49
|
-
module Vowels
|
50
|
-
extend self
|
51
|
-
|
52
|
-
FormantFrequencies = {
|
53
|
-
# https://en.wikipedia.org/wiki/Formant#Phonetics
|
54
|
-
'i' => { F1: 240, F2: 2400, rounded: false },
|
55
|
-
'y' => { F1: 235, F2: 2100, rounded: false },
|
56
|
-
'ɪ' => { F1: 300, F2: 2100, rounded: false }, # Guessing From other vowels
|
57
|
-
'e' => { F1: 390, F2: 2300, rounded: false },
|
58
|
-
'ø' => { F1: 370, F2: 1900, rounded: true },
|
59
|
-
'ɛ' => { F1: 610, F2: 1900, rounded: false },
|
60
|
-
'œ' => { F1: 585, F2: 1710, rounded: true },
|
61
|
-
'a' => { F1: 850, F2: 1610, rounded: false },
|
62
|
-
'ɶ' => { F1: 820, F2: 1530, rounded: true },
|
63
|
-
'ɑ' => { F1: 750, F2: 940, rounded: false },
|
64
|
-
'ɒ' => { F1: 700, F2: 760, rounded: true },
|
65
|
-
|
66
|
-
'ʌ' => { F1: 600, F2: 1170, rounded: false },
|
67
|
-
# copying 'ʌ' for other mid-vowel formants
|
68
|
-
'ə' => { F1: 600, F2: 1170, rounded: false },
|
69
|
-
'ɝ' => { F1: 600, F2: 1170, rounded: false, rhotic: true },
|
70
|
-
|
71
|
-
'ɔ' => { F1: 500, F2: 700, rounded: true },
|
72
|
-
'ɤ' => { F1: 460, F2: 1310, rounded: false },
|
73
|
-
'o' => { F1: 360, F2: 640, rounded: true },
|
74
|
-
'ɯ' => { F1: 300, F2: 1390, rounded: false },
|
75
|
-
'æ' => { F1: 800, F2: 1900, rounded: false }, # Guessing From other vowels
|
76
|
-
'u' => { F1: 350, F2: 650, rounded: true }, # Guessing From other vowels
|
77
|
-
'ʊ' => { F1: 350, F2: 650, rounded: true },
|
78
|
-
# Frequencies from http://videoweb.nie.edu.sg/phonetic/vowels/measurements.html
|
79
|
-
}.freeze
|
80
|
-
|
81
|
-
def phonemes
|
82
|
-
@phonemes ||= FormantFrequencies.keys
|
83
|
-
end
|
84
|
-
|
85
|
-
# Given two vowels, calculate the (pythagorean) distance between them using
|
86
|
-
# their F1 and F2 frequencies as x/y coordinates.
|
87
|
-
# The return value is scaled to a value between 0 and 1
|
88
|
-
# TODO: account for rhoticity (F3)
|
89
|
-
def distance(phoneme1, phoneme2)
|
90
|
-
formants1 = FormantFrequencies.fetch(phoneme1)
|
91
|
-
formants2 = FormantFrequencies.fetch(phoneme2)
|
92
|
-
|
93
|
-
@minmax_f1 ||= FormantFrequencies.values.minmax { |a, b| a[:F1] <=> b[:F1] }.map { |h| h[:F1] }
|
94
|
-
@minmax_f2 ||= FormantFrequencies.values.minmax { |a, b| a[:F2] <=> b[:F2] }.map { |h| h[:F2] }
|
95
|
-
|
96
|
-
# Get an x and y value for each input phoneme scaled between 0.0 and 1.0
|
97
|
-
# We'll use the scaled f1 as the 'x' and the scaled f2 as the 'y'
|
98
|
-
scaled_phoneme1_f1 = (formants1[:F1] - @minmax_f1[0]) / @minmax_f1[1].to_f
|
99
|
-
scaled_phoneme1_f2 = (formants1[:F2] - @minmax_f2[0]) / @minmax_f2[1].to_f
|
100
|
-
scaled_phoneme2_f1 = (formants2[:F1] - @minmax_f1[0]) / @minmax_f1[1].to_f
|
101
|
-
scaled_phoneme2_f2 = (formants2[:F2] - @minmax_f2[0]) / @minmax_f2[1].to_f
|
102
|
-
|
103
|
-
f1_distance = (scaled_phoneme1_f1 - scaled_phoneme2_f1).abs
|
104
|
-
f2_distance = (scaled_phoneme1_f2 - scaled_phoneme2_f2).abs
|
105
|
-
|
106
|
-
# When we have four values we can use the pythagorean theorem on them
|
107
|
-
# (order doesn't matter)
|
108
|
-
Math.sqrt((f1_distance**2) + (f2_distance**2))
|
109
|
-
end
|
110
|
-
end
|
111
|
-
|
112
|
-
module Consonants
|
113
|
-
extend self
|
114
|
-
|
115
|
-
# This chart (columns 2 through the end, anyway) is a direct port of
|
116
|
-
# https://en.wikipedia.org/wiki/International_Phonetic_Alphabet#Letters
|
117
|
-
# We store the consonant table in this format to make updating it easier.
|
118
|
-
#
|
119
|
-
# rubocop:disable Layout/TrailingWhitespace
|
120
|
-
ChartData = %( | Labio-velar | Bi-labial | Labio-dental | Linguo-labial | Dental | Alveolar | Post-alveolar | Retro-flex | Palatal | Velar | Uvular | Pharyngeal | Glottal
|
121
|
-
Nasal | | m̥ m | ɱ | n̼ | | n̥ n | | ɳ̊ ɳ | ɲ̊ ɲ | ŋ̊ ŋ | ɴ | |
|
122
|
-
Stop | | p b | p̪ b̪ | t̼ d̼ | | t d | | ʈ ɖ | c ɟ | k g | q ɢ | ʡ | ʔ
|
123
|
-
Sibilant fricative | | | | | | s z | ʃ ʒ | ʂ ʐ | ɕ ʑ | | | |
|
124
|
-
Non-sibilant fricative | | ɸ β | f v | θ̼ ð̼ | θ ð | θ̠ ð̠ | ɹ̠̊˔ ɹ̠˔ | ɻ˔ | ç ʝ | x ɣ | χ ʁ | ħ ʕ | h ɦ
|
125
|
-
Approximant | w | | ʋ̥ ʋ | | | ɹ̥ ɹ | | ɻ̊ ɻ | j̊ j | ɰ̊ ɰ | | | ʔ̞
|
126
|
-
Tap/flap | | ⱱ̟ | ⱱ | ɾ̼ | | ɾ̥ ɾ | | ɽ̊ ɽ | | | ɢ̆ | ʡ̆ |
|
127
|
-
Trill | | ʙ̥ ʙ | | | | r̥ r | | | | | ʀ̥ ʀ | ʜ ʢ |
|
128
|
-
Lateral fricative | | | | | | ɬ ɮ | | ɭ̊˔ ɭ˔ | ʎ̝̊ ʎ̝ | ʟ̝̊ ʟ̝ | | |
|
129
|
-
Lateral approximant | | | | | | l̥ l | | ɭ̊ ɭ | ʎ̥ ʎ | ʟ̥ ʟ | ʟ̠ | |
|
130
|
-
Lateral tap/flap | | | | | | ɺ | | ɭ̆ | ʎ̆ | ʟ̆ | | |
|
131
|
-
)
|
132
|
-
# rubocop:enable Layout/TrailingWhitespace
|
133
|
-
|
134
|
-
# Parse the ChartData into a lookup table where we can retrieve attributes
|
135
|
-
# for each phoneme
|
136
|
-
def features
|
137
|
-
@features ||= begin
|
138
|
-
header, *manners = ChartData.lines
|
139
|
-
|
140
|
-
_, *positions = header.chomp.split(' | ')
|
141
|
-
positions.map(&:strip!)
|
142
|
-
|
143
|
-
# Remove any trailing blank lines
|
144
|
-
manners.pop while manners.last.to_s.strip.empty?
|
145
|
-
|
146
|
-
position_indexes = Hash[*positions.each_with_index.to_a.flatten]
|
147
|
-
|
148
|
-
@position_count = positions.size
|
149
|
-
|
150
|
-
manners.each_with_object({}) do |row, phonemes|
|
151
|
-
manner, *columns = row.chomp.split(' | ')
|
152
|
-
manner.strip!
|
153
|
-
positions.zip(columns).each do |position, phoneme_text|
|
154
|
-
data = {
|
155
|
-
position: position,
|
156
|
-
position_index: position_indexes[position],
|
157
|
-
manner: manner,
|
158
|
-
}
|
159
|
-
# If there is a character in the first byte then this articulation
|
160
|
-
# has a voiceless phoneme. The symbol may use additional characters
|
161
|
-
# as part of the phoneme symbol.
|
162
|
-
unless phoneme_text[0] == ' '
|
163
|
-
# Take the first non-blank character string
|
164
|
-
symbol = phoneme_text.chars.take_while { |char| char != ' ' }.join
|
165
|
-
phoneme_text = phoneme_text[symbol.chars.size..-1]
|
166
|
-
|
167
|
-
phonemes[symbol] = data.merge(voiced: false)
|
168
|
-
end
|
169
|
-
# If there's a character anywhere left in the string then this
|
170
|
-
# articulation has a voiced phoneme
|
171
|
-
unless phoneme_text.strip.empty?
|
172
|
-
symbol = phoneme_text.strip
|
173
|
-
phonemes[symbol] = data.merge(voiced: true)
|
174
|
-
end
|
175
|
-
end
|
176
|
-
end
|
177
|
-
end
|
178
|
-
end
|
179
|
-
|
180
|
-
def phonemes
|
181
|
-
@phonemes ||= features.keys
|
182
|
-
end
|
183
|
-
|
184
|
-
# Given two consonants, calculate their difference by summing the
|
185
|
-
# following:
|
186
|
-
# * 0.1 if they are not voiced the same
|
187
|
-
# * 0.3 if they are different manners
|
188
|
-
# * Up to 0.6 if they are the maximum position difference
|
189
|
-
def distance(phoneme1, phoneme2)
|
190
|
-
features1 = features[phoneme1]
|
191
|
-
features2 = features[phoneme2]
|
192
|
-
|
193
|
-
penalty = 0
|
194
|
-
penalty += 0.1 if features1[:voiced] != features2[:voiced]
|
195
|
-
|
196
|
-
penalty += 0.3 if features1[:manner] != features2[:manner]
|
197
|
-
|
198
|
-
# Use up to the remaining 0.6 for penalizing differences in manner
|
199
|
-
penalty += 0.6 * ((features1[:position_index] - features2[:position_index]).abs / @position_count.to_f)
|
200
|
-
penalty
|
201
|
-
end
|
202
|
-
end
|
203
|
-
|
204
|
-
def phonemes
|
205
|
-
Vowels.phonemes + Consonants.phonemes
|
206
|
-
end
|
207
|
-
|
208
|
-
Symbols = Consonants.phonemes.reduce({}) { |acc, p| acc.update p => :consonant }.merge(
|
209
|
-
Vowels.phonemes.reduce({}) { |acc, p| acc.update p => :vowel }
|
210
|
-
)
|
211
|
-
|
212
|
-
def distance(phoneme1, phoneme2)
|
213
|
-
return 0 if phoneme1 == phoneme2
|
214
|
-
|
215
|
-
distance_map.fetch(phoneme1).fetch(phoneme2)
|
216
|
-
end
|
217
|
-
|
218
|
-
def distance_map
|
219
|
-
@distance_map ||= phonemes.permutation(2).each_with_object(Hash.new { |h, k| h[k] = {} }) do |pair, scores|
|
220
|
-
p1, p2 = *pair
|
221
|
-
score = _distance(p1, p2)
|
222
|
-
scores[p1][p2] = score
|
223
|
-
scores[p2][p1] = score
|
224
|
-
end
|
225
|
-
end
|
226
|
-
|
227
|
-
private
|
228
|
-
|
229
|
-
def _distance(phoneme1, phoneme2)
|
230
|
-
types = [Symbols.fetch(phoneme1), Symbols.fetch(phoneme2)].sort
|
231
|
-
if types == %i[consonant vowel]
|
232
|
-
1.0
|
233
|
-
elsif types == %i[vowel vowel]
|
234
|
-
Vowels.distance(phoneme1, phoneme2)
|
235
|
-
elsif types == %i[consonant consonant]
|
236
|
-
Consonants.distance(phoneme1, phoneme2)
|
237
|
-
end
|
238
|
-
end
|
239
|
-
end
|
3
|
+
require 'phonetics/distances'
|
4
|
+
require 'phonetics/transcriptions'
|
@@ -0,0 +1,239 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'delegate'
|
4
|
+
|
5
|
+
module Phonetics
|
6
|
+
extend self
|
7
|
+
|
8
|
+
# This subclass of the stdlib's String allows us to iterate over each phoneme
|
9
|
+
# in a string without monkeypatching
|
10
|
+
#
|
11
|
+
# Usage:
|
12
|
+
# Phonetics::String.new("wətɛvɝ").each_phoneme.to_a
|
13
|
+
# => ["w", "ə", "t", "ɛ", "v", "ɝ"]
|
14
|
+
class String < SimpleDelegator
|
15
|
+
# Group all phonemes by how many characters they have. Use this to walk
|
16
|
+
# through a string finding phonemes (looking for longest ones first)
|
17
|
+
def self.phonemes_by_length
|
18
|
+
@phonemes_by_length ||= Phonetics.phonemes.each_with_object(
|
19
|
+
# This relies on the impicit stable key ordering of Hash objects in Ruby
|
20
|
+
# 2+ to keep the keys in descending order.
|
21
|
+
4 => Set.new, 3 => Set.new, 2 => Set.new, 1 => Set.new
|
22
|
+
) do |str, acc|
|
23
|
+
acc[str.chars.size] << str
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def each_phoneme
|
28
|
+
idx = 0
|
29
|
+
Enumerator.new do |y|
|
30
|
+
while idx < chars.length
|
31
|
+
found = false
|
32
|
+
self.class.phonemes_by_length.each do |size, phonemes|
|
33
|
+
next unless idx + size <= chars.length
|
34
|
+
|
35
|
+
candidate = chars[idx..idx + size - 1].join
|
36
|
+
next unless phonemes.include?(candidate)
|
37
|
+
|
38
|
+
y.yield candidate
|
39
|
+
idx += size
|
40
|
+
found = true
|
41
|
+
break
|
42
|
+
end
|
43
|
+
idx += 1 unless found
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
module Vowels
|
50
|
+
extend self
|
51
|
+
|
52
|
+
FormantFrequencies = {
|
53
|
+
# https://en.wikipedia.org/wiki/Formant#Phonetics
|
54
|
+
'i' => { F1: 240, F2: 2400, rounded: false },
|
55
|
+
'y' => { F1: 235, F2: 2100, rounded: false },
|
56
|
+
'ɪ' => { F1: 300, F2: 2100, rounded: false }, # Guessing From other vowels
|
57
|
+
'e' => { F1: 390, F2: 2300, rounded: false },
|
58
|
+
'ø' => { F1: 370, F2: 1900, rounded: true },
|
59
|
+
'ɛ' => { F1: 610, F2: 1900, rounded: false },
|
60
|
+
'œ' => { F1: 585, F2: 1710, rounded: true },
|
61
|
+
'a' => { F1: 850, F2: 1610, rounded: false },
|
62
|
+
'ɶ' => { F1: 820, F2: 1530, rounded: true },
|
63
|
+
'ɑ' => { F1: 750, F2: 940, rounded: false },
|
64
|
+
'ɒ' => { F1: 700, F2: 760, rounded: true },
|
65
|
+
|
66
|
+
'ʌ' => { F1: 600, F2: 1170, rounded: false },
|
67
|
+
# copying 'ʌ' for other mid-vowel formants
|
68
|
+
'ə' => { F1: 600, F2: 1170, rounded: false },
|
69
|
+
'ɝ' => { F1: 600, F2: 1170, rounded: false, rhotic: true },
|
70
|
+
|
71
|
+
'ɔ' => { F1: 500, F2: 700, rounded: true },
|
72
|
+
'ɤ' => { F1: 460, F2: 1310, rounded: false },
|
73
|
+
'o' => { F1: 360, F2: 640, rounded: true },
|
74
|
+
'ɯ' => { F1: 300, F2: 1390, rounded: false },
|
75
|
+
'æ' => { F1: 800, F2: 1900, rounded: false }, # Guessing From other vowels
|
76
|
+
'u' => { F1: 350, F2: 650, rounded: true }, # Guessing From other vowels
|
77
|
+
'ʊ' => { F1: 350, F2: 650, rounded: true },
|
78
|
+
# Frequencies from http://videoweb.nie.edu.sg/phonetic/vowels/measurements.html
|
79
|
+
}.freeze
|
80
|
+
|
81
|
+
def phonemes
|
82
|
+
@phonemes ||= FormantFrequencies.keys
|
83
|
+
end
|
84
|
+
|
85
|
+
# Given two vowels, calculate the (pythagorean) distance between them using
|
86
|
+
# their F1 and F2 frequencies as x/y coordinates.
|
87
|
+
# The return value is scaled to a value between 0 and 1
|
88
|
+
# TODO: account for rhoticity (F3)
|
89
|
+
def distance(phoneme1, phoneme2)
|
90
|
+
formants1 = FormantFrequencies.fetch(phoneme1)
|
91
|
+
formants2 = FormantFrequencies.fetch(phoneme2)
|
92
|
+
|
93
|
+
@minmax_f1 ||= FormantFrequencies.values.minmax { |a, b| a[:F1] <=> b[:F1] }.map { |h| h[:F1] }
|
94
|
+
@minmax_f2 ||= FormantFrequencies.values.minmax { |a, b| a[:F2] <=> b[:F2] }.map { |h| h[:F2] }
|
95
|
+
|
96
|
+
# Get an x and y value for each input phoneme scaled between 0.0 and 1.0
|
97
|
+
# We'll use the scaled f1 as the 'x' and the scaled f2 as the 'y'
|
98
|
+
scaled_phoneme1_f1 = (formants1[:F1] - @minmax_f1[0]) / @minmax_f1[1].to_f
|
99
|
+
scaled_phoneme1_f2 = (formants1[:F2] - @minmax_f2[0]) / @minmax_f2[1].to_f
|
100
|
+
scaled_phoneme2_f1 = (formants2[:F1] - @minmax_f1[0]) / @minmax_f1[1].to_f
|
101
|
+
scaled_phoneme2_f2 = (formants2[:F2] - @minmax_f2[0]) / @minmax_f2[1].to_f
|
102
|
+
|
103
|
+
f1_distance = (scaled_phoneme1_f1 - scaled_phoneme2_f1).abs
|
104
|
+
f2_distance = (scaled_phoneme1_f2 - scaled_phoneme2_f2).abs
|
105
|
+
|
106
|
+
# When we have four values we can use the pythagorean theorem on them
|
107
|
+
# (order doesn't matter)
|
108
|
+
Math.sqrt((f1_distance**2) + (f2_distance**2))
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
module Consonants
|
113
|
+
extend self
|
114
|
+
|
115
|
+
# This chart (columns 2 through the end, anyway) is a direct port of
|
116
|
+
# https://en.wikipedia.org/wiki/International_Phonetic_Alphabet#Letters
|
117
|
+
# We store the consonant table in this format to make updating it easier.
|
118
|
+
#
|
119
|
+
# rubocop:disable Layout/TrailingWhitespace
|
120
|
+
ChartData = %( | Labio-velar | Bi-labial | Labio-dental | Linguo-labial | Dental | Alveolar | Post-alveolar | Retro-flex | Palatal | Velar | Uvular | Pharyngeal | Glottal
|
121
|
+
Nasal | | m̥ m | ɱ | n̼ | | n̥ n | | ɳ̊ ɳ | ɲ̊ ɲ | ŋ̊ ŋ | ɴ | |
|
122
|
+
Stop | | p b | p̪ b̪ | t̼ d̼ | | t d | | ʈ ɖ | c ɟ | k g | q ɢ | ʡ | ʔ
|
123
|
+
Sibilant fricative | | | | | | s z | ʃ ʒ | ʂ ʐ | ɕ ʑ | | | |
|
124
|
+
Non-sibilant fricative | | ɸ β | f v | θ̼ ð̼ | θ ð | θ̠ ð̠ | ɹ̠̊˔ ɹ̠˔ | ɻ˔ | ç ʝ | x ɣ | χ ʁ | ħ ʕ | h ɦ
|
125
|
+
Approximant | w | | ʋ̥ ʋ | | | ɹ̥ ɹ | | ɻ̊ ɻ | j̊ j | ɰ̊ ɰ | | | ʔ̞
|
126
|
+
Tap/flap | | ⱱ̟ | ⱱ | ɾ̼ | | ɾ̥ ɾ | | ɽ̊ ɽ | | | ɢ̆ | ʡ̆ |
|
127
|
+
Trill | | ʙ̥ ʙ | | | | r̥ r | | | | | ʀ̥ ʀ | ʜ ʢ |
|
128
|
+
Lateral fricative | | | | | | ɬ ɮ | | ɭ̊˔ ɭ˔ | ʎ̝̊ ʎ̝ | ʟ̝̊ ʟ̝ | | |
|
129
|
+
Lateral approximant | | | | | | l̥ l | | ɭ̊ ɭ | ʎ̥ ʎ | ʟ̥ ʟ | ʟ̠ | |
|
130
|
+
Lateral tap/flap | | | | | | ɺ | | ɭ̆ | ʎ̆ | ʟ̆ | | |
|
131
|
+
)
|
132
|
+
# rubocop:enable Layout/TrailingWhitespace
|
133
|
+
|
134
|
+
# Parse the ChartData into a lookup table where we can retrieve attributes
|
135
|
+
# for each phoneme
|
136
|
+
def features
|
137
|
+
@features ||= begin
|
138
|
+
header, *manners = ChartData.lines
|
139
|
+
|
140
|
+
_, *positions = header.chomp.split(' | ')
|
141
|
+
positions.map(&:strip!)
|
142
|
+
|
143
|
+
# Remove any trailing blank lines
|
144
|
+
manners.pop while manners.last.to_s.strip.empty?
|
145
|
+
|
146
|
+
position_indexes = Hash[*positions.each_with_index.to_a.flatten]
|
147
|
+
|
148
|
+
@position_count = positions.size
|
149
|
+
|
150
|
+
manners.each_with_object({}) do |row, phonemes|
|
151
|
+
manner, *columns = row.chomp.split(' | ')
|
152
|
+
manner.strip!
|
153
|
+
positions.zip(columns).each do |position, phoneme_text|
|
154
|
+
data = {
|
155
|
+
position: position,
|
156
|
+
position_index: position_indexes[position],
|
157
|
+
manner: manner,
|
158
|
+
}
|
159
|
+
# If there is a character in the first byte then this articulation
|
160
|
+
# has a voiceless phoneme. The symbol may use additional characters
|
161
|
+
# as part of the phoneme symbol.
|
162
|
+
unless phoneme_text[0] == ' '
|
163
|
+
# Take the first non-blank character string
|
164
|
+
symbol = phoneme_text.chars.take_while { |char| char != ' ' }.join
|
165
|
+
phoneme_text = phoneme_text[symbol.chars.size..-1]
|
166
|
+
|
167
|
+
phonemes[symbol] = data.merge(voiced: false)
|
168
|
+
end
|
169
|
+
# If there's a character anywhere left in the string then this
|
170
|
+
# articulation has a voiced phoneme
|
171
|
+
unless phoneme_text.strip.empty?
|
172
|
+
symbol = phoneme_text.strip
|
173
|
+
phonemes[symbol] = data.merge(voiced: true)
|
174
|
+
end
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
def phonemes
|
181
|
+
@phonemes ||= features.keys
|
182
|
+
end
|
183
|
+
|
184
|
+
# Given two consonants, calculate their difference by summing the
|
185
|
+
# following:
|
186
|
+
# * 0.1 if they are not voiced the same
|
187
|
+
# * 0.3 if they are different manners
|
188
|
+
# * Up to 0.6 if they are the maximum position difference
|
189
|
+
def distance(phoneme1, phoneme2)
|
190
|
+
features1 = features[phoneme1]
|
191
|
+
features2 = features[phoneme2]
|
192
|
+
|
193
|
+
penalty = 0
|
194
|
+
penalty += 0.1 if features1[:voiced] != features2[:voiced]
|
195
|
+
|
196
|
+
penalty += 0.3 if features1[:manner] != features2[:manner]
|
197
|
+
|
198
|
+
# Use up to the remaining 0.6 for penalizing differences in manner
|
199
|
+
penalty += 0.6 * ((features1[:position_index] - features2[:position_index]).abs / @position_count.to_f)
|
200
|
+
penalty
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
def phonemes
|
205
|
+
Vowels.phonemes + Consonants.phonemes
|
206
|
+
end
|
207
|
+
|
208
|
+
Symbols = Consonants.phonemes.reduce({}) { |acc, p| acc.update p => :consonant }.merge(
|
209
|
+
Vowels.phonemes.reduce({}) { |acc, p| acc.update p => :vowel }
|
210
|
+
)
|
211
|
+
|
212
|
+
def distance(phoneme1, phoneme2)
|
213
|
+
return 0 if phoneme1 == phoneme2
|
214
|
+
|
215
|
+
distance_map.fetch(phoneme1).fetch(phoneme2)
|
216
|
+
end
|
217
|
+
|
218
|
+
def distance_map
|
219
|
+
@distance_map ||= phonemes.permutation(2).each_with_object(Hash.new { |h, k| h[k] = {} }) do |pair, scores|
|
220
|
+
p1, p2 = *pair
|
221
|
+
score = _distance(p1, p2)
|
222
|
+
scores[p1][p2] = score
|
223
|
+
scores[p2][p1] = score
|
224
|
+
end
|
225
|
+
end
|
226
|
+
|
227
|
+
private
|
228
|
+
|
229
|
+
def _distance(phoneme1, phoneme2)
|
230
|
+
types = [Symbols.fetch(phoneme1), Symbols.fetch(phoneme2)].sort
|
231
|
+
if types == %i[consonant vowel]
|
232
|
+
1.0
|
233
|
+
elsif types == %i[vowel vowel]
|
234
|
+
Vowels.distance(phoneme1, phoneme2)
|
235
|
+
elsif types == %i[consonant consonant]
|
236
|
+
Consonants.distance(phoneme1, phoneme2)
|
237
|
+
end
|
238
|
+
end
|
239
|
+
end
|