phonetics 3.2.0 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +17 -2
- data/Cargo.toml +27 -0
- data/Rakefile +58 -26
- data/VERSION +1 -1
- data/bin/phonetics +89 -0
- data/ext/phonetics_ruby/Cargo.toml +36 -0
- data/ext/phonetics_ruby/build.rs +24 -0
- data/ext/phonetics_ruby/extconf.rb +17 -0
- data/ext/phonetics_ruby/src/lib.rs +56 -0
- data/ext/phonetics_ruby/vendor/phonetics/Cargo.toml +30 -0
- data/ext/phonetics_ruby/vendor/phonetics/README.md +29 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/compounds.rs +40 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/confusion.rs +325 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/consonants.rs +363 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/cross_class.rs +56 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/diacritics.rs +113 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/distance.rs +183 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/levenshtein.rs +146 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/lib.rs +44 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/symbols.rs +21 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/tokenizer.rs +171 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/vowels.rs +197 -0
- data/lib/phonetics.rb +77 -2
- data/phonetics.gemspec +33 -9
- metadata +45 -34
- data/.github/workflows/gempush.yml +0 -28
- data/.github/workflows/test.yml +0 -20
- data/Makefile +0 -9
- data/ext/c_levenshtein/extconf.rb +0 -10
- data/ext/c_levenshtein/levenshtein.c +0 -223
- data/ext/c_levenshtein/next_phoneme_length.c +0 -1365
- data/ext/c_levenshtein/next_phoneme_length.h +0 -1
- data/ext/c_levenshtein/phonemes.c +0 -53
- data/ext/c_levenshtein/phonemes.h +0 -3
- data/ext/c_levenshtein/phonetic_cost.c +0 -88593
- data/ext/c_levenshtein/phonetic_cost.h +0 -1
- data/lib/phonetics/code_generator.rb +0 -228
- data/lib/phonetics/distances.rb +0 -249
- data/lib/phonetics/levenshtein.rb +0 -27
- data/lib/phonetics/ruby_levenshtein.rb +0 -162
|
@@ -1 +0,0 @@
|
|
|
1
|
-
float phonetic_cost(int64_t phoneme1, int64_t phoneme2);
|
|
@@ -1,228 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require_relative '../phonetics'
|
|
4
|
-
require 'json'
|
|
5
|
-
|
|
6
|
-
module Phonetics
|
|
7
|
-
class CodeGenerator
|
|
8
|
-
attr_reader :writer
|
|
9
|
-
|
|
10
|
-
def initialize(writer = $stdout)
|
|
11
|
-
@writer = writer
|
|
12
|
-
end
|
|
13
|
-
|
|
14
|
-
def generate_phonetic_cost_c_code
|
|
15
|
-
generator = PhoneticCost.new(writer)
|
|
16
|
-
generator.generate
|
|
17
|
-
writer.flush
|
|
18
|
-
end
|
|
19
|
-
|
|
20
|
-
def generate_next_phoneme_length_c_code
|
|
21
|
-
generator = NextPhonemeLength.new(writer)
|
|
22
|
-
generator.generate
|
|
23
|
-
writer.flush
|
|
24
|
-
end
|
|
25
|
-
|
|
26
|
-
private
|
|
27
|
-
|
|
28
|
-
def binary(str)
|
|
29
|
-
"0b#{str.bytes.map { |byte| byte.to_s(2).rjust(8, '0') }.join}"
|
|
30
|
-
end
|
|
31
|
-
|
|
32
|
-
# Turn the bytes of all phonemes into a lookup trie where a sequence of
|
|
33
|
-
# bytes can find a phoneme in linear time.
|
|
34
|
-
def phoneme_byte_trie
|
|
35
|
-
phoneme_byte_trie_for(Phonetics.phonemes)
|
|
36
|
-
end
|
|
37
|
-
|
|
38
|
-
def phoneme_byte_trie_for(phonemes)
|
|
39
|
-
phonemes.each_with_object({}) do |phoneme, trie|
|
|
40
|
-
phoneme.bytes.each_with_index.reduce(trie) do |subtrie, (byte, idx)|
|
|
41
|
-
subtrie[byte] ||= {}
|
|
42
|
-
|
|
43
|
-
# If we've reached the end of the byte string
|
|
44
|
-
if phoneme.bytes.length - 1 == idx
|
|
45
|
-
# Check if this is a duplicate lookup path. If there's a collision
|
|
46
|
-
# then this whole approach makes no sense.
|
|
47
|
-
if subtrie[byte].key?(:source)
|
|
48
|
-
source = subtrie[byte][:source]
|
|
49
|
-
raise "Duplicate byte sequence on #{phoneme.inspect} & #{source.inspect} (#{phoneme.bytes.inspect})"
|
|
50
|
-
else
|
|
51
|
-
subtrie[byte][:source] = phoneme
|
|
52
|
-
end
|
|
53
|
-
end
|
|
54
|
-
subtrie[byte]
|
|
55
|
-
end
|
|
56
|
-
end
|
|
57
|
-
end
|
|
58
|
-
|
|
59
|
-
def describe(phoneme, depth = 0)
|
|
60
|
-
indent depth, "// Phoneme: '#{phoneme}', bytes: #{phoneme.bytes.inspect}"
|
|
61
|
-
if Phonetics::Consonants.features.key?(phoneme)
|
|
62
|
-
indent depth, "// consonant features: #{Phonetics::Consonants.features[phoneme].to_json}"
|
|
63
|
-
else
|
|
64
|
-
indent depth, "// vowel features: #{Phonetics::Vowels::FormantFrequencies[phoneme].to_json}"
|
|
65
|
-
end
|
|
66
|
-
end
|
|
67
|
-
|
|
68
|
-
def ruby_source
|
|
69
|
-
location = caller_locations.first
|
|
70
|
-
"#{location.path.split('/')[-4..].join('/')}:#{location.lineno}"
|
|
71
|
-
end
|
|
72
|
-
|
|
73
|
-
def indent(depth, line)
|
|
74
|
-
write " #{' ' * depth}#{line}"
|
|
75
|
-
end
|
|
76
|
-
|
|
77
|
-
def write(line)
|
|
78
|
-
writer.puts line
|
|
79
|
-
end
|
|
80
|
-
end
|
|
81
|
-
|
|
82
|
-
class PhoneticCost < CodeGenerator
|
|
83
|
-
# We find the phonetic distance between two phonemes using a compiled
|
|
84
|
-
# lookup table. This is implemented as a set of nested switch statements.
|
|
85
|
-
# Hard to read when compiled, but simple to generate and fast at runtime.
|
|
86
|
-
#
|
|
87
|
-
# We generate a `phonetic_cost` function that takes four arguments: Two
|
|
88
|
-
# strings, and the lengths of those strings. Each string should be exactly
|
|
89
|
-
# one valid phoneme, which is possible thanks to the (also generated)
|
|
90
|
-
# next_phoneme_length() function.
|
|
91
|
-
#
|
|
92
|
-
# This will print a C code file with a function that implements a multil-level C
|
|
93
|
-
# switch like the following:
|
|
94
|
-
#
|
|
95
|
-
# switch (phoneme1) {
|
|
96
|
-
# case 'ɪ': // two bytes: [201, 170]
|
|
97
|
-
# // vowel features: {"F1":300,"F2":2100,"rounded":false}
|
|
98
|
-
#
|
|
99
|
-
# switch(phoneme2) {
|
|
100
|
-
# 'i': // one byte: [105]
|
|
101
|
-
# // vowel features: {"F1":240,"F2":2400,"rounded":false}
|
|
102
|
-
# return (float) 0.14355381904337383;
|
|
103
|
-
# break;
|
|
104
|
-
#
|
|
105
|
-
# the distance of ("ɪ", "i")2 is therefore 0.14355
|
|
106
|
-
#
|
|
107
|
-
def generate
|
|
108
|
-
write(<<-HEADER.gsub(/^ {6}/, ''))
|
|
109
|
-
|
|
110
|
-
// This is compiled from Ruby, in #{ruby_source}
|
|
111
|
-
#include <stdint.h>
|
|
112
|
-
#include <stdio.h>
|
|
113
|
-
#include <inttypes.h>
|
|
114
|
-
float phonetic_cost(int64_t phoneme1, int64_t phoneme2) {
|
|
115
|
-
if (phoneme1 == phoneme2) {
|
|
116
|
-
return (float) 0.0;
|
|
117
|
-
}
|
|
118
|
-
|
|
119
|
-
HEADER
|
|
120
|
-
|
|
121
|
-
write ' switch (phoneme1) {'
|
|
122
|
-
Phonetics.phonemes.each do |phoneme1|
|
|
123
|
-
write " case #{binary(phoneme1)}:"
|
|
124
|
-
describe(phoneme1, 2)
|
|
125
|
-
write ' switch(phoneme2) {'
|
|
126
|
-
Phonetics.distance_map[phoneme1].each do |phoneme2, distance|
|
|
127
|
-
write " case #{binary(phoneme2)}:"
|
|
128
|
-
describe(phoneme2, 6)
|
|
129
|
-
write " return (float) #{distance};"
|
|
130
|
-
write ' break;'
|
|
131
|
-
end
|
|
132
|
-
write ' }'
|
|
133
|
-
write ' break;'
|
|
134
|
-
end
|
|
135
|
-
write ' }'
|
|
136
|
-
write ' return (float) 1.0;'
|
|
137
|
-
write '};'
|
|
138
|
-
write ''
|
|
139
|
-
end
|
|
140
|
-
end
|
|
141
|
-
|
|
142
|
-
class NextPhonemeLength < CodeGenerator
|
|
143
|
-
# There's no simple way to break a string of IPA characters into phonemes.
|
|
144
|
-
# We do it by generating a function that, given a string of IPA characters,
|
|
145
|
-
# the starting index in that string, and the length of the string, returns
|
|
146
|
-
# the length of the next phoneme, or zero if none is found.
|
|
147
|
-
#
|
|
148
|
-
# Pseudocode:
|
|
149
|
-
# - return 0 if length - index == 0
|
|
150
|
-
# - switch on first byte, matching on possible first bytes of phonemes
|
|
151
|
-
# within the selected case statement:
|
|
152
|
-
# - return 1 if length - index == 1
|
|
153
|
-
# - switch on second byte, matching on possible second bytes of phonemes
|
|
154
|
-
# within the selected case statement:
|
|
155
|
-
# - return 2 if length - index == 1
|
|
156
|
-
# ...
|
|
157
|
-
# - default case: return 2 iff a phoneme terminates here
|
|
158
|
-
# - default case: return 1 iff a phoneme terminates here
|
|
159
|
-
# - return 0
|
|
160
|
-
#
|
|
161
|
-
def generate
|
|
162
|
-
write(<<-HEADER.gsub(/^ {6}/, ''))
|
|
163
|
-
// This is compiled from Ruby, in #{ruby_source}
|
|
164
|
-
#include <stdio.h>
|
|
165
|
-
int next_phoneme_length(int *string, int cursor, int length) {
|
|
166
|
-
|
|
167
|
-
int max_length;
|
|
168
|
-
max_length = length - cursor;
|
|
169
|
-
|
|
170
|
-
HEADER
|
|
171
|
-
|
|
172
|
-
next_phoneme_switch(phoneme_byte_trie, 0)
|
|
173
|
-
|
|
174
|
-
# If we fell through all the cases, return 0
|
|
175
|
-
write ' return 0;'
|
|
176
|
-
write '}'
|
|
177
|
-
end
|
|
178
|
-
|
|
179
|
-
private
|
|
180
|
-
|
|
181
|
-
# Recursively build switch statements for the body of next_phoneme_length
|
|
182
|
-
def next_phoneme_switch(trie, depth)
|
|
183
|
-
# switch (string[cursor + depth]) {
|
|
184
|
-
# case N: // for N in subtrie.keys
|
|
185
|
-
# // if a case statement matches the current byte AND there's chance
|
|
186
|
-
# // that a longer string might match, recurse.
|
|
187
|
-
# if (max_length >= depth) {
|
|
188
|
-
# // recurse
|
|
189
|
-
# }
|
|
190
|
-
# break;
|
|
191
|
-
# // if there's a :source key here then a phoneme terminates at this
|
|
192
|
-
# // point and this depth is a valid return value.
|
|
193
|
-
# default:
|
|
194
|
-
# return depth;
|
|
195
|
-
# break;
|
|
196
|
-
# }
|
|
197
|
-
indent depth, "switch(string[cursor + #{depth}]) {"
|
|
198
|
-
write ''
|
|
199
|
-
trie.each do |key, subtrie|
|
|
200
|
-
next if key == :source
|
|
201
|
-
next if subtrie.empty?
|
|
202
|
-
|
|
203
|
-
indent depth, "case #{key}:"
|
|
204
|
-
|
|
205
|
-
# Add a comment to help understand the dataset
|
|
206
|
-
describe(subtrie[:source], depth + 1) if subtrie[:source]
|
|
207
|
-
|
|
208
|
-
if subtrie.keys == [:source]
|
|
209
|
-
indent depth, " return #{depth + 1};"
|
|
210
|
-
else
|
|
211
|
-
indent depth, " if (max_length > #{depth + 1}) {"
|
|
212
|
-
next_phoneme_switch(subtrie, depth + 1)
|
|
213
|
-
indent depth, ' } else {'
|
|
214
|
-
indent depth, " return #{depth + 1};"
|
|
215
|
-
indent depth, ' }'
|
|
216
|
-
end
|
|
217
|
-
|
|
218
|
-
indent depth, ' break;'
|
|
219
|
-
end
|
|
220
|
-
|
|
221
|
-
if trie.key?(:source)
|
|
222
|
-
indent depth, ' default:'
|
|
223
|
-
indent depth, " return #{depth};"
|
|
224
|
-
end
|
|
225
|
-
indent depth, '}'
|
|
226
|
-
end
|
|
227
|
-
end
|
|
228
|
-
end
|
data/lib/phonetics/distances.rb
DELETED
|
@@ -1,249 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'delegate'
|
|
4
|
-
require 'set'
|
|
5
|
-
|
|
6
|
-
module Phonetics
|
|
7
|
-
extend self
|
|
8
|
-
|
|
9
|
-
# This subclass of the stdlib's String allows us to iterate over each phoneme
|
|
10
|
-
# in a string without monkeypatching
|
|
11
|
-
#
|
|
12
|
-
# Usage:
|
|
13
|
-
# Phonetics::String.new("wətɛvɝ").each_phoneme.to_a
|
|
14
|
-
# => ["w", "ə", "t", "ɛ", "v", "ɝ"]
|
|
15
|
-
class String < SimpleDelegator
|
|
16
|
-
# Group all phonemes by how many characters they have. Use this to walk
|
|
17
|
-
# through a string finding phonemes (looking for longest ones first)
|
|
18
|
-
def self.phonemes_by_length
|
|
19
|
-
@phonemes_by_length ||= Phonetics.phonemes.each_with_object(
|
|
20
|
-
# This relies on the impicit stable key ordering of Hash objects in Ruby
|
|
21
|
-
# 2+ to keep the keys in descending order.
|
|
22
|
-
4 => Set.new, 3 => Set.new, 2 => Set.new, 1 => Set.new
|
|
23
|
-
) do |str, acc|
|
|
24
|
-
acc[str.chars.size] << str
|
|
25
|
-
end
|
|
26
|
-
end
|
|
27
|
-
|
|
28
|
-
def each_phoneme
|
|
29
|
-
idx = 0
|
|
30
|
-
Enumerator.new do |y|
|
|
31
|
-
while idx < chars.length
|
|
32
|
-
found = false
|
|
33
|
-
self.class.phonemes_by_length.each do |size, phonemes|
|
|
34
|
-
next unless idx + size <= chars.length
|
|
35
|
-
|
|
36
|
-
candidate = chars[idx..idx + size - 1].join
|
|
37
|
-
next unless phonemes.include?(candidate)
|
|
38
|
-
|
|
39
|
-
y.yield candidate
|
|
40
|
-
idx += size
|
|
41
|
-
found = true
|
|
42
|
-
break
|
|
43
|
-
end
|
|
44
|
-
idx += 1 unless found
|
|
45
|
-
end
|
|
46
|
-
end
|
|
47
|
-
end
|
|
48
|
-
end
|
|
49
|
-
|
|
50
|
-
module Vowels
|
|
51
|
-
extend self
|
|
52
|
-
|
|
53
|
-
FormantFrequencies = {
|
|
54
|
-
# https://en.wikipedia.org/wiki/Formant#Phonetics
|
|
55
|
-
'i' => { F1: 240, F2: 2400, rounded: false },
|
|
56
|
-
'y' => { F1: 235, F2: 2100, rounded: false },
|
|
57
|
-
'ɪ' => { F1: 300, F2: 2100, rounded: false }, # Guessing From other vowels
|
|
58
|
-
'e' => { F1: 390, F2: 2300, rounded: false },
|
|
59
|
-
'ø' => { F1: 370, F2: 1900, rounded: true },
|
|
60
|
-
'ɛ' => { F1: 610, F2: 1900, rounded: false },
|
|
61
|
-
'œ' => { F1: 585, F2: 1710, rounded: true },
|
|
62
|
-
'a' => { F1: 850, F2: 1610, rounded: false },
|
|
63
|
-
'ɶ' => { F1: 820, F2: 1530, rounded: true },
|
|
64
|
-
'ɑ' => { F1: 750, F2: 940, rounded: false },
|
|
65
|
-
'ɒ' => { F1: 700, F2: 760, rounded: true },
|
|
66
|
-
|
|
67
|
-
'ʌ' => { F1: 600, F2: 1170, rounded: false },
|
|
68
|
-
# copying 'ʌ' for other mid-vowel formants
|
|
69
|
-
'ə' => { F1: 600, F2: 1170, rounded: false },
|
|
70
|
-
'ɝ' => { F1: 600, F2: 1170, rounded: false, rhotic: true },
|
|
71
|
-
|
|
72
|
-
'ɔ' => { F1: 500, F2: 700, rounded: true },
|
|
73
|
-
'ɤ' => { F1: 460, F2: 1310, rounded: false },
|
|
74
|
-
'o' => { F1: 360, F2: 640, rounded: true },
|
|
75
|
-
'ɯ' => { F1: 300, F2: 1390, rounded: false },
|
|
76
|
-
'æ' => { F1: 800, F2: 1900, rounded: false }, # Guessing From other vowels
|
|
77
|
-
'u' => { F1: 350, F2: 650, rounded: true }, # Guessing From other vowels
|
|
78
|
-
'ʊ' => { F1: 350, F2: 650, rounded: true },
|
|
79
|
-
# Frequencies from http://videoweb.nie.edu.sg/phonetic/vowels/measurements.html
|
|
80
|
-
}.freeze
|
|
81
|
-
|
|
82
|
-
def phonemes
|
|
83
|
-
@phonemes ||= FormantFrequencies.keys
|
|
84
|
-
end
|
|
85
|
-
|
|
86
|
-
# Given two vowels, calculate the (pythagorean) distance between them using
|
|
87
|
-
# their F1 and F2 frequencies as x/y coordinates.
|
|
88
|
-
# The return value is scaled to a value between 0 and 1
|
|
89
|
-
# TODO: account for rhoticity (F3)
|
|
90
|
-
def distance(phoneme1, phoneme2)
|
|
91
|
-
formants1 = FormantFrequencies.fetch(phoneme1)
|
|
92
|
-
formants2 = FormantFrequencies.fetch(phoneme2)
|
|
93
|
-
|
|
94
|
-
@minmax_f1 ||= FormantFrequencies.values.minmax { |a, b| a[:F1] <=> b[:F1] }.map { |h| h[:F1] }
|
|
95
|
-
@minmax_f2 ||= FormantFrequencies.values.minmax { |a, b| a[:F2] <=> b[:F2] }.map { |h| h[:F2] }
|
|
96
|
-
|
|
97
|
-
# Get an x and y value for each input phoneme scaled between 0.0 and 1.0
|
|
98
|
-
# We'll use the scaled f1 as the 'x' and the scaled f2 as the 'y'
|
|
99
|
-
scaled_phoneme1_f1 = (formants1[:F1] - @minmax_f1[0]) / @minmax_f1[1].to_f
|
|
100
|
-
scaled_phoneme1_f2 = (formants1[:F2] - @minmax_f2[0]) / @minmax_f2[1].to_f
|
|
101
|
-
scaled_phoneme2_f1 = (formants2[:F1] - @minmax_f1[0]) / @minmax_f1[1].to_f
|
|
102
|
-
scaled_phoneme2_f2 = (formants2[:F2] - @minmax_f2[0]) / @minmax_f2[1].to_f
|
|
103
|
-
|
|
104
|
-
f1_distance = (scaled_phoneme1_f1 - scaled_phoneme2_f1).abs
|
|
105
|
-
f2_distance = (scaled_phoneme1_f2 - scaled_phoneme2_f2).abs
|
|
106
|
-
|
|
107
|
-
# When we have four values we can use the pythagorean theorem on them
|
|
108
|
-
# (order doesn't matter)
|
|
109
|
-
sqrt = Math.sqrt((f1_distance**2) + (f2_distance**2))
|
|
110
|
-
|
|
111
|
-
# Vowels are more similiar to each other than consonants, so we apply a
|
|
112
|
-
# penalty softening here
|
|
113
|
-
sqrt / 2.0
|
|
114
|
-
end
|
|
115
|
-
end
|
|
116
|
-
|
|
117
|
-
module Consonants
|
|
118
|
-
extend self
|
|
119
|
-
|
|
120
|
-
# This chart (columns 2 through the end, anyway) is a direct port of
|
|
121
|
-
# https://en.wikipedia.org/wiki/International_Phonetic_Alphabet#Letters
|
|
122
|
-
# We store the consonant table in this format to make updating it easier.
|
|
123
|
-
#
|
|
124
|
-
# rubocop:disable Layout/TrailingWhitespace
|
|
125
|
-
ChartData = %( | Labio-velar | Bi-labial | Labio-dental | Linguo-labial | Dental | Alveolar | Post-alveolar | Retro-flex | Palatal | Velar | Uvular | Pharyngeal | Glottal
|
|
126
|
-
Nasal | | m̥ m | ɱ | n̼ | | n̥ n | | ɳ̊ ɳ | ɲ̊ ɲ | ŋ̊ ŋ | ɴ | |
|
|
127
|
-
Stop | | p b | p̪ b̪ | t̼ d̼ | | t d | | ʈ ɖ | c ɟ | k g | q ɢ | ʡ | ʔ
|
|
128
|
-
Sibilant fricative | | | | | | s z | ʃ ʒ | ʂ ʐ | ɕ ʑ | | | |
|
|
129
|
-
Non-sibilant fricative | | ɸ β | f v | θ̼ ð̼ | θ ð | θ̠ ð̠ | ɹ̠̊˔ ɹ̠˔ | ɻ˔ | ç ʝ | x ɣ | χ ʁ | ħ ʕ | h ɦ
|
|
130
|
-
Approximant | w | | ʋ̥ ʋ | | | ɹ̥ ɹ | | ɻ̊ ɻ | j̊ j | ɰ̊ ɰ | | | ʔ̞
|
|
131
|
-
Tap/flap | | ⱱ̟ | ⱱ | ɾ̼ | | ɾ̥ ɾ | | ɽ̊ ɽ | | | ɢ̆ | ʡ̆ |
|
|
132
|
-
Trill | | ʙ̥ ʙ | | | | r̥ r | | | | | ʀ̥ ʀ | ʜ ʢ |
|
|
133
|
-
Lateral fricative | | | | | | ɬ ɮ | | ɭ̊˔ ɭ˔ | ʎ̝̊ ʎ̝ | ʟ̝̊ ʟ̝ | | |
|
|
134
|
-
Lateral approximant | | | | | | l̥ l | | ɭ̊ ɭ | ʎ̥ ʎ | ʟ̥ ʟ | ʟ̠ | |
|
|
135
|
-
Lateral tap/flap | | | | | | ɺ | | ɭ̆ | ʎ̆ | ʟ̆ | | |
|
|
136
|
-
)
|
|
137
|
-
# rubocop:enable Layout/TrailingWhitespace
|
|
138
|
-
|
|
139
|
-
# rubocop:disable Metrics/CyclomaticComplexity
|
|
140
|
-
# rubocop:disable Metrics/PerceivedComplexity
|
|
141
|
-
# Parse the ChartData into a lookup table where we can retrieve attributes
|
|
142
|
-
# for each phoneme
|
|
143
|
-
def features
|
|
144
|
-
@features ||= begin
|
|
145
|
-
header, *manners = ChartData.lines
|
|
146
|
-
|
|
147
|
-
_, *positions = header.chomp.split(' | ')
|
|
148
|
-
positions.map(&:strip!)
|
|
149
|
-
|
|
150
|
-
# Remove any trailing blank lines
|
|
151
|
-
manners.pop while manners.last.to_s.strip.empty?
|
|
152
|
-
|
|
153
|
-
position_indexes = Hash[*positions.each_with_index.to_a.flatten]
|
|
154
|
-
|
|
155
|
-
@position_count = positions.size
|
|
156
|
-
|
|
157
|
-
manners.each_with_object({}) do |row, phonemes|
|
|
158
|
-
manner, *columns = row.chomp.split(' | ')
|
|
159
|
-
manner.strip!
|
|
160
|
-
positions.zip(columns).each do |position, phoneme_text|
|
|
161
|
-
data = {
|
|
162
|
-
position: position,
|
|
163
|
-
position_index: position_indexes[position],
|
|
164
|
-
manner: manner,
|
|
165
|
-
}
|
|
166
|
-
# If there is a character in the first byte then this articulation
|
|
167
|
-
# has a voiceless phoneme. The symbol may use additional characters
|
|
168
|
-
# as part of the phoneme symbol.
|
|
169
|
-
unless phoneme_text[0] == ' '
|
|
170
|
-
# Take the first non-blank character string
|
|
171
|
-
symbol = phoneme_text.chars.take_while { |char| char != ' ' }.join
|
|
172
|
-
phoneme_text = phoneme_text[symbol.chars.size..]
|
|
173
|
-
|
|
174
|
-
phonemes[symbol] = data.merge(voiced: false)
|
|
175
|
-
end
|
|
176
|
-
# If there's a character anywhere left in the string then this
|
|
177
|
-
# articulation has a voiced phoneme
|
|
178
|
-
unless phoneme_text.strip.empty?
|
|
179
|
-
symbol = phoneme_text.strip
|
|
180
|
-
phonemes[symbol] = data.merge(voiced: true)
|
|
181
|
-
end
|
|
182
|
-
end
|
|
183
|
-
end
|
|
184
|
-
end
|
|
185
|
-
end
|
|
186
|
-
# rubocop:enable Metrics/CyclomaticComplexity
|
|
187
|
-
# rubocop:enable Metrics/PerceivedComplexity
|
|
188
|
-
|
|
189
|
-
def phonemes
|
|
190
|
-
@phonemes ||= features.keys
|
|
191
|
-
end
|
|
192
|
-
|
|
193
|
-
# Given two consonants, calculate their difference by summing the
|
|
194
|
-
# following:
|
|
195
|
-
# * 0.3 if they are not voiced the same
|
|
196
|
-
# * 0.3 if they are different manners
|
|
197
|
-
# * Up to 0.4 if they are the maximum position difference
|
|
198
|
-
def distance(phoneme1, phoneme2)
|
|
199
|
-
features1 = features[phoneme1]
|
|
200
|
-
features2 = features[phoneme2]
|
|
201
|
-
|
|
202
|
-
penalty = 0
|
|
203
|
-
penalty += 0.3 if features1[:voiced] != features2[:voiced]
|
|
204
|
-
|
|
205
|
-
penalty += 0.3 if features1[:manner] != features2[:manner]
|
|
206
|
-
|
|
207
|
-
# Use up to the remaining 0.4 for penalizing differences in manner
|
|
208
|
-
penalty += 0.4 * ((features1[:position_index] - features2[:position_index]).abs / @position_count.to_f)
|
|
209
|
-
penalty
|
|
210
|
-
end
|
|
211
|
-
end
|
|
212
|
-
|
|
213
|
-
def phonemes
|
|
214
|
-
Vowels.phonemes + Consonants.phonemes
|
|
215
|
-
end
|
|
216
|
-
|
|
217
|
-
Symbols = Consonants.phonemes.reduce({}) { |acc, p| acc.update p => :consonant }.merge(
|
|
218
|
-
Vowels.phonemes.reduce({}) { |acc, p| acc.update p => :vowel }
|
|
219
|
-
)
|
|
220
|
-
|
|
221
|
-
def distance(phoneme1, phoneme2)
|
|
222
|
-
return 0 if phoneme1 == phoneme2
|
|
223
|
-
|
|
224
|
-
distance_map.fetch(phoneme1).fetch(phoneme2)
|
|
225
|
-
end
|
|
226
|
-
|
|
227
|
-
def distance_map
|
|
228
|
-
@distance_map ||= phonemes.permutation(2).each_with_object(Hash.new { |h, k| h[k] = {} }) do |pair, scores|
|
|
229
|
-
p1, p2 = *pair
|
|
230
|
-
score = _distance(p1, p2)
|
|
231
|
-
scores[p1][p2] = score
|
|
232
|
-
scores[p2][p1] = score
|
|
233
|
-
end
|
|
234
|
-
end
|
|
235
|
-
|
|
236
|
-
private
|
|
237
|
-
|
|
238
|
-
def _distance(phoneme1, phoneme2)
|
|
239
|
-
types = [Symbols.fetch(phoneme1), Symbols.fetch(phoneme2)].sort
|
|
240
|
-
case types
|
|
241
|
-
when %i[consonant vowel]
|
|
242
|
-
1.0
|
|
243
|
-
when %i[vowel vowel]
|
|
244
|
-
Vowels.distance(phoneme1, phoneme2)
|
|
245
|
-
when %i[consonant consonant]
|
|
246
|
-
Consonants.distance(phoneme1, phoneme2)
|
|
247
|
-
end
|
|
248
|
-
end
|
|
249
|
-
end
|
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require_relative '../phonetics'
|
|
4
|
-
require_relative 'c_levenshtein'
|
|
5
|
-
|
|
6
|
-
# Using the Damerau version of the Levenshtein algorithm, with phonetic feature
|
|
7
|
-
# count used instead of a binary edit distance calculation
|
|
8
|
-
#
|
|
9
|
-
# This implementation was dually inspired by the damerau-levenshtein gem
|
|
10
|
-
# (https://github.com/GlobalNamesArchitecture/damerau-levenshtein/tree/master/ext/damerau_levenshtein).
|
|
11
|
-
# and "Using Phonologically Weighted Levenshtein Distances for the Prediction
|
|
12
|
-
# of Microscopic Intelligibility" by Lionel Fontan, Isabelle Ferrané, Jérôme
|
|
13
|
-
# Farinas, Julien Pinquier, Xavier Aumont, 2016
|
|
14
|
-
# https://hal.archives-ouvertes.fr/hal-01474904/document
|
|
15
|
-
module Phonetics
|
|
16
|
-
module Levenshtein
|
|
17
|
-
extend ::PhoneticsLevenshteinCBinding
|
|
18
|
-
|
|
19
|
-
# rubocop:disable Style/OptionalBooleanParameter
|
|
20
|
-
def self.distance(str1, str2, verbose = false)
|
|
21
|
-
return if str1.nil? || str2.nil?
|
|
22
|
-
|
|
23
|
-
internal_phonetic_distance(str1, str2, verbose)
|
|
24
|
-
end
|
|
25
|
-
# rubocop:enable Style/OptionalBooleanParameter
|
|
26
|
-
end
|
|
27
|
-
end
|
|
@@ -1,162 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require_relative '../phonetics'
|
|
4
|
-
|
|
5
|
-
# Using the Damerau version of the Levenshtein algorithm, with phonetic feature
|
|
6
|
-
# count used instead of a binary edit distance calculation
|
|
7
|
-
#
|
|
8
|
-
# This Ruby implementation is almost entirely taken from the damerau-levenshtein gem
|
|
9
|
-
# (https://github.com/GlobalNamesArchitecture/damerau-levenshtein/tree/master/ext/damerau_levenshtein).
|
|
10
|
-
# The implementation is modified based on "Using Phonologically Weighted
|
|
11
|
-
# Levenshtein Distances for the Prediction of Microscopic Intelligibility" by
|
|
12
|
-
# Lionel Fontan, Isabelle Ferrané, Jérôme Farinas, Julien Pinquier, Xavier
|
|
13
|
-
# Aumont, 2016
|
|
14
|
-
# https://hal.archives-ouvertes.fr/hal-01474904/document
|
|
15
|
-
module Phonetics
|
|
16
|
-
class RubyLevenshtein
|
|
17
|
-
attr_reader :str1, :str2, :len1, :len2, :matrix
|
|
18
|
-
|
|
19
|
-
# rubocop:disable Style/OptionalBooleanParameter
|
|
20
|
-
def initialize(ipa_str1, ipa_str2, verbose = false)
|
|
21
|
-
@str1 = ipa_str1.each_char.select { |c| Phonetics.phonemes.include?(c) }.join
|
|
22
|
-
@str2 = ipa_str2.each_char.select { |c| Phonetics.phonemes.include?(c) }.join
|
|
23
|
-
@len1 = @str1.size
|
|
24
|
-
@len2 = @str2.size
|
|
25
|
-
@verbose = verbose
|
|
26
|
-
prepare_matrix
|
|
27
|
-
set_edit_distances(@str1, @str2)
|
|
28
|
-
end
|
|
29
|
-
|
|
30
|
-
def self.distance(str1, str2, verbose = false)
|
|
31
|
-
new(str1, str2, verbose).distance
|
|
32
|
-
end
|
|
33
|
-
# rubocop:enable Style/OptionalBooleanParameter
|
|
34
|
-
|
|
35
|
-
def distance
|
|
36
|
-
return 0 if walk.empty?
|
|
37
|
-
|
|
38
|
-
print_matrix if @verbose
|
|
39
|
-
walk.last[:distance]
|
|
40
|
-
end
|
|
41
|
-
|
|
42
|
-
private
|
|
43
|
-
|
|
44
|
-
def walk
|
|
45
|
-
res = []
|
|
46
|
-
i = len2
|
|
47
|
-
j = len1
|
|
48
|
-
return res if i == 0 && j == 0
|
|
49
|
-
|
|
50
|
-
loop do
|
|
51
|
-
i, j, char = char_data(i, j)
|
|
52
|
-
res.unshift char
|
|
53
|
-
break if i == 0 || j == 0
|
|
54
|
-
end
|
|
55
|
-
res
|
|
56
|
-
end
|
|
57
|
-
|
|
58
|
-
def set_edit_distances(str1, str2)
|
|
59
|
-
i = 0
|
|
60
|
-
while (i += 1) <= len2
|
|
61
|
-
j = 0
|
|
62
|
-
while (j += 1) <= len1
|
|
63
|
-
options = [
|
|
64
|
-
ins(i, j),
|
|
65
|
-
del(i, j),
|
|
66
|
-
subst(i, j)
|
|
67
|
-
]
|
|
68
|
-
# This is where we implement the modifications to Damerau-Levenshtein
|
|
69
|
-
# according to https://hal.archives-ouvertes.fr/hal-01474904/document
|
|
70
|
-
phonetic_cost = Phonetics.distance(str1[j - 1], str2[i - 1])
|
|
71
|
-
matrix[i][j] = options.min + phonetic_cost
|
|
72
|
-
puts "------- #{j}/#{i} #{j + (i * (len1 + 1))}" if @verbose
|
|
73
|
-
print_matrix if @verbose
|
|
74
|
-
end
|
|
75
|
-
end
|
|
76
|
-
end
|
|
77
|
-
|
|
78
|
-
def char_data(i, j)
|
|
79
|
-
char = { distance: matrix[i][j] }
|
|
80
|
-
operation, move = find_previous(i, j)
|
|
81
|
-
previous_value = move[:value]
|
|
82
|
-
char[:type] = previous_value == char[:distance] ? :same : operation
|
|
83
|
-
i, j = move[:move_to]
|
|
84
|
-
[i, j, char]
|
|
85
|
-
end
|
|
86
|
-
|
|
87
|
-
def find_previous(i, j)
|
|
88
|
-
[
|
|
89
|
-
[:insert, { cost: ins(i, j), move_to: [i, j - 1] }],
|
|
90
|
-
[:delete, { cost: del(i, j), move_to: [i, j - 1] }],
|
|
91
|
-
[:substitute, { cost: subst(i, j), move_to: [i, j - 1] }]
|
|
92
|
-
].select do |_operation, data|
|
|
93
|
-
# Don't send us out of bounds
|
|
94
|
-
data[:move_to][0] >= 0 && data[:move_to][1] >= 0
|
|
95
|
-
end.min_by do |_operation, data|
|
|
96
|
-
# pick the cheapest one
|
|
97
|
-
data[:value]
|
|
98
|
-
end
|
|
99
|
-
end
|
|
100
|
-
|
|
101
|
-
# TODO: Score the edit distance lower if sonorant sounds are found in sequence.
|
|
102
|
-
def del(i, j)
|
|
103
|
-
matrix[i - 1][j]
|
|
104
|
-
end
|
|
105
|
-
|
|
106
|
-
def ins(i, j)
|
|
107
|
-
matrix[i][j - 1]
|
|
108
|
-
end
|
|
109
|
-
|
|
110
|
-
def subst(i, j)
|
|
111
|
-
matrix[i - 1][j - 1]
|
|
112
|
-
end
|
|
113
|
-
|
|
114
|
-
# Set the minimum scores equal to the distance between each phoneme,
|
|
115
|
-
# sequentially.
|
|
116
|
-
#
|
|
117
|
-
# The first value is always zero, the second is always 1.
|
|
118
|
-
# Subsequent values are the cumulative phonetic distance between each
|
|
119
|
-
# phoneme within the same string.
|
|
120
|
-
# "aek" -> [0, 1, 1.61, 2.61]
|
|
121
|
-
def initial_distances(str1, str2)
|
|
122
|
-
starting_distance = 1
|
|
123
|
-
starting_distance = 0 if len1 == 0 || len2 == 0
|
|
124
|
-
|
|
125
|
-
distances1 = (1..(str1.length - 1)).reduce([0, starting_distance]) do |acc, i|
|
|
126
|
-
acc << acc.last + Phonetics.distance(str1[i - 1], str1[i])
|
|
127
|
-
end
|
|
128
|
-
distances2 = (1..(str2.length - 1)).reduce([0, starting_distance]) do |acc, i|
|
|
129
|
-
acc << acc.last + Phonetics.distance(str2[i - 1], str2[i])
|
|
130
|
-
end
|
|
131
|
-
|
|
132
|
-
[distances1, distances2]
|
|
133
|
-
end
|
|
134
|
-
|
|
135
|
-
def prepare_matrix
|
|
136
|
-
str1_initial, str2_initial = initial_distances(str1, str2)
|
|
137
|
-
|
|
138
|
-
@matrix = Array.new(len2 + 1) { Array.new(len1 + 1) { nil } }
|
|
139
|
-
# The first row is the initial values for str2
|
|
140
|
-
@matrix[0] = str1_initial
|
|
141
|
-
# The first column is the initial values for str1
|
|
142
|
-
(len2 + 1).times { |n| @matrix[n][0] = str2_initial[n] }
|
|
143
|
-
end
|
|
144
|
-
|
|
145
|
-
# This is a helper method for developers to use when exploring this
|
|
146
|
-
# algorithm.
|
|
147
|
-
def print_matrix
|
|
148
|
-
puts " #{str1.chars.map { |c| c.ljust(9, ' ') }.join}"
|
|
149
|
-
matrix.each_with_index do |row, ridx|
|
|
150
|
-
print ' ' if ridx == 0
|
|
151
|
-
print "#{str2[ridx - 1]} " if ridx > 0
|
|
152
|
-
row.each_with_index do |cell, _cidx|
|
|
153
|
-
cell ||= 0.0
|
|
154
|
-
print cell.to_s[0, 8].ljust(8, '0')
|
|
155
|
-
print ' '
|
|
156
|
-
end
|
|
157
|
-
puts ''
|
|
158
|
-
end
|
|
159
|
-
''
|
|
160
|
-
end
|
|
161
|
-
end
|
|
162
|
-
end
|