phonetics 1.5.4 → 1.8.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Rakefile +22 -11
- data/VERSION +1 -1
- data/bin/console +13 -0
- data/ext/c_levenshtein/levenshtein.c +104 -76
- data/ext/c_levenshtein/next_phoneme_length.c +1364 -0
- data/ext/c_levenshtein/next_phoneme_length.h +1 -0
- data/ext/c_levenshtein/phonemes.c +33 -0
- data/ext/c_levenshtein/phonemes.h +2 -0
- data/ext/c_levenshtein/phonetic_cost.c +134245 -42305
- data/ext/c_levenshtein/phonetic_cost.h +1 -1
- data/lib/phonetics.rb +2 -90
- data/lib/phonetics/code_generator.rb +285 -0
- data/lib/phonetics/levenshtein.rb +12 -21
- data/lib/phonetics/ruby_levenshtein.rb +5 -14
- metadata +8 -2
@@ -1 +1 @@
|
|
1
|
-
float phonetic_cost(
|
1
|
+
float phonetic_cost(int *string1, int string1_offset, int phoneme1_length, int *string2, int string2_offset, int phoneme2_length);
|
data/lib/phonetics.rb
CHANGED
@@ -112,22 +112,6 @@ module Phonetics
|
|
112
112
|
module Consonants
|
113
113
|
extend self
|
114
114
|
|
115
|
-
# Plosives and fricatives are less similar than trills and flaps, or
|
116
|
-
# sibilant fricatives and non-sibilant fricatives
|
117
|
-
# TODO: this is unfinished and possibly a bad idea
|
118
|
-
MannerDistances = {
|
119
|
-
'Nasal' => %w[continuant],
|
120
|
-
'Stop' => %w[],
|
121
|
-
'Sibilant fricative' => %w[continuant fricative],
|
122
|
-
'Non-sibilant fricative' => %w[continuant non_sibilant fricative],
|
123
|
-
'Approximant' => %w[],
|
124
|
-
'Tap/Flap' => %w[],
|
125
|
-
'Trill' => %w[],
|
126
|
-
'Lateral fricative' => %w[continuant fricative],
|
127
|
-
'Lateral approximant' => %w[],
|
128
|
-
'Lateral tap/flap' => %w[],
|
129
|
-
}.freeze
|
130
|
-
|
131
115
|
# This chart (columns 2 through the end, anyway) is a direct port of
|
132
116
|
# https://en.wikipedia.org/wiki/International_Phonetic_Alphabet#Letters
|
133
117
|
# We store the consonant table in this format to make updating it easier.
|
@@ -218,7 +202,7 @@ module Phonetics
|
|
218
202
|
end
|
219
203
|
|
220
204
|
def phonemes
|
221
|
-
|
205
|
+
Vowels.phonemes + Consonants.phonemes
|
222
206
|
end
|
223
207
|
|
224
208
|
Symbols = Consonants.phonemes.reduce({}) { |acc, p| acc.update p => :consonant }.merge(
|
@@ -232,9 +216,7 @@ module Phonetics
|
|
232
216
|
end
|
233
217
|
|
234
218
|
def distance_map
|
235
|
-
@distance_map ||= (
|
236
|
-
Vowels.phonemes + Consonants.phonemes
|
237
|
-
).permutation(2).each_with_object(Hash.new { |h, k| h[k] = {} }) do |pair, scores|
|
219
|
+
@distance_map ||= phonemes.permutation(2).each_with_object(Hash.new { |h, k| h[k] = {} }) do |pair, scores|
|
238
220
|
p1, p2 = *pair
|
239
221
|
score = _distance(p1, p2)
|
240
222
|
scores[p1][p2] = score
|
@@ -242,76 +224,6 @@ module Phonetics
|
|
242
224
|
end
|
243
225
|
end
|
244
226
|
|
245
|
-
# as_utf_8_long("aɰ̊ h")
|
246
|
-
# => [97, 8404, 32, 104]
|
247
|
-
def as_utf_8_long(string)
|
248
|
-
string.each_grapheme_cluster.map { |grapheme| grapheme_as_utf_8_long(grapheme) }
|
249
|
-
end
|
250
|
-
|
251
|
-
# Encode individual multi-byte strings as a single integer.
|
252
|
-
#
|
253
|
-
# "ɰ̊".unpack('U*')
|
254
|
-
# => [624, 778]
|
255
|
-
#
|
256
|
-
# grapheme_as_utf_8_long("ɰ̊")
|
257
|
-
# => 1413 (624 + (10 * 778))
|
258
|
-
def grapheme_as_utf_8_long(grapheme)
|
259
|
-
grapheme.unpack('U*').each_with_index.reduce(0) do |total, (byte, i)|
|
260
|
-
total + (10**i) * byte
|
261
|
-
end
|
262
|
-
end
|
263
|
-
|
264
|
-
# This will print a C code file with a function that implements a two-level C
|
265
|
-
# switch like the following:
|
266
|
-
#
|
267
|
-
# switch (a) {
|
268
|
-
# case 100: // 'd'
|
269
|
-
# switch (b) {
|
270
|
-
# case 618: // 'ɪ'
|
271
|
-
# return (float) 0.73827;
|
272
|
-
# break;
|
273
|
-
# }
|
274
|
-
# }
|
275
|
-
#
|
276
|
-
def generate_phonetic_cost_c_code(writer = STDOUT)
|
277
|
-
# First, flatten the bytes of the runes (unicode codepoints encoded via
|
278
|
-
# UTF-8) into single integers. We do this by adding the utf-8 values, each
|
279
|
-
# multiplied by 10 * their byte number. The specific encoding doesn't
|
280
|
-
# matter so long as it's:
|
281
|
-
# * consistent
|
282
|
-
# * has no collisions
|
283
|
-
# * produces a value that's a valid C case conditional
|
284
|
-
# * can be applied to runes of input strings later
|
285
|
-
integer_distance_map = distance_map.reduce({}) do |acc_a, (a, distances)|
|
286
|
-
acc_a.update [a, grapheme_as_utf_8_long(a)] => (distances.reduce({}) do |acc_b, (b, distance)|
|
287
|
-
acc_b.update [b, grapheme_as_utf_8_long(b)] => distance
|
288
|
-
end)
|
289
|
-
end
|
290
|
-
|
291
|
-
# Then we print out C code full of switches
|
292
|
-
|
293
|
-
writer.puts(<<-FUNC.gsub(/^ {4}/, ''))
|
294
|
-
float phonetic_cost(int a, int b) {
|
295
|
-
// This is compiled from Ruby, using `String#unpack("U")` on each character
|
296
|
-
// to retrieve the UTF-8 codepoint as a C long value.
|
297
|
-
if (a == b) { return 0.0; };
|
298
|
-
FUNC
|
299
|
-
writer.puts ' switch (a) {'
|
300
|
-
integer_distance_map.each do |(a, a_i), distances|
|
301
|
-
writer.puts " case #{a_i}: // #{a}"
|
302
|
-
writer.puts ' switch (b) {'
|
303
|
-
distances.each do |(b, b_i), distance|
|
304
|
-
writer.puts " case #{b_i}: // #{a}->#{b}"
|
305
|
-
writer.puts " return (float) #{distance};"
|
306
|
-
writer.puts ' break;'
|
307
|
-
end
|
308
|
-
writer.puts ' }'
|
309
|
-
end
|
310
|
-
writer.puts ' }'
|
311
|
-
writer.puts ' return 1.0;'
|
312
|
-
writer.puts '}'
|
313
|
-
end
|
314
|
-
|
315
227
|
private
|
316
228
|
|
317
229
|
def _distance(phoneme1, phoneme2)
|
@@ -0,0 +1,285 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative '../phonetics'
|
4
|
+
require 'json'
|
5
|
+
|
6
|
+
module Phonetics
|
7
|
+
class CodeGenerator
|
8
|
+
attr_reader :writer
|
9
|
+
|
10
|
+
def initialize(writer = STDOUT)
|
11
|
+
@writer = writer
|
12
|
+
end
|
13
|
+
|
14
|
+
def generate_phonetic_cost_c_code
|
15
|
+
generator = PhoneticCost.new(writer)
|
16
|
+
generator.generate
|
17
|
+
writer.flush
|
18
|
+
end
|
19
|
+
|
20
|
+
def generate_next_phoneme_length_c_code
|
21
|
+
generator = NextPhonemeLength.new(writer)
|
22
|
+
generator.generate
|
23
|
+
writer.flush
|
24
|
+
end
|
25
|
+
|
26
|
+
private
|
27
|
+
|
28
|
+
# Turn the bytes of all phonemes into a lookup trie where a sequence of
|
29
|
+
# bytes can find a phoneme in linear time.
|
30
|
+
def phoneme_byte_trie
|
31
|
+
phoneme_byte_trie_for(Phonetics.phonemes)
|
32
|
+
end
|
33
|
+
|
34
|
+
def phoneme_byte_trie_for(phonemes)
|
35
|
+
phonemes.each_with_object({}) do |phoneme, trie|
|
36
|
+
phoneme.bytes.each_with_index.reduce(trie) do |subtrie, (byte, idx)|
|
37
|
+
subtrie[byte] ||= {}
|
38
|
+
|
39
|
+
# If we've reached the end of the byte string
|
40
|
+
if phoneme.bytes.length - 1 == idx
|
41
|
+
# Check if this is a duplicate lookup path. If there's a collision
|
42
|
+
# then this whole approach makes no sense.
|
43
|
+
if subtrie[byte].key?(:source)
|
44
|
+
source = subtrie[byte][:source]
|
45
|
+
raise "Duplicate byte sequence on #{phoneme.inspect} & #{source.inspect} (#{phoneme.bytes.inspect})"
|
46
|
+
else
|
47
|
+
subtrie[byte][:source] = phoneme
|
48
|
+
end
|
49
|
+
end
|
50
|
+
subtrie[byte]
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def ruby_source
|
56
|
+
location = caller_locations.first
|
57
|
+
"#{location.path.split('/')[-4..-1].join('/')}:#{location.lineno}"
|
58
|
+
end
|
59
|
+
|
60
|
+
def describe(phoneme, depth)
|
61
|
+
indent depth, "// Phoneme: #{phoneme.inspect}, bytes: #{phoneme.bytes.inspect}"
|
62
|
+
if Phonetics::Consonants.features.key?(phoneme)
|
63
|
+
indent depth, "// consonant features: #{Phonetics::Consonants.features[phoneme].to_json}"
|
64
|
+
else
|
65
|
+
indent depth, "// vowel features: #{Phonetics::Vowels::FormantFrequencies[phoneme].to_json}"
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def indent(depth, line)
|
70
|
+
write " #{' ' * depth}#{line}"
|
71
|
+
end
|
72
|
+
|
73
|
+
def write(line)
|
74
|
+
writer.puts line
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
class PhoneticCost < CodeGenerator
|
79
|
+
# We find the phonetic distance between two phonemes using a compiled
|
80
|
+
# lookup table. This is implemented as a set of nested switch statements.
|
81
|
+
# Hard to read when compiled, but simple to generate and fast at runtime.
|
82
|
+
#
|
83
|
+
# We generate a `phonetic_cost` function that takes four arguments: Two
|
84
|
+
# strings, and the lengths of those strings. Each string should be exactly
|
85
|
+
# one valid phoneme, which is possible thanks to the (also generated)
|
86
|
+
# next_phoneme_length() function.
|
87
|
+
#
|
88
|
+
# This will print a C code file with a function that implements a multil-level C
|
89
|
+
# switch like the following:
|
90
|
+
#
|
91
|
+
# switch (phoneme1_length) {
|
92
|
+
# case 2:
|
93
|
+
# switch(string1[1]) {
|
94
|
+
# case 201: // first byte of "ɪ"
|
95
|
+
# switch(string1[3]) {
|
96
|
+
# case 170: // second and final byte of "ɪ"
|
97
|
+
# // Phoneme: "ɪ", bytes: [201, 170]
|
98
|
+
# // vowel features: {"F1":300,"F2":2100,"rounded":false}
|
99
|
+
# switch(string2[6]) {
|
100
|
+
# case 105: // first and only byte of "i"
|
101
|
+
# // Phoneme: "i", bytes: [105]
|
102
|
+
# // vowel features: {"F1":240,"F2":2400,"rounded":false}
|
103
|
+
# return (float) 0.14355381904337383;
|
104
|
+
# break;
|
105
|
+
#
|
106
|
+
# the distance of ("ɪ", "i")2 is therefore 0.14355
|
107
|
+
#
|
108
|
+
def generate
|
109
|
+
write(<<-HEADER.gsub(/^ {6}/, ''))
|
110
|
+
|
111
|
+
// This is compiled from Ruby, in #{ruby_source}
|
112
|
+
#include <stdbool.h>
|
113
|
+
#include <stdio.h>
|
114
|
+
#include "./phonemes.h"
|
115
|
+
float phonetic_cost(int *string1, int string1_offset, int phoneme1_length, int *string2, int string2_offset, int phoneme2_length) {
|
116
|
+
|
117
|
+
HEADER
|
118
|
+
|
119
|
+
write ' switch (phoneme1_length) {'
|
120
|
+
by_byte_length.each do |length, phonemes|
|
121
|
+
write " case #{length}:"
|
122
|
+
switch_phoneme1(phoneme_byte_trie_for(phonemes), 0)
|
123
|
+
write ' break;'
|
124
|
+
end
|
125
|
+
write ' }'
|
126
|
+
write ' return (float) 1.0;'
|
127
|
+
write '};'
|
128
|
+
write ''
|
129
|
+
end
|
130
|
+
|
131
|
+
def switch_phoneme1(trie, depth = 0)
|
132
|
+
indent depth, "switch(string1[string1_offset + #{depth}]) {"
|
133
|
+
trie.each do |key, subtrie|
|
134
|
+
next if key == :source
|
135
|
+
next if subtrie.empty?
|
136
|
+
|
137
|
+
indent depth + 1, "case #{key}:"
|
138
|
+
|
139
|
+
phoneme1 = subtrie[:source]
|
140
|
+
|
141
|
+
# If this could be a match of a phoneme1 then find phoneme2
|
142
|
+
if phoneme1
|
143
|
+
# Add a comment to help understand the dataset
|
144
|
+
describe(phoneme1, depth + 2) if phoneme1
|
145
|
+
|
146
|
+
by_byte_length.each do |_, phonemes|
|
147
|
+
byte_trie = phoneme_byte_trie_for(phonemes)
|
148
|
+
next if byte_trie.empty?
|
149
|
+
|
150
|
+
switch_phoneme2(byte_trie, phoneme1, 0)
|
151
|
+
end
|
152
|
+
else
|
153
|
+
switch_phoneme1(subtrie, depth + 1)
|
154
|
+
end
|
155
|
+
|
156
|
+
indent depth + 2, 'break;'
|
157
|
+
end
|
158
|
+
indent depth, '}'
|
159
|
+
end
|
160
|
+
|
161
|
+
def switch_phoneme2(trie, previous_phoneme, depth = 0)
|
162
|
+
indent depth, "switch(string2[string2_offset + #{depth}]) {"
|
163
|
+
trie.each do |key, subtrie|
|
164
|
+
next if key == :source
|
165
|
+
next if subtrie.empty?
|
166
|
+
|
167
|
+
phoneme2 = subtrie[:source]
|
168
|
+
|
169
|
+
indent depth + 1, "case #{key}:"
|
170
|
+
|
171
|
+
if phoneme2
|
172
|
+
value = if previous_phoneme == phoneme2
|
173
|
+
0.0
|
174
|
+
else
|
175
|
+
distance(previous_phoneme, phoneme2)
|
176
|
+
end
|
177
|
+
# Add a comment to help understand the dataset
|
178
|
+
describe(phoneme2, depth + 2)
|
179
|
+
indent depth + 2, "return (float) #{value};"
|
180
|
+
else
|
181
|
+
switch_phoneme2(subtrie, previous_phoneme, depth + 1)
|
182
|
+
end
|
183
|
+
|
184
|
+
indent depth + 2, 'break;'
|
185
|
+
end
|
186
|
+
indent depth, '}'
|
187
|
+
end
|
188
|
+
|
189
|
+
def by_byte_length
|
190
|
+
Phonetics.phonemes.group_by do |phoneme|
|
191
|
+
phoneme.bytes.length
|
192
|
+
end.sort_by(&:first)
|
193
|
+
end
|
194
|
+
|
195
|
+
def distance(p1, p2)
|
196
|
+
Phonetics.distance_map[p1][p2]
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
class NextPhonemeLength < CodeGenerator
|
201
|
+
# There's no simple way to break a string of IPA characters into phonemes.
|
202
|
+
# We do it by generating a function that, given a string of IPA characters,
|
203
|
+
# the starting index in that string, and the length of the string, returns
|
204
|
+
# the length of the next phoneme, or zero if none is found.
|
205
|
+
#
|
206
|
+
# Pseudocode:
|
207
|
+
# - return 0 if length - index == 0
|
208
|
+
# - switch on first byte, matching on possible first bytes of phonemes
|
209
|
+
# within the selected case statement:
|
210
|
+
# - return 1 if length - index == 1
|
211
|
+
# - switch on second byte, matching on possible second bytes of phonemes
|
212
|
+
# within the selected case statement:
|
213
|
+
# - return 2 if length - index == 1
|
214
|
+
# ...
|
215
|
+
# - default case: return 2 iff a phoneme terminates here
|
216
|
+
# - default case: return 1 iff a phoneme terminates here
|
217
|
+
# - return 0
|
218
|
+
#
|
219
|
+
def generate
|
220
|
+
write(<<-HEADER.gsub(/^ {6}/, ''))
|
221
|
+
// This is compiled from Ruby, in #{ruby_source}
|
222
|
+
int next_phoneme_length(int *string, int cursor, int length) {
|
223
|
+
|
224
|
+
int max_length;
|
225
|
+
max_length = length - cursor;
|
226
|
+
|
227
|
+
HEADER
|
228
|
+
|
229
|
+
next_phoneme_switch(phoneme_byte_trie, 0)
|
230
|
+
|
231
|
+
# If we fell through all the cases, return 0
|
232
|
+
write ' return 0;'
|
233
|
+
write '}'
|
234
|
+
end
|
235
|
+
|
236
|
+
private
|
237
|
+
|
238
|
+
# Recursively build switch statements for the body of next_phoneme_length
|
239
|
+
def next_phoneme_switch(trie, depth)
|
240
|
+
# switch (string[cursor + depth]) {
|
241
|
+
# case N: // for N in subtrie.keys
|
242
|
+
# // if a case statement matches the current byte AND there's chance
|
243
|
+
# // that a longer string might match, recurse.
|
244
|
+
# if (max_length >= depth) {
|
245
|
+
# // recurse
|
246
|
+
# }
|
247
|
+
# break;
|
248
|
+
# // if there's a :source key here then a phoneme terminates at this
|
249
|
+
# // point and this depth is a valid return value.
|
250
|
+
# default:
|
251
|
+
# return depth;
|
252
|
+
# break;
|
253
|
+
# }
|
254
|
+
indent depth, "switch(string[cursor + #{depth}]) {"
|
255
|
+
write ''
|
256
|
+
trie.each do |key, subtrie|
|
257
|
+
next if key == :source
|
258
|
+
next if subtrie.empty?
|
259
|
+
|
260
|
+
indent depth, "case #{key}:"
|
261
|
+
|
262
|
+
# Add a comment to help understand the dataset
|
263
|
+
describe(subtrie[:source], depth + 1) if subtrie[:source]
|
264
|
+
|
265
|
+
if subtrie.keys == [:source]
|
266
|
+
indent depth, " return #{depth + 1};"
|
267
|
+
else
|
268
|
+
indent depth, " if (max_length > #{depth + 1}) {"
|
269
|
+
next_phoneme_switch(subtrie, depth + 1)
|
270
|
+
indent depth, ' } else {'
|
271
|
+
indent depth, " return #{depth + 1};"
|
272
|
+
indent depth, ' }'
|
273
|
+
end
|
274
|
+
|
275
|
+
indent depth, ' break;'
|
276
|
+
end
|
277
|
+
|
278
|
+
if trie.key?(:source)
|
279
|
+
indent depth, ' default:'
|
280
|
+
indent depth, " return #{depth};"
|
281
|
+
end
|
282
|
+
indent depth, '}'
|
283
|
+
end
|
284
|
+
end
|
285
|
+
end
|
@@ -1,38 +1,29 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
+
require_relative '../phonetics'
|
3
4
|
require_relative 'c_levenshtein'
|
5
|
+
|
4
6
|
# Using the Damerau version of the Levenshtein algorithm, with phonetic feature
|
5
7
|
# count used instead of a binary edit distance calculation
|
6
8
|
#
|
7
|
-
# This implementation
|
9
|
+
# This implementation was dually inspired by the damerau-levenshtein gem
|
8
10
|
# (https://github.com/GlobalNamesArchitecture/damerau-levenshtein/tree/master/ext/damerau_levenshtein).
|
9
|
-
#
|
10
|
-
#
|
11
|
-
#
|
12
|
-
# Aumont, 2016
|
11
|
+
# and "Using Phonologically Weighted Levenshtein Distances for the Prediction
|
12
|
+
# of Microscopic Intelligibility" by Lionel Fontan, Isabelle Ferrané, Jérôme
|
13
|
+
# Farinas, Julien Pinquier, Xavier Aumont, 2016
|
13
14
|
# https://hal.archives-ouvertes.fr/hal-01474904/document
|
14
15
|
module Phonetics
|
15
16
|
module Levenshtein
|
16
17
|
extend ::PhoneticsLevenshteinCBinding
|
17
18
|
|
18
|
-
def
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
Phonetics.as_utf_8_long(str2),
|
23
|
-
verbose
|
24
|
-
)
|
19
|
+
def inspect_bytes(str)
|
20
|
+
puts "Rubyland str: #{str.inspect}"
|
21
|
+
puts "Rubyland bytes: #{str.bytes.inspect}"
|
22
|
+
testing_codepoints(str)
|
25
23
|
end
|
26
24
|
|
27
|
-
def self.
|
28
|
-
|
29
|
-
string.chars.each do |char|
|
30
|
-
unless Phonetics.phonemes.include?(char)
|
31
|
-
msg = "#{char.inspect} is not a character in the International Phonetic Alphabet. #{self.class.name} only works with IPA-transcribed strings"
|
32
|
-
raise ArgumentError, msg
|
33
|
-
end
|
34
|
-
end
|
35
|
-
end
|
25
|
+
def self.distance(str1, str2, verbose = false)
|
26
|
+
internal_phonetic_distance(str1, str2, verbose)
|
36
27
|
end
|
37
28
|
end
|
38
29
|
end
|