phonetics 1.5.4 → 1.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Rakefile +22 -11
- data/VERSION +1 -1
- data/bin/console +13 -0
- data/ext/c_levenshtein/levenshtein.c +104 -76
- data/ext/c_levenshtein/next_phoneme_length.c +1364 -0
- data/ext/c_levenshtein/next_phoneme_length.h +1 -0
- data/ext/c_levenshtein/phonemes.c +33 -0
- data/ext/c_levenshtein/phonemes.h +2 -0
- data/ext/c_levenshtein/phonetic_cost.c +134245 -42305
- data/ext/c_levenshtein/phonetic_cost.h +1 -1
- data/lib/phonetics.rb +2 -90
- data/lib/phonetics/code_generator.rb +285 -0
- data/lib/phonetics/levenshtein.rb +12 -21
- data/lib/phonetics/ruby_levenshtein.rb +5 -14
- metadata +8 -2
@@ -1 +1 @@
|
|
1
|
-
float phonetic_cost(
|
1
|
+
float phonetic_cost(int *string1, int string1_offset, int phoneme1_length, int *string2, int string2_offset, int phoneme2_length);
|
data/lib/phonetics.rb
CHANGED
@@ -112,22 +112,6 @@ module Phonetics
|
|
112
112
|
module Consonants
|
113
113
|
extend self
|
114
114
|
|
115
|
-
# Plosives and fricatives are less similar than trills and flaps, or
|
116
|
-
# sibilant fricatives and non-sibilant fricatives
|
117
|
-
# TODO: this is unfinished and possibly a bad idea
|
118
|
-
MannerDistances = {
|
119
|
-
'Nasal' => %w[continuant],
|
120
|
-
'Stop' => %w[],
|
121
|
-
'Sibilant fricative' => %w[continuant fricative],
|
122
|
-
'Non-sibilant fricative' => %w[continuant non_sibilant fricative],
|
123
|
-
'Approximant' => %w[],
|
124
|
-
'Tap/Flap' => %w[],
|
125
|
-
'Trill' => %w[],
|
126
|
-
'Lateral fricative' => %w[continuant fricative],
|
127
|
-
'Lateral approximant' => %w[],
|
128
|
-
'Lateral tap/flap' => %w[],
|
129
|
-
}.freeze
|
130
|
-
|
131
115
|
# This chart (columns 2 through the end, anyway) is a direct port of
|
132
116
|
# https://en.wikipedia.org/wiki/International_Phonetic_Alphabet#Letters
|
133
117
|
# We store the consonant table in this format to make updating it easier.
|
@@ -218,7 +202,7 @@ module Phonetics
|
|
218
202
|
end
|
219
203
|
|
220
204
|
def phonemes
|
221
|
-
|
205
|
+
Vowels.phonemes + Consonants.phonemes
|
222
206
|
end
|
223
207
|
|
224
208
|
Symbols = Consonants.phonemes.reduce({}) { |acc, p| acc.update p => :consonant }.merge(
|
@@ -232,9 +216,7 @@ module Phonetics
|
|
232
216
|
end
|
233
217
|
|
234
218
|
def distance_map
|
235
|
-
@distance_map ||= (
|
236
|
-
Vowels.phonemes + Consonants.phonemes
|
237
|
-
).permutation(2).each_with_object(Hash.new { |h, k| h[k] = {} }) do |pair, scores|
|
219
|
+
@distance_map ||= phonemes.permutation(2).each_with_object(Hash.new { |h, k| h[k] = {} }) do |pair, scores|
|
238
220
|
p1, p2 = *pair
|
239
221
|
score = _distance(p1, p2)
|
240
222
|
scores[p1][p2] = score
|
@@ -242,76 +224,6 @@ module Phonetics
|
|
242
224
|
end
|
243
225
|
end
|
244
226
|
|
245
|
-
# as_utf_8_long("aɰ̊ h")
|
246
|
-
# => [97, 8404, 32, 104]
|
247
|
-
def as_utf_8_long(string)
|
248
|
-
string.each_grapheme_cluster.map { |grapheme| grapheme_as_utf_8_long(grapheme) }
|
249
|
-
end
|
250
|
-
|
251
|
-
# Encode individual multi-byte strings as a single integer.
|
252
|
-
#
|
253
|
-
# "ɰ̊".unpack('U*')
|
254
|
-
# => [624, 778]
|
255
|
-
#
|
256
|
-
# grapheme_as_utf_8_long("ɰ̊")
|
257
|
-
# => 1413 (624 + (10 * 778))
|
258
|
-
def grapheme_as_utf_8_long(grapheme)
|
259
|
-
grapheme.unpack('U*').each_with_index.reduce(0) do |total, (byte, i)|
|
260
|
-
total + (10**i) * byte
|
261
|
-
end
|
262
|
-
end
|
263
|
-
|
264
|
-
# This will print a C code file with a function that implements a two-level C
|
265
|
-
# switch like the following:
|
266
|
-
#
|
267
|
-
# switch (a) {
|
268
|
-
# case 100: // 'd'
|
269
|
-
# switch (b) {
|
270
|
-
# case 618: // 'ɪ'
|
271
|
-
# return (float) 0.73827;
|
272
|
-
# break;
|
273
|
-
# }
|
274
|
-
# }
|
275
|
-
#
|
276
|
-
def generate_phonetic_cost_c_code(writer = STDOUT)
|
277
|
-
# First, flatten the bytes of the runes (unicode codepoints encoded via
|
278
|
-
# UTF-8) into single integers. We do this by adding the utf-8 values, each
|
279
|
-
# multiplied by 10 * their byte number. The specific encoding doesn't
|
280
|
-
# matter so long as it's:
|
281
|
-
# * consistent
|
282
|
-
# * has no collisions
|
283
|
-
# * produces a value that's a valid C case conditional
|
284
|
-
# * can be applied to runes of input strings later
|
285
|
-
integer_distance_map = distance_map.reduce({}) do |acc_a, (a, distances)|
|
286
|
-
acc_a.update [a, grapheme_as_utf_8_long(a)] => (distances.reduce({}) do |acc_b, (b, distance)|
|
287
|
-
acc_b.update [b, grapheme_as_utf_8_long(b)] => distance
|
288
|
-
end)
|
289
|
-
end
|
290
|
-
|
291
|
-
# Then we print out C code full of switches
|
292
|
-
|
293
|
-
writer.puts(<<-FUNC.gsub(/^ {4}/, ''))
|
294
|
-
float phonetic_cost(int a, int b) {
|
295
|
-
// This is compiled from Ruby, using `String#unpack("U")` on each character
|
296
|
-
// to retrieve the UTF-8 codepoint as a C long value.
|
297
|
-
if (a == b) { return 0.0; };
|
298
|
-
FUNC
|
299
|
-
writer.puts ' switch (a) {'
|
300
|
-
integer_distance_map.each do |(a, a_i), distances|
|
301
|
-
writer.puts " case #{a_i}: // #{a}"
|
302
|
-
writer.puts ' switch (b) {'
|
303
|
-
distances.each do |(b, b_i), distance|
|
304
|
-
writer.puts " case #{b_i}: // #{a}->#{b}"
|
305
|
-
writer.puts " return (float) #{distance};"
|
306
|
-
writer.puts ' break;'
|
307
|
-
end
|
308
|
-
writer.puts ' }'
|
309
|
-
end
|
310
|
-
writer.puts ' }'
|
311
|
-
writer.puts ' return 1.0;'
|
312
|
-
writer.puts '}'
|
313
|
-
end
|
314
|
-
|
315
227
|
private
|
316
228
|
|
317
229
|
def _distance(phoneme1, phoneme2)
|
@@ -0,0 +1,285 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative '../phonetics'
|
4
|
+
require 'json'
|
5
|
+
|
6
|
+
module Phonetics
|
7
|
+
class CodeGenerator
|
8
|
+
attr_reader :writer
|
9
|
+
|
10
|
+
def initialize(writer = STDOUT)
|
11
|
+
@writer = writer
|
12
|
+
end
|
13
|
+
|
14
|
+
def generate_phonetic_cost_c_code
|
15
|
+
generator = PhoneticCost.new(writer)
|
16
|
+
generator.generate
|
17
|
+
writer.flush
|
18
|
+
end
|
19
|
+
|
20
|
+
def generate_next_phoneme_length_c_code
|
21
|
+
generator = NextPhonemeLength.new(writer)
|
22
|
+
generator.generate
|
23
|
+
writer.flush
|
24
|
+
end
|
25
|
+
|
26
|
+
private
|
27
|
+
|
28
|
+
# Turn the bytes of all phonemes into a lookup trie where a sequence of
|
29
|
+
# bytes can find a phoneme in linear time.
|
30
|
+
def phoneme_byte_trie
|
31
|
+
phoneme_byte_trie_for(Phonetics.phonemes)
|
32
|
+
end
|
33
|
+
|
34
|
+
def phoneme_byte_trie_for(phonemes)
|
35
|
+
phonemes.each_with_object({}) do |phoneme, trie|
|
36
|
+
phoneme.bytes.each_with_index.reduce(trie) do |subtrie, (byte, idx)|
|
37
|
+
subtrie[byte] ||= {}
|
38
|
+
|
39
|
+
# If we've reached the end of the byte string
|
40
|
+
if phoneme.bytes.length - 1 == idx
|
41
|
+
# Check if this is a duplicate lookup path. If there's a collision
|
42
|
+
# then this whole approach makes no sense.
|
43
|
+
if subtrie[byte].key?(:source)
|
44
|
+
source = subtrie[byte][:source]
|
45
|
+
raise "Duplicate byte sequence on #{phoneme.inspect} & #{source.inspect} (#{phoneme.bytes.inspect})"
|
46
|
+
else
|
47
|
+
subtrie[byte][:source] = phoneme
|
48
|
+
end
|
49
|
+
end
|
50
|
+
subtrie[byte]
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def ruby_source
|
56
|
+
location = caller_locations.first
|
57
|
+
"#{location.path.split('/')[-4..-1].join('/')}:#{location.lineno}"
|
58
|
+
end
|
59
|
+
|
60
|
+
def describe(phoneme, depth)
|
61
|
+
indent depth, "// Phoneme: #{phoneme.inspect}, bytes: #{phoneme.bytes.inspect}"
|
62
|
+
if Phonetics::Consonants.features.key?(phoneme)
|
63
|
+
indent depth, "// consonant features: #{Phonetics::Consonants.features[phoneme].to_json}"
|
64
|
+
else
|
65
|
+
indent depth, "// vowel features: #{Phonetics::Vowels::FormantFrequencies[phoneme].to_json}"
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def indent(depth, line)
|
70
|
+
write " #{' ' * depth}#{line}"
|
71
|
+
end
|
72
|
+
|
73
|
+
def write(line)
|
74
|
+
writer.puts line
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
class PhoneticCost < CodeGenerator
|
79
|
+
# We find the phonetic distance between two phonemes using a compiled
|
80
|
+
# lookup table. This is implemented as a set of nested switch statements.
|
81
|
+
# Hard to read when compiled, but simple to generate and fast at runtime.
|
82
|
+
#
|
83
|
+
# We generate a `phonetic_cost` function that takes four arguments: Two
|
84
|
+
# strings, and the lengths of those strings. Each string should be exactly
|
85
|
+
# one valid phoneme, which is possible thanks to the (also generated)
|
86
|
+
# next_phoneme_length() function.
|
87
|
+
#
|
88
|
+
# This will print a C code file with a function that implements a multil-level C
|
89
|
+
# switch like the following:
|
90
|
+
#
|
91
|
+
# switch (phoneme1_length) {
|
92
|
+
# case 2:
|
93
|
+
# switch(string1[1]) {
|
94
|
+
# case 201: // first byte of "ɪ"
|
95
|
+
# switch(string1[3]) {
|
96
|
+
# case 170: // second and final byte of "ɪ"
|
97
|
+
# // Phoneme: "ɪ", bytes: [201, 170]
|
98
|
+
# // vowel features: {"F1":300,"F2":2100,"rounded":false}
|
99
|
+
# switch(string2[6]) {
|
100
|
+
# case 105: // first and only byte of "i"
|
101
|
+
# // Phoneme: "i", bytes: [105]
|
102
|
+
# // vowel features: {"F1":240,"F2":2400,"rounded":false}
|
103
|
+
# return (float) 0.14355381904337383;
|
104
|
+
# break;
|
105
|
+
#
|
106
|
+
# the distance of ("ɪ", "i")2 is therefore 0.14355
|
107
|
+
#
|
108
|
+
def generate
|
109
|
+
write(<<-HEADER.gsub(/^ {6}/, ''))
|
110
|
+
|
111
|
+
// This is compiled from Ruby, in #{ruby_source}
|
112
|
+
#include <stdbool.h>
|
113
|
+
#include <stdio.h>
|
114
|
+
#include "./phonemes.h"
|
115
|
+
float phonetic_cost(int *string1, int string1_offset, int phoneme1_length, int *string2, int string2_offset, int phoneme2_length) {
|
116
|
+
|
117
|
+
HEADER
|
118
|
+
|
119
|
+
write ' switch (phoneme1_length) {'
|
120
|
+
by_byte_length.each do |length, phonemes|
|
121
|
+
write " case #{length}:"
|
122
|
+
switch_phoneme1(phoneme_byte_trie_for(phonemes), 0)
|
123
|
+
write ' break;'
|
124
|
+
end
|
125
|
+
write ' }'
|
126
|
+
write ' return (float) 1.0;'
|
127
|
+
write '};'
|
128
|
+
write ''
|
129
|
+
end
|
130
|
+
|
131
|
+
def switch_phoneme1(trie, depth = 0)
|
132
|
+
indent depth, "switch(string1[string1_offset + #{depth}]) {"
|
133
|
+
trie.each do |key, subtrie|
|
134
|
+
next if key == :source
|
135
|
+
next if subtrie.empty?
|
136
|
+
|
137
|
+
indent depth + 1, "case #{key}:"
|
138
|
+
|
139
|
+
phoneme1 = subtrie[:source]
|
140
|
+
|
141
|
+
# If this could be a match of a phoneme1 then find phoneme2
|
142
|
+
if phoneme1
|
143
|
+
# Add a comment to help understand the dataset
|
144
|
+
describe(phoneme1, depth + 2) if phoneme1
|
145
|
+
|
146
|
+
by_byte_length.each do |_, phonemes|
|
147
|
+
byte_trie = phoneme_byte_trie_for(phonemes)
|
148
|
+
next if byte_trie.empty?
|
149
|
+
|
150
|
+
switch_phoneme2(byte_trie, phoneme1, 0)
|
151
|
+
end
|
152
|
+
else
|
153
|
+
switch_phoneme1(subtrie, depth + 1)
|
154
|
+
end
|
155
|
+
|
156
|
+
indent depth + 2, 'break;'
|
157
|
+
end
|
158
|
+
indent depth, '}'
|
159
|
+
end
|
160
|
+
|
161
|
+
def switch_phoneme2(trie, previous_phoneme, depth = 0)
|
162
|
+
indent depth, "switch(string2[string2_offset + #{depth}]) {"
|
163
|
+
trie.each do |key, subtrie|
|
164
|
+
next if key == :source
|
165
|
+
next if subtrie.empty?
|
166
|
+
|
167
|
+
phoneme2 = subtrie[:source]
|
168
|
+
|
169
|
+
indent depth + 1, "case #{key}:"
|
170
|
+
|
171
|
+
if phoneme2
|
172
|
+
value = if previous_phoneme == phoneme2
|
173
|
+
0.0
|
174
|
+
else
|
175
|
+
distance(previous_phoneme, phoneme2)
|
176
|
+
end
|
177
|
+
# Add a comment to help understand the dataset
|
178
|
+
describe(phoneme2, depth + 2)
|
179
|
+
indent depth + 2, "return (float) #{value};"
|
180
|
+
else
|
181
|
+
switch_phoneme2(subtrie, previous_phoneme, depth + 1)
|
182
|
+
end
|
183
|
+
|
184
|
+
indent depth + 2, 'break;'
|
185
|
+
end
|
186
|
+
indent depth, '}'
|
187
|
+
end
|
188
|
+
|
189
|
+
def by_byte_length
|
190
|
+
Phonetics.phonemes.group_by do |phoneme|
|
191
|
+
phoneme.bytes.length
|
192
|
+
end.sort_by(&:first)
|
193
|
+
end
|
194
|
+
|
195
|
+
def distance(p1, p2)
|
196
|
+
Phonetics.distance_map[p1][p2]
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
class NextPhonemeLength < CodeGenerator
|
201
|
+
# There's no simple way to break a string of IPA characters into phonemes.
|
202
|
+
# We do it by generating a function that, given a string of IPA characters,
|
203
|
+
# the starting index in that string, and the length of the string, returns
|
204
|
+
# the length of the next phoneme, or zero if none is found.
|
205
|
+
#
|
206
|
+
# Pseudocode:
|
207
|
+
# - return 0 if length - index == 0
|
208
|
+
# - switch on first byte, matching on possible first bytes of phonemes
|
209
|
+
# within the selected case statement:
|
210
|
+
# - return 1 if length - index == 1
|
211
|
+
# - switch on second byte, matching on possible second bytes of phonemes
|
212
|
+
# within the selected case statement:
|
213
|
+
# - return 2 if length - index == 1
|
214
|
+
# ...
|
215
|
+
# - default case: return 2 iff a phoneme terminates here
|
216
|
+
# - default case: return 1 iff a phoneme terminates here
|
217
|
+
# - return 0
|
218
|
+
#
|
219
|
+
def generate
|
220
|
+
write(<<-HEADER.gsub(/^ {6}/, ''))
|
221
|
+
// This is compiled from Ruby, in #{ruby_source}
|
222
|
+
int next_phoneme_length(int *string, int cursor, int length) {
|
223
|
+
|
224
|
+
int max_length;
|
225
|
+
max_length = length - cursor;
|
226
|
+
|
227
|
+
HEADER
|
228
|
+
|
229
|
+
next_phoneme_switch(phoneme_byte_trie, 0)
|
230
|
+
|
231
|
+
# If we fell through all the cases, return 0
|
232
|
+
write ' return 0;'
|
233
|
+
write '}'
|
234
|
+
end
|
235
|
+
|
236
|
+
private
|
237
|
+
|
238
|
+
# Recursively build switch statements for the body of next_phoneme_length
|
239
|
+
def next_phoneme_switch(trie, depth)
|
240
|
+
# switch (string[cursor + depth]) {
|
241
|
+
# case N: // for N in subtrie.keys
|
242
|
+
# // if a case statement matches the current byte AND there's chance
|
243
|
+
# // that a longer string might match, recurse.
|
244
|
+
# if (max_length >= depth) {
|
245
|
+
# // recurse
|
246
|
+
# }
|
247
|
+
# break;
|
248
|
+
# // if there's a :source key here then a phoneme terminates at this
|
249
|
+
# // point and this depth is a valid return value.
|
250
|
+
# default:
|
251
|
+
# return depth;
|
252
|
+
# break;
|
253
|
+
# }
|
254
|
+
indent depth, "switch(string[cursor + #{depth}]) {"
|
255
|
+
write ''
|
256
|
+
trie.each do |key, subtrie|
|
257
|
+
next if key == :source
|
258
|
+
next if subtrie.empty?
|
259
|
+
|
260
|
+
indent depth, "case #{key}:"
|
261
|
+
|
262
|
+
# Add a comment to help understand the dataset
|
263
|
+
describe(subtrie[:source], depth + 1) if subtrie[:source]
|
264
|
+
|
265
|
+
if subtrie.keys == [:source]
|
266
|
+
indent depth, " return #{depth + 1};"
|
267
|
+
else
|
268
|
+
indent depth, " if (max_length > #{depth + 1}) {"
|
269
|
+
next_phoneme_switch(subtrie, depth + 1)
|
270
|
+
indent depth, ' } else {'
|
271
|
+
indent depth, " return #{depth + 1};"
|
272
|
+
indent depth, ' }'
|
273
|
+
end
|
274
|
+
|
275
|
+
indent depth, ' break;'
|
276
|
+
end
|
277
|
+
|
278
|
+
if trie.key?(:source)
|
279
|
+
indent depth, ' default:'
|
280
|
+
indent depth, " return #{depth};"
|
281
|
+
end
|
282
|
+
indent depth, '}'
|
283
|
+
end
|
284
|
+
end
|
285
|
+
end
|
@@ -1,38 +1,29 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
+
require_relative '../phonetics'
|
3
4
|
require_relative 'c_levenshtein'
|
5
|
+
|
4
6
|
# Using the Damerau version of the Levenshtein algorithm, with phonetic feature
|
5
7
|
# count used instead of a binary edit distance calculation
|
6
8
|
#
|
7
|
-
# This implementation
|
9
|
+
# This implementation was dually inspired by the damerau-levenshtein gem
|
8
10
|
# (https://github.com/GlobalNamesArchitecture/damerau-levenshtein/tree/master/ext/damerau_levenshtein).
|
9
|
-
#
|
10
|
-
#
|
11
|
-
#
|
12
|
-
# Aumont, 2016
|
11
|
+
# and "Using Phonologically Weighted Levenshtein Distances for the Prediction
|
12
|
+
# of Microscopic Intelligibility" by Lionel Fontan, Isabelle Ferrané, Jérôme
|
13
|
+
# Farinas, Julien Pinquier, Xavier Aumont, 2016
|
13
14
|
# https://hal.archives-ouvertes.fr/hal-01474904/document
|
14
15
|
module Phonetics
|
15
16
|
module Levenshtein
|
16
17
|
extend ::PhoneticsLevenshteinCBinding
|
17
18
|
|
18
|
-
def
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
Phonetics.as_utf_8_long(str2),
|
23
|
-
verbose
|
24
|
-
)
|
19
|
+
def inspect_bytes(str)
|
20
|
+
puts "Rubyland str: #{str.inspect}"
|
21
|
+
puts "Rubyland bytes: #{str.bytes.inspect}"
|
22
|
+
testing_codepoints(str)
|
25
23
|
end
|
26
24
|
|
27
|
-
def self.
|
28
|
-
|
29
|
-
string.chars.each do |char|
|
30
|
-
unless Phonetics.phonemes.include?(char)
|
31
|
-
msg = "#{char.inspect} is not a character in the International Phonetic Alphabet. #{self.class.name} only works with IPA-transcribed strings"
|
32
|
-
raise ArgumentError, msg
|
33
|
-
end
|
34
|
-
end
|
35
|
-
end
|
25
|
+
def self.distance(str1, str2, verbose = false)
|
26
|
+
internal_phonetic_distance(str1, str2, verbose)
|
36
27
|
end
|
37
28
|
end
|
38
29
|
end
|