phonetics 1.5.4 → 1.8.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1 +1 @@
1
- float phonetic_cost(long, long);
1
+ float phonetic_cost(int *string1, int string1_offset, int phoneme1_length, int *string2, int string2_offset, int phoneme2_length);
@@ -112,22 +112,6 @@ module Phonetics
112
112
  module Consonants
113
113
  extend self
114
114
 
115
- # Plosives and fricatives are less similar than trills and flaps, or
116
- # sibilant fricatives and non-sibilant fricatives
117
- # TODO: this is unfinished and possibly a bad idea
118
- MannerDistances = {
119
- 'Nasal' => %w[continuant],
120
- 'Stop' => %w[],
121
- 'Sibilant fricative' => %w[continuant fricative],
122
- 'Non-sibilant fricative' => %w[continuant non_sibilant fricative],
123
- 'Approximant' => %w[],
124
- 'Tap/Flap' => %w[],
125
- 'Trill' => %w[],
126
- 'Lateral fricative' => %w[continuant fricative],
127
- 'Lateral approximant' => %w[],
128
- 'Lateral tap/flap' => %w[],
129
- }.freeze
130
-
131
115
  # This chart (columns 2 through the end, anyway) is a direct port of
132
116
  # https://en.wikipedia.org/wiki/International_Phonetic_Alphabet#Letters
133
117
  # We store the consonant table in this format to make updating it easier.
@@ -218,7 +202,7 @@ module Phonetics
218
202
  end
219
203
 
220
204
  def phonemes
221
- Consonants.phonemes + Vowels.phonemes
205
+ Vowels.phonemes + Consonants.phonemes
222
206
  end
223
207
 
224
208
  Symbols = Consonants.phonemes.reduce({}) { |acc, p| acc.update p => :consonant }.merge(
@@ -232,9 +216,7 @@ module Phonetics
232
216
  end
233
217
 
234
218
  def distance_map
235
- @distance_map ||= (
236
- Vowels.phonemes + Consonants.phonemes
237
- ).permutation(2).each_with_object(Hash.new { |h, k| h[k] = {} }) do |pair, scores|
219
+ @distance_map ||= phonemes.permutation(2).each_with_object(Hash.new { |h, k| h[k] = {} }) do |pair, scores|
238
220
  p1, p2 = *pair
239
221
  score = _distance(p1, p2)
240
222
  scores[p1][p2] = score
@@ -242,76 +224,6 @@ module Phonetics
242
224
  end
243
225
  end
244
226
 
245
- # as_utf_8_long("aɰ̊ h")
246
- # => [97, 8404, 32, 104]
247
- def as_utf_8_long(string)
248
- string.each_grapheme_cluster.map { |grapheme| grapheme_as_utf_8_long(grapheme) }
249
- end
250
-
251
- # Encode individual multi-byte strings as a single integer.
252
- #
253
- # "ɰ̊".unpack('U*')
254
- # => [624, 778]
255
- #
256
- # grapheme_as_utf_8_long("ɰ̊")
257
- # => 1413 (624 + (10 * 778))
258
- def grapheme_as_utf_8_long(grapheme)
259
- grapheme.unpack('U*').each_with_index.reduce(0) do |total, (byte, i)|
260
- total + (10**i) * byte
261
- end
262
- end
263
-
264
- # This will print a C code file with a function that implements a two-level C
265
- # switch like the following:
266
- #
267
- # switch (a) {
268
- # case 100: // 'd'
269
- # switch (b) {
270
- # case 618: // 'ɪ'
271
- # return (float) 0.73827;
272
- # break;
273
- # }
274
- # }
275
- #
276
- def generate_phonetic_cost_c_code(writer = STDOUT)
277
- # First, flatten the bytes of the runes (unicode codepoints encoded via
278
- # UTF-8) into single integers. We do this by adding the utf-8 values, each
279
- # multiplied by 10 * their byte number. The specific encoding doesn't
280
- # matter so long as it's:
281
- # * consistent
282
- # * has no collisions
283
- # * produces a value that's a valid C case conditional
284
- # * can be applied to runes of input strings later
285
- integer_distance_map = distance_map.reduce({}) do |acc_a, (a, distances)|
286
- acc_a.update [a, grapheme_as_utf_8_long(a)] => (distances.reduce({}) do |acc_b, (b, distance)|
287
- acc_b.update [b, grapheme_as_utf_8_long(b)] => distance
288
- end)
289
- end
290
-
291
- # Then we print out C code full of switches
292
-
293
- writer.puts(<<-FUNC.gsub(/^ {4}/, ''))
294
- float phonetic_cost(int a, int b) {
295
- // This is compiled from Ruby, using `String#unpack("U")` on each character
296
- // to retrieve the UTF-8 codepoint as a C long value.
297
- if (a == b) { return 0.0; };
298
- FUNC
299
- writer.puts ' switch (a) {'
300
- integer_distance_map.each do |(a, a_i), distances|
301
- writer.puts " case #{a_i}: // #{a}"
302
- writer.puts ' switch (b) {'
303
- distances.each do |(b, b_i), distance|
304
- writer.puts " case #{b_i}: // #{a}->#{b}"
305
- writer.puts " return (float) #{distance};"
306
- writer.puts ' break;'
307
- end
308
- writer.puts ' }'
309
- end
310
- writer.puts ' }'
311
- writer.puts ' return 1.0;'
312
- writer.puts '}'
313
- end
314
-
315
227
  private
316
228
 
317
229
  def _distance(phoneme1, phoneme2)
@@ -0,0 +1,285 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../phonetics'
4
+ require 'json'
5
+
6
+ module Phonetics
7
+ class CodeGenerator
8
+ attr_reader :writer
9
+
10
+ def initialize(writer = STDOUT)
11
+ @writer = writer
12
+ end
13
+
14
+ def generate_phonetic_cost_c_code
15
+ generator = PhoneticCost.new(writer)
16
+ generator.generate
17
+ writer.flush
18
+ end
19
+
20
+ def generate_next_phoneme_length_c_code
21
+ generator = NextPhonemeLength.new(writer)
22
+ generator.generate
23
+ writer.flush
24
+ end
25
+
26
+ private
27
+
28
+ # Turn the bytes of all phonemes into a lookup trie where a sequence of
29
+ # bytes can find a phoneme in linear time.
30
+ def phoneme_byte_trie
31
+ phoneme_byte_trie_for(Phonetics.phonemes)
32
+ end
33
+
34
+ def phoneme_byte_trie_for(phonemes)
35
+ phonemes.each_with_object({}) do |phoneme, trie|
36
+ phoneme.bytes.each_with_index.reduce(trie) do |subtrie, (byte, idx)|
37
+ subtrie[byte] ||= {}
38
+
39
+ # If we've reached the end of the byte string
40
+ if phoneme.bytes.length - 1 == idx
41
+ # Check if this is a duplicate lookup path. If there's a collision
42
+ # then this whole approach makes no sense.
43
+ if subtrie[byte].key?(:source)
44
+ source = subtrie[byte][:source]
45
+ raise "Duplicate byte sequence on #{phoneme.inspect} & #{source.inspect} (#{phoneme.bytes.inspect})"
46
+ else
47
+ subtrie[byte][:source] = phoneme
48
+ end
49
+ end
50
+ subtrie[byte]
51
+ end
52
+ end
53
+ end
54
+
55
+ def ruby_source
56
+ location = caller_locations.first
57
+ "#{location.path.split('/')[-4..-1].join('/')}:#{location.lineno}"
58
+ end
59
+
60
+ def describe(phoneme, depth)
61
+ indent depth, "// Phoneme: #{phoneme.inspect}, bytes: #{phoneme.bytes.inspect}"
62
+ if Phonetics::Consonants.features.key?(phoneme)
63
+ indent depth, "// consonant features: #{Phonetics::Consonants.features[phoneme].to_json}"
64
+ else
65
+ indent depth, "// vowel features: #{Phonetics::Vowels::FormantFrequencies[phoneme].to_json}"
66
+ end
67
+ end
68
+
69
+ def indent(depth, line)
70
+ write " #{' ' * depth}#{line}"
71
+ end
72
+
73
+ def write(line)
74
+ writer.puts line
75
+ end
76
+ end
77
+
78
+ class PhoneticCost < CodeGenerator
79
+ # We find the phonetic distance between two phonemes using a compiled
80
+ # lookup table. This is implemented as a set of nested switch statements.
81
+ # Hard to read when compiled, but simple to generate and fast at runtime.
82
+ #
83
+ # We generate a `phonetic_cost` function that takes four arguments: Two
84
+ # strings, and the lengths of those strings. Each string should be exactly
85
+ # one valid phoneme, which is possible thanks to the (also generated)
86
+ # next_phoneme_length() function.
87
+ #
88
+ # This will print a C code file with a function that implements a multil-level C
89
+ # switch like the following:
90
+ #
91
+ # switch (phoneme1_length) {
92
+ # case 2:
93
+ # switch(string1[1]) {
94
+ # case 201: // first byte of "ɪ"
95
+ # switch(string1[3]) {
96
+ # case 170: // second and final byte of "ɪ"
97
+ # // Phoneme: "ɪ", bytes: [201, 170]
98
+ # // vowel features: {"F1":300,"F2":2100,"rounded":false}
99
+ # switch(string2[6]) {
100
+ # case 105: // first and only byte of "i"
101
+ # // Phoneme: "i", bytes: [105]
102
+ # // vowel features: {"F1":240,"F2":2400,"rounded":false}
103
+ # return (float) 0.14355381904337383;
104
+ # break;
105
+ #
106
+ # the distance of ("ɪ", "i")2 is therefore 0.14355
107
+ #
108
+ def generate
109
+ write(<<-HEADER.gsub(/^ {6}/, ''))
110
+
111
+ // This is compiled from Ruby, in #{ruby_source}
112
+ #include <stdbool.h>
113
+ #include <stdio.h>
114
+ #include "./phonemes.h"
115
+ float phonetic_cost(int *string1, int string1_offset, int phoneme1_length, int *string2, int string2_offset, int phoneme2_length) {
116
+
117
+ HEADER
118
+
119
+ write ' switch (phoneme1_length) {'
120
+ by_byte_length.each do |length, phonemes|
121
+ write " case #{length}:"
122
+ switch_phoneme1(phoneme_byte_trie_for(phonemes), 0)
123
+ write ' break;'
124
+ end
125
+ write ' }'
126
+ write ' return (float) 1.0;'
127
+ write '};'
128
+ write ''
129
+ end
130
+
131
+ def switch_phoneme1(trie, depth = 0)
132
+ indent depth, "switch(string1[string1_offset + #{depth}]) {"
133
+ trie.each do |key, subtrie|
134
+ next if key == :source
135
+ next if subtrie.empty?
136
+
137
+ indent depth + 1, "case #{key}:"
138
+
139
+ phoneme1 = subtrie[:source]
140
+
141
+ # If this could be a match of a phoneme1 then find phoneme2
142
+ if phoneme1
143
+ # Add a comment to help understand the dataset
144
+ describe(phoneme1, depth + 2) if phoneme1
145
+
146
+ by_byte_length.each do |_, phonemes|
147
+ byte_trie = phoneme_byte_trie_for(phonemes)
148
+ next if byte_trie.empty?
149
+
150
+ switch_phoneme2(byte_trie, phoneme1, 0)
151
+ end
152
+ else
153
+ switch_phoneme1(subtrie, depth + 1)
154
+ end
155
+
156
+ indent depth + 2, 'break;'
157
+ end
158
+ indent depth, '}'
159
+ end
160
+
161
+ def switch_phoneme2(trie, previous_phoneme, depth = 0)
162
+ indent depth, "switch(string2[string2_offset + #{depth}]) {"
163
+ trie.each do |key, subtrie|
164
+ next if key == :source
165
+ next if subtrie.empty?
166
+
167
+ phoneme2 = subtrie[:source]
168
+
169
+ indent depth + 1, "case #{key}:"
170
+
171
+ if phoneme2
172
+ value = if previous_phoneme == phoneme2
173
+ 0.0
174
+ else
175
+ distance(previous_phoneme, phoneme2)
176
+ end
177
+ # Add a comment to help understand the dataset
178
+ describe(phoneme2, depth + 2)
179
+ indent depth + 2, "return (float) #{value};"
180
+ else
181
+ switch_phoneme2(subtrie, previous_phoneme, depth + 1)
182
+ end
183
+
184
+ indent depth + 2, 'break;'
185
+ end
186
+ indent depth, '}'
187
+ end
188
+
189
+ def by_byte_length
190
+ Phonetics.phonemes.group_by do |phoneme|
191
+ phoneme.bytes.length
192
+ end.sort_by(&:first)
193
+ end
194
+
195
+ def distance(p1, p2)
196
+ Phonetics.distance_map[p1][p2]
197
+ end
198
+ end
199
+
200
+ class NextPhonemeLength < CodeGenerator
201
+ # There's no simple way to break a string of IPA characters into phonemes.
202
+ # We do it by generating a function that, given a string of IPA characters,
203
+ # the starting index in that string, and the length of the string, returns
204
+ # the length of the next phoneme, or zero if none is found.
205
+ #
206
+ # Pseudocode:
207
+ # - return 0 if length - index == 0
208
+ # - switch on first byte, matching on possible first bytes of phonemes
209
+ # within the selected case statement:
210
+ # - return 1 if length - index == 1
211
+ # - switch on second byte, matching on possible second bytes of phonemes
212
+ # within the selected case statement:
213
+ # - return 2 if length - index == 1
214
+ # ...
215
+ # - default case: return 2 iff a phoneme terminates here
216
+ # - default case: return 1 iff a phoneme terminates here
217
+ # - return 0
218
+ #
219
+ def generate
220
+ write(<<-HEADER.gsub(/^ {6}/, ''))
221
+ // This is compiled from Ruby, in #{ruby_source}
222
+ int next_phoneme_length(int *string, int cursor, int length) {
223
+
224
+ int max_length;
225
+ max_length = length - cursor;
226
+
227
+ HEADER
228
+
229
+ next_phoneme_switch(phoneme_byte_trie, 0)
230
+
231
+ # If we fell through all the cases, return 0
232
+ write ' return 0;'
233
+ write '}'
234
+ end
235
+
236
+ private
237
+
238
+ # Recursively build switch statements for the body of next_phoneme_length
239
+ def next_phoneme_switch(trie, depth)
240
+ # switch (string[cursor + depth]) {
241
+ # case N: // for N in subtrie.keys
242
+ # // if a case statement matches the current byte AND there's chance
243
+ # // that a longer string might match, recurse.
244
+ # if (max_length >= depth) {
245
+ # // recurse
246
+ # }
247
+ # break;
248
+ # // if there's a :source key here then a phoneme terminates at this
249
+ # // point and this depth is a valid return value.
250
+ # default:
251
+ # return depth;
252
+ # break;
253
+ # }
254
+ indent depth, "switch(string[cursor + #{depth}]) {"
255
+ write ''
256
+ trie.each do |key, subtrie|
257
+ next if key == :source
258
+ next if subtrie.empty?
259
+
260
+ indent depth, "case #{key}:"
261
+
262
+ # Add a comment to help understand the dataset
263
+ describe(subtrie[:source], depth + 1) if subtrie[:source]
264
+
265
+ if subtrie.keys == [:source]
266
+ indent depth, " return #{depth + 1};"
267
+ else
268
+ indent depth, " if (max_length > #{depth + 1}) {"
269
+ next_phoneme_switch(subtrie, depth + 1)
270
+ indent depth, ' } else {'
271
+ indent depth, " return #{depth + 1};"
272
+ indent depth, ' }'
273
+ end
274
+
275
+ indent depth, ' break;'
276
+ end
277
+
278
+ if trie.key?(:source)
279
+ indent depth, ' default:'
280
+ indent depth, " return #{depth};"
281
+ end
282
+ indent depth, '}'
283
+ end
284
+ end
285
+ end
@@ -1,38 +1,29 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require_relative '../phonetics'
3
4
  require_relative 'c_levenshtein'
5
+
4
6
  # Using the Damerau version of the Levenshtein algorithm, with phonetic feature
5
7
  # count used instead of a binary edit distance calculation
6
8
  #
7
- # This implementation is almost entirely taken from the damerau-levenshtein gem
9
+ # This implementation was dually inspired by the damerau-levenshtein gem
8
10
  # (https://github.com/GlobalNamesArchitecture/damerau-levenshtein/tree/master/ext/damerau_levenshtein).
9
- # The implementation is modified based on "Using Phonologically Weighted
10
- # Levenshtein Distances for the Prediction of Microscopic Intelligibility" by
11
- # Lionel Fontan, Isabelle Ferrané, Jérôme Farinas, Julien Pinquier, Xavier
12
- # Aumont, 2016
11
+ # and "Using Phonologically Weighted Levenshtein Distances for the Prediction
12
+ # of Microscopic Intelligibility" by Lionel Fontan, Isabelle Ferrané, Jérôme
13
+ # Farinas, Julien Pinquier, Xavier Aumont, 2016
13
14
  # https://hal.archives-ouvertes.fr/hal-01474904/document
14
15
  module Phonetics
15
16
  module Levenshtein
16
17
  extend ::PhoneticsLevenshteinCBinding
17
18
 
18
- def self.distance(str1, str2, verbose = false)
19
- ensure_is_phonetic!(str1, str2)
20
- internal_phonetic_distance(
21
- Phonetics.as_utf_8_long(str1),
22
- Phonetics.as_utf_8_long(str2),
23
- verbose
24
- )
19
+ def inspect_bytes(str)
20
+ puts "Rubyland str: #{str.inspect}"
21
+ puts "Rubyland bytes: #{str.bytes.inspect}"
22
+ testing_codepoints(str)
25
23
  end
26
24
 
27
- def self.ensure_is_phonetic!(str1, str2)
28
- [str1, str2].each do |string|
29
- string.chars.each do |char|
30
- unless Phonetics.phonemes.include?(char)
31
- msg = "#{char.inspect} is not a character in the International Phonetic Alphabet. #{self.class.name} only works with IPA-transcribed strings"
32
- raise ArgumentError, msg
33
- end
34
- end
35
- end
25
+ def self.distance(str1, str2, verbose = false)
26
+ internal_phonetic_distance(str1, str2, verbose)
36
27
  end
37
28
  end
38
29
  end