phonetics 1.5.4 → 1.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1 +1 @@
1
- float phonetic_cost(long, long);
1
+ float phonetic_cost(int *string1, int string1_offset, int phoneme1_length, int *string2, int string2_offset, int phoneme2_length);
@@ -112,22 +112,6 @@ module Phonetics
112
112
  module Consonants
113
113
  extend self
114
114
 
115
- # Plosives and fricatives are less similar than trills and flaps, or
116
- # sibilant fricatives and non-sibilant fricatives
117
- # TODO: this is unfinished and possibly a bad idea
118
- MannerDistances = {
119
- 'Nasal' => %w[continuant],
120
- 'Stop' => %w[],
121
- 'Sibilant fricative' => %w[continuant fricative],
122
- 'Non-sibilant fricative' => %w[continuant non_sibilant fricative],
123
- 'Approximant' => %w[],
124
- 'Tap/Flap' => %w[],
125
- 'Trill' => %w[],
126
- 'Lateral fricative' => %w[continuant fricative],
127
- 'Lateral approximant' => %w[],
128
- 'Lateral tap/flap' => %w[],
129
- }.freeze
130
-
131
115
  # This chart (columns 2 through the end, anyway) is a direct port of
132
116
  # https://en.wikipedia.org/wiki/International_Phonetic_Alphabet#Letters
133
117
  # We store the consonant table in this format to make updating it easier.
@@ -218,7 +202,7 @@ module Phonetics
218
202
  end
219
203
 
220
204
  def phonemes
221
- Consonants.phonemes + Vowels.phonemes
205
+ Vowels.phonemes + Consonants.phonemes
222
206
  end
223
207
 
224
208
  Symbols = Consonants.phonemes.reduce({}) { |acc, p| acc.update p => :consonant }.merge(
@@ -232,9 +216,7 @@ module Phonetics
232
216
  end
233
217
 
234
218
  def distance_map
235
- @distance_map ||= (
236
- Vowels.phonemes + Consonants.phonemes
237
- ).permutation(2).each_with_object(Hash.new { |h, k| h[k] = {} }) do |pair, scores|
219
+ @distance_map ||= phonemes.permutation(2).each_with_object(Hash.new { |h, k| h[k] = {} }) do |pair, scores|
238
220
  p1, p2 = *pair
239
221
  score = _distance(p1, p2)
240
222
  scores[p1][p2] = score
@@ -242,76 +224,6 @@ module Phonetics
242
224
  end
243
225
  end
244
226
 
245
- # as_utf_8_long("aɰ̊ h")
246
- # => [97, 8404, 32, 104]
247
- def as_utf_8_long(string)
248
- string.each_grapheme_cluster.map { |grapheme| grapheme_as_utf_8_long(grapheme) }
249
- end
250
-
251
- # Encode individual multi-byte strings as a single integer.
252
- #
253
- # "ɰ̊".unpack('U*')
254
- # => [624, 778]
255
- #
256
- # grapheme_as_utf_8_long("ɰ̊")
257
- # => 1413 (624 + (10 * 778))
258
- def grapheme_as_utf_8_long(grapheme)
259
- grapheme.unpack('U*').each_with_index.reduce(0) do |total, (byte, i)|
260
- total + (10**i) * byte
261
- end
262
- end
263
-
264
- # This will print a C code file with a function that implements a two-level C
265
- # switch like the following:
266
- #
267
- # switch (a) {
268
- # case 100: // 'd'
269
- # switch (b) {
270
- # case 618: // 'ɪ'
271
- # return (float) 0.73827;
272
- # break;
273
- # }
274
- # }
275
- #
276
- def generate_phonetic_cost_c_code(writer = STDOUT)
277
- # First, flatten the bytes of the runes (unicode codepoints encoded via
278
- # UTF-8) into single integers. We do this by adding the utf-8 values, each
279
- # multiplied by 10 * their byte number. The specific encoding doesn't
280
- # matter so long as it's:
281
- # * consistent
282
- # * has no collisions
283
- # * produces a value that's a valid C case conditional
284
- # * can be applied to runes of input strings later
285
- integer_distance_map = distance_map.reduce({}) do |acc_a, (a, distances)|
286
- acc_a.update [a, grapheme_as_utf_8_long(a)] => (distances.reduce({}) do |acc_b, (b, distance)|
287
- acc_b.update [b, grapheme_as_utf_8_long(b)] => distance
288
- end)
289
- end
290
-
291
- # Then we print out C code full of switches
292
-
293
- writer.puts(<<-FUNC.gsub(/^ {4}/, ''))
294
- float phonetic_cost(int a, int b) {
295
- // This is compiled from Ruby, using `String#unpack("U")` on each character
296
- // to retrieve the UTF-8 codepoint as a C long value.
297
- if (a == b) { return 0.0; };
298
- FUNC
299
- writer.puts ' switch (a) {'
300
- integer_distance_map.each do |(a, a_i), distances|
301
- writer.puts " case #{a_i}: // #{a}"
302
- writer.puts ' switch (b) {'
303
- distances.each do |(b, b_i), distance|
304
- writer.puts " case #{b_i}: // #{a}->#{b}"
305
- writer.puts " return (float) #{distance};"
306
- writer.puts ' break;'
307
- end
308
- writer.puts ' }'
309
- end
310
- writer.puts ' }'
311
- writer.puts ' return 1.0;'
312
- writer.puts '}'
313
- end
314
-
315
227
  private
316
228
 
317
229
  def _distance(phoneme1, phoneme2)
@@ -0,0 +1,285 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../phonetics'
4
+ require 'json'
5
+
6
+ module Phonetics
7
+ class CodeGenerator
8
+ attr_reader :writer
9
+
10
+ def initialize(writer = STDOUT)
11
+ @writer = writer
12
+ end
13
+
14
+ def generate_phonetic_cost_c_code
15
+ generator = PhoneticCost.new(writer)
16
+ generator.generate
17
+ writer.flush
18
+ end
19
+
20
+ def generate_next_phoneme_length_c_code
21
+ generator = NextPhonemeLength.new(writer)
22
+ generator.generate
23
+ writer.flush
24
+ end
25
+
26
+ private
27
+
28
+ # Turn the bytes of all phonemes into a lookup trie where a sequence of
29
+ # bytes can find a phoneme in linear time.
30
+ def phoneme_byte_trie
31
+ phoneme_byte_trie_for(Phonetics.phonemes)
32
+ end
33
+
34
+ def phoneme_byte_trie_for(phonemes)
35
+ phonemes.each_with_object({}) do |phoneme, trie|
36
+ phoneme.bytes.each_with_index.reduce(trie) do |subtrie, (byte, idx)|
37
+ subtrie[byte] ||= {}
38
+
39
+ # If we've reached the end of the byte string
40
+ if phoneme.bytes.length - 1 == idx
41
+ # Check if this is a duplicate lookup path. If there's a collision
42
+ # then this whole approach makes no sense.
43
+ if subtrie[byte].key?(:source)
44
+ source = subtrie[byte][:source]
45
+ raise "Duplicate byte sequence on #{phoneme.inspect} & #{source.inspect} (#{phoneme.bytes.inspect})"
46
+ else
47
+ subtrie[byte][:source] = phoneme
48
+ end
49
+ end
50
+ subtrie[byte]
51
+ end
52
+ end
53
+ end
54
+
55
+ def ruby_source
56
+ location = caller_locations.first
57
+ "#{location.path.split('/')[-4..-1].join('/')}:#{location.lineno}"
58
+ end
59
+
60
+ def describe(phoneme, depth)
61
+ indent depth, "// Phoneme: #{phoneme.inspect}, bytes: #{phoneme.bytes.inspect}"
62
+ if Phonetics::Consonants.features.key?(phoneme)
63
+ indent depth, "// consonant features: #{Phonetics::Consonants.features[phoneme].to_json}"
64
+ else
65
+ indent depth, "// vowel features: #{Phonetics::Vowels::FormantFrequencies[phoneme].to_json}"
66
+ end
67
+ end
68
+
69
+ def indent(depth, line)
70
+ write " #{' ' * depth}#{line}"
71
+ end
72
+
73
+ def write(line)
74
+ writer.puts line
75
+ end
76
+ end
77
+
78
+ class PhoneticCost < CodeGenerator
79
+ # We find the phonetic distance between two phonemes using a compiled
80
+ # lookup table. This is implemented as a set of nested switch statements.
81
+ # Hard to read when compiled, but simple to generate and fast at runtime.
82
+ #
83
+ # We generate a `phonetic_cost` function that takes four arguments: Two
84
+ # strings, and the lengths of those strings. Each string should be exactly
85
+ # one valid phoneme, which is possible thanks to the (also generated)
86
+ # next_phoneme_length() function.
87
+ #
88
+ # This will print a C code file with a function that implements a multil-level C
89
+ # switch like the following:
90
+ #
91
+ # switch (phoneme1_length) {
92
+ # case 2:
93
+ # switch(string1[1]) {
94
+ # case 201: // first byte of "ɪ"
95
+ # switch(string1[3]) {
96
+ # case 170: // second and final byte of "ɪ"
97
+ # // Phoneme: "ɪ", bytes: [201, 170]
98
+ # // vowel features: {"F1":300,"F2":2100,"rounded":false}
99
+ # switch(string2[6]) {
100
+ # case 105: // first and only byte of "i"
101
+ # // Phoneme: "i", bytes: [105]
102
+ # // vowel features: {"F1":240,"F2":2400,"rounded":false}
103
+ # return (float) 0.14355381904337383;
104
+ # break;
105
+ #
106
+ # the distance of ("ɪ", "i")2 is therefore 0.14355
107
+ #
108
+ def generate
109
+ write(<<-HEADER.gsub(/^ {6}/, ''))
110
+
111
+ // This is compiled from Ruby, in #{ruby_source}
112
+ #include <stdbool.h>
113
+ #include <stdio.h>
114
+ #include "./phonemes.h"
115
+ float phonetic_cost(int *string1, int string1_offset, int phoneme1_length, int *string2, int string2_offset, int phoneme2_length) {
116
+
117
+ HEADER
118
+
119
+ write ' switch (phoneme1_length) {'
120
+ by_byte_length.each do |length, phonemes|
121
+ write " case #{length}:"
122
+ switch_phoneme1(phoneme_byte_trie_for(phonemes), 0)
123
+ write ' break;'
124
+ end
125
+ write ' }'
126
+ write ' return (float) 1.0;'
127
+ write '};'
128
+ write ''
129
+ end
130
+
131
+ def switch_phoneme1(trie, depth = 0)
132
+ indent depth, "switch(string1[string1_offset + #{depth}]) {"
133
+ trie.each do |key, subtrie|
134
+ next if key == :source
135
+ next if subtrie.empty?
136
+
137
+ indent depth + 1, "case #{key}:"
138
+
139
+ phoneme1 = subtrie[:source]
140
+
141
+ # If this could be a match of a phoneme1 then find phoneme2
142
+ if phoneme1
143
+ # Add a comment to help understand the dataset
144
+ describe(phoneme1, depth + 2) if phoneme1
145
+
146
+ by_byte_length.each do |_, phonemes|
147
+ byte_trie = phoneme_byte_trie_for(phonemes)
148
+ next if byte_trie.empty?
149
+
150
+ switch_phoneme2(byte_trie, phoneme1, 0)
151
+ end
152
+ else
153
+ switch_phoneme1(subtrie, depth + 1)
154
+ end
155
+
156
+ indent depth + 2, 'break;'
157
+ end
158
+ indent depth, '}'
159
+ end
160
+
161
+ def switch_phoneme2(trie, previous_phoneme, depth = 0)
162
+ indent depth, "switch(string2[string2_offset + #{depth}]) {"
163
+ trie.each do |key, subtrie|
164
+ next if key == :source
165
+ next if subtrie.empty?
166
+
167
+ phoneme2 = subtrie[:source]
168
+
169
+ indent depth + 1, "case #{key}:"
170
+
171
+ if phoneme2
172
+ value = if previous_phoneme == phoneme2
173
+ 0.0
174
+ else
175
+ distance(previous_phoneme, phoneme2)
176
+ end
177
+ # Add a comment to help understand the dataset
178
+ describe(phoneme2, depth + 2)
179
+ indent depth + 2, "return (float) #{value};"
180
+ else
181
+ switch_phoneme2(subtrie, previous_phoneme, depth + 1)
182
+ end
183
+
184
+ indent depth + 2, 'break;'
185
+ end
186
+ indent depth, '}'
187
+ end
188
+
189
+ def by_byte_length
190
+ Phonetics.phonemes.group_by do |phoneme|
191
+ phoneme.bytes.length
192
+ end.sort_by(&:first)
193
+ end
194
+
195
+ def distance(p1, p2)
196
+ Phonetics.distance_map[p1][p2]
197
+ end
198
+ end
199
+
200
+ class NextPhonemeLength < CodeGenerator
201
+ # There's no simple way to break a string of IPA characters into phonemes.
202
+ # We do it by generating a function that, given a string of IPA characters,
203
+ # the starting index in that string, and the length of the string, returns
204
+ # the length of the next phoneme, or zero if none is found.
205
+ #
206
+ # Pseudocode:
207
+ # - return 0 if length - index == 0
208
+ # - switch on first byte, matching on possible first bytes of phonemes
209
+ # within the selected case statement:
210
+ # - return 1 if length - index == 1
211
+ # - switch on second byte, matching on possible second bytes of phonemes
212
+ # within the selected case statement:
213
+ # - return 2 if length - index == 1
214
+ # ...
215
+ # - default case: return 2 iff a phoneme terminates here
216
+ # - default case: return 1 iff a phoneme terminates here
217
+ # - return 0
218
+ #
219
+ def generate
220
+ write(<<-HEADER.gsub(/^ {6}/, ''))
221
+ // This is compiled from Ruby, in #{ruby_source}
222
+ int next_phoneme_length(int *string, int cursor, int length) {
223
+
224
+ int max_length;
225
+ max_length = length - cursor;
226
+
227
+ HEADER
228
+
229
+ next_phoneme_switch(phoneme_byte_trie, 0)
230
+
231
+ # If we fell through all the cases, return 0
232
+ write ' return 0;'
233
+ write '}'
234
+ end
235
+
236
+ private
237
+
238
+ # Recursively build switch statements for the body of next_phoneme_length
239
+ def next_phoneme_switch(trie, depth)
240
+ # switch (string[cursor + depth]) {
241
+ # case N: // for N in subtrie.keys
242
+ # // if a case statement matches the current byte AND there's chance
243
+ # // that a longer string might match, recurse.
244
+ # if (max_length >= depth) {
245
+ # // recurse
246
+ # }
247
+ # break;
248
+ # // if there's a :source key here then a phoneme terminates at this
249
+ # // point and this depth is a valid return value.
250
+ # default:
251
+ # return depth;
252
+ # break;
253
+ # }
254
+ indent depth, "switch(string[cursor + #{depth}]) {"
255
+ write ''
256
+ trie.each do |key, subtrie|
257
+ next if key == :source
258
+ next if subtrie.empty?
259
+
260
+ indent depth, "case #{key}:"
261
+
262
+ # Add a comment to help understand the dataset
263
+ describe(subtrie[:source], depth + 1) if subtrie[:source]
264
+
265
+ if subtrie.keys == [:source]
266
+ indent depth, " return #{depth + 1};"
267
+ else
268
+ indent depth, " if (max_length > #{depth + 1}) {"
269
+ next_phoneme_switch(subtrie, depth + 1)
270
+ indent depth, ' } else {'
271
+ indent depth, " return #{depth + 1};"
272
+ indent depth, ' }'
273
+ end
274
+
275
+ indent depth, ' break;'
276
+ end
277
+
278
+ if trie.key?(:source)
279
+ indent depth, ' default:'
280
+ indent depth, " return #{depth};"
281
+ end
282
+ indent depth, '}'
283
+ end
284
+ end
285
+ end
@@ -1,38 +1,29 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require_relative '../phonetics'
3
4
  require_relative 'c_levenshtein'
5
+
4
6
  # Using the Damerau version of the Levenshtein algorithm, with phonetic feature
5
7
  # count used instead of a binary edit distance calculation
6
8
  #
7
- # This implementation is almost entirely taken from the damerau-levenshtein gem
9
+ # This implementation was dually inspired by the damerau-levenshtein gem
8
10
  # (https://github.com/GlobalNamesArchitecture/damerau-levenshtein/tree/master/ext/damerau_levenshtein).
9
- # The implementation is modified based on "Using Phonologically Weighted
10
- # Levenshtein Distances for the Prediction of Microscopic Intelligibility" by
11
- # Lionel Fontan, Isabelle Ferrané, Jérôme Farinas, Julien Pinquier, Xavier
12
- # Aumont, 2016
11
+ # and "Using Phonologically Weighted Levenshtein Distances for the Prediction
12
+ # of Microscopic Intelligibility" by Lionel Fontan, Isabelle Ferrané, Jérôme
13
+ # Farinas, Julien Pinquier, Xavier Aumont, 2016
13
14
  # https://hal.archives-ouvertes.fr/hal-01474904/document
14
15
  module Phonetics
15
16
  module Levenshtein
16
17
  extend ::PhoneticsLevenshteinCBinding
17
18
 
18
- def self.distance(str1, str2, verbose = false)
19
- ensure_is_phonetic!(str1, str2)
20
- internal_phonetic_distance(
21
- Phonetics.as_utf_8_long(str1),
22
- Phonetics.as_utf_8_long(str2),
23
- verbose
24
- )
19
+ def inspect_bytes(str)
20
+ puts "Rubyland str: #{str.inspect}"
21
+ puts "Rubyland bytes: #{str.bytes.inspect}"
22
+ testing_codepoints(str)
25
23
  end
26
24
 
27
- def self.ensure_is_phonetic!(str1, str2)
28
- [str1, str2].each do |string|
29
- string.chars.each do |char|
30
- unless Phonetics.phonemes.include?(char)
31
- msg = "#{char.inspect} is not a character in the International Phonetic Alphabet. #{self.class.name} only works with IPA-transcribed strings"
32
- raise ArgumentError, msg
33
- end
34
- end
35
- end
25
+ def self.distance(str1, str2, verbose = false)
26
+ internal_phonetic_distance(str1, str2, verbose)
36
27
  end
37
28
  end
38
29
  end