phonetics 3.2.0 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +17 -2
  3. data/Cargo.toml +27 -0
  4. data/Rakefile +58 -26
  5. data/VERSION +1 -1
  6. data/bin/phonetics +89 -0
  7. data/ext/phonetics_ruby/Cargo.toml +36 -0
  8. data/ext/phonetics_ruby/build.rs +24 -0
  9. data/ext/phonetics_ruby/extconf.rb +17 -0
  10. data/ext/phonetics_ruby/src/lib.rs +56 -0
  11. data/ext/phonetics_ruby/vendor/phonetics/Cargo.toml +30 -0
  12. data/ext/phonetics_ruby/vendor/phonetics/README.md +29 -0
  13. data/ext/phonetics_ruby/vendor/phonetics/src/compounds.rs +40 -0
  14. data/ext/phonetics_ruby/vendor/phonetics/src/confusion.rs +325 -0
  15. data/ext/phonetics_ruby/vendor/phonetics/src/consonants.rs +363 -0
  16. data/ext/phonetics_ruby/vendor/phonetics/src/cross_class.rs +56 -0
  17. data/ext/phonetics_ruby/vendor/phonetics/src/diacritics.rs +113 -0
  18. data/ext/phonetics_ruby/vendor/phonetics/src/distance.rs +183 -0
  19. data/ext/phonetics_ruby/vendor/phonetics/src/levenshtein.rs +146 -0
  20. data/ext/phonetics_ruby/vendor/phonetics/src/lib.rs +44 -0
  21. data/ext/phonetics_ruby/vendor/phonetics/src/symbols.rs +21 -0
  22. data/ext/phonetics_ruby/vendor/phonetics/src/tokenizer.rs +171 -0
  23. data/ext/phonetics_ruby/vendor/phonetics/src/vowels.rs +197 -0
  24. data/lib/phonetics.rb +77 -2
  25. data/phonetics.gemspec +33 -9
  26. metadata +45 -34
  27. data/.github/workflows/gempush.yml +0 -28
  28. data/.github/workflows/test.yml +0 -20
  29. data/Makefile +0 -9
  30. data/ext/c_levenshtein/extconf.rb +0 -10
  31. data/ext/c_levenshtein/levenshtein.c +0 -223
  32. data/ext/c_levenshtein/next_phoneme_length.c +0 -1365
  33. data/ext/c_levenshtein/next_phoneme_length.h +0 -1
  34. data/ext/c_levenshtein/phonemes.c +0 -53
  35. data/ext/c_levenshtein/phonemes.h +0 -3
  36. data/ext/c_levenshtein/phonetic_cost.c +0 -88593
  37. data/ext/c_levenshtein/phonetic_cost.h +0 -1
  38. data/lib/phonetics/code_generator.rb +0 -228
  39. data/lib/phonetics/distances.rb +0 -249
  40. data/lib/phonetics/levenshtein.rb +0 -27
  41. data/lib/phonetics/ruby_levenshtein.rb +0 -162
@@ -1 +0,0 @@
1
- float phonetic_cost(int64_t phoneme1, int64_t phoneme2);
@@ -1,228 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require_relative '../phonetics'
4
- require 'json'
5
-
6
- module Phonetics
7
- class CodeGenerator
8
- attr_reader :writer
9
-
10
- def initialize(writer = $stdout)
11
- @writer = writer
12
- end
13
-
14
- def generate_phonetic_cost_c_code
15
- generator = PhoneticCost.new(writer)
16
- generator.generate
17
- writer.flush
18
- end
19
-
20
- def generate_next_phoneme_length_c_code
21
- generator = NextPhonemeLength.new(writer)
22
- generator.generate
23
- writer.flush
24
- end
25
-
26
- private
27
-
28
- def binary(str)
29
- "0b#{str.bytes.map { |byte| byte.to_s(2).rjust(8, '0') }.join}"
30
- end
31
-
32
- # Turn the bytes of all phonemes into a lookup trie where a sequence of
33
- # bytes can find a phoneme in linear time.
34
- def phoneme_byte_trie
35
- phoneme_byte_trie_for(Phonetics.phonemes)
36
- end
37
-
38
- def phoneme_byte_trie_for(phonemes)
39
- phonemes.each_with_object({}) do |phoneme, trie|
40
- phoneme.bytes.each_with_index.reduce(trie) do |subtrie, (byte, idx)|
41
- subtrie[byte] ||= {}
42
-
43
- # If we've reached the end of the byte string
44
- if phoneme.bytes.length - 1 == idx
45
- # Check if this is a duplicate lookup path. If there's a collision
46
- # then this whole approach makes no sense.
47
- if subtrie[byte].key?(:source)
48
- source = subtrie[byte][:source]
49
- raise "Duplicate byte sequence on #{phoneme.inspect} & #{source.inspect} (#{phoneme.bytes.inspect})"
50
- else
51
- subtrie[byte][:source] = phoneme
52
- end
53
- end
54
- subtrie[byte]
55
- end
56
- end
57
- end
58
-
59
- def describe(phoneme, depth = 0)
60
- indent depth, "// Phoneme: '#{phoneme}', bytes: #{phoneme.bytes.inspect}"
61
- if Phonetics::Consonants.features.key?(phoneme)
62
- indent depth, "// consonant features: #{Phonetics::Consonants.features[phoneme].to_json}"
63
- else
64
- indent depth, "// vowel features: #{Phonetics::Vowels::FormantFrequencies[phoneme].to_json}"
65
- end
66
- end
67
-
68
- def ruby_source
69
- location = caller_locations.first
70
- "#{location.path.split('/')[-4..].join('/')}:#{location.lineno}"
71
- end
72
-
73
- def indent(depth, line)
74
- write " #{' ' * depth}#{line}"
75
- end
76
-
77
- def write(line)
78
- writer.puts line
79
- end
80
- end
81
-
82
- class PhoneticCost < CodeGenerator
83
- # We find the phonetic distance between two phonemes using a compiled
84
- # lookup table. This is implemented as a set of nested switch statements.
85
- # Hard to read when compiled, but simple to generate and fast at runtime.
86
- #
87
- # We generate a `phonetic_cost` function that takes four arguments: Two
88
- # strings, and the lengths of those strings. Each string should be exactly
89
- # one valid phoneme, which is possible thanks to the (also generated)
90
- # next_phoneme_length() function.
91
- #
92
- # This will print a C code file with a function that implements a multil-level C
93
- # switch like the following:
94
- #
95
- # switch (phoneme1) {
96
- # case 'ɪ': // two bytes: [201, 170]
97
- # // vowel features: {"F1":300,"F2":2100,"rounded":false}
98
- #
99
- # switch(phoneme2) {
100
- # 'i': // one byte: [105]
101
- # // vowel features: {"F1":240,"F2":2400,"rounded":false}
102
- # return (float) 0.14355381904337383;
103
- # break;
104
- #
105
- # the distance of ("ɪ", "i")2 is therefore 0.14355
106
- #
107
- def generate
108
- write(<<-HEADER.gsub(/^ {6}/, ''))
109
-
110
- // This is compiled from Ruby, in #{ruby_source}
111
- #include <stdint.h>
112
- #include <stdio.h>
113
- #include <inttypes.h>
114
- float phonetic_cost(int64_t phoneme1, int64_t phoneme2) {
115
- if (phoneme1 == phoneme2) {
116
- return (float) 0.0;
117
- }
118
-
119
- HEADER
120
-
121
- write ' switch (phoneme1) {'
122
- Phonetics.phonemes.each do |phoneme1|
123
- write " case #{binary(phoneme1)}:"
124
- describe(phoneme1, 2)
125
- write ' switch(phoneme2) {'
126
- Phonetics.distance_map[phoneme1].each do |phoneme2, distance|
127
- write " case #{binary(phoneme2)}:"
128
- describe(phoneme2, 6)
129
- write " return (float) #{distance};"
130
- write ' break;'
131
- end
132
- write ' }'
133
- write ' break;'
134
- end
135
- write ' }'
136
- write ' return (float) 1.0;'
137
- write '};'
138
- write ''
139
- end
140
- end
141
-
142
- class NextPhonemeLength < CodeGenerator
143
- # There's no simple way to break a string of IPA characters into phonemes.
144
- # We do it by generating a function that, given a string of IPA characters,
145
- # the starting index in that string, and the length of the string, returns
146
- # the length of the next phoneme, or zero if none is found.
147
- #
148
- # Pseudocode:
149
- # - return 0 if length - index == 0
150
- # - switch on first byte, matching on possible first bytes of phonemes
151
- # within the selected case statement:
152
- # - return 1 if length - index == 1
153
- # - switch on second byte, matching on possible second bytes of phonemes
154
- # within the selected case statement:
155
- # - return 2 if length - index == 1
156
- # ...
157
- # - default case: return 2 iff a phoneme terminates here
158
- # - default case: return 1 iff a phoneme terminates here
159
- # - return 0
160
- #
161
- def generate
162
- write(<<-HEADER.gsub(/^ {6}/, ''))
163
- // This is compiled from Ruby, in #{ruby_source}
164
- #include <stdio.h>
165
- int next_phoneme_length(int *string, int cursor, int length) {
166
-
167
- int max_length;
168
- max_length = length - cursor;
169
-
170
- HEADER
171
-
172
- next_phoneme_switch(phoneme_byte_trie, 0)
173
-
174
- # If we fell through all the cases, return 0
175
- write ' return 0;'
176
- write '}'
177
- end
178
-
179
- private
180
-
181
- # Recursively build switch statements for the body of next_phoneme_length
182
- def next_phoneme_switch(trie, depth)
183
- # switch (string[cursor + depth]) {
184
- # case N: // for N in subtrie.keys
185
- # // if a case statement matches the current byte AND there's chance
186
- # // that a longer string might match, recurse.
187
- # if (max_length >= depth) {
188
- # // recurse
189
- # }
190
- # break;
191
- # // if there's a :source key here then a phoneme terminates at this
192
- # // point and this depth is a valid return value.
193
- # default:
194
- # return depth;
195
- # break;
196
- # }
197
- indent depth, "switch(string[cursor + #{depth}]) {"
198
- write ''
199
- trie.each do |key, subtrie|
200
- next if key == :source
201
- next if subtrie.empty?
202
-
203
- indent depth, "case #{key}:"
204
-
205
- # Add a comment to help understand the dataset
206
- describe(subtrie[:source], depth + 1) if subtrie[:source]
207
-
208
- if subtrie.keys == [:source]
209
- indent depth, " return #{depth + 1};"
210
- else
211
- indent depth, " if (max_length > #{depth + 1}) {"
212
- next_phoneme_switch(subtrie, depth + 1)
213
- indent depth, ' } else {'
214
- indent depth, " return #{depth + 1};"
215
- indent depth, ' }'
216
- end
217
-
218
- indent depth, ' break;'
219
- end
220
-
221
- if trie.key?(:source)
222
- indent depth, ' default:'
223
- indent depth, " return #{depth};"
224
- end
225
- indent depth, '}'
226
- end
227
- end
228
- end
@@ -1,249 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'delegate'
4
- require 'set'
5
-
6
- module Phonetics
7
- extend self
8
-
9
- # This subclass of the stdlib's String allows us to iterate over each phoneme
10
- # in a string without monkeypatching
11
- #
12
- # Usage:
13
- # Phonetics::String.new("wətɛvɝ").each_phoneme.to_a
14
- # => ["w", "ə", "t", "ɛ", "v", "ɝ"]
15
- class String < SimpleDelegator
16
- # Group all phonemes by how many characters they have. Use this to walk
17
- # through a string finding phonemes (looking for longest ones first)
18
- def self.phonemes_by_length
19
- @phonemes_by_length ||= Phonetics.phonemes.each_with_object(
20
- # This relies on the impicit stable key ordering of Hash objects in Ruby
21
- # 2+ to keep the keys in descending order.
22
- 4 => Set.new, 3 => Set.new, 2 => Set.new, 1 => Set.new
23
- ) do |str, acc|
24
- acc[str.chars.size] << str
25
- end
26
- end
27
-
28
- def each_phoneme
29
- idx = 0
30
- Enumerator.new do |y|
31
- while idx < chars.length
32
- found = false
33
- self.class.phonemes_by_length.each do |size, phonemes|
34
- next unless idx + size <= chars.length
35
-
36
- candidate = chars[idx..idx + size - 1].join
37
- next unless phonemes.include?(candidate)
38
-
39
- y.yield candidate
40
- idx += size
41
- found = true
42
- break
43
- end
44
- idx += 1 unless found
45
- end
46
- end
47
- end
48
- end
49
-
50
- module Vowels
51
- extend self
52
-
53
- FormantFrequencies = {
54
- # https://en.wikipedia.org/wiki/Formant#Phonetics
55
- 'i' => { F1: 240, F2: 2400, rounded: false },
56
- 'y' => { F1: 235, F2: 2100, rounded: false },
57
- 'ɪ' => { F1: 300, F2: 2100, rounded: false }, # Guessing From other vowels
58
- 'e' => { F1: 390, F2: 2300, rounded: false },
59
- 'ø' => { F1: 370, F2: 1900, rounded: true },
60
- 'ɛ' => { F1: 610, F2: 1900, rounded: false },
61
- 'œ' => { F1: 585, F2: 1710, rounded: true },
62
- 'a' => { F1: 850, F2: 1610, rounded: false },
63
- 'ɶ' => { F1: 820, F2: 1530, rounded: true },
64
- 'ɑ' => { F1: 750, F2: 940, rounded: false },
65
- 'ɒ' => { F1: 700, F2: 760, rounded: true },
66
-
67
- 'ʌ' => { F1: 600, F2: 1170, rounded: false },
68
- # copying 'ʌ' for other mid-vowel formants
69
- 'ə' => { F1: 600, F2: 1170, rounded: false },
70
- 'ɝ' => { F1: 600, F2: 1170, rounded: false, rhotic: true },
71
-
72
- 'ɔ' => { F1: 500, F2: 700, rounded: true },
73
- 'ɤ' => { F1: 460, F2: 1310, rounded: false },
74
- 'o' => { F1: 360, F2: 640, rounded: true },
75
- 'ɯ' => { F1: 300, F2: 1390, rounded: false },
76
- 'æ' => { F1: 800, F2: 1900, rounded: false }, # Guessing From other vowels
77
- 'u' => { F1: 350, F2: 650, rounded: true }, # Guessing From other vowels
78
- 'ʊ' => { F1: 350, F2: 650, rounded: true },
79
- # Frequencies from http://videoweb.nie.edu.sg/phonetic/vowels/measurements.html
80
- }.freeze
81
-
82
- def phonemes
83
- @phonemes ||= FormantFrequencies.keys
84
- end
85
-
86
- # Given two vowels, calculate the (pythagorean) distance between them using
87
- # their F1 and F2 frequencies as x/y coordinates.
88
- # The return value is scaled to a value between 0 and 1
89
- # TODO: account for rhoticity (F3)
90
- def distance(phoneme1, phoneme2)
91
- formants1 = FormantFrequencies.fetch(phoneme1)
92
- formants2 = FormantFrequencies.fetch(phoneme2)
93
-
94
- @minmax_f1 ||= FormantFrequencies.values.minmax { |a, b| a[:F1] <=> b[:F1] }.map { |h| h[:F1] }
95
- @minmax_f2 ||= FormantFrequencies.values.minmax { |a, b| a[:F2] <=> b[:F2] }.map { |h| h[:F2] }
96
-
97
- # Get an x and y value for each input phoneme scaled between 0.0 and 1.0
98
- # We'll use the scaled f1 as the 'x' and the scaled f2 as the 'y'
99
- scaled_phoneme1_f1 = (formants1[:F1] - @minmax_f1[0]) / @minmax_f1[1].to_f
100
- scaled_phoneme1_f2 = (formants1[:F2] - @minmax_f2[0]) / @minmax_f2[1].to_f
101
- scaled_phoneme2_f1 = (formants2[:F1] - @minmax_f1[0]) / @minmax_f1[1].to_f
102
- scaled_phoneme2_f2 = (formants2[:F2] - @minmax_f2[0]) / @minmax_f2[1].to_f
103
-
104
- f1_distance = (scaled_phoneme1_f1 - scaled_phoneme2_f1).abs
105
- f2_distance = (scaled_phoneme1_f2 - scaled_phoneme2_f2).abs
106
-
107
- # When we have four values we can use the pythagorean theorem on them
108
- # (order doesn't matter)
109
- sqrt = Math.sqrt((f1_distance**2) + (f2_distance**2))
110
-
111
- # Vowels are more similiar to each other than consonants, so we apply a
112
- # penalty softening here
113
- sqrt / 2.0
114
- end
115
- end
116
-
117
- module Consonants
118
- extend self
119
-
120
- # This chart (columns 2 through the end, anyway) is a direct port of
121
- # https://en.wikipedia.org/wiki/International_Phonetic_Alphabet#Letters
122
- # We store the consonant table in this format to make updating it easier.
123
- #
124
- # rubocop:disable Layout/TrailingWhitespace
125
- ChartData = %( | Labio-velar | Bi-labial | Labio-dental | Linguo-labial | Dental | Alveolar | Post-alveolar | Retro-flex | Palatal | Velar | Uvular | Pharyngeal | Glottal
126
- Nasal | | m̥ m | ɱ | n̼ | | n̥ n | | ɳ̊ ɳ | ɲ̊ ɲ | ŋ̊ ŋ | ɴ | |
127
- Stop | | p b | p̪ b̪ | t̼ d̼ | | t d | | ʈ ɖ | c ɟ | k g | q ɢ | ʡ | ʔ
128
- Sibilant fricative | | | | | | s z | ʃ ʒ | ʂ ʐ | ɕ ʑ | | | |
129
- Non-sibilant fricative | | ɸ β | f v | θ̼ ð̼ | θ ð | θ̠ ð̠ | ɹ̠̊˔ ɹ̠˔ | ɻ˔ | ç ʝ | x ɣ | χ ʁ | ħ ʕ | h ɦ
130
- Approximant | w | | ʋ̥ ʋ | | | ɹ̥ ɹ | | ɻ̊ ɻ | j̊ j | ɰ̊ ɰ | | | ʔ̞
131
- Tap/flap | | ⱱ̟ | ⱱ | ɾ̼ | | ɾ̥ ɾ | | ɽ̊ ɽ | | | ɢ̆ | ʡ̆ |
132
- Trill | | ʙ̥ ʙ | | | | r̥ r | | | | | ʀ̥ ʀ | ʜ ʢ |
133
- Lateral fricative | | | | | | ɬ ɮ | | ɭ̊˔ ɭ˔ | ʎ̝̊ ʎ̝ | ʟ̝̊ ʟ̝ | | |
134
- Lateral approximant | | | | | | l̥ l | | ɭ̊ ɭ | ʎ̥ ʎ | ʟ̥ ʟ | ʟ̠ | |
135
- Lateral tap/flap | | | | | | ɺ | | ɭ̆ | ʎ̆ | ʟ̆ | | |
136
- )
137
- # rubocop:enable Layout/TrailingWhitespace
138
-
139
- # rubocop:disable Metrics/CyclomaticComplexity
140
- # rubocop:disable Metrics/PerceivedComplexity
141
- # Parse the ChartData into a lookup table where we can retrieve attributes
142
- # for each phoneme
143
- def features
144
- @features ||= begin
145
- header, *manners = ChartData.lines
146
-
147
- _, *positions = header.chomp.split(' | ')
148
- positions.map(&:strip!)
149
-
150
- # Remove any trailing blank lines
151
- manners.pop while manners.last.to_s.strip.empty?
152
-
153
- position_indexes = Hash[*positions.each_with_index.to_a.flatten]
154
-
155
- @position_count = positions.size
156
-
157
- manners.each_with_object({}) do |row, phonemes|
158
- manner, *columns = row.chomp.split(' | ')
159
- manner.strip!
160
- positions.zip(columns).each do |position, phoneme_text|
161
- data = {
162
- position: position,
163
- position_index: position_indexes[position],
164
- manner: manner,
165
- }
166
- # If there is a character in the first byte then this articulation
167
- # has a voiceless phoneme. The symbol may use additional characters
168
- # as part of the phoneme symbol.
169
- unless phoneme_text[0] == ' '
170
- # Take the first non-blank character string
171
- symbol = phoneme_text.chars.take_while { |char| char != ' ' }.join
172
- phoneme_text = phoneme_text[symbol.chars.size..]
173
-
174
- phonemes[symbol] = data.merge(voiced: false)
175
- end
176
- # If there's a character anywhere left in the string then this
177
- # articulation has a voiced phoneme
178
- unless phoneme_text.strip.empty?
179
- symbol = phoneme_text.strip
180
- phonemes[symbol] = data.merge(voiced: true)
181
- end
182
- end
183
- end
184
- end
185
- end
186
- # rubocop:enable Metrics/CyclomaticComplexity
187
- # rubocop:enable Metrics/PerceivedComplexity
188
-
189
- def phonemes
190
- @phonemes ||= features.keys
191
- end
192
-
193
- # Given two consonants, calculate their difference by summing the
194
- # following:
195
- # * 0.3 if they are not voiced the same
196
- # * 0.3 if they are different manners
197
- # * Up to 0.4 if they are the maximum position difference
198
- def distance(phoneme1, phoneme2)
199
- features1 = features[phoneme1]
200
- features2 = features[phoneme2]
201
-
202
- penalty = 0
203
- penalty += 0.3 if features1[:voiced] != features2[:voiced]
204
-
205
- penalty += 0.3 if features1[:manner] != features2[:manner]
206
-
207
- # Use up to the remaining 0.4 for penalizing differences in manner
208
- penalty += 0.4 * ((features1[:position_index] - features2[:position_index]).abs / @position_count.to_f)
209
- penalty
210
- end
211
- end
212
-
213
- def phonemes
214
- Vowels.phonemes + Consonants.phonemes
215
- end
216
-
217
- Symbols = Consonants.phonemes.reduce({}) { |acc, p| acc.update p => :consonant }.merge(
218
- Vowels.phonemes.reduce({}) { |acc, p| acc.update p => :vowel }
219
- )
220
-
221
- def distance(phoneme1, phoneme2)
222
- return 0 if phoneme1 == phoneme2
223
-
224
- distance_map.fetch(phoneme1).fetch(phoneme2)
225
- end
226
-
227
- def distance_map
228
- @distance_map ||= phonemes.permutation(2).each_with_object(Hash.new { |h, k| h[k] = {} }) do |pair, scores|
229
- p1, p2 = *pair
230
- score = _distance(p1, p2)
231
- scores[p1][p2] = score
232
- scores[p2][p1] = score
233
- end
234
- end
235
-
236
- private
237
-
238
- def _distance(phoneme1, phoneme2)
239
- types = [Symbols.fetch(phoneme1), Symbols.fetch(phoneme2)].sort
240
- case types
241
- when %i[consonant vowel]
242
- 1.0
243
- when %i[vowel vowel]
244
- Vowels.distance(phoneme1, phoneme2)
245
- when %i[consonant consonant]
246
- Consonants.distance(phoneme1, phoneme2)
247
- end
248
- end
249
- end
@@ -1,27 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require_relative '../phonetics'
4
- require_relative 'c_levenshtein'
5
-
6
- # Using the Damerau version of the Levenshtein algorithm, with phonetic feature
7
- # count used instead of a binary edit distance calculation
8
- #
9
- # This implementation was dually inspired by the damerau-levenshtein gem
10
- # (https://github.com/GlobalNamesArchitecture/damerau-levenshtein/tree/master/ext/damerau_levenshtein).
11
- # and "Using Phonologically Weighted Levenshtein Distances for the Prediction
12
- # of Microscopic Intelligibility" by Lionel Fontan, Isabelle Ferrané, Jérôme
13
- # Farinas, Julien Pinquier, Xavier Aumont, 2016
14
- # https://hal.archives-ouvertes.fr/hal-01474904/document
15
- module Phonetics
16
- module Levenshtein
17
- extend ::PhoneticsLevenshteinCBinding
18
-
19
- # rubocop:disable Style/OptionalBooleanParameter
20
- def self.distance(str1, str2, verbose = false)
21
- return if str1.nil? || str2.nil?
22
-
23
- internal_phonetic_distance(str1, str2, verbose)
24
- end
25
- # rubocop:enable Style/OptionalBooleanParameter
26
- end
27
- end
@@ -1,162 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require_relative '../phonetics'
4
-
5
- # Using the Damerau version of the Levenshtein algorithm, with phonetic feature
6
- # count used instead of a binary edit distance calculation
7
- #
8
- # This Ruby implementation is almost entirely taken from the damerau-levenshtein gem
9
- # (https://github.com/GlobalNamesArchitecture/damerau-levenshtein/tree/master/ext/damerau_levenshtein).
10
- # The implementation is modified based on "Using Phonologically Weighted
11
- # Levenshtein Distances for the Prediction of Microscopic Intelligibility" by
12
- # Lionel Fontan, Isabelle Ferrané, Jérôme Farinas, Julien Pinquier, Xavier
13
- # Aumont, 2016
14
- # https://hal.archives-ouvertes.fr/hal-01474904/document
15
- module Phonetics
16
- class RubyLevenshtein
17
- attr_reader :str1, :str2, :len1, :len2, :matrix
18
-
19
- # rubocop:disable Style/OptionalBooleanParameter
20
- def initialize(ipa_str1, ipa_str2, verbose = false)
21
- @str1 = ipa_str1.each_char.select { |c| Phonetics.phonemes.include?(c) }.join
22
- @str2 = ipa_str2.each_char.select { |c| Phonetics.phonemes.include?(c) }.join
23
- @len1 = @str1.size
24
- @len2 = @str2.size
25
- @verbose = verbose
26
- prepare_matrix
27
- set_edit_distances(@str1, @str2)
28
- end
29
-
30
- def self.distance(str1, str2, verbose = false)
31
- new(str1, str2, verbose).distance
32
- end
33
- # rubocop:enable Style/OptionalBooleanParameter
34
-
35
- def distance
36
- return 0 if walk.empty?
37
-
38
- print_matrix if @verbose
39
- walk.last[:distance]
40
- end
41
-
42
- private
43
-
44
- def walk
45
- res = []
46
- i = len2
47
- j = len1
48
- return res if i == 0 && j == 0
49
-
50
- loop do
51
- i, j, char = char_data(i, j)
52
- res.unshift char
53
- break if i == 0 || j == 0
54
- end
55
- res
56
- end
57
-
58
- def set_edit_distances(str1, str2)
59
- i = 0
60
- while (i += 1) <= len2
61
- j = 0
62
- while (j += 1) <= len1
63
- options = [
64
- ins(i, j),
65
- del(i, j),
66
- subst(i, j)
67
- ]
68
- # This is where we implement the modifications to Damerau-Levenshtein
69
- # according to https://hal.archives-ouvertes.fr/hal-01474904/document
70
- phonetic_cost = Phonetics.distance(str1[j - 1], str2[i - 1])
71
- matrix[i][j] = options.min + phonetic_cost
72
- puts "------- #{j}/#{i} #{j + (i * (len1 + 1))}" if @verbose
73
- print_matrix if @verbose
74
- end
75
- end
76
- end
77
-
78
- def char_data(i, j)
79
- char = { distance: matrix[i][j] }
80
- operation, move = find_previous(i, j)
81
- previous_value = move[:value]
82
- char[:type] = previous_value == char[:distance] ? :same : operation
83
- i, j = move[:move_to]
84
- [i, j, char]
85
- end
86
-
87
- def find_previous(i, j)
88
- [
89
- [:insert, { cost: ins(i, j), move_to: [i, j - 1] }],
90
- [:delete, { cost: del(i, j), move_to: [i, j - 1] }],
91
- [:substitute, { cost: subst(i, j), move_to: [i, j - 1] }]
92
- ].select do |_operation, data|
93
- # Don't send us out of bounds
94
- data[:move_to][0] >= 0 && data[:move_to][1] >= 0
95
- end.min_by do |_operation, data|
96
- # pick the cheapest one
97
- data[:value]
98
- end
99
- end
100
-
101
- # TODO: Score the edit distance lower if sonorant sounds are found in sequence.
102
- def del(i, j)
103
- matrix[i - 1][j]
104
- end
105
-
106
- def ins(i, j)
107
- matrix[i][j - 1]
108
- end
109
-
110
- def subst(i, j)
111
- matrix[i - 1][j - 1]
112
- end
113
-
114
- # Set the minimum scores equal to the distance between each phoneme,
115
- # sequentially.
116
- #
117
- # The first value is always zero, the second is always 1.
118
- # Subsequent values are the cumulative phonetic distance between each
119
- # phoneme within the same string.
120
- # "aek" -> [0, 1, 1.61, 2.61]
121
- def initial_distances(str1, str2)
122
- starting_distance = 1
123
- starting_distance = 0 if len1 == 0 || len2 == 0
124
-
125
- distances1 = (1..(str1.length - 1)).reduce([0, starting_distance]) do |acc, i|
126
- acc << acc.last + Phonetics.distance(str1[i - 1], str1[i])
127
- end
128
- distances2 = (1..(str2.length - 1)).reduce([0, starting_distance]) do |acc, i|
129
- acc << acc.last + Phonetics.distance(str2[i - 1], str2[i])
130
- end
131
-
132
- [distances1, distances2]
133
- end
134
-
135
- def prepare_matrix
136
- str1_initial, str2_initial = initial_distances(str1, str2)
137
-
138
- @matrix = Array.new(len2 + 1) { Array.new(len1 + 1) { nil } }
139
- # The first row is the initial values for str2
140
- @matrix[0] = str1_initial
141
- # The first column is the initial values for str1
142
- (len2 + 1).times { |n| @matrix[n][0] = str2_initial[n] }
143
- end
144
-
145
- # This is a helper method for developers to use when exploring this
146
- # algorithm.
147
- def print_matrix
148
- puts " #{str1.chars.map { |c| c.ljust(9, ' ') }.join}"
149
- matrix.each_with_index do |row, ridx|
150
- print ' ' if ridx == 0
151
- print "#{str2[ridx - 1]} " if ridx > 0
152
- row.each_with_index do |cell, _cidx|
153
- cell ||= 0.0
154
- print cell.to_s[0, 8].ljust(8, '0')
155
- print ' '
156
- end
157
- puts ''
158
- end
159
- ''
160
- end
161
- end
162
- end