phonetics 3.0.9 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +17 -2
  3. data/CHANGELOG +4 -0
  4. data/Cargo.toml +27 -0
  5. data/Rakefile +58 -26
  6. data/VERSION +1 -1
  7. data/bin/phonetics +89 -0
  8. data/ext/phonetics_ruby/Cargo.toml +36 -0
  9. data/ext/phonetics_ruby/build.rs +24 -0
  10. data/ext/phonetics_ruby/extconf.rb +17 -0
  11. data/ext/phonetics_ruby/src/lib.rs +56 -0
  12. data/ext/phonetics_ruby/vendor/phonetics/Cargo.toml +30 -0
  13. data/ext/phonetics_ruby/vendor/phonetics/README.md +29 -0
  14. data/ext/phonetics_ruby/vendor/phonetics/src/compounds.rs +40 -0
  15. data/ext/phonetics_ruby/vendor/phonetics/src/confusion.rs +325 -0
  16. data/ext/phonetics_ruby/vendor/phonetics/src/consonants.rs +363 -0
  17. data/ext/phonetics_ruby/vendor/phonetics/src/cross_class.rs +56 -0
  18. data/ext/phonetics_ruby/vendor/phonetics/src/diacritics.rs +113 -0
  19. data/ext/phonetics_ruby/vendor/phonetics/src/distance.rs +183 -0
  20. data/ext/phonetics_ruby/vendor/phonetics/src/levenshtein.rs +146 -0
  21. data/ext/phonetics_ruby/vendor/phonetics/src/lib.rs +44 -0
  22. data/ext/phonetics_ruby/vendor/phonetics/src/symbols.rs +21 -0
  23. data/ext/phonetics_ruby/vendor/phonetics/src/tokenizer.rs +171 -0
  24. data/ext/phonetics_ruby/vendor/phonetics/src/vowels.rs +197 -0
  25. data/lib/phonetics.rb +77 -2
  26. data/phonetics.gemspec +33 -9
  27. metadata +46 -34
  28. data/.github/workflows/gempush.yml +0 -28
  29. data/.github/workflows/test.yml +0 -20
  30. data/Makefile +0 -6
  31. data/ext/c_levenshtein/extconf.rb +0 -10
  32. data/ext/c_levenshtein/levenshtein.c +0 -223
  33. data/ext/c_levenshtein/next_phoneme_length.c +0 -1365
  34. data/ext/c_levenshtein/next_phoneme_length.h +0 -1
  35. data/ext/c_levenshtein/phonemes.c +0 -53
  36. data/ext/c_levenshtein/phonemes.h +0 -3
  37. data/ext/c_levenshtein/phonetic_cost.c +0 -88593
  38. data/ext/c_levenshtein/phonetic_cost.h +0 -1
  39. data/lib/phonetics/code_generator.rb +0 -228
  40. data/lib/phonetics/distances.rb +0 -245
  41. data/lib/phonetics/levenshtein.rb +0 -27
  42. data/lib/phonetics/ruby_levenshtein.rb +0 -162
@@ -1 +0,0 @@
1
- float phonetic_cost(int64_t phoneme1, int64_t phoneme2);
@@ -1,228 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require_relative '../phonetics'
4
- require 'json'
5
-
6
- module Phonetics
7
- class CodeGenerator
8
- attr_reader :writer
9
-
10
- def initialize(writer = $stdout)
11
- @writer = writer
12
- end
13
-
14
- def generate_phonetic_cost_c_code
15
- generator = PhoneticCost.new(writer)
16
- generator.generate
17
- writer.flush
18
- end
19
-
20
- def generate_next_phoneme_length_c_code
21
- generator = NextPhonemeLength.new(writer)
22
- generator.generate
23
- writer.flush
24
- end
25
-
26
- private
27
-
28
- def binary(str)
29
- "0b#{str.bytes.map { |byte| byte.to_s(2).rjust(8, '0') }.join}"
30
- end
31
-
32
- # Turn the bytes of all phonemes into a lookup trie where a sequence of
33
- # bytes can find a phoneme in linear time.
34
- def phoneme_byte_trie
35
- phoneme_byte_trie_for(Phonetics.phonemes)
36
- end
37
-
38
- def phoneme_byte_trie_for(phonemes)
39
- phonemes.each_with_object({}) do |phoneme, trie|
40
- phoneme.bytes.each_with_index.reduce(trie) do |subtrie, (byte, idx)|
41
- subtrie[byte] ||= {}
42
-
43
- # If we've reached the end of the byte string
44
- if phoneme.bytes.length - 1 == idx
45
- # Check if this is a duplicate lookup path. If there's a collision
46
- # then this whole approach makes no sense.
47
- if subtrie[byte].key?(:source)
48
- source = subtrie[byte][:source]
49
- raise "Duplicate byte sequence on #{phoneme.inspect} & #{source.inspect} (#{phoneme.bytes.inspect})"
50
- else
51
- subtrie[byte][:source] = phoneme
52
- end
53
- end
54
- subtrie[byte]
55
- end
56
- end
57
- end
58
-
59
- def describe(phoneme, depth = 0)
60
- indent depth, "// Phoneme: '#{phoneme}', bytes: #{phoneme.bytes.inspect}"
61
- if Phonetics::Consonants.features.key?(phoneme)
62
- indent depth, "// consonant features: #{Phonetics::Consonants.features[phoneme].to_json}"
63
- else
64
- indent depth, "// vowel features: #{Phonetics::Vowels::FormantFrequencies[phoneme].to_json}"
65
- end
66
- end
67
-
68
- def ruby_source
69
- location = caller_locations.first
70
- "#{location.path.split('/')[-4..].join('/')}:#{location.lineno}"
71
- end
72
-
73
- def indent(depth, line)
74
- write " #{' ' * depth}#{line}"
75
- end
76
-
77
- def write(line)
78
- writer.puts line
79
- end
80
- end
81
-
82
- class PhoneticCost < CodeGenerator
83
- # We find the phonetic distance between two phonemes using a compiled
84
- # lookup table. This is implemented as a set of nested switch statements.
85
- # Hard to read when compiled, but simple to generate and fast at runtime.
86
- #
87
- # We generate a `phonetic_cost` function that takes four arguments: Two
88
- # strings, and the lengths of those strings. Each string should be exactly
89
- # one valid phoneme, which is possible thanks to the (also generated)
90
- # next_phoneme_length() function.
91
- #
92
- # This will print a C code file with a function that implements a multil-level C
93
- # switch like the following:
94
- #
95
- # switch (phoneme1) {
96
- # case 'ɪ': // two bytes: [201, 170]
97
- # // vowel features: {"F1":300,"F2":2100,"rounded":false}
98
- #
99
- # switch(phoneme2) {
100
- # 'i': // one byte: [105]
101
- # // vowel features: {"F1":240,"F2":2400,"rounded":false}
102
- # return (float) 0.14355381904337383;
103
- # break;
104
- #
105
- # the distance of ("ɪ", "i")2 is therefore 0.14355
106
- #
107
- def generate
108
- write(<<-HEADER.gsub(/^ {6}/, ''))
109
-
110
- // This is compiled from Ruby, in #{ruby_source}
111
- #include <stdint.h>
112
- #include <stdio.h>
113
- #include <inttypes.h>
114
- float phonetic_cost(int64_t phoneme1, int64_t phoneme2) {
115
- if (phoneme1 == phoneme2) {
116
- return (float) 0.0;
117
- }
118
-
119
- HEADER
120
-
121
- write ' switch (phoneme1) {'
122
- Phonetics.phonemes.each do |phoneme1|
123
- write " case #{binary(phoneme1)}:"
124
- describe(phoneme1, 2)
125
- write ' switch(phoneme2) {'
126
- Phonetics.distance_map[phoneme1].each do |phoneme2, distance|
127
- write " case #{binary(phoneme2)}:"
128
- describe(phoneme2, 6)
129
- write " return (float) #{distance};"
130
- write ' break;'
131
- end
132
- write ' }'
133
- write ' break;'
134
- end
135
- write ' }'
136
- write ' return (float) 1.0;'
137
- write '};'
138
- write ''
139
- end
140
- end
141
-
142
- class NextPhonemeLength < CodeGenerator
143
- # There's no simple way to break a string of IPA characters into phonemes.
144
- # We do it by generating a function that, given a string of IPA characters,
145
- # the starting index in that string, and the length of the string, returns
146
- # the length of the next phoneme, or zero if none is found.
147
- #
148
- # Pseudocode:
149
- # - return 0 if length - index == 0
150
- # - switch on first byte, matching on possible first bytes of phonemes
151
- # within the selected case statement:
152
- # - return 1 if length - index == 1
153
- # - switch on second byte, matching on possible second bytes of phonemes
154
- # within the selected case statement:
155
- # - return 2 if length - index == 1
156
- # ...
157
- # - default case: return 2 iff a phoneme terminates here
158
- # - default case: return 1 iff a phoneme terminates here
159
- # - return 0
160
- #
161
- def generate
162
- write(<<-HEADER.gsub(/^ {6}/, ''))
163
- // This is compiled from Ruby, in #{ruby_source}
164
- #include <stdio.h>
165
- int next_phoneme_length(int *string, int cursor, int length) {
166
-
167
- int max_length;
168
- max_length = length - cursor;
169
-
170
- HEADER
171
-
172
- next_phoneme_switch(phoneme_byte_trie, 0)
173
-
174
- # If we fell through all the cases, return 0
175
- write ' return 0;'
176
- write '}'
177
- end
178
-
179
- private
180
-
181
- # Recursively build switch statements for the body of next_phoneme_length
182
- def next_phoneme_switch(trie, depth)
183
- # switch (string[cursor + depth]) {
184
- # case N: // for N in subtrie.keys
185
- # // if a case statement matches the current byte AND there's chance
186
- # // that a longer string might match, recurse.
187
- # if (max_length >= depth) {
188
- # // recurse
189
- # }
190
- # break;
191
- # // if there's a :source key here then a phoneme terminates at this
192
- # // point and this depth is a valid return value.
193
- # default:
194
- # return depth;
195
- # break;
196
- # }
197
- indent depth, "switch(string[cursor + #{depth}]) {"
198
- write ''
199
- trie.each do |key, subtrie|
200
- next if key == :source
201
- next if subtrie.empty?
202
-
203
- indent depth, "case #{key}:"
204
-
205
- # Add a comment to help understand the dataset
206
- describe(subtrie[:source], depth + 1) if subtrie[:source]
207
-
208
- if subtrie.keys == [:source]
209
- indent depth, " return #{depth + 1};"
210
- else
211
- indent depth, " if (max_length > #{depth + 1}) {"
212
- next_phoneme_switch(subtrie, depth + 1)
213
- indent depth, ' } else {'
214
- indent depth, " return #{depth + 1};"
215
- indent depth, ' }'
216
- end
217
-
218
- indent depth, ' break;'
219
- end
220
-
221
- if trie.key?(:source)
222
- indent depth, ' default:'
223
- indent depth, " return #{depth};"
224
- end
225
- indent depth, '}'
226
- end
227
- end
228
- end
@@ -1,245 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'delegate'
4
- require 'set'
5
-
6
- module Phonetics
7
- extend self
8
-
9
- # This subclass of the stdlib's String allows us to iterate over each phoneme
10
- # in a string without monkeypatching
11
- #
12
- # Usage:
13
- # Phonetics::String.new("wətɛvɝ").each_phoneme.to_a
14
- # => ["w", "ə", "t", "ɛ", "v", "ɝ"]
15
- class String < SimpleDelegator
16
- # Group all phonemes by how many characters they have. Use this to walk
17
- # through a string finding phonemes (looking for longest ones first)
18
- def self.phonemes_by_length
19
- @phonemes_by_length ||= Phonetics.phonemes.each_with_object(
20
- # This relies on the impicit stable key ordering of Hash objects in Ruby
21
- # 2+ to keep the keys in descending order.
22
- 4 => Set.new, 3 => Set.new, 2 => Set.new, 1 => Set.new
23
- ) do |str, acc|
24
- acc[str.chars.size] << str
25
- end
26
- end
27
-
28
- def each_phoneme
29
- idx = 0
30
- Enumerator.new do |y|
31
- while idx < chars.length
32
- found = false
33
- self.class.phonemes_by_length.each do |size, phonemes|
34
- next unless idx + size <= chars.length
35
-
36
- candidate = chars[idx..idx + size - 1].join
37
- next unless phonemes.include?(candidate)
38
-
39
- y.yield candidate
40
- idx += size
41
- found = true
42
- break
43
- end
44
- idx += 1 unless found
45
- end
46
- end
47
- end
48
- end
49
-
50
- module Vowels
51
- extend self
52
-
53
- FormantFrequencies = {
54
- # https://en.wikipedia.org/wiki/Formant#Phonetics
55
- 'i' => { F1: 240, F2: 2400, rounded: false },
56
- 'y' => { F1: 235, F2: 2100, rounded: false },
57
- 'ɪ' => { F1: 300, F2: 2100, rounded: false }, # Guessing From other vowels
58
- 'e' => { F1: 390, F2: 2300, rounded: false },
59
- 'ø' => { F1: 370, F2: 1900, rounded: true },
60
- 'ɛ' => { F1: 610, F2: 1900, rounded: false },
61
- 'œ' => { F1: 585, F2: 1710, rounded: true },
62
- 'a' => { F1: 850, F2: 1610, rounded: false },
63
- 'ɶ' => { F1: 820, F2: 1530, rounded: true },
64
- 'ɑ' => { F1: 750, F2: 940, rounded: false },
65
- 'ɒ' => { F1: 700, F2: 760, rounded: true },
66
-
67
- 'ʌ' => { F1: 600, F2: 1170, rounded: false },
68
- # copying 'ʌ' for other mid-vowel formants
69
- 'ə' => { F1: 600, F2: 1170, rounded: false },
70
- 'ɝ' => { F1: 600, F2: 1170, rounded: false, rhotic: true },
71
-
72
- 'ɔ' => { F1: 500, F2: 700, rounded: true },
73
- 'ɤ' => { F1: 460, F2: 1310, rounded: false },
74
- 'o' => { F1: 360, F2: 640, rounded: true },
75
- 'ɯ' => { F1: 300, F2: 1390, rounded: false },
76
- 'æ' => { F1: 800, F2: 1900, rounded: false }, # Guessing From other vowels
77
- 'u' => { F1: 350, F2: 650, rounded: true }, # Guessing From other vowels
78
- 'ʊ' => { F1: 350, F2: 650, rounded: true },
79
- # Frequencies from http://videoweb.nie.edu.sg/phonetic/vowels/measurements.html
80
- }.freeze
81
-
82
- def phonemes
83
- @phonemes ||= FormantFrequencies.keys
84
- end
85
-
86
- # Given two vowels, calculate the (pythagorean) distance between them using
87
- # their F1 and F2 frequencies as x/y coordinates.
88
- # The return value is scaled to a value between 0 and 1
89
- # TODO: account for rhoticity (F3)
90
- def distance(phoneme1, phoneme2)
91
- formants1 = FormantFrequencies.fetch(phoneme1)
92
- formants2 = FormantFrequencies.fetch(phoneme2)
93
-
94
- @minmax_f1 ||= FormantFrequencies.values.minmax { |a, b| a[:F1] <=> b[:F1] }.map { |h| h[:F1] }
95
- @minmax_f2 ||= FormantFrequencies.values.minmax { |a, b| a[:F2] <=> b[:F2] }.map { |h| h[:F2] }
96
-
97
- # Get an x and y value for each input phoneme scaled between 0.0 and 1.0
98
- # We'll use the scaled f1 as the 'x' and the scaled f2 as the 'y'
99
- scaled_phoneme1_f1 = (formants1[:F1] - @minmax_f1[0]) / @minmax_f1[1].to_f
100
- scaled_phoneme1_f2 = (formants1[:F2] - @minmax_f2[0]) / @minmax_f2[1].to_f
101
- scaled_phoneme2_f1 = (formants2[:F1] - @minmax_f1[0]) / @minmax_f1[1].to_f
102
- scaled_phoneme2_f2 = (formants2[:F2] - @minmax_f2[0]) / @minmax_f2[1].to_f
103
-
104
- f1_distance = (scaled_phoneme1_f1 - scaled_phoneme2_f1).abs
105
- f2_distance = (scaled_phoneme1_f2 - scaled_phoneme2_f2).abs
106
-
107
- # When we have four values we can use the pythagorean theorem on them
108
- # (order doesn't matter)
109
- Math.sqrt((f1_distance**2) + (f2_distance**2))
110
- end
111
- end
112
-
113
- module Consonants
114
- extend self
115
-
116
- # This chart (columns 2 through the end, anyway) is a direct port of
117
- # https://en.wikipedia.org/wiki/International_Phonetic_Alphabet#Letters
118
- # We store the consonant table in this format to make updating it easier.
119
- #
120
- # rubocop:disable Layout/TrailingWhitespace
121
- ChartData = %( | Labio-velar | Bi-labial | Labio-dental | Linguo-labial | Dental | Alveolar | Post-alveolar | Retro-flex | Palatal | Velar | Uvular | Pharyngeal | Glottal
122
- Nasal | | m̥ m | ɱ | n̼ | | n̥ n | | ɳ̊ ɳ | ɲ̊ ɲ | ŋ̊ ŋ | ɴ | |
123
- Stop | | p b | p̪ b̪ | t̼ d̼ | | t d | | ʈ ɖ | c ɟ | k g | q ɢ | ʡ | ʔ
124
- Sibilant fricative | | | | | | s z | ʃ ʒ | ʂ ʐ | ɕ ʑ | | | |
125
- Non-sibilant fricative | | ɸ β | f v | θ̼ ð̼ | θ ð | θ̠ ð̠ | ɹ̠̊˔ ɹ̠˔ | ɻ˔ | ç ʝ | x ɣ | χ ʁ | ħ ʕ | h ɦ
126
- Approximant | w | | ʋ̥ ʋ | | | ɹ̥ ɹ | | ɻ̊ ɻ | j̊ j | ɰ̊ ɰ | | | ʔ̞
127
- Tap/flap | | ⱱ̟ | ⱱ | ɾ̼ | | ɾ̥ ɾ | | ɽ̊ ɽ | | | ɢ̆ | ʡ̆ |
128
- Trill | | ʙ̥ ʙ | | | | r̥ r | | | | | ʀ̥ ʀ | ʜ ʢ |
129
- Lateral fricative | | | | | | ɬ ɮ | | ɭ̊˔ ɭ˔ | ʎ̝̊ ʎ̝ | ʟ̝̊ ʟ̝ | | |
130
- Lateral approximant | | | | | | l̥ l | | ɭ̊ ɭ | ʎ̥ ʎ | ʟ̥ ʟ | ʟ̠ | |
131
- Lateral tap/flap | | | | | | ɺ | | ɭ̆ | ʎ̆ | ʟ̆ | | |
132
- )
133
- # rubocop:enable Layout/TrailingWhitespace
134
-
135
- # rubocop:disable Metrics/CyclomaticComplexity
136
- # rubocop:disable Metrics/PerceivedComplexity
137
- # Parse the ChartData into a lookup table where we can retrieve attributes
138
- # for each phoneme
139
- def features
140
- @features ||= begin
141
- header, *manners = ChartData.lines
142
-
143
- _, *positions = header.chomp.split(' | ')
144
- positions.map(&:strip!)
145
-
146
- # Remove any trailing blank lines
147
- manners.pop while manners.last.to_s.strip.empty?
148
-
149
- position_indexes = Hash[*positions.each_with_index.to_a.flatten]
150
-
151
- @position_count = positions.size
152
-
153
- manners.each_with_object({}) do |row, phonemes|
154
- manner, *columns = row.chomp.split(' | ')
155
- manner.strip!
156
- positions.zip(columns).each do |position, phoneme_text|
157
- data = {
158
- position: position,
159
- position_index: position_indexes[position],
160
- manner: manner,
161
- }
162
- # If there is a character in the first byte then this articulation
163
- # has a voiceless phoneme. The symbol may use additional characters
164
- # as part of the phoneme symbol.
165
- unless phoneme_text[0] == ' '
166
- # Take the first non-blank character string
167
- symbol = phoneme_text.chars.take_while { |char| char != ' ' }.join
168
- phoneme_text = phoneme_text[symbol.chars.size..]
169
-
170
- phonemes[symbol] = data.merge(voiced: false)
171
- end
172
- # If there's a character anywhere left in the string then this
173
- # articulation has a voiced phoneme
174
- unless phoneme_text.strip.empty?
175
- symbol = phoneme_text.strip
176
- phonemes[symbol] = data.merge(voiced: true)
177
- end
178
- end
179
- end
180
- end
181
- end
182
- # rubocop:enable Metrics/CyclomaticComplexity
183
- # rubocop:enable Metrics/PerceivedComplexity
184
-
185
- def phonemes
186
- @phonemes ||= features.keys
187
- end
188
-
189
- # Given two consonants, calculate their difference by summing the
190
- # following:
191
- # * 0.3 if they are not voiced the same
192
- # * 0.3 if they are different manners
193
- # * Up to 0.4 if they are the maximum position difference
194
- def distance(phoneme1, phoneme2)
195
- features1 = features[phoneme1]
196
- features2 = features[phoneme2]
197
-
198
- penalty = 0
199
- penalty += 0.3 if features1[:voiced] != features2[:voiced]
200
-
201
- penalty += 0.3 if features1[:manner] != features2[:manner]
202
-
203
- # Use up to the remaining 0.4 for penalizing differences in manner
204
- penalty += 0.4 * ((features1[:position_index] - features2[:position_index]).abs / @position_count.to_f)
205
- penalty
206
- end
207
- end
208
-
209
- def phonemes
210
- Vowels.phonemes + Consonants.phonemes
211
- end
212
-
213
- Symbols = Consonants.phonemes.reduce({}) { |acc, p| acc.update p => :consonant }.merge(
214
- Vowels.phonemes.reduce({}) { |acc, p| acc.update p => :vowel }
215
- )
216
-
217
- def distance(phoneme1, phoneme2)
218
- return 0 if phoneme1 == phoneme2
219
-
220
- distance_map.fetch(phoneme1).fetch(phoneme2)
221
- end
222
-
223
- def distance_map
224
- @distance_map ||= phonemes.permutation(2).each_with_object(Hash.new { |h, k| h[k] = {} }) do |pair, scores|
225
- p1, p2 = *pair
226
- score = _distance(p1, p2)
227
- scores[p1][p2] = score
228
- scores[p2][p1] = score
229
- end
230
- end
231
-
232
- private
233
-
234
- def _distance(phoneme1, phoneme2)
235
- types = [Symbols.fetch(phoneme1), Symbols.fetch(phoneme2)].sort
236
- case types
237
- when %i[consonant vowel]
238
- 1.0
239
- when %i[vowel vowel]
240
- Vowels.distance(phoneme1, phoneme2)
241
- when %i[consonant consonant]
242
- Consonants.distance(phoneme1, phoneme2)
243
- end
244
- end
245
- end
@@ -1,27 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require_relative '../phonetics'
4
- require_relative 'c_levenshtein'
5
-
6
- # Using the Damerau version of the Levenshtein algorithm, with phonetic feature
7
- # count used instead of a binary edit distance calculation
8
- #
9
- # This implementation was dually inspired by the damerau-levenshtein gem
10
- # (https://github.com/GlobalNamesArchitecture/damerau-levenshtein/tree/master/ext/damerau_levenshtein).
11
- # and "Using Phonologically Weighted Levenshtein Distances for the Prediction
12
- # of Microscopic Intelligibility" by Lionel Fontan, Isabelle Ferrané, Jérôme
13
- # Farinas, Julien Pinquier, Xavier Aumont, 2016
14
- # https://hal.archives-ouvertes.fr/hal-01474904/document
15
- module Phonetics
16
- module Levenshtein
17
- extend ::PhoneticsLevenshteinCBinding
18
-
19
- # rubocop:disable Style/OptionalBooleanParameter
20
- def self.distance(str1, str2, verbose = false)
21
- return if str1.nil? || str2.nil?
22
-
23
- internal_phonetic_distance(str1, str2, verbose)
24
- end
25
- # rubocop:enable Style/OptionalBooleanParameter
26
- end
27
- end
@@ -1,162 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require_relative '../phonetics'
4
-
5
- # Using the Damerau version of the Levenshtein algorithm, with phonetic feature
6
- # count used instead of a binary edit distance calculation
7
- #
8
- # This Ruby implementation is almost entirely taken from the damerau-levenshtein gem
9
- # (https://github.com/GlobalNamesArchitecture/damerau-levenshtein/tree/master/ext/damerau_levenshtein).
10
- # The implementation is modified based on "Using Phonologically Weighted
11
- # Levenshtein Distances for the Prediction of Microscopic Intelligibility" by
12
- # Lionel Fontan, Isabelle Ferrané, Jérôme Farinas, Julien Pinquier, Xavier
13
- # Aumont, 2016
14
- # https://hal.archives-ouvertes.fr/hal-01474904/document
15
- module Phonetics
16
- class RubyLevenshtein
17
- attr_reader :str1, :str2, :len1, :len2, :matrix
18
-
19
- # rubocop:disable Style/OptionalBooleanParameter
20
- def initialize(ipa_str1, ipa_str2, verbose = false)
21
- @str1 = ipa_str1.each_char.select { |c| Phonetics.phonemes.include?(c) }.join
22
- @str2 = ipa_str2.each_char.select { |c| Phonetics.phonemes.include?(c) }.join
23
- @len1 = @str1.size
24
- @len2 = @str2.size
25
- @verbose = verbose
26
- prepare_matrix
27
- set_edit_distances(@str1, @str2)
28
- end
29
-
30
- def self.distance(str1, str2, verbose = false)
31
- new(str1, str2, verbose).distance
32
- end
33
- # rubocop:enable Style/OptionalBooleanParameter
34
-
35
- def distance
36
- return 0 if walk.empty?
37
-
38
- print_matrix if @verbose
39
- walk.last[:distance]
40
- end
41
-
42
- private
43
-
44
- def walk
45
- res = []
46
- i = len2
47
- j = len1
48
- return res if i == 0 && j == 0
49
-
50
- loop do
51
- i, j, char = char_data(i, j)
52
- res.unshift char
53
- break if i == 0 || j == 0
54
- end
55
- res
56
- end
57
-
58
- def set_edit_distances(str1, str2)
59
- i = 0
60
- while (i += 1) <= len2
61
- j = 0
62
- while (j += 1) <= len1
63
- options = [
64
- ins(i, j),
65
- del(i, j),
66
- subst(i, j)
67
- ]
68
- # This is where we implement the modifications to Damerau-Levenshtein
69
- # according to https://hal.archives-ouvertes.fr/hal-01474904/document
70
- phonetic_cost = Phonetics.distance(str1[j - 1], str2[i - 1])
71
- matrix[i][j] = options.min + phonetic_cost
72
- puts "------- #{j}/#{i} #{j + (i * (len1 + 1))}" if @verbose
73
- print_matrix if @verbose
74
- end
75
- end
76
- end
77
-
78
- def char_data(i, j)
79
- char = { distance: matrix[i][j] }
80
- operation, move = find_previous(i, j)
81
- previous_value = move[:value]
82
- char[:type] = previous_value == char[:distance] ? :same : operation
83
- i, j = move[:move_to]
84
- [i, j, char]
85
- end
86
-
87
- def find_previous(i, j)
88
- [
89
- [:insert, { cost: ins(i, j), move_to: [i, j - 1] }],
90
- [:delete, { cost: del(i, j), move_to: [i, j - 1] }],
91
- [:substitute, { cost: subst(i, j), move_to: [i, j - 1] }]
92
- ].select do |_operation, data|
93
- # Don't send us out of bounds
94
- data[:move_to][0] >= 0 && data[:move_to][1] >= 0
95
- end.min_by do |_operation, data|
96
- # pick the cheapest one
97
- data[:value]
98
- end
99
- end
100
-
101
- # TODO: Score the edit distance lower if sonorant sounds are found in sequence.
102
- def del(i, j)
103
- matrix[i - 1][j]
104
- end
105
-
106
- def ins(i, j)
107
- matrix[i][j - 1]
108
- end
109
-
110
- def subst(i, j)
111
- matrix[i - 1][j - 1]
112
- end
113
-
114
- # Set the minimum scores equal to the distance between each phoneme,
115
- # sequentially.
116
- #
117
- # The first value is always zero, the second is always 1.
118
- # Subsequent values are the cumulative phonetic distance between each
119
- # phoneme within the same string.
120
- # "aek" -> [0, 1, 1.61, 2.61]
121
- def initial_distances(str1, str2)
122
- starting_distance = 1
123
- starting_distance = 0 if len1 == 0 || len2 == 0
124
-
125
- distances1 = (1..(str1.length - 1)).reduce([0, starting_distance]) do |acc, i|
126
- acc << acc.last + Phonetics.distance(str1[i - 1], str1[i])
127
- end
128
- distances2 = (1..(str2.length - 1)).reduce([0, starting_distance]) do |acc, i|
129
- acc << acc.last + Phonetics.distance(str2[i - 1], str2[i])
130
- end
131
-
132
- [distances1, distances2]
133
- end
134
-
135
- def prepare_matrix
136
- str1_initial, str2_initial = initial_distances(str1, str2)
137
-
138
- @matrix = Array.new(len2 + 1) { Array.new(len1 + 1) { nil } }
139
- # The first row is the initial values for str2
140
- @matrix[0] = str1_initial
141
- # The first column is the initial values for str1
142
- (len2 + 1).times { |n| @matrix[n][0] = str2_initial[n] }
143
- end
144
-
145
- # This is a helper method for developers to use when exploring this
146
- # algorithm.
147
- def print_matrix
148
- puts " #{str1.chars.map { |c| c.ljust(9, ' ') }.join}"
149
- matrix.each_with_index do |row, ridx|
150
- print ' ' if ridx == 0
151
- print "#{str2[ridx - 1]} " if ridx > 0
152
- row.each_with_index do |cell, _cidx|
153
- cell ||= 0.0
154
- print cell.to_s[0, 8].ljust(8, '0')
155
- print ' '
156
- end
157
- puts ''
158
- end
159
- ''
160
- end
161
- end
162
- end