phonetics 1.1.1 → 1.5.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1 @@
1
+ float phonetic_cost(long, long);
@@ -223,14 +223,8 @@ module Phonetics
223
223
  )
224
224
 
225
225
  def distance(phoneme1, phoneme2)
226
- types = [Symbols.fetch(phoneme1), Symbols.fetch(phoneme2)].sort
227
- if types == [:consonant, :vowel]
228
- 1.0
229
- elsif types == [:vowel, :vowel]
230
- Vowels.distance(phoneme1, phoneme2)
231
- elsif types == [:consonant, :consonant]
232
- Consonants.distance(phoneme1, phoneme2)
233
- end
226
+ return 0 if phoneme1 == phoneme2
227
+ distance_map.fetch(phoneme1).fetch(phoneme2)
234
228
  end
235
229
 
236
230
  def distance_map
@@ -238,9 +232,92 @@ module Phonetics
238
232
  Vowels.phonemes + Consonants.phonemes
239
233
  ).permutation(2).each_with_object(Hash.new { |h, k| h[k] = {} } ) do |pair, scores|
240
234
  p1, p2 = *pair
241
- score = distance(p1, p2)
235
+ score = _distance(p1, p2)
242
236
  scores[p1][p2] = score
243
237
  scores[p2][p1] = score
244
238
  end
245
239
  end
240
+
241
+ # as_utf_8_long("aɰ̊ h")
242
+ # => [97, 8404, 32, 104]
243
+ def as_utf_8_long(string)
244
+ string.each_grapheme_cluster.map { |grapheme| grapheme_as_utf_8_long(grapheme) }
245
+ end
246
+
247
+ # Encode individual multi-byte strings as a single integer.
248
+ #
249
+ # "ɰ̊".unpack('U*')
250
+ # => [624, 778]
251
+ #
252
+ # grapheme_as_utf_8_long("ɰ̊")
253
+ # => 1413 (624 + (10 * 778))
254
+ def grapheme_as_utf_8_long(grapheme)
255
+ grapheme.unpack('U*').each_with_index.reduce(0) do |total, (byte, i)|
256
+ total += (10**i) * byte
257
+ end
258
+ end
259
+
260
+ # This will print a C code file with a function that implements a two-level C
261
+ # switch like the following:
262
+ #
263
+ # switch (a) {
264
+ # case 100: // 'd'
265
+ # switch (b) {
266
+ # case 618: // 'ɪ'
267
+ # return (float) 0.73827;
268
+ # break;
269
+ # }
270
+ # }
271
+ #
272
+ def generate_phonetic_cost_c_code(writer = STDOUT)
273
+ # First, flatten the bytes of the runes (unicode codepoints encoded via
274
+ # UTF-8) into single integers. We do this by adding the utf-8 values, each
275
+ # multiplied by 10 * their byte number. The specific encoding doesn't
276
+ # matter so long as it's:
277
+ # * consistent
278
+ # * has no collisions
279
+ # * produces a value that's a valid C case conditional
280
+ # * can be applied to runes of input strings later
281
+ integer_distance_map = distance_map.reduce({}) do |acc_a, (a, distances)|
282
+ acc_a.update [a, grapheme_as_utf_8_long(a)] => (distances.reduce({}) do |acc_b, (b, distance)|
283
+ acc_b.update [b, grapheme_as_utf_8_long(b)] => distance
284
+ end)
285
+ end
286
+
287
+ # Then we print out C code full of switches
288
+
289
+ writer.puts(<<-FUNC.gsub(/^ {4}/, ''))
290
+ float phonetic_cost(int a, int b) {
291
+ // This is compiled from Ruby, using `String#unpack("U")` on each character
292
+ // to retrieve the UTF-8 codepoint as a C long value.
293
+ if (a == b) { return 0.0; };
294
+ FUNC
295
+ writer.puts ' switch (a) {'
296
+ integer_distance_map.each do |(a, a_i), distances|
297
+ writer.puts " case #{a_i}: // #{a}"
298
+ writer.puts ' switch (b) {'
299
+ distances.each do |(b, b_i), distance|
300
+ writer.puts " case #{b_i}: // #{a}->#{b}"
301
+ writer.puts " return (float) #{distance};"
302
+ writer.puts " break;"
303
+ end
304
+ writer.puts ' }'
305
+ end
306
+ writer.puts ' }'
307
+ writer.puts ' return 1.0;'
308
+ writer.puts '}'
309
+ end
310
+
311
+ private
312
+
313
+ def _distance(phoneme1, phoneme2)
314
+ types = [Symbols.fetch(phoneme1), Symbols.fetch(phoneme2)].sort
315
+ if types == [:consonant, :vowel]
316
+ 1.0
317
+ elsif types == [:vowel, :vowel]
318
+ Vowels.distance(phoneme1, phoneme2)
319
+ elsif types == [:consonant, :consonant]
320
+ Consonants.distance(phoneme1, phoneme2)
321
+ end
322
+ end
246
323
  end
@@ -1,5 +1,4 @@
1
- require_relative '../phonetics'
2
-
1
+ require_relative 'c_levenshtein'
3
2
  # Using the Damerau version of the Levenshtein algorithm, with phonetic feature
4
3
  # count used instead of a binary edit distance calculation
5
4
  #
@@ -11,110 +10,26 @@ require_relative '../phonetics'
11
10
  # Aumont, 2016
12
11
  # https://hal.archives-ouvertes.fr/hal-01474904/document
13
12
  module Phonetics
14
- class Levenshtein
15
- def initialize(ipa_str1, ipa_str2)
16
- @str1 = ipa_str1
17
- @str2 = ipa_str2
18
- @len1 = ipa_str1.size
19
- @len2 = ipa_str2.size
20
- prepare_matrix
21
- set_edit_distances(ipa_str1, ipa_str2)
22
- end
23
-
24
- def distance
25
- return 0 if walk.empty?
26
- walk.last[:distance]
27
- end
13
+ module Levenshtein
14
+ extend ::PhoneticsLevenshteinCBinding
28
15
 
29
16
  def self.distance(str1, str2)
30
- new(str1, str2).distance
31
- end
32
-
33
- private
34
-
35
- def walk
36
- res = []
37
- cell = [@len2, @len1]
38
- while cell != [0, 0]
39
- cell, char = char_data(cell)
40
- res.unshift char
41
- end
42
- res
43
- end
44
-
45
- def set_edit_distances(str1, str2)
46
- (1..@len2).each do |i|
47
- (1..@len1).each do |j|
48
- no_change(i, j) && next if str2[i - 1] == str1[j - 1]
49
- @matrix[i][j] = [del(i, j) + 1.0, ins(i, j) + 1.0, subst(i, j)].min
17
+ ensure_is_phonetic!(str1, str2)
18
+ internal_phonetic_distance(
19
+ Phonetics.as_utf_8_long(str1),
20
+ Phonetics.as_utf_8_long(str2),
21
+ )
22
+ end
23
+
24
+ def self.ensure_is_phonetic!(str1, str2)
25
+ [str1, str2].each do |string|
26
+ string.chars.each do |char|
27
+ unless Phonetics.phonemes.include?(char)
28
+ raise ArgumentError, "#{char.inspect} is not a character in the International Phonetic Alphabet. #{self.class.name} only works with IPA-transcribed strings"
29
+ end
50
30
  end
51
31
  end
52
32
  end
53
33
 
54
- def char_data(cell)
55
- char = { distance: @matrix[cell[0]][cell[1]] }
56
- val = find_previous(cell)
57
- previous_value = val[0][0]
58
- char[:type] = previous_value == char[:distance] ? :same : val[1]
59
- cell = val.pop
60
- [cell, char]
61
- end
62
-
63
- def find_previous(cell)
64
- candidates = [
65
- [
66
- [ins(*cell), 1],
67
- :ins,
68
- [cell[0], cell[1] - 1],
69
- ],
70
- [
71
- [del(*cell), 2],
72
- :del,
73
- [cell[0] - 1, cell[1]],
74
- ],
75
- [
76
- [subst(*cell), 0],
77
- :subst,
78
- [cell[0] - 1, cell[1] - 1],
79
- ],
80
- ]
81
- select_cell(candidates)
82
- end
83
-
84
- def select_cell(candidates)
85
- candidates.select { |e| e[-1][0] >= 0 && e[-1][1] >= 0 }.
86
- sort_by(&:first).first
87
- end
88
-
89
- # TODO: Score the edit distance lower if sonorant sounds are found in sequence.
90
- def del(i, j)
91
- @matrix[i - 1][j]
92
- end
93
-
94
- def ins(i, j)
95
- @matrix[i][j - 1]
96
- end
97
-
98
- # This is where we implement the modifications to Damerau-Levenshtein according to
99
- # https://hal.archives-ouvertes.fr/hal-01474904/document
100
- def subst(i, j)
101
- map = Phonetics.distance_map[@str1[j]]
102
- score = map[@str2[i]] if map
103
- score ||= 1.0
104
- @matrix[i - 1][j - 1] + score
105
- end
106
-
107
- def no_change(i, j)
108
- @matrix[i][j] = @matrix[i - 1][j - 1]
109
- end
110
-
111
- def prepare_matrix
112
- @matrix = []
113
- @matrix << (0..@len1).to_a
114
- @len2.times do |i|
115
- ary = [i + 1] + (1..@len1).map { nil }
116
- @matrix << ary
117
- end
118
- end
119
34
  end
120
35
  end
@@ -0,0 +1,171 @@
1
+ require_relative '../phonetics'
2
+
3
+ # Using the Damerau version of the Levenshtein algorithm, with phonetic feature
4
+ # count used instead of a binary edit distance calculation
5
+ #
6
+ # This implementation is almost entirely taken from the damerau-levenshtein gem
7
+ # (https://github.com/GlobalNamesArchitecture/damerau-levenshtein/tree/master/ext/damerau_levenshtein).
8
+ # The implementation is modified based on "Using Phonologically Weighted
9
+ # Levenshtein Distances for the Prediction of Microscopic Intelligibility" by
10
+ # Lionel Fontan, Isabelle Ferrané, Jérôme Farinas, Julien Pinquier, Xavier
11
+ # Aumont, 2016
12
+ # https://hal.archives-ouvertes.fr/hal-01474904/document
13
+ module Phonetics
14
+ class RubyLevenshtein
15
+
16
+ attr_reader :str1, :str2, :len1, :len2, :matrix
17
+
18
+ def initialize(ipa_str1, ipa_str2, verbose = false)
19
+ @str1 = ipa_str1
20
+ @str2 = ipa_str2
21
+ @len1 = ipa_str1.size
22
+ @len2 = ipa_str2.size
23
+ @verbose = verbose
24
+ ensure_is_phonetic!
25
+ prepare_matrix
26
+ set_edit_distances(ipa_str1, ipa_str2)
27
+ end
28
+
29
+ def distance
30
+ return 0 if walk.empty?
31
+ print_matrix if @verbose
32
+ walk.last[:distance]
33
+ end
34
+
35
+ def self.distance(str1, str2)
36
+ new(str1, str2).distance
37
+ end
38
+
39
+ private
40
+
41
+ def ensure_is_phonetic!
42
+ [str1, str2].each do |string|
43
+ string.chars.each do |char|
44
+ unless Phonetics.phonemes.include?(char)
45
+ raise ArgumentError, "#{char.inspect} is not a character in the International Phonetic Alphabet. #{self.class.name} only works with IPA-transcribed strings"
46
+ end
47
+ end
48
+ end
49
+ end
50
+
51
+ def walk
52
+ res = []
53
+ i, j = len2, len1
54
+ return res if i == 0 && j == 0
55
+ begin
56
+ i, j, char = char_data(i, j)
57
+ res.unshift char
58
+ end while i > 0 && j > 0
59
+ res
60
+ end
61
+
62
+ def set_edit_distances(str1, str2)
63
+ i = 0
64
+ while (i += 1) <= len2
65
+ j = 0
66
+ while (j += 1) <= len1
67
+ options = [
68
+ ins(i, j),
69
+ del(i, j),
70
+ subst(i, j),
71
+ ]
72
+ # This is where we implement the modifications to Damerau-Levenshtein
73
+ # according to https://hal.archives-ouvertes.fr/hal-01474904/document
74
+ phonetic_cost = Phonetics.distance(str1[j - 1], str2[i - 1])
75
+ matrix[i][j] = options.min + phonetic_cost
76
+ puts "------- #{j}/#{i} #{j + (i*(len1+1))}" if @verbose
77
+ print_matrix if @verbose
78
+ end
79
+ end
80
+ end
81
+
82
+ def char_data(i, j)
83
+ char = { distance: matrix[i][j] }
84
+ operation, move = find_previous(i, j)
85
+ previous_value = move[:value]
86
+ char[:type] = previous_value == char[:distance] ? :same : operation
87
+ i, j = move[:move_to]
88
+ [i, j, char]
89
+ end
90
+
91
+ def find_previous(i, j)
92
+ [
93
+ [ :insert, { cost: ins(i, j), move_to: [i, j - 1] }],
94
+ [ :delete, { cost: del(i, j), move_to: [i, j - 1] }],
95
+ [ :substitute, { cost: subst(i, j), move_to: [i, j - 1] }],
96
+ ].select do |operation, data|
97
+ # Don't send us out of bounds
98
+ data[:move_to][0] >= 0 && data[:move_to][1] >= 0
99
+ end.sort_by do |operation, data|
100
+ # pick the cheapest one
101
+ data[:value]
102
+ end.first
103
+ end
104
+
105
+ # TODO: Score the edit distance lower if sonorant sounds are found in sequence.
106
+ def del(i, j)
107
+ matrix[i - 1][j]
108
+ end
109
+
110
+ def ins(i, j)
111
+ matrix[i][j - 1]
112
+ end
113
+
114
+ def subst(i, j)
115
+ matrix[i - 1][j - 1]
116
+ end
117
+
118
+ # Set the minimum scores equal to the distance between each phoneme,
119
+ # sequentially.
120
+ #
121
+ # The first value is always zero.
122
+ # The second value is always the phonetic distance between the first
123
+ # phonemes of each string.
124
+ # Subsequent values are the cumulative phonetic distance between each
125
+ # phoneme within the same string.
126
+ # "aek" -> [0, 1, 1.61, 2.61]
127
+ def initial_distances(str1, str2)
128
+ if len1 == 0 || len2 == 0
129
+ starting_distance = 0
130
+ else
131
+ starting_distance = Phonetics.distance(str1[0], str2[0])
132
+ end
133
+
134
+ distances1 = (1..(str1.length-1)).reduce([0, starting_distance]) do |acc, i|
135
+ acc << acc.last + Phonetics.distance(str1[i-1], str1[i])
136
+ end
137
+ distances2 = (1..(str2.length-1)).reduce([0, starting_distance]) do |acc, i|
138
+ acc << acc.last + Phonetics.distance(str2[i-1], str2[i])
139
+ end
140
+
141
+ [ distances1, distances2 ]
142
+ end
143
+
144
+ def prepare_matrix
145
+ str1_initial, str2_initial = initial_distances(str1, str2)
146
+
147
+ @matrix = Array.new(len2 + 1) { Array.new(len1 + 1) { nil } }
148
+ # The first row is the initial values for str2
149
+ @matrix[0] = str1_initial
150
+ # The first column is the initial values for str1
151
+ (len2 + 1).times { |n| @matrix[n][0] = str2_initial[n] }
152
+ end
153
+
154
+ # This is a helper method for developers to use when exploring this
155
+ # algorithm.
156
+ def print_matrix
157
+ puts " #{str1.chars.map {|c| c.ljust(9, " ") }.join}"
158
+ matrix.each_with_index do |row, ridx|
159
+ print ' ' if ridx == 0
160
+ print "#{str2[ridx - 1]} " if ridx > 0
161
+ row.each_with_index do |cell, cidx|
162
+ cell ||= 0.0
163
+ print cell.to_s[0, 8].ljust(8, '0')
164
+ print ' '
165
+ end
166
+ puts ''
167
+ end
168
+ ''
169
+ end
170
+ end
171
+ end
@@ -16,8 +16,11 @@ Gem::Specification.new do |spec|
16
16
  end
17
17
  spec.require_paths = ["lib"]
18
18
 
19
+ spec.add_development_dependency "pry-byebug"
20
+ spec.add_development_dependency "rake-compiler", "~> 1.0"
21
+ spec.add_development_dependency "rubocop", "~> 0.52"
22
+ spec.add_development_dependency "ruby-prof", "~> 0.17"
19
23
  spec.add_development_dependency 'bundler', '~> 1.16'
20
24
  spec.add_development_dependency 'rake'
21
- spec.add_development_dependency "pry-byebug"
22
25
  spec.add_development_dependency 'rspec', '~> 3.0'
23
26
  end
metadata CHANGED
@@ -1,45 +1,87 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: phonetics
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.1
4
+ version: 1.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jack Danger
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-04-13 00:00:00.000000000 Z
11
+ date: 2019-08-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: bundler
14
+ name: pry-byebug
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake-compiler
15
29
  requirement: !ruby/object:Gem::Requirement
16
30
  requirements:
17
31
  - - "~>"
18
32
  - !ruby/object:Gem::Version
19
- version: '1.16'
33
+ version: '1.0'
20
34
  type: :development
21
35
  prerelease: false
22
36
  version_requirements: !ruby/object:Gem::Requirement
23
37
  requirements:
24
38
  - - "~>"
25
39
  - !ruby/object:Gem::Version
26
- version: '1.16'
40
+ version: '1.0'
27
41
  - !ruby/object:Gem::Dependency
28
- name: rake
42
+ name: rubocop
29
43
  requirement: !ruby/object:Gem::Requirement
30
44
  requirements:
31
- - - ">="
45
+ - - "~>"
32
46
  - !ruby/object:Gem::Version
33
- version: '0'
47
+ version: '0.52'
34
48
  type: :development
35
49
  prerelease: false
36
50
  version_requirements: !ruby/object:Gem::Requirement
37
51
  requirements:
38
- - - ">="
52
+ - - "~>"
39
53
  - !ruby/object:Gem::Version
40
- version: '0'
54
+ version: '0.52'
41
55
  - !ruby/object:Gem::Dependency
42
- name: pry-byebug
56
+ name: ruby-prof
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '0.17'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '0.17'
69
+ - !ruby/object:Gem::Dependency
70
+ name: bundler
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '1.16'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '1.16'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rake
43
85
  requirement: !ruby/object:Gem::Requirement
44
86
  requirements:
45
87
  - - ">="
@@ -73,6 +115,7 @@ executables: []
73
115
  extensions: []
74
116
  extra_rdoc_files: []
75
117
  files:
118
+ - ".github/workflows/gempush.yml"
76
119
  - ".gitignore"
77
120
  - ".rspec"
78
121
  - ".travis.yml"
@@ -83,8 +126,14 @@ files:
83
126
  - README.md
84
127
  - Rakefile
85
128
  - VERSION
129
+ - ext/c_levenshtein/extconf.rb
130
+ - ext/c_levenshtein/levenshtein.c
131
+ - ext/c_levenshtein/phonetic_cost.c
132
+ - ext/c_levenshtein/phonetic_cost.h
86
133
  - lib/phonetics.rb
134
+ - lib/phonetics/c_levenshtein.bundle
87
135
  - lib/phonetics/levenshtein.rb
136
+ - lib/phonetics/ruby_levenshtein.rb
88
137
  - lib/phonetics/version.rb
89
138
  - phonetics.gemspec
90
139
  homepage: https://github.com/JackDanger/phonetics
@@ -106,8 +155,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
106
155
  - !ruby/object:Gem::Version
107
156
  version: '0'
108
157
  requirements: []
109
- rubyforge_project:
110
- rubygems_version: 2.7.6
158
+ rubygems_version: 3.0.3
111
159
  signing_key:
112
160
  specification_version: 4
113
161
  summary: tools for linguistic code using the International Phonetic Alphabet