phonetics 1.1.1 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1 @@
1
+ float phonetic_cost(long, long);
@@ -223,14 +223,8 @@ module Phonetics
223
223
  )
224
224
 
225
225
  def distance(phoneme1, phoneme2)
226
- types = [Symbols.fetch(phoneme1), Symbols.fetch(phoneme2)].sort
227
- if types == [:consonant, :vowel]
228
- 1.0
229
- elsif types == [:vowel, :vowel]
230
- Vowels.distance(phoneme1, phoneme2)
231
- elsif types == [:consonant, :consonant]
232
- Consonants.distance(phoneme1, phoneme2)
233
- end
226
+ return 0 if phoneme1 == phoneme2
227
+ distance_map.fetch(phoneme1).fetch(phoneme2)
234
228
  end
235
229
 
236
230
  def distance_map
@@ -238,9 +232,92 @@ module Phonetics
238
232
  Vowels.phonemes + Consonants.phonemes
239
233
  ).permutation(2).each_with_object(Hash.new { |h, k| h[k] = {} } ) do |pair, scores|
240
234
  p1, p2 = *pair
241
- score = distance(p1, p2)
235
+ score = _distance(p1, p2)
242
236
  scores[p1][p2] = score
243
237
  scores[p2][p1] = score
244
238
  end
245
239
  end
240
+
241
+ # as_utf_8_long("aɰ̊ h")
242
+ # => [97, 8404, 32, 104]
243
+ def as_utf_8_long(string)
244
+ string.each_grapheme_cluster.map { |grapheme| grapheme_as_utf_8_long(grapheme) }
245
+ end
246
+
247
+ # Encode individual multi-byte strings as a single integer.
248
+ #
249
+ # "ɰ̊".unpack('U*')
250
+ # => [624, 778]
251
+ #
252
+ # grapheme_as_utf_8_long("ɰ̊")
253
+ # => 1413 (624 + (10 * 778))
254
+ def grapheme_as_utf_8_long(grapheme)
255
+ grapheme.unpack('U*').each_with_index.reduce(0) do |total, (byte, i)|
256
+ total += (10**i) * byte
257
+ end
258
+ end
259
+
260
+ # This will print a C code file with a function that implements a two-level C
261
+ # switch like the following:
262
+ #
263
+ # switch (a) {
264
+ # case 100: // 'd'
265
+ # switch (b) {
266
+ # case 618: // 'ɪ'
267
+ # return (float) 0.73827;
268
+ # break;
269
+ # }
270
+ # }
271
+ #
272
+ def generate_phonetic_cost_c_code(writer = STDOUT)
273
+ # First, flatten the bytes of the runes (unicode codepoints encoded via
274
+ # UTF-8) into single integers. We do this by adding the utf-8 values, each
275
+ # multiplied by 10 * their byte number. The specific encoding doesn't
276
+ # matter so long as it's:
277
+ # * consistent
278
+ # * has no collisions
279
+ # * produces a value that's a valid C case conditional
280
+ # * can be applied to runes of input strings later
281
+ integer_distance_map = distance_map.reduce({}) do |acc_a, (a, distances)|
282
+ acc_a.update [a, grapheme_as_utf_8_long(a)] => (distances.reduce({}) do |acc_b, (b, distance)|
283
+ acc_b.update [b, grapheme_as_utf_8_long(b)] => distance
284
+ end)
285
+ end
286
+
287
+ # Then we print out C code full of switches
288
+
289
+ writer.puts(<<-FUNC.gsub(/^ {4}/, ''))
290
+ float phonetic_cost(int a, int b) {
291
+ // This is compiled from Ruby, using `String#unpack("U")` on each character
292
+ // to retrieve the UTF-8 codepoint as a C long value.
293
+ if (a == b) { return 0.0; };
294
+ FUNC
295
+ writer.puts ' switch (a) {'
296
+ integer_distance_map.each do |(a, a_i), distances|
297
+ writer.puts " case #{a_i}: // #{a}"
298
+ writer.puts ' switch (b) {'
299
+ distances.each do |(b, b_i), distance|
300
+ writer.puts " case #{b_i}: // #{a}->#{b}"
301
+ writer.puts " return (float) #{distance};"
302
+ writer.puts " break;"
303
+ end
304
+ writer.puts ' }'
305
+ end
306
+ writer.puts ' }'
307
+ writer.puts ' return 1.0;'
308
+ writer.puts '}'
309
+ end
310
+
311
+ private
312
+
313
+ def _distance(phoneme1, phoneme2)
314
+ types = [Symbols.fetch(phoneme1), Symbols.fetch(phoneme2)].sort
315
+ if types == [:consonant, :vowel]
316
+ 1.0
317
+ elsif types == [:vowel, :vowel]
318
+ Vowels.distance(phoneme1, phoneme2)
319
+ elsif types == [:consonant, :consonant]
320
+ Consonants.distance(phoneme1, phoneme2)
321
+ end
322
+ end
246
323
  end
@@ -1,5 +1,4 @@
1
- require_relative '../phonetics'
2
-
1
+ require_relative 'c_levenshtein'
3
2
  # Using the Damerau version of the Levenshtein algorithm, with phonetic feature
4
3
  # count used instead of a binary edit distance calculation
5
4
  #
@@ -11,110 +10,26 @@ require_relative '../phonetics'
11
10
  # Aumont, 2016
12
11
  # https://hal.archives-ouvertes.fr/hal-01474904/document
13
12
  module Phonetics
14
- class Levenshtein
15
- def initialize(ipa_str1, ipa_str2)
16
- @str1 = ipa_str1
17
- @str2 = ipa_str2
18
- @len1 = ipa_str1.size
19
- @len2 = ipa_str2.size
20
- prepare_matrix
21
- set_edit_distances(ipa_str1, ipa_str2)
22
- end
23
-
24
- def distance
25
- return 0 if walk.empty?
26
- walk.last[:distance]
27
- end
13
+ module Levenshtein
14
+ extend ::PhoneticsLevenshteinCBinding
28
15
 
29
16
  def self.distance(str1, str2)
30
- new(str1, str2).distance
31
- end
32
-
33
- private
34
-
35
- def walk
36
- res = []
37
- cell = [@len2, @len1]
38
- while cell != [0, 0]
39
- cell, char = char_data(cell)
40
- res.unshift char
41
- end
42
- res
43
- end
44
-
45
- def set_edit_distances(str1, str2)
46
- (1..@len2).each do |i|
47
- (1..@len1).each do |j|
48
- no_change(i, j) && next if str2[i - 1] == str1[j - 1]
49
- @matrix[i][j] = [del(i, j) + 1.0, ins(i, j) + 1.0, subst(i, j)].min
17
+ ensure_is_phonetic!(str1, str2)
18
+ internal_phonetic_distance(
19
+ Phonetics.as_utf_8_long(str1),
20
+ Phonetics.as_utf_8_long(str2),
21
+ )
22
+ end
23
+
24
+ def self.ensure_is_phonetic!(str1, str2)
25
+ [str1, str2].each do |string|
26
+ string.chars.each do |char|
27
+ unless Phonetics.phonemes.include?(char)
28
+ raise ArgumentError, "#{char.inspect} is not a character in the International Phonetic Alphabet. #{self.class.name} only works with IPA-transcribed strings"
29
+ end
50
30
  end
51
31
  end
52
32
  end
53
33
 
54
- def char_data(cell)
55
- char = { distance: @matrix[cell[0]][cell[1]] }
56
- val = find_previous(cell)
57
- previous_value = val[0][0]
58
- char[:type] = previous_value == char[:distance] ? :same : val[1]
59
- cell = val.pop
60
- [cell, char]
61
- end
62
-
63
- def find_previous(cell)
64
- candidates = [
65
- [
66
- [ins(*cell), 1],
67
- :ins,
68
- [cell[0], cell[1] - 1],
69
- ],
70
- [
71
- [del(*cell), 2],
72
- :del,
73
- [cell[0] - 1, cell[1]],
74
- ],
75
- [
76
- [subst(*cell), 0],
77
- :subst,
78
- [cell[0] - 1, cell[1] - 1],
79
- ],
80
- ]
81
- select_cell(candidates)
82
- end
83
-
84
- def select_cell(candidates)
85
- candidates.select { |e| e[-1][0] >= 0 && e[-1][1] >= 0 }.
86
- sort_by(&:first).first
87
- end
88
-
89
- # TODO: Score the edit distance lower if sonorant sounds are found in sequence.
90
- def del(i, j)
91
- @matrix[i - 1][j]
92
- end
93
-
94
- def ins(i, j)
95
- @matrix[i][j - 1]
96
- end
97
-
98
- # This is where we implement the modifications to Damerau-Levenshtein according to
99
- # https://hal.archives-ouvertes.fr/hal-01474904/document
100
- def subst(i, j)
101
- map = Phonetics.distance_map[@str1[j]]
102
- score = map[@str2[i]] if map
103
- score ||= 1.0
104
- @matrix[i - 1][j - 1] + score
105
- end
106
-
107
- def no_change(i, j)
108
- @matrix[i][j] = @matrix[i - 1][j - 1]
109
- end
110
-
111
- def prepare_matrix
112
- @matrix = []
113
- @matrix << (0..@len1).to_a
114
- @len2.times do |i|
115
- ary = [i + 1] + (1..@len1).map { nil }
116
- @matrix << ary
117
- end
118
- end
119
34
  end
120
35
  end
@@ -0,0 +1,171 @@
1
+ require_relative '../phonetics'
2
+
3
+ # Using the Damerau version of the Levenshtein algorithm, with phonetic feature
4
+ # count used instead of a binary edit distance calculation
5
+ #
6
+ # This implementation is almost entirely taken from the damerau-levenshtein gem
7
+ # (https://github.com/GlobalNamesArchitecture/damerau-levenshtein/tree/master/ext/damerau_levenshtein).
8
+ # The implementation is modified based on "Using Phonologically Weighted
9
+ # Levenshtein Distances for the Prediction of Microscopic Intelligibility" by
10
+ # Lionel Fontan, Isabelle Ferrané, Jérôme Farinas, Julien Pinquier, Xavier
11
+ # Aumont, 2016
12
+ # https://hal.archives-ouvertes.fr/hal-01474904/document
13
+ module Phonetics
14
+ class RubyLevenshtein
15
+
16
+ attr_reader :str1, :str2, :len1, :len2, :matrix
17
+
18
+ def initialize(ipa_str1, ipa_str2, verbose = false)
19
+ @str1 = ipa_str1
20
+ @str2 = ipa_str2
21
+ @len1 = ipa_str1.size
22
+ @len2 = ipa_str2.size
23
+ @verbose = verbose
24
+ ensure_is_phonetic!
25
+ prepare_matrix
26
+ set_edit_distances(ipa_str1, ipa_str2)
27
+ end
28
+
29
+ def distance
30
+ return 0 if walk.empty?
31
+ print_matrix if @verbose
32
+ walk.last[:distance]
33
+ end
34
+
35
+ def self.distance(str1, str2)
36
+ new(str1, str2).distance
37
+ end
38
+
39
+ private
40
+
41
+ def ensure_is_phonetic!
42
+ [str1, str2].each do |string|
43
+ string.chars.each do |char|
44
+ unless Phonetics.phonemes.include?(char)
45
+ raise ArgumentError, "#{char.inspect} is not a character in the International Phonetic Alphabet. #{self.class.name} only works with IPA-transcribed strings"
46
+ end
47
+ end
48
+ end
49
+ end
50
+
51
+ def walk
52
+ res = []
53
+ i, j = len2, len1
54
+ return res if i == 0 && j == 0
55
+ begin
56
+ i, j, char = char_data(i, j)
57
+ res.unshift char
58
+ end while i > 0 && j > 0
59
+ res
60
+ end
61
+
62
+ def set_edit_distances(str1, str2)
63
+ i = 0
64
+ while (i += 1) <= len2
65
+ j = 0
66
+ while (j += 1) <= len1
67
+ options = [
68
+ ins(i, j),
69
+ del(i, j),
70
+ subst(i, j),
71
+ ]
72
+ # This is where we implement the modifications to Damerau-Levenshtein
73
+ # according to https://hal.archives-ouvertes.fr/hal-01474904/document
74
+ phonetic_cost = Phonetics.distance(str1[j - 1], str2[i - 1])
75
+ matrix[i][j] = options.min + phonetic_cost
76
+ puts "------- #{j}/#{i} #{j + (i*(len1+1))}" if @verbose
77
+ print_matrix if @verbose
78
+ end
79
+ end
80
+ end
81
+
82
+ def char_data(i, j)
83
+ char = { distance: matrix[i][j] }
84
+ operation, move = find_previous(i, j)
85
+ previous_value = move[:value]
86
+ char[:type] = previous_value == char[:distance] ? :same : operation
87
+ i, j = move[:move_to]
88
+ [i, j, char]
89
+ end
90
+
91
+ def find_previous(i, j)
92
+ [
93
+ [ :insert, { cost: ins(i, j), move_to: [i, j - 1] }],
94
+ [ :delete, { cost: del(i, j), move_to: [i, j - 1] }],
95
+ [ :substitute, { cost: subst(i, j), move_to: [i, j - 1] }],
96
+ ].select do |operation, data|
97
+ # Don't send us out of bounds
98
+ data[:move_to][0] >= 0 && data[:move_to][1] >= 0
99
+ end.sort_by do |operation, data|
100
+ # pick the cheapest one
101
+ data[:value]
102
+ end.first
103
+ end
104
+
105
+ # TODO: Score the edit distance lower if sonorant sounds are found in sequence.
106
+ def del(i, j)
107
+ matrix[i - 1][j]
108
+ end
109
+
110
+ def ins(i, j)
111
+ matrix[i][j - 1]
112
+ end
113
+
114
+ def subst(i, j)
115
+ matrix[i - 1][j - 1]
116
+ end
117
+
118
+ # Set the minimum scores equal to the distance between each phoneme,
119
+ # sequentially.
120
+ #
121
+ # The first value is always zero.
122
+ # The second value is always the phonetic distance between the first
123
+ # phonemes of each string.
124
+ # Subsequent values are the cumulative phonetic distance between each
125
+ # phoneme within the same string.
126
+ # "aek" -> [0, 1, 1.61, 2.61]
127
+ def initial_distances(str1, str2)
128
+ if len1 == 0 || len2 == 0
129
+ starting_distance = 0
130
+ else
131
+ starting_distance = Phonetics.distance(str1[0], str2[0])
132
+ end
133
+
134
+ distances1 = (1..(str1.length-1)).reduce([0, starting_distance]) do |acc, i|
135
+ acc << acc.last + Phonetics.distance(str1[i-1], str1[i])
136
+ end
137
+ distances2 = (1..(str2.length-1)).reduce([0, starting_distance]) do |acc, i|
138
+ acc << acc.last + Phonetics.distance(str2[i-1], str2[i])
139
+ end
140
+
141
+ [ distances1, distances2 ]
142
+ end
143
+
144
+ def prepare_matrix
145
+ str1_initial, str2_initial = initial_distances(str1, str2)
146
+
147
+ @matrix = Array.new(len2 + 1) { Array.new(len1 + 1) { nil } }
148
+ # The first row is the initial values for str2
149
+ @matrix[0] = str1_initial
150
+ # The first column is the initial values for str1
151
+ (len2 + 1).times { |n| @matrix[n][0] = str2_initial[n] }
152
+ end
153
+
154
+ # This is a helper method for developers to use when exploring this
155
+ # algorithm.
156
+ def print_matrix
157
+ puts " #{str1.chars.map {|c| c.ljust(9, " ") }.join}"
158
+ matrix.each_with_index do |row, ridx|
159
+ print ' ' if ridx == 0
160
+ print "#{str2[ridx - 1]} " if ridx > 0
161
+ row.each_with_index do |cell, cidx|
162
+ cell ||= 0.0
163
+ print cell.to_s[0, 8].ljust(8, '0')
164
+ print ' '
165
+ end
166
+ puts ''
167
+ end
168
+ ''
169
+ end
170
+ end
171
+ end
@@ -16,8 +16,11 @@ Gem::Specification.new do |spec|
16
16
  end
17
17
  spec.require_paths = ["lib"]
18
18
 
19
+ spec.add_development_dependency "pry-byebug"
20
+ spec.add_development_dependency "rake-compiler", "~> 1.0"
21
+ spec.add_development_dependency "rubocop", "~> 0.52"
22
+ spec.add_development_dependency "ruby-prof", "~> 0.17"
19
23
  spec.add_development_dependency 'bundler', '~> 1.16'
20
24
  spec.add_development_dependency 'rake'
21
- spec.add_development_dependency "pry-byebug"
22
25
  spec.add_development_dependency 'rspec', '~> 3.0'
23
26
  end
metadata CHANGED
@@ -1,45 +1,87 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: phonetics
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.1
4
+ version: 1.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jack Danger
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-04-13 00:00:00.000000000 Z
11
+ date: 2019-08-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: bundler
14
+ name: pry-byebug
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake-compiler
15
29
  requirement: !ruby/object:Gem::Requirement
16
30
  requirements:
17
31
  - - "~>"
18
32
  - !ruby/object:Gem::Version
19
- version: '1.16'
33
+ version: '1.0'
20
34
  type: :development
21
35
  prerelease: false
22
36
  version_requirements: !ruby/object:Gem::Requirement
23
37
  requirements:
24
38
  - - "~>"
25
39
  - !ruby/object:Gem::Version
26
- version: '1.16'
40
+ version: '1.0'
27
41
  - !ruby/object:Gem::Dependency
28
- name: rake
42
+ name: rubocop
29
43
  requirement: !ruby/object:Gem::Requirement
30
44
  requirements:
31
- - - ">="
45
+ - - "~>"
32
46
  - !ruby/object:Gem::Version
33
- version: '0'
47
+ version: '0.52'
34
48
  type: :development
35
49
  prerelease: false
36
50
  version_requirements: !ruby/object:Gem::Requirement
37
51
  requirements:
38
- - - ">="
52
+ - - "~>"
39
53
  - !ruby/object:Gem::Version
40
- version: '0'
54
+ version: '0.52'
41
55
  - !ruby/object:Gem::Dependency
42
- name: pry-byebug
56
+ name: ruby-prof
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '0.17'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '0.17'
69
+ - !ruby/object:Gem::Dependency
70
+ name: bundler
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '1.16'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '1.16'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rake
43
85
  requirement: !ruby/object:Gem::Requirement
44
86
  requirements:
45
87
  - - ">="
@@ -73,6 +115,7 @@ executables: []
73
115
  extensions: []
74
116
  extra_rdoc_files: []
75
117
  files:
118
+ - ".github/workflows/gempush.yml"
76
119
  - ".gitignore"
77
120
  - ".rspec"
78
121
  - ".travis.yml"
@@ -83,8 +126,14 @@ files:
83
126
  - README.md
84
127
  - Rakefile
85
128
  - VERSION
129
+ - ext/c_levenshtein/extconf.rb
130
+ - ext/c_levenshtein/levenshtein.c
131
+ - ext/c_levenshtein/phonetic_cost.c
132
+ - ext/c_levenshtein/phonetic_cost.h
86
133
  - lib/phonetics.rb
134
+ - lib/phonetics/c_levenshtein.bundle
87
135
  - lib/phonetics/levenshtein.rb
136
+ - lib/phonetics/ruby_levenshtein.rb
88
137
  - lib/phonetics/version.rb
89
138
  - phonetics.gemspec
90
139
  homepage: https://github.com/JackDanger/phonetics
@@ -106,8 +155,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
106
155
  - !ruby/object:Gem::Version
107
156
  version: '0'
108
157
  requirements: []
109
- rubyforge_project:
110
- rubygems_version: 2.7.6
158
+ rubygems_version: 3.0.3
111
159
  signing_key:
112
160
  specification_version: 4
113
161
  summary: tools for linguistic code using the International Phonetic Alphabet