phonetics 1.8.0 → 1.9.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1 +1 @@
1
- float phonetic_cost(int *string1, int string1_offset, int phoneme1_length, int *string2, int string2_offset, int phoneme2_length);
1
+ float phonetic_cost(int64_t phoneme1, int64_t phoneme2);
@@ -25,6 +25,10 @@ module Phonetics
25
25
 
26
26
  private
27
27
 
28
+ def binary(str)
29
+ "0b#{str.bytes.map { |byte| byte.to_s(2).rjust(8, '0') }.join}"
30
+ end
31
+
28
32
  # Turn the bytes of all phonemes into a lookup trie where a sequence of
29
33
  # bytes can find a phoneme in linear time.
30
34
  def phoneme_byte_trie
@@ -52,13 +56,8 @@ module Phonetics
52
56
  end
53
57
  end
54
58
 
55
- def ruby_source
56
- location = caller_locations.first
57
- "#{location.path.split('/')[-4..-1].join('/')}:#{location.lineno}"
58
- end
59
-
60
- def describe(phoneme, depth)
61
- indent depth, "// Phoneme: #{phoneme.inspect}, bytes: #{phoneme.bytes.inspect}"
59
+ def describe(phoneme, depth = 0)
60
+ indent depth, "// Phoneme: '#{phoneme}', bytes: #{phoneme.bytes.inspect}"
62
61
  if Phonetics::Consonants.features.key?(phoneme)
63
62
  indent depth, "// consonant features: #{Phonetics::Consonants.features[phoneme].to_json}"
64
63
  else
@@ -66,8 +65,13 @@ module Phonetics
66
65
  end
67
66
  end
68
67
 
68
+ def ruby_source
69
+ location = caller_locations.first
70
+ "#{location.path.split('/')[-4..-1].join('/')}:#{location.lineno}"
71
+ end
72
+
69
73
  def indent(depth, line)
70
- write " #{' ' * depth}#{line}"
74
+ write " #{' ' * depth}#{line}"
71
75
  end
72
76
 
73
77
  def write(line)
@@ -88,20 +92,15 @@ module Phonetics
88
92
  # This will print a C code file with a function that implements a multil-level C
89
93
  # switch like the following:
90
94
  #
91
- # switch (phoneme1_length) {
92
- # case 2:
93
- # switch(string1[1]) {
94
- # case 201: // first byte of "ɪ"
95
- # switch(string1[3]) {
96
- # case 170: // second and final byte of "ɪ"
97
- # // Phoneme: "ɪ", bytes: [201, 170]
95
+ # switch (phoneme1) {
96
+ # case 'ɪ': // two bytes: [201, 170]
98
97
  # // vowel features: {"F1":300,"F2":2100,"rounded":false}
99
- # switch(string2[6]) {
100
- # case 105: // first and only byte of "i"
101
- # // Phoneme: "i", bytes: [105]
102
- # // vowel features: {"F1":240,"F2":2400,"rounded":false}
103
- # return (float) 0.14355381904337383;
104
- # break;
98
+ #
99
+ # switch(phoneme2) {
100
+ # 'i': // one byte: [105]
101
+ # // vowel features: {"F1":240,"F2":2400,"rounded":false}
102
+ # return (float) 0.14355381904337383;
103
+ # break;
105
104
  #
106
105
  # the distance of ("ɪ", "i")2 is therefore 0.14355
107
106
  #
@@ -109,92 +108,30 @@ module Phonetics
109
108
  write(<<-HEADER.gsub(/^ {6}/, ''))
110
109
 
111
110
  // This is compiled from Ruby, in #{ruby_source}
112
- #include <stdbool.h>
113
- #include <stdio.h>
114
- #include "./phonemes.h"
115
- float phonetic_cost(int *string1, int string1_offset, int phoneme1_length, int *string2, int string2_offset, int phoneme2_length) {
111
+ #include <stdint.h>
112
+ float phonetic_cost(int64_t phoneme1, int64_t phoneme2) {
116
113
 
117
114
  HEADER
118
115
 
119
- write ' switch (phoneme1_length) {'
120
- by_byte_length.each do |length, phonemes|
121
- write " case #{length}:"
122
- switch_phoneme1(phoneme_byte_trie_for(phonemes), 0)
123
- write ' break;'
116
+ write ' switch (phoneme1) {'
117
+ Phonetics.phonemes.each do |phoneme1|
118
+ write " case #{binary(phoneme1)}:"
119
+ describe(phoneme1, 2)
120
+ write " switch(phoneme2) {"
121
+ Phonetics.distance_map[phoneme1].each do |phoneme2, distance|
122
+ write " case #{binary(phoneme2)}:"
123
+ describe(phoneme2, 6)
124
+ write " return (float) #{distance};"
125
+ write ' break;'
126
+ end
127
+ write " }"
128
+ write ' break;'
124
129
  end
125
130
  write ' }'
126
131
  write ' return (float) 1.0;'
127
132
  write '};'
128
133
  write ''
129
134
  end
130
-
131
- def switch_phoneme1(trie, depth = 0)
132
- indent depth, "switch(string1[string1_offset + #{depth}]) {"
133
- trie.each do |key, subtrie|
134
- next if key == :source
135
- next if subtrie.empty?
136
-
137
- indent depth + 1, "case #{key}:"
138
-
139
- phoneme1 = subtrie[:source]
140
-
141
- # If this could be a match of a phoneme1 then find phoneme2
142
- if phoneme1
143
- # Add a comment to help understand the dataset
144
- describe(phoneme1, depth + 2) if phoneme1
145
-
146
- by_byte_length.each do |_, phonemes|
147
- byte_trie = phoneme_byte_trie_for(phonemes)
148
- next if byte_trie.empty?
149
-
150
- switch_phoneme2(byte_trie, phoneme1, 0)
151
- end
152
- else
153
- switch_phoneme1(subtrie, depth + 1)
154
- end
155
-
156
- indent depth + 2, 'break;'
157
- end
158
- indent depth, '}'
159
- end
160
-
161
- def switch_phoneme2(trie, previous_phoneme, depth = 0)
162
- indent depth, "switch(string2[string2_offset + #{depth}]) {"
163
- trie.each do |key, subtrie|
164
- next if key == :source
165
- next if subtrie.empty?
166
-
167
- phoneme2 = subtrie[:source]
168
-
169
- indent depth + 1, "case #{key}:"
170
-
171
- if phoneme2
172
- value = if previous_phoneme == phoneme2
173
- 0.0
174
- else
175
- distance(previous_phoneme, phoneme2)
176
- end
177
- # Add a comment to help understand the dataset
178
- describe(phoneme2, depth + 2)
179
- indent depth + 2, "return (float) #{value};"
180
- else
181
- switch_phoneme2(subtrie, previous_phoneme, depth + 1)
182
- end
183
-
184
- indent depth + 2, 'break;'
185
- end
186
- indent depth, '}'
187
- end
188
-
189
- def by_byte_length
190
- Phonetics.phonemes.group_by do |phoneme|
191
- phoneme.bytes.length
192
- end.sort_by(&:first)
193
- end
194
-
195
- def distance(p1, p2)
196
- Phonetics.distance_map[p1][p2]
197
- end
198
135
  end
199
136
 
200
137
  class NextPhonemeLength < CodeGenerator
@@ -5,7 +5,7 @@ require_relative '../phonetics'
5
5
  # Using the Damerau version of the Levenshtein algorithm, with phonetic feature
6
6
  # count used instead of a binary edit distance calculation
7
7
  #
8
- # This implementation is almost entirely taken from the damerau-levenshtein gem
8
+ # This Ruby implementation is almost entirely taken from the damerau-levenshtein gem
9
9
  # (https://github.com/GlobalNamesArchitecture/damerau-levenshtein/tree/master/ext/damerau_levenshtein).
10
10
  # The implementation is modified based on "Using Phonologically Weighted
11
11
  # Levenshtein Distances for the Prediction of Microscopic Intelligibility" by
data/phonetics.gemspec CHANGED
@@ -26,4 +26,5 @@ Gem::Specification.new do |spec|
26
26
  spec.add_development_dependency 'rake-compiler'
27
27
  spec.add_development_dependency 'rspec'
28
28
  spec.add_development_dependency 'rubocop'
29
+ spec.add_development_dependency 'ruby-prof'
29
30
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: phonetics
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.8.0
4
+ version: 1.9.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jack Danger
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-09-05 00:00:00.000000000 Z
11
+ date: 2019-09-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -94,6 +94,20 @@ dependencies:
94
94
  - - ">="
95
95
  - !ruby/object:Gem::Version
96
96
  version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: ruby-prof
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
97
111
  description: tools for linguistic code using the International Phonetic Alphabet
98
112
  email:
99
113
  - github@jackcanty.com
@@ -108,13 +122,16 @@ files:
108
122
  - ".rubocop.yml"
109
123
  - ".travis.yml"
110
124
  - CODE_OF_CONDUCT.md
125
+ - Dockerfile
111
126
  - Gemfile
112
127
  - LICENSE.txt
128
+ - Makefile
113
129
  - README.md
114
130
  - Rakefile
115
131
  - VERSION
116
- - _site/orthographic_example.png
117
- - _site/phonetic_example.png
132
+ - _site/orthographic_levenshtein_example.png
133
+ - _site/phonetic_levenshtein_example.png
134
+ - _site/vowel_chart_b_words.jpg
118
135
  - bin/console
119
136
  - ext/c_levenshtein/extconf.rb
120
137
  - ext/c_levenshtein/levenshtein.c