phonetics 1.8.0 → 1.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1 +1 @@
1
- float phonetic_cost(int *string1, int string1_offset, int phoneme1_length, int *string2, int string2_offset, int phoneme2_length);
1
+ float phonetic_cost(int64_t phoneme1, int64_t phoneme2);
@@ -25,6 +25,10 @@ module Phonetics
25
25
 
26
26
  private
27
27
 
28
+ def binary(str)
29
+ "0b#{str.bytes.map { |byte| byte.to_s(2).rjust(8, '0') }.join}"
30
+ end
31
+
28
32
  # Turn the bytes of all phonemes into a lookup trie where a sequence of
29
33
  # bytes can find a phoneme in linear time.
30
34
  def phoneme_byte_trie
@@ -52,13 +56,8 @@ module Phonetics
52
56
  end
53
57
  end
54
58
 
55
- def ruby_source
56
- location = caller_locations.first
57
- "#{location.path.split('/')[-4..-1].join('/')}:#{location.lineno}"
58
- end
59
-
60
- def describe(phoneme, depth)
61
- indent depth, "// Phoneme: #{phoneme.inspect}, bytes: #{phoneme.bytes.inspect}"
59
+ def describe(phoneme, depth = 0)
60
+ indent depth, "// Phoneme: '#{phoneme}', bytes: #{phoneme.bytes.inspect}"
62
61
  if Phonetics::Consonants.features.key?(phoneme)
63
62
  indent depth, "// consonant features: #{Phonetics::Consonants.features[phoneme].to_json}"
64
63
  else
@@ -66,8 +65,13 @@ module Phonetics
66
65
  end
67
66
  end
68
67
 
68
+ def ruby_source
69
+ location = caller_locations.first
70
+ "#{location.path.split('/')[-4..-1].join('/')}:#{location.lineno}"
71
+ end
72
+
69
73
  def indent(depth, line)
70
- write " #{' ' * depth}#{line}"
74
+ write " #{' ' * depth}#{line}"
71
75
  end
72
76
 
73
77
  def write(line)
@@ -88,20 +92,15 @@ module Phonetics
88
92
  # This will print a C code file with a function that implements a multil-level C
89
93
  # switch like the following:
90
94
  #
91
- # switch (phoneme1_length) {
92
- # case 2:
93
- # switch(string1[1]) {
94
- # case 201: // first byte of "ɪ"
95
- # switch(string1[3]) {
96
- # case 170: // second and final byte of "ɪ"
97
- # // Phoneme: "ɪ", bytes: [201, 170]
95
+ # switch (phoneme1) {
96
+ # case 'ɪ': // two bytes: [201, 170]
98
97
  # // vowel features: {"F1":300,"F2":2100,"rounded":false}
99
- # switch(string2[6]) {
100
- # case 105: // first and only byte of "i"
101
- # // Phoneme: "i", bytes: [105]
102
- # // vowel features: {"F1":240,"F2":2400,"rounded":false}
103
- # return (float) 0.14355381904337383;
104
- # break;
98
+ #
99
+ # switch(phoneme2) {
100
+ # 'i': // one byte: [105]
101
+ # // vowel features: {"F1":240,"F2":2400,"rounded":false}
102
+ # return (float) 0.14355381904337383;
103
+ # break;
105
104
  #
106
105
  # the distance of ("ɪ", "i")2 is therefore 0.14355
107
106
  #
@@ -109,92 +108,30 @@ module Phonetics
109
108
  write(<<-HEADER.gsub(/^ {6}/, ''))
110
109
 
111
110
  // This is compiled from Ruby, in #{ruby_source}
112
- #include <stdbool.h>
113
- #include <stdio.h>
114
- #include "./phonemes.h"
115
- float phonetic_cost(int *string1, int string1_offset, int phoneme1_length, int *string2, int string2_offset, int phoneme2_length) {
111
+ #include <stdint.h>
112
+ float phonetic_cost(int64_t phoneme1, int64_t phoneme2) {
116
113
 
117
114
  HEADER
118
115
 
119
- write ' switch (phoneme1_length) {'
120
- by_byte_length.each do |length, phonemes|
121
- write " case #{length}:"
122
- switch_phoneme1(phoneme_byte_trie_for(phonemes), 0)
123
- write ' break;'
116
+ write ' switch (phoneme1) {'
117
+ Phonetics.phonemes.each do |phoneme1|
118
+ write " case #{binary(phoneme1)}:"
119
+ describe(phoneme1, 2)
120
+ write " switch(phoneme2) {"
121
+ Phonetics.distance_map[phoneme1].each do |phoneme2, distance|
122
+ write " case #{binary(phoneme2)}:"
123
+ describe(phoneme2, 6)
124
+ write " return (float) #{distance};"
125
+ write ' break;'
126
+ end
127
+ write " }"
128
+ write ' break;'
124
129
  end
125
130
  write ' }'
126
131
  write ' return (float) 1.0;'
127
132
  write '};'
128
133
  write ''
129
134
  end
130
-
131
- def switch_phoneme1(trie, depth = 0)
132
- indent depth, "switch(string1[string1_offset + #{depth}]) {"
133
- trie.each do |key, subtrie|
134
- next if key == :source
135
- next if subtrie.empty?
136
-
137
- indent depth + 1, "case #{key}:"
138
-
139
- phoneme1 = subtrie[:source]
140
-
141
- # If this could be a match of a phoneme1 then find phoneme2
142
- if phoneme1
143
- # Add a comment to help understand the dataset
144
- describe(phoneme1, depth + 2) if phoneme1
145
-
146
- by_byte_length.each do |_, phonemes|
147
- byte_trie = phoneme_byte_trie_for(phonemes)
148
- next if byte_trie.empty?
149
-
150
- switch_phoneme2(byte_trie, phoneme1, 0)
151
- end
152
- else
153
- switch_phoneme1(subtrie, depth + 1)
154
- end
155
-
156
- indent depth + 2, 'break;'
157
- end
158
- indent depth, '}'
159
- end
160
-
161
- def switch_phoneme2(trie, previous_phoneme, depth = 0)
162
- indent depth, "switch(string2[string2_offset + #{depth}]) {"
163
- trie.each do |key, subtrie|
164
- next if key == :source
165
- next if subtrie.empty?
166
-
167
- phoneme2 = subtrie[:source]
168
-
169
- indent depth + 1, "case #{key}:"
170
-
171
- if phoneme2
172
- value = if previous_phoneme == phoneme2
173
- 0.0
174
- else
175
- distance(previous_phoneme, phoneme2)
176
- end
177
- # Add a comment to help understand the dataset
178
- describe(phoneme2, depth + 2)
179
- indent depth + 2, "return (float) #{value};"
180
- else
181
- switch_phoneme2(subtrie, previous_phoneme, depth + 1)
182
- end
183
-
184
- indent depth + 2, 'break;'
185
- end
186
- indent depth, '}'
187
- end
188
-
189
- def by_byte_length
190
- Phonetics.phonemes.group_by do |phoneme|
191
- phoneme.bytes.length
192
- end.sort_by(&:first)
193
- end
194
-
195
- def distance(p1, p2)
196
- Phonetics.distance_map[p1][p2]
197
- end
198
135
  end
199
136
 
200
137
  class NextPhonemeLength < CodeGenerator
@@ -5,7 +5,7 @@ require_relative '../phonetics'
5
5
  # Using the Damerau version of the Levenshtein algorithm, with phonetic feature
6
6
  # count used instead of a binary edit distance calculation
7
7
  #
8
- # This implementation is almost entirely taken from the damerau-levenshtein gem
8
+ # This Ruby implementation is almost entirely taken from the damerau-levenshtein gem
9
9
  # (https://github.com/GlobalNamesArchitecture/damerau-levenshtein/tree/master/ext/damerau_levenshtein).
10
10
  # The implementation is modified based on "Using Phonologically Weighted
11
11
  # Levenshtein Distances for the Prediction of Microscopic Intelligibility" by
data/phonetics.gemspec CHANGED
@@ -26,4 +26,5 @@ Gem::Specification.new do |spec|
26
26
  spec.add_development_dependency 'rake-compiler'
27
27
  spec.add_development_dependency 'rspec'
28
28
  spec.add_development_dependency 'rubocop'
29
+ spec.add_development_dependency 'ruby-prof'
29
30
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: phonetics
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.8.0
4
+ version: 1.9.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jack Danger
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-09-05 00:00:00.000000000 Z
11
+ date: 2019-09-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -94,6 +94,20 @@ dependencies:
94
94
  - - ">="
95
95
  - !ruby/object:Gem::Version
96
96
  version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: ruby-prof
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
97
111
  description: tools for linguistic code using the International Phonetic Alphabet
98
112
  email:
99
113
  - github@jackcanty.com
@@ -108,13 +122,16 @@ files:
108
122
  - ".rubocop.yml"
109
123
  - ".travis.yml"
110
124
  - CODE_OF_CONDUCT.md
125
+ - Dockerfile
111
126
  - Gemfile
112
127
  - LICENSE.txt
128
+ - Makefile
113
129
  - README.md
114
130
  - Rakefile
115
131
  - VERSION
116
- - _site/orthographic_example.png
117
- - _site/phonetic_example.png
132
+ - _site/orthographic_levenshtein_example.png
133
+ - _site/phonetic_levenshtein_example.png
134
+ - _site/vowel_chart_b_words.jpg
118
135
  - bin/console
119
136
  - ext/c_levenshtein/extconf.rb
120
137
  - ext/c_levenshtein/levenshtein.c