phonetics 1.8.0 → 1.9.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Dockerfile +15 -0
- data/Gemfile +2 -0
- data/Makefile +6 -0
- data/README.md +36 -1
- data/VERSION +1 -1
- data/_site/{orthographic_example.png → orthographic_levenshtein_example.png} +0 -0
- data/_site/{phonetic_example.png → phonetic_levenshtein_example.png} +0 -0
- data/_site/vowel_chart_b_words.jpg +0 -0
- data/ext/c_levenshtein/levenshtein.c +47 -43
- data/ext/c_levenshtein/next_phoneme_length.c +1043 -1043
- data/ext/c_levenshtein/phonetic_cost.c +74732 -131159
- data/ext/c_levenshtein/phonetic_cost.h +1 -1
- data/lib/phonetics/code_generator.rb +35 -98
- data/lib/phonetics/ruby_levenshtein.rb +1 -1
- data/phonetics.gemspec +1 -0
- metadata +21 -4
@@ -1 +1 @@
|
|
1
|
-
float phonetic_cost(
|
1
|
+
float phonetic_cost(int64_t phoneme1, int64_t phoneme2);
|
@@ -25,6 +25,10 @@ module Phonetics
|
|
25
25
|
|
26
26
|
private
|
27
27
|
|
28
|
+
def binary(str)
|
29
|
+
"0b#{str.bytes.map { |byte| byte.to_s(2).rjust(8, '0') }.join}"
|
30
|
+
end
|
31
|
+
|
28
32
|
# Turn the bytes of all phonemes into a lookup trie where a sequence of
|
29
33
|
# bytes can find a phoneme in linear time.
|
30
34
|
def phoneme_byte_trie
|
@@ -52,13 +56,8 @@ module Phonetics
|
|
52
56
|
end
|
53
57
|
end
|
54
58
|
|
55
|
-
def
|
56
|
-
|
57
|
-
"#{location.path.split('/')[-4..-1].join('/')}:#{location.lineno}"
|
58
|
-
end
|
59
|
-
|
60
|
-
def describe(phoneme, depth)
|
61
|
-
indent depth, "// Phoneme: #{phoneme.inspect}, bytes: #{phoneme.bytes.inspect}"
|
59
|
+
def describe(phoneme, depth = 0)
|
60
|
+
indent depth, "// Phoneme: '#{phoneme}', bytes: #{phoneme.bytes.inspect}"
|
62
61
|
if Phonetics::Consonants.features.key?(phoneme)
|
63
62
|
indent depth, "// consonant features: #{Phonetics::Consonants.features[phoneme].to_json}"
|
64
63
|
else
|
@@ -66,8 +65,13 @@ module Phonetics
|
|
66
65
|
end
|
67
66
|
end
|
68
67
|
|
68
|
+
def ruby_source
|
69
|
+
location = caller_locations.first
|
70
|
+
"#{location.path.split('/')[-4..-1].join('/')}:#{location.lineno}"
|
71
|
+
end
|
72
|
+
|
69
73
|
def indent(depth, line)
|
70
|
-
write " #{'
|
74
|
+
write " #{' ' * depth}#{line}"
|
71
75
|
end
|
72
76
|
|
73
77
|
def write(line)
|
@@ -88,20 +92,15 @@ module Phonetics
|
|
88
92
|
# This will print a C code file with a function that implements a multil-level C
|
89
93
|
# switch like the following:
|
90
94
|
#
|
91
|
-
# switch (
|
92
|
-
# case
|
93
|
-
# switch(string1[1]) {
|
94
|
-
# case 201: // first byte of "ɪ"
|
95
|
-
# switch(string1[3]) {
|
96
|
-
# case 170: // second and final byte of "ɪ"
|
97
|
-
# // Phoneme: "ɪ", bytes: [201, 170]
|
95
|
+
# switch (phoneme1) {
|
96
|
+
# case 'ɪ': // two bytes: [201, 170]
|
98
97
|
# // vowel features: {"F1":300,"F2":2100,"rounded":false}
|
99
|
-
#
|
100
|
-
#
|
101
|
-
#
|
102
|
-
#
|
103
|
-
#
|
104
|
-
#
|
98
|
+
#
|
99
|
+
# switch(phoneme2) {
|
100
|
+
# 'i': // one byte: [105]
|
101
|
+
# // vowel features: {"F1":240,"F2":2400,"rounded":false}
|
102
|
+
# return (float) 0.14355381904337383;
|
103
|
+
# break;
|
105
104
|
#
|
106
105
|
# the distance of ("ɪ", "i")2 is therefore 0.14355
|
107
106
|
#
|
@@ -109,92 +108,30 @@ module Phonetics
|
|
109
108
|
write(<<-HEADER.gsub(/^ {6}/, ''))
|
110
109
|
|
111
110
|
// This is compiled from Ruby, in #{ruby_source}
|
112
|
-
#include <
|
113
|
-
|
114
|
-
#include "./phonemes.h"
|
115
|
-
float phonetic_cost(int *string1, int string1_offset, int phoneme1_length, int *string2, int string2_offset, int phoneme2_length) {
|
111
|
+
#include <stdint.h>
|
112
|
+
float phonetic_cost(int64_t phoneme1, int64_t phoneme2) {
|
116
113
|
|
117
114
|
HEADER
|
118
115
|
|
119
|
-
write ' switch (
|
120
|
-
|
121
|
-
write " case #{
|
122
|
-
|
123
|
-
write
|
116
|
+
write ' switch (phoneme1) {'
|
117
|
+
Phonetics.phonemes.each do |phoneme1|
|
118
|
+
write " case #{binary(phoneme1)}:"
|
119
|
+
describe(phoneme1, 2)
|
120
|
+
write " switch(phoneme2) {"
|
121
|
+
Phonetics.distance_map[phoneme1].each do |phoneme2, distance|
|
122
|
+
write " case #{binary(phoneme2)}:"
|
123
|
+
describe(phoneme2, 6)
|
124
|
+
write " return (float) #{distance};"
|
125
|
+
write ' break;'
|
126
|
+
end
|
127
|
+
write " }"
|
128
|
+
write ' break;'
|
124
129
|
end
|
125
130
|
write ' }'
|
126
131
|
write ' return (float) 1.0;'
|
127
132
|
write '};'
|
128
133
|
write ''
|
129
134
|
end
|
130
|
-
|
131
|
-
def switch_phoneme1(trie, depth = 0)
|
132
|
-
indent depth, "switch(string1[string1_offset + #{depth}]) {"
|
133
|
-
trie.each do |key, subtrie|
|
134
|
-
next if key == :source
|
135
|
-
next if subtrie.empty?
|
136
|
-
|
137
|
-
indent depth + 1, "case #{key}:"
|
138
|
-
|
139
|
-
phoneme1 = subtrie[:source]
|
140
|
-
|
141
|
-
# If this could be a match of a phoneme1 then find phoneme2
|
142
|
-
if phoneme1
|
143
|
-
# Add a comment to help understand the dataset
|
144
|
-
describe(phoneme1, depth + 2) if phoneme1
|
145
|
-
|
146
|
-
by_byte_length.each do |_, phonemes|
|
147
|
-
byte_trie = phoneme_byte_trie_for(phonemes)
|
148
|
-
next if byte_trie.empty?
|
149
|
-
|
150
|
-
switch_phoneme2(byte_trie, phoneme1, 0)
|
151
|
-
end
|
152
|
-
else
|
153
|
-
switch_phoneme1(subtrie, depth + 1)
|
154
|
-
end
|
155
|
-
|
156
|
-
indent depth + 2, 'break;'
|
157
|
-
end
|
158
|
-
indent depth, '}'
|
159
|
-
end
|
160
|
-
|
161
|
-
def switch_phoneme2(trie, previous_phoneme, depth = 0)
|
162
|
-
indent depth, "switch(string2[string2_offset + #{depth}]) {"
|
163
|
-
trie.each do |key, subtrie|
|
164
|
-
next if key == :source
|
165
|
-
next if subtrie.empty?
|
166
|
-
|
167
|
-
phoneme2 = subtrie[:source]
|
168
|
-
|
169
|
-
indent depth + 1, "case #{key}:"
|
170
|
-
|
171
|
-
if phoneme2
|
172
|
-
value = if previous_phoneme == phoneme2
|
173
|
-
0.0
|
174
|
-
else
|
175
|
-
distance(previous_phoneme, phoneme2)
|
176
|
-
end
|
177
|
-
# Add a comment to help understand the dataset
|
178
|
-
describe(phoneme2, depth + 2)
|
179
|
-
indent depth + 2, "return (float) #{value};"
|
180
|
-
else
|
181
|
-
switch_phoneme2(subtrie, previous_phoneme, depth + 1)
|
182
|
-
end
|
183
|
-
|
184
|
-
indent depth + 2, 'break;'
|
185
|
-
end
|
186
|
-
indent depth, '}'
|
187
|
-
end
|
188
|
-
|
189
|
-
def by_byte_length
|
190
|
-
Phonetics.phonemes.group_by do |phoneme|
|
191
|
-
phoneme.bytes.length
|
192
|
-
end.sort_by(&:first)
|
193
|
-
end
|
194
|
-
|
195
|
-
def distance(p1, p2)
|
196
|
-
Phonetics.distance_map[p1][p2]
|
197
|
-
end
|
198
135
|
end
|
199
136
|
|
200
137
|
class NextPhonemeLength < CodeGenerator
|
@@ -5,7 +5,7 @@ require_relative '../phonetics'
|
|
5
5
|
# Using the Damerau version of the Levenshtein algorithm, with phonetic feature
|
6
6
|
# count used instead of a binary edit distance calculation
|
7
7
|
#
|
8
|
-
# This implementation is almost entirely taken from the damerau-levenshtein gem
|
8
|
+
# This Ruby implementation is almost entirely taken from the damerau-levenshtein gem
|
9
9
|
# (https://github.com/GlobalNamesArchitecture/damerau-levenshtein/tree/master/ext/damerau_levenshtein).
|
10
10
|
# The implementation is modified based on "Using Phonologically Weighted
|
11
11
|
# Levenshtein Distances for the Prediction of Microscopic Intelligibility" by
|
data/phonetics.gemspec
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: phonetics
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.9.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jack Danger
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-09-
|
11
|
+
date: 2019-09-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -94,6 +94,20 @@ dependencies:
|
|
94
94
|
- - ">="
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: ruby-prof
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
97
111
|
description: tools for linguistic code using the International Phonetic Alphabet
|
98
112
|
email:
|
99
113
|
- github@jackcanty.com
|
@@ -108,13 +122,16 @@ files:
|
|
108
122
|
- ".rubocop.yml"
|
109
123
|
- ".travis.yml"
|
110
124
|
- CODE_OF_CONDUCT.md
|
125
|
+
- Dockerfile
|
111
126
|
- Gemfile
|
112
127
|
- LICENSE.txt
|
128
|
+
- Makefile
|
113
129
|
- README.md
|
114
130
|
- Rakefile
|
115
131
|
- VERSION
|
116
|
-
- _site/
|
117
|
-
- _site/
|
132
|
+
- _site/orthographic_levenshtein_example.png
|
133
|
+
- _site/phonetic_levenshtein_example.png
|
134
|
+
- _site/vowel_chart_b_words.jpg
|
118
135
|
- bin/console
|
119
136
|
- ext/c_levenshtein/extconf.rb
|
120
137
|
- ext/c_levenshtein/levenshtein.c
|