phonetics 1.8.0 → 1.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Dockerfile +15 -0
- data/Gemfile +2 -0
- data/Makefile +6 -0
- data/README.md +36 -1
- data/VERSION +1 -1
- data/_site/{orthographic_example.png → orthographic_levenshtein_example.png} +0 -0
- data/_site/{phonetic_example.png → phonetic_levenshtein_example.png} +0 -0
- data/_site/vowel_chart_b_words.jpg +0 -0
- data/ext/c_levenshtein/levenshtein.c +47 -43
- data/ext/c_levenshtein/next_phoneme_length.c +1043 -1043
- data/ext/c_levenshtein/phonetic_cost.c +74732 -131159
- data/ext/c_levenshtein/phonetic_cost.h +1 -1
- data/lib/phonetics/code_generator.rb +35 -98
- data/lib/phonetics/ruby_levenshtein.rb +1 -1
- data/phonetics.gemspec +1 -0
- metadata +21 -4
@@ -1 +1 @@
|
|
1
|
-
float phonetic_cost(
|
1
|
+
float phonetic_cost(int64_t phoneme1, int64_t phoneme2);
|
@@ -25,6 +25,10 @@ module Phonetics
|
|
25
25
|
|
26
26
|
private
|
27
27
|
|
28
|
+
def binary(str)
|
29
|
+
"0b#{str.bytes.map { |byte| byte.to_s(2).rjust(8, '0') }.join}"
|
30
|
+
end
|
31
|
+
|
28
32
|
# Turn the bytes of all phonemes into a lookup trie where a sequence of
|
29
33
|
# bytes can find a phoneme in linear time.
|
30
34
|
def phoneme_byte_trie
|
@@ -52,13 +56,8 @@ module Phonetics
|
|
52
56
|
end
|
53
57
|
end
|
54
58
|
|
55
|
-
def
|
56
|
-
|
57
|
-
"#{location.path.split('/')[-4..-1].join('/')}:#{location.lineno}"
|
58
|
-
end
|
59
|
-
|
60
|
-
def describe(phoneme, depth)
|
61
|
-
indent depth, "// Phoneme: #{phoneme.inspect}, bytes: #{phoneme.bytes.inspect}"
|
59
|
+
def describe(phoneme, depth = 0)
|
60
|
+
indent depth, "// Phoneme: '#{phoneme}', bytes: #{phoneme.bytes.inspect}"
|
62
61
|
if Phonetics::Consonants.features.key?(phoneme)
|
63
62
|
indent depth, "// consonant features: #{Phonetics::Consonants.features[phoneme].to_json}"
|
64
63
|
else
|
@@ -66,8 +65,13 @@ module Phonetics
|
|
66
65
|
end
|
67
66
|
end
|
68
67
|
|
68
|
+
def ruby_source
|
69
|
+
location = caller_locations.first
|
70
|
+
"#{location.path.split('/')[-4..-1].join('/')}:#{location.lineno}"
|
71
|
+
end
|
72
|
+
|
69
73
|
def indent(depth, line)
|
70
|
-
write " #{'
|
74
|
+
write " #{' ' * depth}#{line}"
|
71
75
|
end
|
72
76
|
|
73
77
|
def write(line)
|
@@ -88,20 +92,15 @@ module Phonetics
|
|
88
92
|
# This will print a C code file with a function that implements a multil-level C
|
89
93
|
# switch like the following:
|
90
94
|
#
|
91
|
-
# switch (
|
92
|
-
# case
|
93
|
-
# switch(string1[1]) {
|
94
|
-
# case 201: // first byte of "ɪ"
|
95
|
-
# switch(string1[3]) {
|
96
|
-
# case 170: // second and final byte of "ɪ"
|
97
|
-
# // Phoneme: "ɪ", bytes: [201, 170]
|
95
|
+
# switch (phoneme1) {
|
96
|
+
# case 'ɪ': // two bytes: [201, 170]
|
98
97
|
# // vowel features: {"F1":300,"F2":2100,"rounded":false}
|
99
|
-
#
|
100
|
-
#
|
101
|
-
#
|
102
|
-
#
|
103
|
-
#
|
104
|
-
#
|
98
|
+
#
|
99
|
+
# switch(phoneme2) {
|
100
|
+
# 'i': // one byte: [105]
|
101
|
+
# // vowel features: {"F1":240,"F2":2400,"rounded":false}
|
102
|
+
# return (float) 0.14355381904337383;
|
103
|
+
# break;
|
105
104
|
#
|
106
105
|
# the distance of ("ɪ", "i")2 is therefore 0.14355
|
107
106
|
#
|
@@ -109,92 +108,30 @@ module Phonetics
|
|
109
108
|
write(<<-HEADER.gsub(/^ {6}/, ''))
|
110
109
|
|
111
110
|
// This is compiled from Ruby, in #{ruby_source}
|
112
|
-
#include <
|
113
|
-
|
114
|
-
#include "./phonemes.h"
|
115
|
-
float phonetic_cost(int *string1, int string1_offset, int phoneme1_length, int *string2, int string2_offset, int phoneme2_length) {
|
111
|
+
#include <stdint.h>
|
112
|
+
float phonetic_cost(int64_t phoneme1, int64_t phoneme2) {
|
116
113
|
|
117
114
|
HEADER
|
118
115
|
|
119
|
-
write ' switch (
|
120
|
-
|
121
|
-
write " case #{
|
122
|
-
|
123
|
-
write
|
116
|
+
write ' switch (phoneme1) {'
|
117
|
+
Phonetics.phonemes.each do |phoneme1|
|
118
|
+
write " case #{binary(phoneme1)}:"
|
119
|
+
describe(phoneme1, 2)
|
120
|
+
write " switch(phoneme2) {"
|
121
|
+
Phonetics.distance_map[phoneme1].each do |phoneme2, distance|
|
122
|
+
write " case #{binary(phoneme2)}:"
|
123
|
+
describe(phoneme2, 6)
|
124
|
+
write " return (float) #{distance};"
|
125
|
+
write ' break;'
|
126
|
+
end
|
127
|
+
write " }"
|
128
|
+
write ' break;'
|
124
129
|
end
|
125
130
|
write ' }'
|
126
131
|
write ' return (float) 1.0;'
|
127
132
|
write '};'
|
128
133
|
write ''
|
129
134
|
end
|
130
|
-
|
131
|
-
def switch_phoneme1(trie, depth = 0)
|
132
|
-
indent depth, "switch(string1[string1_offset + #{depth}]) {"
|
133
|
-
trie.each do |key, subtrie|
|
134
|
-
next if key == :source
|
135
|
-
next if subtrie.empty?
|
136
|
-
|
137
|
-
indent depth + 1, "case #{key}:"
|
138
|
-
|
139
|
-
phoneme1 = subtrie[:source]
|
140
|
-
|
141
|
-
# If this could be a match of a phoneme1 then find phoneme2
|
142
|
-
if phoneme1
|
143
|
-
# Add a comment to help understand the dataset
|
144
|
-
describe(phoneme1, depth + 2) if phoneme1
|
145
|
-
|
146
|
-
by_byte_length.each do |_, phonemes|
|
147
|
-
byte_trie = phoneme_byte_trie_for(phonemes)
|
148
|
-
next if byte_trie.empty?
|
149
|
-
|
150
|
-
switch_phoneme2(byte_trie, phoneme1, 0)
|
151
|
-
end
|
152
|
-
else
|
153
|
-
switch_phoneme1(subtrie, depth + 1)
|
154
|
-
end
|
155
|
-
|
156
|
-
indent depth + 2, 'break;'
|
157
|
-
end
|
158
|
-
indent depth, '}'
|
159
|
-
end
|
160
|
-
|
161
|
-
def switch_phoneme2(trie, previous_phoneme, depth = 0)
|
162
|
-
indent depth, "switch(string2[string2_offset + #{depth}]) {"
|
163
|
-
trie.each do |key, subtrie|
|
164
|
-
next if key == :source
|
165
|
-
next if subtrie.empty?
|
166
|
-
|
167
|
-
phoneme2 = subtrie[:source]
|
168
|
-
|
169
|
-
indent depth + 1, "case #{key}:"
|
170
|
-
|
171
|
-
if phoneme2
|
172
|
-
value = if previous_phoneme == phoneme2
|
173
|
-
0.0
|
174
|
-
else
|
175
|
-
distance(previous_phoneme, phoneme2)
|
176
|
-
end
|
177
|
-
# Add a comment to help understand the dataset
|
178
|
-
describe(phoneme2, depth + 2)
|
179
|
-
indent depth + 2, "return (float) #{value};"
|
180
|
-
else
|
181
|
-
switch_phoneme2(subtrie, previous_phoneme, depth + 1)
|
182
|
-
end
|
183
|
-
|
184
|
-
indent depth + 2, 'break;'
|
185
|
-
end
|
186
|
-
indent depth, '}'
|
187
|
-
end
|
188
|
-
|
189
|
-
def by_byte_length
|
190
|
-
Phonetics.phonemes.group_by do |phoneme|
|
191
|
-
phoneme.bytes.length
|
192
|
-
end.sort_by(&:first)
|
193
|
-
end
|
194
|
-
|
195
|
-
def distance(p1, p2)
|
196
|
-
Phonetics.distance_map[p1][p2]
|
197
|
-
end
|
198
135
|
end
|
199
136
|
|
200
137
|
class NextPhonemeLength < CodeGenerator
|
@@ -5,7 +5,7 @@ require_relative '../phonetics'
|
|
5
5
|
# Using the Damerau version of the Levenshtein algorithm, with phonetic feature
|
6
6
|
# count used instead of a binary edit distance calculation
|
7
7
|
#
|
8
|
-
# This implementation is almost entirely taken from the damerau-levenshtein gem
|
8
|
+
# This Ruby implementation is almost entirely taken from the damerau-levenshtein gem
|
9
9
|
# (https://github.com/GlobalNamesArchitecture/damerau-levenshtein/tree/master/ext/damerau_levenshtein).
|
10
10
|
# The implementation is modified based on "Using Phonologically Weighted
|
11
11
|
# Levenshtein Distances for the Prediction of Microscopic Intelligibility" by
|
data/phonetics.gemspec
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: phonetics
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.9.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jack Danger
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-09-
|
11
|
+
date: 2019-09-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -94,6 +94,20 @@ dependencies:
|
|
94
94
|
- - ">="
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: ruby-prof
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
97
111
|
description: tools for linguistic code using the International Phonetic Alphabet
|
98
112
|
email:
|
99
113
|
- github@jackcanty.com
|
@@ -108,13 +122,16 @@ files:
|
|
108
122
|
- ".rubocop.yml"
|
109
123
|
- ".travis.yml"
|
110
124
|
- CODE_OF_CONDUCT.md
|
125
|
+
- Dockerfile
|
111
126
|
- Gemfile
|
112
127
|
- LICENSE.txt
|
128
|
+
- Makefile
|
113
129
|
- README.md
|
114
130
|
- Rakefile
|
115
131
|
- VERSION
|
116
|
-
- _site/
|
117
|
-
- _site/
|
132
|
+
- _site/orthographic_levenshtein_example.png
|
133
|
+
- _site/phonetic_levenshtein_example.png
|
134
|
+
- _site/vowel_chart_b_words.jpg
|
118
135
|
- bin/console
|
119
136
|
- ext/c_levenshtein/extconf.rb
|
120
137
|
- ext/c_levenshtein/levenshtein.c
|