phonetics 1.9.0 → 2.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/gempush.yml +3 -7
- data/.github/workflows/test.yml +23 -0
- data/Gemfile +0 -2
- data/VERSION +1 -1
- data/bin/gempush-if-changed +8 -0
- data/ext/c_levenshtein/levenshtein.c +33 -44
- data/ext/c_levenshtein/next_phoneme_length.c +2 -1
- data/ext/c_levenshtein/phonemes.c +21 -1
- data/ext/c_levenshtein/phonemes.h +1 -0
- data/ext/c_levenshtein/phonetic_cost.c +5 -0
- data/lib/phonetics/code_generator.rb +9 -3
- data/phonetics.gemspec +0 -1
- metadata +4 -16
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 32e8dd70cfccdfd6826023da278e2d04c75fc6e7658b5bc64af44830a0ae5668
|
4
|
+
data.tar.gz: 7dfd507a3fcb7914ee9eaba8049bc286068265399ee580a8c2ffb8ae8ecbd061
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 10c9e15ec47748cf1c508538950fb090477e6762a584466bd9abcc7ffd81fff2497041adf77d190ba6adcb7ef73774502941faeff0edfcf048f7ac650336ab0a
|
7
|
+
data.tar.gz: 7cb06657f02ff1bad1547eb948b412876855d6e0967f927d44512e7d9b84b66e1d8c9ffb79f81e5001442d36bcbc37d8282aa80b3c1bbebecf25f2f0f3fb0849
|
@@ -1,16 +1,13 @@
|
|
1
1
|
name: Ruby Gem
|
2
2
|
|
3
3
|
on:
|
4
|
-
pull_request:
|
5
|
-
branches:
|
6
|
-
- master
|
7
4
|
push:
|
8
5
|
branches:
|
9
6
|
- master
|
10
7
|
|
11
8
|
jobs:
|
12
9
|
build:
|
13
|
-
name:
|
10
|
+
name: gem publishing
|
14
11
|
runs-on: ubuntu-latest
|
15
12
|
|
16
13
|
steps:
|
@@ -18,7 +15,7 @@ jobs:
|
|
18
15
|
- name: Set up Ruby 2.6
|
19
16
|
uses: actions/setup-ruby@v1
|
20
17
|
with:
|
21
|
-
version: 2.6.x
|
18
|
+
ruby-version: 2.6.x
|
22
19
|
|
23
20
|
- name: Publish to RubyGems
|
24
21
|
run: |
|
@@ -26,7 +23,6 @@ jobs:
|
|
26
23
|
touch $HOME/.gem/credentials
|
27
24
|
chmod 0600 $HOME/.gem/credentials
|
28
25
|
printf -- "---\n:rubygems_api_key: ${GEM_HOST_API_KEY}\n" > $HOME/.gem/credentials
|
29
|
-
|
30
|
-
gem push *.gem
|
26
|
+
bin/gempush-if-changed
|
31
27
|
env:
|
32
28
|
GEM_HOST_API_KEY: ${{secrets.RUBYGEMS_AUTH_TOKEN}}
|
@@ -0,0 +1,23 @@
|
|
1
|
+
name: Tests
|
2
|
+
|
3
|
+
on: push
|
4
|
+
|
5
|
+
jobs:
|
6
|
+
ci:
|
7
|
+
name: CI
|
8
|
+
runs-on: ubuntu-latest
|
9
|
+
strategy:
|
10
|
+
matrix:
|
11
|
+
ruby:
|
12
|
+
- '2.3.x'
|
13
|
+
- '2.4.x'
|
14
|
+
- '2.5.x'
|
15
|
+
- '2.6.x'
|
16
|
+
steps:
|
17
|
+
- uses: actions/checkout@master
|
18
|
+
- name: Setup ruby
|
19
|
+
uses: actions/setup-ruby@v1
|
20
|
+
with:
|
21
|
+
ruby-version: ${{ matrix.ruby }}
|
22
|
+
architecture: 'x64'
|
23
|
+
- run: gem install bundler && bundle && bundle exec rake
|
data/Gemfile
CHANGED
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
2.0.1
|
@@ -7,7 +7,13 @@
|
|
7
7
|
#include "./next_phoneme_length.h"
|
8
8
|
#include "./phonetic_cost.h"
|
9
9
|
|
10
|
+
// #define DEBUG
|
11
|
+
|
12
|
+
#ifdef DEBUG
|
10
13
|
#define debug(M, ...) if (verbose) printf(M, ##__VA_ARGS__)
|
14
|
+
#else
|
15
|
+
#define debug(M, ...)
|
16
|
+
#endif
|
11
17
|
|
12
18
|
VALUE Binding = Qnil;
|
13
19
|
|
@@ -15,8 +21,8 @@ VALUE Binding = Qnil;
|
|
15
21
|
|
16
22
|
void Init_c_levenshtein();
|
17
23
|
|
18
|
-
void set_initial(
|
19
|
-
void print_matrix(
|
24
|
+
void set_initial(float *d, int string1_phoneme_count, uint64_t *string1_phonemes, int string2_phoneme_count, uint64_t *string2_phonemes, bool verbose);
|
25
|
+
void print_matrix(float *d, int *string1, int string1_phoneme_count, int *string1_phoneme_sizes, int *string2, int string2_phoneme_count, int *string2_phoneme_sizes, bool verbose);
|
20
26
|
VALUE method_internal_phonetic_distance(VALUE self, VALUE _string1, VALUE _string2, VALUE _verbose);
|
21
27
|
|
22
28
|
/* Function implemitations */
|
@@ -41,11 +47,11 @@ VALUE method_internal_phonetic_distance(VALUE self, VALUE _string1, VALUE _strin
|
|
41
47
|
int string1[string1_length + 1];
|
42
48
|
int string2[string2_length + 1];
|
43
49
|
|
44
|
-
|
50
|
+
float *d; // The (flattened) 2-dimensional matrix
|
45
51
|
// underlying this algorithm
|
46
52
|
|
47
|
-
|
48
|
-
|
53
|
+
float distance; // Return value of this function
|
54
|
+
float min, delete, // Reusable cost calculations
|
49
55
|
insert, replace,
|
50
56
|
cost;
|
51
57
|
int i, j; // Frequently overwritten loop vars
|
@@ -64,34 +70,22 @@ VALUE method_internal_phonetic_distance(VALUE self, VALUE _string1, VALUE _strin
|
|
64
70
|
}
|
65
71
|
|
66
72
|
find_phonemes(string1, string1_length, &string1_phoneme_count, string1_phoneme_sizes);
|
67
|
-
|
73
|
+
uint64_t string1_phonemes[string1_phoneme_count];
|
74
|
+
set_phonemes(string1_phonemes, string1, string1_phoneme_count, string1_phoneme_sizes);
|
68
75
|
|
69
76
|
find_phonemes(string2, string2_length, &string2_phoneme_count, string2_phoneme_sizes);
|
70
|
-
|
77
|
+
uint64_t string2_phonemes[string2_phoneme_count];
|
78
|
+
set_phonemes(string2_phonemes, string2, string2_phoneme_count, string2_phoneme_sizes);
|
71
79
|
|
72
80
|
// Guard clauses for empty strings
|
73
81
|
if (string1_phoneme_count == 0 && string2_phoneme_count == 0)
|
74
82
|
return DBL2NUM(0.0);
|
75
83
|
|
76
|
-
|
77
|
-
|
78
|
-
// These 64-bit words are how we implement the lookup table in phonetic_cost
|
79
|
-
int idx = 0;
|
80
|
-
for (i = 0; i < string1_phoneme_count; i++) {
|
81
|
-
for (j = 0; j < string1_phoneme_sizes[j]; j++) {
|
82
|
-
string1_phonemes[i] = (int) ( string1_phonemes[i] << 8 | string1[idx] );
|
83
|
-
idx++;
|
84
|
-
}
|
85
|
-
}
|
86
|
-
for (i = 0; i < string2_phoneme_count; i++) {
|
87
|
-
for (j = 0; j < string2_phoneme_sizes[j]; j++) {
|
88
|
-
string2_phonemes[i] = (int) ( string2_phonemes[i] << 8 | string2[idx] );
|
89
|
-
idx++;
|
90
|
-
}
|
91
|
-
}
|
84
|
+
debug("\n");
|
85
|
+
debug("distance between 0 and 1 of phoneme1: %f\n", phonetic_cost(string1_phonemes[0], string1_phonemes[1]));
|
92
86
|
|
93
87
|
// one-dimensional representation of 2 dimensional array
|
94
|
-
d = calloc((string1_phoneme_count+1) * (string2_phoneme_count+1), sizeof(
|
88
|
+
d = calloc((string1_phoneme_count+1) * (string2_phoneme_count+1), sizeof(float));
|
95
89
|
|
96
90
|
// First, set the top row and left column of the matrix using the sequential
|
97
91
|
// phonetic edit distance of string1 and string2, respectively
|
@@ -115,34 +109,34 @@ VALUE method_internal_phonetic_distance(VALUE self, VALUE _string1, VALUE _strin
|
|
115
109
|
// plus the phonetic distance between the sound we're moving from to the
|
116
110
|
// new one.
|
117
111
|
|
118
|
-
|
112
|
+
debug("------- %d/%d (%d) \n", i, j, j*(string1_phoneme_count+1) + i);
|
119
113
|
|
120
114
|
cost = phonetic_cost(string1_phonemes[i-1], string2_phonemes[j-1]);
|
121
115
|
|
122
116
|
insert = d[j*(string1_phoneme_count+1) + i-1];
|
123
|
-
|
117
|
+
debug("insert proposes cell %d,%d - %f\n", i-1, j, insert);
|
124
118
|
min = insert;
|
125
|
-
|
119
|
+
debug("min (insert): %f\n", min);
|
126
120
|
|
127
121
|
delete = d[(j-1)*(string1_phoneme_count+1) + i];
|
128
|
-
|
122
|
+
debug("delete proposes cell %d,%d - %f\n", i, j-1, delete);
|
129
123
|
if (delete < min) {
|
130
|
-
|
124
|
+
debug("delete is %f, better than %f for %d/%d\n", delete, min, i, j);
|
131
125
|
min = delete;
|
132
126
|
}
|
133
127
|
|
134
128
|
replace = d[(j-1)*(string1_phoneme_count+1) + i-1];
|
135
|
-
|
129
|
+
debug("replace proposes cell %d,%d - %f\n", i-1, j-1, replace);
|
136
130
|
if (replace < min) {
|
137
|
-
|
131
|
+
debug("replace is %f, better than %f for %d/%d\n", replace, min, i, j);
|
138
132
|
min = replace;
|
139
133
|
}
|
140
134
|
|
141
135
|
d[(j * (string1_phoneme_count+1)) + i] = min + cost;
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
136
|
+
debug("\n");
|
137
|
+
if (verbose) {
|
138
|
+
print_matrix(d, string1, string1_phoneme_count, string1_phoneme_sizes, string2, string2_phoneme_count, string2_phoneme_sizes, verbose);
|
139
|
+
}
|
146
140
|
|
147
141
|
}
|
148
142
|
}
|
@@ -166,9 +160,9 @@ VALUE method_internal_phonetic_distance(VALUE self, VALUE _string1, VALUE _strin
|
|
166
160
|
// Subsequent values are the cumulative phonetic distance between each
|
167
161
|
// phoneme within the same string.
|
168
162
|
// "aek" -> [0.0, 1.0, 1.61, 2.61]
|
169
|
-
void set_initial(
|
163
|
+
void set_initial(float *d, int string1_phoneme_count, uint64_t *string1_phonemes, int string2_phoneme_count, uint64_t *string2_phonemes, bool verbose) {
|
170
164
|
|
171
|
-
|
165
|
+
float initial_distance;
|
172
166
|
int i, j;
|
173
167
|
|
174
168
|
if (string1_phoneme_count == 0 || string2_phoneme_count == 0) {
|
@@ -178,7 +172,7 @@ void set_initial(double *d, int string1_phoneme_count, int64_t *string1_phonemes
|
|
178
172
|
}
|
179
173
|
|
180
174
|
// The top-left is 0, the cell to the right and down are each 1 to start
|
181
|
-
d[0] = (
|
175
|
+
d[0] = (float) 0.0;
|
182
176
|
if (string1_phoneme_count > 0) {
|
183
177
|
d[1] = initial_distance;
|
184
178
|
}
|
@@ -186,16 +180,12 @@ void set_initial(double *d, int string1_phoneme_count, int64_t *string1_phonemes
|
|
186
180
|
d[string1_phoneme_count+1] = initial_distance;
|
187
181
|
}
|
188
182
|
|
189
|
-
debug("string1 phoneme count: %d\n", string1_phoneme_count);
|
190
|
-
|
191
183
|
for (i=2; i <= string1_phoneme_count; i++) {
|
192
184
|
// The cost of adding the next phoneme is the cost so far plus the phonetic
|
193
185
|
// distance between the previous one and the current one.
|
194
186
|
d[i] = d[i-1] + phonetic_cost(string1_phonemes[i-2], string1_phonemes[i-1]);
|
195
187
|
}
|
196
188
|
|
197
|
-
debug("string2 phoneme count: %d\n", string2_phoneme_count);
|
198
|
-
|
199
189
|
for (j=2; j <= string2_phoneme_count; j++) {
|
200
190
|
// The same exact pattern down the left side of the matrix
|
201
191
|
d[j * (string1_phoneme_count+1)] = d[(j - 1) * (string1_phoneme_count+1)] + phonetic_cost(string2_phonemes[j-2], string2_phonemes[j-1]);
|
@@ -203,13 +193,12 @@ void set_initial(double *d, int string1_phoneme_count, int64_t *string1_phonemes
|
|
203
193
|
}
|
204
194
|
|
205
195
|
// A handy visualization for developers
|
206
|
-
void print_matrix(
|
196
|
+
void print_matrix(float *d, int *string1, int string1_phoneme_count, int *string1_phoneme_sizes, int *string2, int string2_phoneme_count, int *string2_phoneme_sizes, bool verbose) {
|
207
197
|
|
208
198
|
int i, j;
|
209
199
|
int string1_offset = 0;
|
210
200
|
int string2_offset = 0;
|
211
201
|
|
212
|
-
return;
|
213
202
|
if (!verbose)
|
214
203
|
return;
|
215
204
|
|
@@ -1,8 +1,13 @@
|
|
1
1
|
#include <stdio.h>
|
2
|
+
#include <stdlib.h>
|
3
|
+
#include <stdint.h>
|
2
4
|
#include "./next_phoneme_length.h"
|
5
|
+
|
3
6
|
void find_phonemes(int *string, int string_length, int *count, int *lengths) {
|
4
|
-
int i = 0;
|
5
7
|
int length;
|
8
|
+
int i;
|
9
|
+
|
10
|
+
i = 0;
|
6
11
|
while (i < string_length) {
|
7
12
|
length = next_phoneme_length(string, i, string_length);
|
8
13
|
if (length) {
|
@@ -14,6 +19,21 @@ void find_phonemes(int *string, int string_length, int *count, int *lengths) {
|
|
14
19
|
}
|
15
20
|
}
|
16
21
|
|
22
|
+
// Collect between 1 and 8 bytes of a phoneme into a single 64-bit word so we can compare two
|
23
|
+
// phonemes using just one instruction.
|
24
|
+
// These 64-bit words are how we implement the lookup table in phonetic_cost
|
25
|
+
void set_phonemes(uint64_t* phonemes, int* string, int count, int* lengths) {
|
26
|
+
int idx = 0;
|
27
|
+
int i, j;
|
28
|
+
for (i = 0; i < count; i++) {
|
29
|
+
phonemes[i] = 0;
|
30
|
+
for (j = 0; j < lengths[i]; j++) {
|
31
|
+
phonemes[i] = (uint64_t) ( phonemes[i] << 8 | string[idx] );
|
32
|
+
idx++;
|
33
|
+
}
|
34
|
+
}
|
35
|
+
}
|
36
|
+
|
17
37
|
void print_phoneme(int *string, int offset, int length, int padding) {
|
18
38
|
int p;
|
19
39
|
int max = padding;
|
@@ -1,7 +1,12 @@
|
|
1
1
|
|
2
2
|
// This is compiled from Ruby, in phonetics/lib/phonetics/code_generator.rb:110
|
3
3
|
#include <stdint.h>
|
4
|
+
#include <stdio.h>
|
5
|
+
#include <inttypes.h>
|
4
6
|
float phonetic_cost(int64_t phoneme1, int64_t phoneme2) {
|
7
|
+
if (phoneme1 == phoneme2) {
|
8
|
+
return (float) 0.0;
|
9
|
+
}
|
5
10
|
|
6
11
|
switch (phoneme1) {
|
7
12
|
case 0b01101001:
|
@@ -95,7 +95,7 @@ module Phonetics
|
|
95
95
|
# switch (phoneme1) {
|
96
96
|
# case 'ɪ': // two bytes: [201, 170]
|
97
97
|
# // vowel features: {"F1":300,"F2":2100,"rounded":false}
|
98
|
-
#
|
98
|
+
#
|
99
99
|
# switch(phoneme2) {
|
100
100
|
# 'i': // one byte: [105]
|
101
101
|
# // vowel features: {"F1":240,"F2":2400,"rounded":false}
|
@@ -109,7 +109,12 @@ module Phonetics
|
|
109
109
|
|
110
110
|
// This is compiled from Ruby, in #{ruby_source}
|
111
111
|
#include <stdint.h>
|
112
|
+
#include <stdio.h>
|
113
|
+
#include <inttypes.h>
|
112
114
|
float phonetic_cost(int64_t phoneme1, int64_t phoneme2) {
|
115
|
+
if (phoneme1 == phoneme2) {
|
116
|
+
return (float) 0.0;
|
117
|
+
}
|
113
118
|
|
114
119
|
HEADER
|
115
120
|
|
@@ -117,14 +122,14 @@ module Phonetics
|
|
117
122
|
Phonetics.phonemes.each do |phoneme1|
|
118
123
|
write " case #{binary(phoneme1)}:"
|
119
124
|
describe(phoneme1, 2)
|
120
|
-
write
|
125
|
+
write ' switch(phoneme2) {'
|
121
126
|
Phonetics.distance_map[phoneme1].each do |phoneme2, distance|
|
122
127
|
write " case #{binary(phoneme2)}:"
|
123
128
|
describe(phoneme2, 6)
|
124
129
|
write " return (float) #{distance};"
|
125
130
|
write ' break;'
|
126
131
|
end
|
127
|
-
write
|
132
|
+
write ' }'
|
128
133
|
write ' break;'
|
129
134
|
end
|
130
135
|
write ' }'
|
@@ -156,6 +161,7 @@ module Phonetics
|
|
156
161
|
def generate
|
157
162
|
write(<<-HEADER.gsub(/^ {6}/, ''))
|
158
163
|
// This is compiled from Ruby, in #{ruby_source}
|
164
|
+
#include <stdio.h>
|
159
165
|
int next_phoneme_length(int *string, int cursor, int length) {
|
160
166
|
|
161
167
|
int max_length;
|
data/phonetics.gemspec
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: phonetics
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jack Danger
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-10-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -94,20 +94,6 @@ dependencies:
|
|
94
94
|
- - ">="
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: '0'
|
97
|
-
- !ruby/object:Gem::Dependency
|
98
|
-
name: ruby-prof
|
99
|
-
requirement: !ruby/object:Gem::Requirement
|
100
|
-
requirements:
|
101
|
-
- - ">="
|
102
|
-
- !ruby/object:Gem::Version
|
103
|
-
version: '0'
|
104
|
-
type: :development
|
105
|
-
prerelease: false
|
106
|
-
version_requirements: !ruby/object:Gem::Requirement
|
107
|
-
requirements:
|
108
|
-
- - ">="
|
109
|
-
- !ruby/object:Gem::Version
|
110
|
-
version: '0'
|
111
97
|
description: tools for linguistic code using the International Phonetic Alphabet
|
112
98
|
email:
|
113
99
|
- github@jackcanty.com
|
@@ -117,6 +103,7 @@ extensions:
|
|
117
103
|
extra_rdoc_files: []
|
118
104
|
files:
|
119
105
|
- ".github/workflows/gempush.yml"
|
106
|
+
- ".github/workflows/test.yml"
|
120
107
|
- ".gitignore"
|
121
108
|
- ".rspec"
|
122
109
|
- ".rubocop.yml"
|
@@ -133,6 +120,7 @@ files:
|
|
133
120
|
- _site/phonetic_levenshtein_example.png
|
134
121
|
- _site/vowel_chart_b_words.jpg
|
135
122
|
- bin/console
|
123
|
+
- bin/gempush-if-changed
|
136
124
|
- ext/c_levenshtein/extconf.rb
|
137
125
|
- ext/c_levenshtein/levenshtein.c
|
138
126
|
- ext/c_levenshtein/next_phoneme_length.c
|