phonetics 1.9.0 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/gempush.yml +3 -7
- data/.github/workflows/test.yml +23 -0
- data/Gemfile +0 -2
- data/VERSION +1 -1
- data/bin/gempush-if-changed +8 -0
- data/ext/c_levenshtein/levenshtein.c +33 -44
- data/ext/c_levenshtein/next_phoneme_length.c +2 -1
- data/ext/c_levenshtein/phonemes.c +21 -1
- data/ext/c_levenshtein/phonemes.h +1 -0
- data/ext/c_levenshtein/phonetic_cost.c +5 -0
- data/lib/phonetics/code_generator.rb +9 -3
- data/phonetics.gemspec +0 -1
- metadata +4 -16
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 32e8dd70cfccdfd6826023da278e2d04c75fc6e7658b5bc64af44830a0ae5668
|
4
|
+
data.tar.gz: 7dfd507a3fcb7914ee9eaba8049bc286068265399ee580a8c2ffb8ae8ecbd061
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 10c9e15ec47748cf1c508538950fb090477e6762a584466bd9abcc7ffd81fff2497041adf77d190ba6adcb7ef73774502941faeff0edfcf048f7ac650336ab0a
|
7
|
+
data.tar.gz: 7cb06657f02ff1bad1547eb948b412876855d6e0967f927d44512e7d9b84b66e1d8c9ffb79f81e5001442d36bcbc37d8282aa80b3c1bbebecf25f2f0f3fb0849
|
@@ -1,16 +1,13 @@
|
|
1
1
|
name: Ruby Gem
|
2
2
|
|
3
3
|
on:
|
4
|
-
pull_request:
|
5
|
-
branches:
|
6
|
-
- master
|
7
4
|
push:
|
8
5
|
branches:
|
9
6
|
- master
|
10
7
|
|
11
8
|
jobs:
|
12
9
|
build:
|
13
|
-
name:
|
10
|
+
name: gem publishing
|
14
11
|
runs-on: ubuntu-latest
|
15
12
|
|
16
13
|
steps:
|
@@ -18,7 +15,7 @@ jobs:
|
|
18
15
|
- name: Set up Ruby 2.6
|
19
16
|
uses: actions/setup-ruby@v1
|
20
17
|
with:
|
21
|
-
version: 2.6.x
|
18
|
+
ruby-version: 2.6.x
|
22
19
|
|
23
20
|
- name: Publish to RubyGems
|
24
21
|
run: |
|
@@ -26,7 +23,6 @@ jobs:
|
|
26
23
|
touch $HOME/.gem/credentials
|
27
24
|
chmod 0600 $HOME/.gem/credentials
|
28
25
|
printf -- "---\n:rubygems_api_key: ${GEM_HOST_API_KEY}\n" > $HOME/.gem/credentials
|
29
|
-
|
30
|
-
gem push *.gem
|
26
|
+
bin/gempush-if-changed
|
31
27
|
env:
|
32
28
|
GEM_HOST_API_KEY: ${{secrets.RUBYGEMS_AUTH_TOKEN}}
|
@@ -0,0 +1,23 @@
|
|
1
|
+
name: Tests
|
2
|
+
|
3
|
+
on: push
|
4
|
+
|
5
|
+
jobs:
|
6
|
+
ci:
|
7
|
+
name: CI
|
8
|
+
runs-on: ubuntu-latest
|
9
|
+
strategy:
|
10
|
+
matrix:
|
11
|
+
ruby:
|
12
|
+
- '2.3.x'
|
13
|
+
- '2.4.x'
|
14
|
+
- '2.5.x'
|
15
|
+
- '2.6.x'
|
16
|
+
steps:
|
17
|
+
- uses: actions/checkout@master
|
18
|
+
- name: Setup ruby
|
19
|
+
uses: actions/setup-ruby@v1
|
20
|
+
with:
|
21
|
+
ruby-version: ${{ matrix.ruby }}
|
22
|
+
architecture: 'x64'
|
23
|
+
- run: gem install bundler && bundle && bundle exec rake
|
data/Gemfile
CHANGED
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
2.0.1
|
@@ -7,7 +7,13 @@
|
|
7
7
|
#include "./next_phoneme_length.h"
|
8
8
|
#include "./phonetic_cost.h"
|
9
9
|
|
10
|
+
// #define DEBUG
|
11
|
+
|
12
|
+
#ifdef DEBUG
|
10
13
|
#define debug(M, ...) if (verbose) printf(M, ##__VA_ARGS__)
|
14
|
+
#else
|
15
|
+
#define debug(M, ...)
|
16
|
+
#endif
|
11
17
|
|
12
18
|
VALUE Binding = Qnil;
|
13
19
|
|
@@ -15,8 +21,8 @@ VALUE Binding = Qnil;
|
|
15
21
|
|
16
22
|
void Init_c_levenshtein();
|
17
23
|
|
18
|
-
void set_initial(
|
19
|
-
void print_matrix(
|
24
|
+
void set_initial(float *d, int string1_phoneme_count, uint64_t *string1_phonemes, int string2_phoneme_count, uint64_t *string2_phonemes, bool verbose);
|
25
|
+
void print_matrix(float *d, int *string1, int string1_phoneme_count, int *string1_phoneme_sizes, int *string2, int string2_phoneme_count, int *string2_phoneme_sizes, bool verbose);
|
20
26
|
VALUE method_internal_phonetic_distance(VALUE self, VALUE _string1, VALUE _string2, VALUE _verbose);
|
21
27
|
|
22
28
|
/* Function implemitations */
|
@@ -41,11 +47,11 @@ VALUE method_internal_phonetic_distance(VALUE self, VALUE _string1, VALUE _strin
|
|
41
47
|
int string1[string1_length + 1];
|
42
48
|
int string2[string2_length + 1];
|
43
49
|
|
44
|
-
|
50
|
+
float *d; // The (flattened) 2-dimensional matrix
|
45
51
|
// underlying this algorithm
|
46
52
|
|
47
|
-
|
48
|
-
|
53
|
+
float distance; // Return value of this function
|
54
|
+
float min, delete, // Reusable cost calculations
|
49
55
|
insert, replace,
|
50
56
|
cost;
|
51
57
|
int i, j; // Frequently overwritten loop vars
|
@@ -64,34 +70,22 @@ VALUE method_internal_phonetic_distance(VALUE self, VALUE _string1, VALUE _strin
|
|
64
70
|
}
|
65
71
|
|
66
72
|
find_phonemes(string1, string1_length, &string1_phoneme_count, string1_phoneme_sizes);
|
67
|
-
|
73
|
+
uint64_t string1_phonemes[string1_phoneme_count];
|
74
|
+
set_phonemes(string1_phonemes, string1, string1_phoneme_count, string1_phoneme_sizes);
|
68
75
|
|
69
76
|
find_phonemes(string2, string2_length, &string2_phoneme_count, string2_phoneme_sizes);
|
70
|
-
|
77
|
+
uint64_t string2_phonemes[string2_phoneme_count];
|
78
|
+
set_phonemes(string2_phonemes, string2, string2_phoneme_count, string2_phoneme_sizes);
|
71
79
|
|
72
80
|
// Guard clauses for empty strings
|
73
81
|
if (string1_phoneme_count == 0 && string2_phoneme_count == 0)
|
74
82
|
return DBL2NUM(0.0);
|
75
83
|
|
76
|
-
|
77
|
-
|
78
|
-
// These 64-bit words are how we implement the lookup table in phonetic_cost
|
79
|
-
int idx = 0;
|
80
|
-
for (i = 0; i < string1_phoneme_count; i++) {
|
81
|
-
for (j = 0; j < string1_phoneme_sizes[j]; j++) {
|
82
|
-
string1_phonemes[i] = (int) ( string1_phonemes[i] << 8 | string1[idx] );
|
83
|
-
idx++;
|
84
|
-
}
|
85
|
-
}
|
86
|
-
for (i = 0; i < string2_phoneme_count; i++) {
|
87
|
-
for (j = 0; j < string2_phoneme_sizes[j]; j++) {
|
88
|
-
string2_phonemes[i] = (int) ( string2_phonemes[i] << 8 | string2[idx] );
|
89
|
-
idx++;
|
90
|
-
}
|
91
|
-
}
|
84
|
+
debug("\n");
|
85
|
+
debug("distance between 0 and 1 of phoneme1: %f\n", phonetic_cost(string1_phonemes[0], string1_phonemes[1]));
|
92
86
|
|
93
87
|
// one-dimensional representation of 2 dimensional array
|
94
|
-
d = calloc((string1_phoneme_count+1) * (string2_phoneme_count+1), sizeof(
|
88
|
+
d = calloc((string1_phoneme_count+1) * (string2_phoneme_count+1), sizeof(float));
|
95
89
|
|
96
90
|
// First, set the top row and left column of the matrix using the sequential
|
97
91
|
// phonetic edit distance of string1 and string2, respectively
|
@@ -115,34 +109,34 @@ VALUE method_internal_phonetic_distance(VALUE self, VALUE _string1, VALUE _strin
|
|
115
109
|
// plus the phonetic distance between the sound we're moving from to the
|
116
110
|
// new one.
|
117
111
|
|
118
|
-
|
112
|
+
debug("------- %d/%d (%d) \n", i, j, j*(string1_phoneme_count+1) + i);
|
119
113
|
|
120
114
|
cost = phonetic_cost(string1_phonemes[i-1], string2_phonemes[j-1]);
|
121
115
|
|
122
116
|
insert = d[j*(string1_phoneme_count+1) + i-1];
|
123
|
-
|
117
|
+
debug("insert proposes cell %d,%d - %f\n", i-1, j, insert);
|
124
118
|
min = insert;
|
125
|
-
|
119
|
+
debug("min (insert): %f\n", min);
|
126
120
|
|
127
121
|
delete = d[(j-1)*(string1_phoneme_count+1) + i];
|
128
|
-
|
122
|
+
debug("delete proposes cell %d,%d - %f\n", i, j-1, delete);
|
129
123
|
if (delete < min) {
|
130
|
-
|
124
|
+
debug("delete is %f, better than %f for %d/%d\n", delete, min, i, j);
|
131
125
|
min = delete;
|
132
126
|
}
|
133
127
|
|
134
128
|
replace = d[(j-1)*(string1_phoneme_count+1) + i-1];
|
135
|
-
|
129
|
+
debug("replace proposes cell %d,%d - %f\n", i-1, j-1, replace);
|
136
130
|
if (replace < min) {
|
137
|
-
|
131
|
+
debug("replace is %f, better than %f for %d/%d\n", replace, min, i, j);
|
138
132
|
min = replace;
|
139
133
|
}
|
140
134
|
|
141
135
|
d[(j * (string1_phoneme_count+1)) + i] = min + cost;
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
136
|
+
debug("\n");
|
137
|
+
if (verbose) {
|
138
|
+
print_matrix(d, string1, string1_phoneme_count, string1_phoneme_sizes, string2, string2_phoneme_count, string2_phoneme_sizes, verbose);
|
139
|
+
}
|
146
140
|
|
147
141
|
}
|
148
142
|
}
|
@@ -166,9 +160,9 @@ VALUE method_internal_phonetic_distance(VALUE self, VALUE _string1, VALUE _strin
|
|
166
160
|
// Subsequent values are the cumulative phonetic distance between each
|
167
161
|
// phoneme within the same string.
|
168
162
|
// "aek" -> [0.0, 1.0, 1.61, 2.61]
|
169
|
-
void set_initial(
|
163
|
+
void set_initial(float *d, int string1_phoneme_count, uint64_t *string1_phonemes, int string2_phoneme_count, uint64_t *string2_phonemes, bool verbose) {
|
170
164
|
|
171
|
-
|
165
|
+
float initial_distance;
|
172
166
|
int i, j;
|
173
167
|
|
174
168
|
if (string1_phoneme_count == 0 || string2_phoneme_count == 0) {
|
@@ -178,7 +172,7 @@ void set_initial(double *d, int string1_phoneme_count, int64_t *string1_phonemes
|
|
178
172
|
}
|
179
173
|
|
180
174
|
// The top-left is 0, the cell to the right and down are each 1 to start
|
181
|
-
d[0] = (
|
175
|
+
d[0] = (float) 0.0;
|
182
176
|
if (string1_phoneme_count > 0) {
|
183
177
|
d[1] = initial_distance;
|
184
178
|
}
|
@@ -186,16 +180,12 @@ void set_initial(double *d, int string1_phoneme_count, int64_t *string1_phonemes
|
|
186
180
|
d[string1_phoneme_count+1] = initial_distance;
|
187
181
|
}
|
188
182
|
|
189
|
-
debug("string1 phoneme count: %d\n", string1_phoneme_count);
|
190
|
-
|
191
183
|
for (i=2; i <= string1_phoneme_count; i++) {
|
192
184
|
// The cost of adding the next phoneme is the cost so far plus the phonetic
|
193
185
|
// distance between the previous one and the current one.
|
194
186
|
d[i] = d[i-1] + phonetic_cost(string1_phonemes[i-2], string1_phonemes[i-1]);
|
195
187
|
}
|
196
188
|
|
197
|
-
debug("string2 phoneme count: %d\n", string2_phoneme_count);
|
198
|
-
|
199
189
|
for (j=2; j <= string2_phoneme_count; j++) {
|
200
190
|
// The same exact pattern down the left side of the matrix
|
201
191
|
d[j * (string1_phoneme_count+1)] = d[(j - 1) * (string1_phoneme_count+1)] + phonetic_cost(string2_phonemes[j-2], string2_phonemes[j-1]);
|
@@ -203,13 +193,12 @@ void set_initial(double *d, int string1_phoneme_count, int64_t *string1_phonemes
|
|
203
193
|
}
|
204
194
|
|
205
195
|
// A handy visualization for developers
|
206
|
-
void print_matrix(
|
196
|
+
void print_matrix(float *d, int *string1, int string1_phoneme_count, int *string1_phoneme_sizes, int *string2, int string2_phoneme_count, int *string2_phoneme_sizes, bool verbose) {
|
207
197
|
|
208
198
|
int i, j;
|
209
199
|
int string1_offset = 0;
|
210
200
|
int string2_offset = 0;
|
211
201
|
|
212
|
-
return;
|
213
202
|
if (!verbose)
|
214
203
|
return;
|
215
204
|
|
@@ -1,8 +1,13 @@
|
|
1
1
|
#include <stdio.h>
|
2
|
+
#include <stdlib.h>
|
3
|
+
#include <stdint.h>
|
2
4
|
#include "./next_phoneme_length.h"
|
5
|
+
|
3
6
|
void find_phonemes(int *string, int string_length, int *count, int *lengths) {
|
4
|
-
int i = 0;
|
5
7
|
int length;
|
8
|
+
int i;
|
9
|
+
|
10
|
+
i = 0;
|
6
11
|
while (i < string_length) {
|
7
12
|
length = next_phoneme_length(string, i, string_length);
|
8
13
|
if (length) {
|
@@ -14,6 +19,21 @@ void find_phonemes(int *string, int string_length, int *count, int *lengths) {
|
|
14
19
|
}
|
15
20
|
}
|
16
21
|
|
22
|
+
// Collect between 1 and 8 bytes of a phoneme into a single 64-bit word so we can compare two
|
23
|
+
// phonemes using just one instruction.
|
24
|
+
// These 64-bit words are how we implement the lookup table in phonetic_cost
|
25
|
+
void set_phonemes(uint64_t* phonemes, int* string, int count, int* lengths) {
|
26
|
+
int idx = 0;
|
27
|
+
int i, j;
|
28
|
+
for (i = 0; i < count; i++) {
|
29
|
+
phonemes[i] = 0;
|
30
|
+
for (j = 0; j < lengths[i]; j++) {
|
31
|
+
phonemes[i] = (uint64_t) ( phonemes[i] << 8 | string[idx] );
|
32
|
+
idx++;
|
33
|
+
}
|
34
|
+
}
|
35
|
+
}
|
36
|
+
|
17
37
|
void print_phoneme(int *string, int offset, int length, int padding) {
|
18
38
|
int p;
|
19
39
|
int max = padding;
|
@@ -1,7 +1,12 @@
|
|
1
1
|
|
2
2
|
// This is compiled from Ruby, in phonetics/lib/phonetics/code_generator.rb:110
|
3
3
|
#include <stdint.h>
|
4
|
+
#include <stdio.h>
|
5
|
+
#include <inttypes.h>
|
4
6
|
float phonetic_cost(int64_t phoneme1, int64_t phoneme2) {
|
7
|
+
if (phoneme1 == phoneme2) {
|
8
|
+
return (float) 0.0;
|
9
|
+
}
|
5
10
|
|
6
11
|
switch (phoneme1) {
|
7
12
|
case 0b01101001:
|
@@ -95,7 +95,7 @@ module Phonetics
|
|
95
95
|
# switch (phoneme1) {
|
96
96
|
# case 'ɪ': // two bytes: [201, 170]
|
97
97
|
# // vowel features: {"F1":300,"F2":2100,"rounded":false}
|
98
|
-
#
|
98
|
+
#
|
99
99
|
# switch(phoneme2) {
|
100
100
|
# 'i': // one byte: [105]
|
101
101
|
# // vowel features: {"F1":240,"F2":2400,"rounded":false}
|
@@ -109,7 +109,12 @@ module Phonetics
|
|
109
109
|
|
110
110
|
// This is compiled from Ruby, in #{ruby_source}
|
111
111
|
#include <stdint.h>
|
112
|
+
#include <stdio.h>
|
113
|
+
#include <inttypes.h>
|
112
114
|
float phonetic_cost(int64_t phoneme1, int64_t phoneme2) {
|
115
|
+
if (phoneme1 == phoneme2) {
|
116
|
+
return (float) 0.0;
|
117
|
+
}
|
113
118
|
|
114
119
|
HEADER
|
115
120
|
|
@@ -117,14 +122,14 @@ module Phonetics
|
|
117
122
|
Phonetics.phonemes.each do |phoneme1|
|
118
123
|
write " case #{binary(phoneme1)}:"
|
119
124
|
describe(phoneme1, 2)
|
120
|
-
write
|
125
|
+
write ' switch(phoneme2) {'
|
121
126
|
Phonetics.distance_map[phoneme1].each do |phoneme2, distance|
|
122
127
|
write " case #{binary(phoneme2)}:"
|
123
128
|
describe(phoneme2, 6)
|
124
129
|
write " return (float) #{distance};"
|
125
130
|
write ' break;'
|
126
131
|
end
|
127
|
-
write
|
132
|
+
write ' }'
|
128
133
|
write ' break;'
|
129
134
|
end
|
130
135
|
write ' }'
|
@@ -156,6 +161,7 @@ module Phonetics
|
|
156
161
|
def generate
|
157
162
|
write(<<-HEADER.gsub(/^ {6}/, ''))
|
158
163
|
// This is compiled from Ruby, in #{ruby_source}
|
164
|
+
#include <stdio.h>
|
159
165
|
int next_phoneme_length(int *string, int cursor, int length) {
|
160
166
|
|
161
167
|
int max_length;
|
data/phonetics.gemspec
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: phonetics
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jack Danger
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-10-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -94,20 +94,6 @@ dependencies:
|
|
94
94
|
- - ">="
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: '0'
|
97
|
-
- !ruby/object:Gem::Dependency
|
98
|
-
name: ruby-prof
|
99
|
-
requirement: !ruby/object:Gem::Requirement
|
100
|
-
requirements:
|
101
|
-
- - ">="
|
102
|
-
- !ruby/object:Gem::Version
|
103
|
-
version: '0'
|
104
|
-
type: :development
|
105
|
-
prerelease: false
|
106
|
-
version_requirements: !ruby/object:Gem::Requirement
|
107
|
-
requirements:
|
108
|
-
- - ">="
|
109
|
-
- !ruby/object:Gem::Version
|
110
|
-
version: '0'
|
111
97
|
description: tools for linguistic code using the International Phonetic Alphabet
|
112
98
|
email:
|
113
99
|
- github@jackcanty.com
|
@@ -117,6 +103,7 @@ extensions:
|
|
117
103
|
extra_rdoc_files: []
|
118
104
|
files:
|
119
105
|
- ".github/workflows/gempush.yml"
|
106
|
+
- ".github/workflows/test.yml"
|
120
107
|
- ".gitignore"
|
121
108
|
- ".rspec"
|
122
109
|
- ".rubocop.yml"
|
@@ -133,6 +120,7 @@ files:
|
|
133
120
|
- _site/phonetic_levenshtein_example.png
|
134
121
|
- _site/vowel_chart_b_words.jpg
|
135
122
|
- bin/console
|
123
|
+
- bin/gempush-if-changed
|
136
124
|
- ext/c_levenshtein/extconf.rb
|
137
125
|
- ext/c_levenshtein/levenshtein.c
|
138
126
|
- ext/c_levenshtein/next_phoneme_length.c
|