phonetics 3.0.9 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +17 -2
- data/CHANGELOG +4 -0
- data/Cargo.toml +27 -0
- data/Rakefile +58 -26
- data/VERSION +1 -1
- data/bin/phonetics +89 -0
- data/ext/phonetics_ruby/Cargo.toml +36 -0
- data/ext/phonetics_ruby/build.rs +24 -0
- data/ext/phonetics_ruby/extconf.rb +17 -0
- data/ext/phonetics_ruby/src/lib.rs +56 -0
- data/ext/phonetics_ruby/vendor/phonetics/Cargo.toml +30 -0
- data/ext/phonetics_ruby/vendor/phonetics/README.md +29 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/compounds.rs +40 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/confusion.rs +325 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/consonants.rs +363 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/cross_class.rs +56 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/diacritics.rs +113 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/distance.rs +183 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/levenshtein.rs +146 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/lib.rs +44 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/symbols.rs +21 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/tokenizer.rs +171 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/vowels.rs +197 -0
- data/lib/phonetics.rb +77 -2
- data/phonetics.gemspec +33 -9
- metadata +46 -34
- data/.github/workflows/gempush.yml +0 -28
- data/.github/workflows/test.yml +0 -20
- data/Makefile +0 -6
- data/ext/c_levenshtein/extconf.rb +0 -10
- data/ext/c_levenshtein/levenshtein.c +0 -223
- data/ext/c_levenshtein/next_phoneme_length.c +0 -1365
- data/ext/c_levenshtein/next_phoneme_length.h +0 -1
- data/ext/c_levenshtein/phonemes.c +0 -53
- data/ext/c_levenshtein/phonemes.h +0 -3
- data/ext/c_levenshtein/phonetic_cost.c +0 -88593
- data/ext/c_levenshtein/phonetic_cost.h +0 -1
- data/lib/phonetics/code_generator.rb +0 -228
- data/lib/phonetics/distances.rb +0 -245
- data/lib/phonetics/levenshtein.rb +0 -27
- data/lib/phonetics/ruby_levenshtein.rb +0 -162
|
@@ -1,28 +0,0 @@
|
|
|
1
|
-
name: Ruby Gem
|
|
2
|
-
|
|
3
|
-
on:
|
|
4
|
-
push:
|
|
5
|
-
branches:
|
|
6
|
-
- main
|
|
7
|
-
|
|
8
|
-
jobs:
|
|
9
|
-
build:
|
|
10
|
-
name: gem publishing
|
|
11
|
-
runs-on: ubuntu-latest
|
|
12
|
-
|
|
13
|
-
steps:
|
|
14
|
-
- uses: actions/checkout@master
|
|
15
|
-
- name: Set up Ruby 3.1
|
|
16
|
-
uses: ruby/setup-ruby@v1
|
|
17
|
-
with:
|
|
18
|
-
ruby-version: 3.1.1
|
|
19
|
-
|
|
20
|
-
- name: Publish to RubyGems
|
|
21
|
-
run: |
|
|
22
|
-
mkdir -p $HOME/.gem
|
|
23
|
-
touch $HOME/.gem/credentials
|
|
24
|
-
chmod 0600 $HOME/.gem/credentials
|
|
25
|
-
printf -- "---\n:rubygems_api_key: ${GEM_HOST_API_KEY}\n" > $HOME/.gem/credentials
|
|
26
|
-
bin/gempush-if-changed
|
|
27
|
-
env:
|
|
28
|
-
GEM_HOST_API_KEY: ${{secrets.RUBYGEMS_AUTH_TOKEN}}
|
data/.github/workflows/test.yml
DELETED
|
@@ -1,20 +0,0 @@
|
|
|
1
|
-
name: Tests
|
|
2
|
-
|
|
3
|
-
on: push
|
|
4
|
-
|
|
5
|
-
jobs:
|
|
6
|
-
ci:
|
|
7
|
-
name: CI
|
|
8
|
-
runs-on: ubuntu-latest
|
|
9
|
-
strategy:
|
|
10
|
-
matrix:
|
|
11
|
-
ruby:
|
|
12
|
-
- '3.1.5'
|
|
13
|
-
- '3.3.1'
|
|
14
|
-
steps:
|
|
15
|
-
- uses: actions/checkout@v4
|
|
16
|
-
- uses: ruby/setup-ruby@v1
|
|
17
|
-
with:
|
|
18
|
-
ruby-version: ${{ matrix.ruby }}
|
|
19
|
-
architecture: 'x64'
|
|
20
|
-
- run: gem install bundler && bundle && bundle exec rake
|
data/Makefile
DELETED
|
@@ -1,223 +0,0 @@
|
|
|
1
|
-
#include <stdbool.h>
|
|
2
|
-
#include <stdint.h>
|
|
3
|
-
#include "ruby.h"
|
|
4
|
-
#include "ruby/encoding.h"
|
|
5
|
-
#include "ruby/re.h"
|
|
6
|
-
#include "./phonemes.h"
|
|
7
|
-
#include "./next_phoneme_length.h"
|
|
8
|
-
#include "./phonetic_cost.h"
|
|
9
|
-
|
|
10
|
-
// #define DEBUG
|
|
11
|
-
|
|
12
|
-
#ifdef DEBUG
|
|
13
|
-
#define debug(M, ...) if (verbose) printf(M, ##__VA_ARGS__)
|
|
14
|
-
#else
|
|
15
|
-
#define debug(M, ...)
|
|
16
|
-
#endif
|
|
17
|
-
|
|
18
|
-
VALUE Binding = Qnil;
|
|
19
|
-
|
|
20
|
-
/* Function declarations */
|
|
21
|
-
|
|
22
|
-
void Init_c_levenshtein();
|
|
23
|
-
|
|
24
|
-
void set_initial(float *d, int string1_phoneme_count, uint64_t *string1_phonemes, int string2_phoneme_count, uint64_t *string2_phonemes, bool verbose);
|
|
25
|
-
void print_matrix(float *d, int *string1, int string1_phoneme_count, int *string1_phoneme_sizes, int *string2, int string2_phoneme_count, int *string2_phoneme_sizes, bool verbose);
|
|
26
|
-
VALUE method_internal_phonetic_distance(VALUE self, VALUE _string1, VALUE _string2, VALUE _verbose);
|
|
27
|
-
|
|
28
|
-
/* Function implemitations */
|
|
29
|
-
|
|
30
|
-
void Init_c_levenshtein() {
|
|
31
|
-
Binding = rb_define_module("PhoneticsLevenshteinCBinding");
|
|
32
|
-
rb_define_method(Binding, "internal_phonetic_distance", method_internal_phonetic_distance, 3);
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
VALUE method_internal_phonetic_distance(VALUE self, VALUE _string1, VALUE _string2, VALUE _verbose){
|
|
36
|
-
bool verbose = _verbose;
|
|
37
|
-
|
|
38
|
-
int string1_length = (int) RSTRING_LEN(_string1);
|
|
39
|
-
int string2_length = (int) RSTRING_LEN(_string2);
|
|
40
|
-
|
|
41
|
-
// Given the input strings, we count the phonemes in each and store both the
|
|
42
|
-
// total and, in a phoneme_sizes array, the length of each.
|
|
43
|
-
int string1_phoneme_count = 0;
|
|
44
|
-
int string2_phoneme_count = 0;
|
|
45
|
-
int string1_phoneme_sizes[string1_length + 1];
|
|
46
|
-
int string2_phoneme_sizes[string2_length + 1];
|
|
47
|
-
int string1[string1_length + 1];
|
|
48
|
-
int string2[string2_length + 1];
|
|
49
|
-
|
|
50
|
-
float *d; // The (flattened) 2-dimensional matrix
|
|
51
|
-
// underlying this algorithm
|
|
52
|
-
|
|
53
|
-
float distance; // Return value of this function
|
|
54
|
-
float min, delete, // Reusable cost calculations
|
|
55
|
-
insert, replace,
|
|
56
|
-
cost;
|
|
57
|
-
int i, j; // Frequently overwritten loop vars
|
|
58
|
-
|
|
59
|
-
if (!RB_TYPE_P(_string1, T_STRING)) {
|
|
60
|
-
rb_raise(rb_eArgError, "must pass string as first argument");
|
|
61
|
-
}
|
|
62
|
-
if (!RB_TYPE_P(_string2, T_STRING)) {
|
|
63
|
-
rb_raise(rb_eArgError, "must pass string as second argument");
|
|
64
|
-
}
|
|
65
|
-
for (i = 0; i < string1_length; i++) {
|
|
66
|
-
string1[i] = (RSTRING_PTR(_string1)[i] & 0xff);
|
|
67
|
-
}
|
|
68
|
-
for (i = 0; i < string2_length; i++) {
|
|
69
|
-
string2[i] = RSTRING_PTR(_string2)[i] & 0xff;
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
find_phonemes(string1, string1_length, &string1_phoneme_count, string1_phoneme_sizes);
|
|
73
|
-
uint64_t string1_phonemes[string1_phoneme_count];
|
|
74
|
-
set_phonemes(string1_phonemes, string1, string1_phoneme_count, string1_phoneme_sizes);
|
|
75
|
-
|
|
76
|
-
find_phonemes(string2, string2_length, &string2_phoneme_count, string2_phoneme_sizes);
|
|
77
|
-
uint64_t string2_phonemes[string2_phoneme_count];
|
|
78
|
-
set_phonemes(string2_phonemes, string2, string2_phoneme_count, string2_phoneme_sizes);
|
|
79
|
-
|
|
80
|
-
// Guard clauses for empty strings
|
|
81
|
-
if (string1_phoneme_count == 0 && string2_phoneme_count == 0)
|
|
82
|
-
return DBL2NUM(0.0);
|
|
83
|
-
|
|
84
|
-
debug("\n");
|
|
85
|
-
debug("distance between 0 and 1 of phoneme1: %f\n", phonetic_cost(string1_phonemes[0], string1_phonemes[1]));
|
|
86
|
-
|
|
87
|
-
// one-dimensional representation of 2 dimensional array
|
|
88
|
-
d = calloc((string1_phoneme_count+1) * (string2_phoneme_count+1), sizeof(float));
|
|
89
|
-
|
|
90
|
-
// First, set the top row and left column of the matrix using the sequential
|
|
91
|
-
// phonetic edit distance of string1 and string2, respectively
|
|
92
|
-
set_initial(d, string1_phoneme_count, string1_phonemes, string2_phoneme_count, string2_phonemes, verbose);
|
|
93
|
-
|
|
94
|
-
print_matrix(d, string1, string1_phoneme_count, string1_phoneme_sizes, string2, string2_phoneme_count, string2_phoneme_sizes, verbose);
|
|
95
|
-
|
|
96
|
-
// Then Fill in the (flattened) matrix using the Levenshtein algorithm so we can
|
|
97
|
-
// pluck the lowest-cost edit distance (stored in the lower-right corner, in
|
|
98
|
-
// this case the last spot in the array).
|
|
99
|
-
// We'll use phonetic distance instead of '1' as the edit cost.
|
|
100
|
-
//
|
|
101
|
-
// (Skipping i=0 and j=0 because set_initial filled in all cells where i
|
|
102
|
-
// or j are zero-valued)
|
|
103
|
-
for (j = 1; j <= string2_phoneme_count; j++){
|
|
104
|
-
|
|
105
|
-
for (i = 1; i <= string1_phoneme_count; i++){
|
|
106
|
-
|
|
107
|
-
// The cost of deletion or addition is the Levenshtein distance
|
|
108
|
-
// calculation (the value in the cell to the left, upper-left, or above)
|
|
109
|
-
// plus the phonetic distance between the sound we're moving from to the
|
|
110
|
-
// new one.
|
|
111
|
-
|
|
112
|
-
debug("------- %d/%d (%d) \n", i, j, j*(string1_phoneme_count+1) + i);
|
|
113
|
-
|
|
114
|
-
cost = phonetic_cost(string1_phonemes[i-1], string2_phonemes[j-1]);
|
|
115
|
-
|
|
116
|
-
insert = d[j*(string1_phoneme_count+1) + i-1];
|
|
117
|
-
debug("insert proposes cell %d,%d - %f\n", i-1, j, insert);
|
|
118
|
-
min = insert;
|
|
119
|
-
debug("min (insert): %f\n", min);
|
|
120
|
-
|
|
121
|
-
delete = d[(j-1)*(string1_phoneme_count+1) + i];
|
|
122
|
-
debug("delete proposes cell %d,%d - %f\n", i, j-1, delete);
|
|
123
|
-
if (delete < min) {
|
|
124
|
-
debug("delete is %f, better than %f for %d/%d\n", delete, min, i, j);
|
|
125
|
-
min = delete;
|
|
126
|
-
}
|
|
127
|
-
|
|
128
|
-
replace = d[(j-1)*(string1_phoneme_count+1) + i-1];
|
|
129
|
-
debug("replace proposes cell %d,%d - %f\n", i-1, j-1, replace);
|
|
130
|
-
if (replace < min) {
|
|
131
|
-
debug("replace is %f, better than %f for %d/%d\n", replace, min, i, j);
|
|
132
|
-
min = replace;
|
|
133
|
-
}
|
|
134
|
-
|
|
135
|
-
d[(j * (string1_phoneme_count+1)) + i] = min + cost;
|
|
136
|
-
debug("\n");
|
|
137
|
-
if (verbose) {
|
|
138
|
-
print_matrix(d, string1, string1_phoneme_count, string1_phoneme_sizes, string2, string2_phoneme_count, string2_phoneme_sizes, verbose);
|
|
139
|
-
}
|
|
140
|
-
|
|
141
|
-
}
|
|
142
|
-
}
|
|
143
|
-
|
|
144
|
-
// The final element in the `d` array is the value of the shortest path from
|
|
145
|
-
// the top-left to the bottom-right of the matrix.
|
|
146
|
-
distance = d[(string1_phoneme_count + 1) * (string2_phoneme_count + 1) - 1];
|
|
147
|
-
|
|
148
|
-
free(d);
|
|
149
|
-
debug("distance: %f\n", distance);
|
|
150
|
-
|
|
151
|
-
return DBL2NUM(distance);
|
|
152
|
-
}
|
|
153
|
-
|
|
154
|
-
// Set the minimum scores equal to the distance between each phoneme,
|
|
155
|
-
// sequentially.
|
|
156
|
-
//
|
|
157
|
-
// The first value is always zero.
|
|
158
|
-
// The second value is always the phonetic distance between the first
|
|
159
|
-
// phonemes of each string.
|
|
160
|
-
// Subsequent values are the cumulative phonetic distance between each
|
|
161
|
-
// phoneme within the same string.
|
|
162
|
-
// "aek" -> [0.0, 1.0, 1.61, 2.61]
|
|
163
|
-
void set_initial(float *d, int string1_phoneme_count, uint64_t *string1_phonemes, int string2_phoneme_count, uint64_t *string2_phonemes, bool verbose) {
|
|
164
|
-
|
|
165
|
-
float initial_distance;
|
|
166
|
-
int i, j;
|
|
167
|
-
|
|
168
|
-
if (string1_phoneme_count == 0 || string2_phoneme_count == 0) {
|
|
169
|
-
initial_distance = 0.0;
|
|
170
|
-
} else {
|
|
171
|
-
initial_distance = 1.0;
|
|
172
|
-
}
|
|
173
|
-
|
|
174
|
-
// The top-left is 0, the cell to the right and down are each 1 to start
|
|
175
|
-
d[0] = (float) 0.0;
|
|
176
|
-
if (string1_phoneme_count > 0) {
|
|
177
|
-
d[1] = initial_distance;
|
|
178
|
-
}
|
|
179
|
-
if (string2_phoneme_count > 0) {
|
|
180
|
-
d[string1_phoneme_count+1] = initial_distance;
|
|
181
|
-
}
|
|
182
|
-
|
|
183
|
-
for (i=2; i <= string1_phoneme_count; i++) {
|
|
184
|
-
// The cost of adding the next phoneme is the cost so far plus the phonetic
|
|
185
|
-
// distance between the previous one and the current one.
|
|
186
|
-
d[i] = d[i-1] + phonetic_cost(string1_phonemes[i-2], string1_phonemes[i-1]);
|
|
187
|
-
}
|
|
188
|
-
|
|
189
|
-
for (j=2; j <= string2_phoneme_count; j++) {
|
|
190
|
-
// The same exact pattern down the left side of the matrix
|
|
191
|
-
d[j * (string1_phoneme_count+1)] = d[(j - 1) * (string1_phoneme_count+1)] + phonetic_cost(string2_phonemes[j-2], string2_phonemes[j-1]);
|
|
192
|
-
}
|
|
193
|
-
}
|
|
194
|
-
|
|
195
|
-
// A handy visualization for developers
|
|
196
|
-
void print_matrix(float *d, int *string1, int string1_phoneme_count, int *string1_phoneme_sizes, int *string2, int string2_phoneme_count, int *string2_phoneme_sizes, bool verbose) {
|
|
197
|
-
|
|
198
|
-
int i, j;
|
|
199
|
-
int string1_offset = 0;
|
|
200
|
-
int string2_offset = 0;
|
|
201
|
-
|
|
202
|
-
if (!verbose)
|
|
203
|
-
return;
|
|
204
|
-
|
|
205
|
-
printf(" ");
|
|
206
|
-
for (i=0; i < string1_phoneme_count; i++) {
|
|
207
|
-
print_phoneme(string1, string1_offset, string1_phoneme_sizes[i], 9);
|
|
208
|
-
string1_offset += string1_phoneme_sizes[i];
|
|
209
|
-
}
|
|
210
|
-
printf("\n");
|
|
211
|
-
for (j=0; j <= string2_phoneme_count; j++) {
|
|
212
|
-
if (j==0) {
|
|
213
|
-
printf(" ");
|
|
214
|
-
} else {
|
|
215
|
-
print_phoneme(string2, string2_offset, string2_phoneme_sizes[j-1], 2);
|
|
216
|
-
string2_offset += string2_phoneme_sizes[j-1];
|
|
217
|
-
}
|
|
218
|
-
for (i=0; i <= string1_phoneme_count; i++) {
|
|
219
|
-
printf("%f ", d[j * (string1_phoneme_count+1) + i]) ;
|
|
220
|
-
}
|
|
221
|
-
printf("\n");
|
|
222
|
-
}
|
|
223
|
-
}
|