phonetics 3.0.9 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +17 -2
- data/CHANGELOG +4 -0
- data/Cargo.toml +27 -0
- data/Rakefile +58 -26
- data/VERSION +1 -1
- data/bin/phonetics +89 -0
- data/ext/phonetics_ruby/Cargo.toml +36 -0
- data/ext/phonetics_ruby/build.rs +24 -0
- data/ext/phonetics_ruby/extconf.rb +17 -0
- data/ext/phonetics_ruby/src/lib.rs +56 -0
- data/ext/phonetics_ruby/vendor/phonetics/Cargo.toml +30 -0
- data/ext/phonetics_ruby/vendor/phonetics/README.md +29 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/compounds.rs +40 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/confusion.rs +325 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/consonants.rs +363 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/cross_class.rs +56 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/diacritics.rs +113 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/distance.rs +183 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/levenshtein.rs +146 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/lib.rs +44 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/symbols.rs +21 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/tokenizer.rs +171 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/vowels.rs +197 -0
- data/lib/phonetics.rb +77 -2
- data/phonetics.gemspec +33 -9
- metadata +46 -34
- data/.github/workflows/gempush.yml +0 -28
- data/.github/workflows/test.yml +0 -20
- data/Makefile +0 -6
- data/ext/c_levenshtein/extconf.rb +0 -10
- data/ext/c_levenshtein/levenshtein.c +0 -223
- data/ext/c_levenshtein/next_phoneme_length.c +0 -1365
- data/ext/c_levenshtein/next_phoneme_length.h +0 -1
- data/ext/c_levenshtein/phonemes.c +0 -53
- data/ext/c_levenshtein/phonemes.h +0 -3
- data/ext/c_levenshtein/phonetic_cost.c +0 -88593
- data/ext/c_levenshtein/phonetic_cost.h +0 -1
- data/lib/phonetics/code_generator.rb +0 -228
- data/lib/phonetics/distances.rb +0 -245
- data/lib/phonetics/levenshtein.rb +0 -27
- data/lib/phonetics/ruby_levenshtein.rb +0 -162
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
//! IPA phoneme tokenizer.
|
|
2
|
+
//!
|
|
3
|
+
//! Walks an input string and emits a sequence of phoneme tokens. The
|
|
4
|
+
//! recognition is longest-prefix: multi-character atoms like /tʃ/,
|
|
5
|
+
//! /aɪ/, and /ɝ/ win over their single-character constituents.
|
|
6
|
+
//!
|
|
7
|
+
//! Diacritics absorb into the segment they modify — trailing
|
|
8
|
+
//! modifiers attach to the preceding base phoneme, stress marks
|
|
9
|
+
//! attach to the following one. Whitespace is skipped by default; in
|
|
10
|
+
//! boundary mode (used by the Confusion metric) each whitespace
|
|
11
|
+
//! character emits the `#` boundary token.
|
|
12
|
+
|
|
13
|
+
use std::collections::HashSet;
|
|
14
|
+
use std::sync::LazyLock;
|
|
15
|
+
|
|
16
|
+
use crate::{compounds, consonants, diacritics::Diacritic, symbols, vowels};
|
|
17
|
+
|
|
18
|
+
/// Set of every recognised phoneme symbol. Includes the boundary token
|
|
19
|
+
/// so longest-prefix matching can pick it up on raw `#` input.
|
|
20
|
+
pub static PHONEME_SET: LazyLock<HashSet<&'static str>> = LazyLock::new(|| {
|
|
21
|
+
let mut s: HashSet<&'static str> = HashSet::new();
|
|
22
|
+
for &p in vowels::INVENTORY {
|
|
23
|
+
s.insert(p);
|
|
24
|
+
}
|
|
25
|
+
for &p in consonants::INVENTORY {
|
|
26
|
+
s.insert(p);
|
|
27
|
+
}
|
|
28
|
+
for &p in compounds::INVENTORY {
|
|
29
|
+
s.insert(p);
|
|
30
|
+
}
|
|
31
|
+
s.insert(symbols::BOUNDARY_TOKEN);
|
|
32
|
+
s
|
|
33
|
+
});
|
|
34
|
+
|
|
35
|
+
/// Largest phoneme-symbol size in characters (not bytes). Used as the
|
|
36
|
+
/// upper bound for longest-prefix matching.
|
|
37
|
+
pub static MAX_PHONEME_CHARS: LazyLock<usize> = LazyLock::new(|| {
|
|
38
|
+
PHONEME_SET
|
|
39
|
+
.iter()
|
|
40
|
+
.map(|s| s.chars().count())
|
|
41
|
+
.max()
|
|
42
|
+
.unwrap_or(1)
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
/// True if `s` is a recognised phoneme symbol.
|
|
46
|
+
pub fn is_phoneme(s: &str) -> bool {
|
|
47
|
+
PHONEME_SET.contains(s)
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
/// Characters that represent a word boundary in raw IPA input.
|
|
51
|
+
const BOUNDARY_CHARS: &[char] = &[' ', '\t', '_', '|'];
|
|
52
|
+
|
|
53
|
+
/// Tokenise an IPA string into a sequence of phoneme tokens.
|
|
54
|
+
///
|
|
55
|
+
/// When `boundaries` is true, each whitespace / boundary character
|
|
56
|
+
/// in the input emits the `#` token; otherwise they're skipped.
|
|
57
|
+
pub fn tokens(input: &str, boundaries: bool) -> Vec<String> {
|
|
58
|
+
let chars: Vec<char> = input.chars().collect();
|
|
59
|
+
let max_phoneme_size = *MAX_PHONEME_CHARS;
|
|
60
|
+
let mut out: Vec<String> = Vec::new();
|
|
61
|
+
let mut pending_prefix = String::new();
|
|
62
|
+
let mut idx = 0;
|
|
63
|
+
|
|
64
|
+
while idx < chars.len() {
|
|
65
|
+
let ch = chars[idx];
|
|
66
|
+
|
|
67
|
+
if BOUNDARY_CHARS.contains(&ch) {
|
|
68
|
+
if boundaries {
|
|
69
|
+
out.push(symbols::BOUNDARY_TOKEN.to_string());
|
|
70
|
+
}
|
|
71
|
+
idx += 1;
|
|
72
|
+
continue;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
// Stress marks bind forward; carry them onto the next emitted token.
|
|
76
|
+
if let Some(d) = Diacritic::from_char(ch) {
|
|
77
|
+
if d.is_leading() {
|
|
78
|
+
pending_prefix.push(ch);
|
|
79
|
+
idx += 1;
|
|
80
|
+
continue;
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
// Try longest-prefix match against the recognized inventory.
|
|
85
|
+
let mut matched: Option<String> = None;
|
|
86
|
+
let max = max_phoneme_size.min(chars.len() - idx);
|
|
87
|
+
for size in (1..=max).rev() {
|
|
88
|
+
let candidate: String = chars[idx..idx + size].iter().collect();
|
|
89
|
+
if is_phoneme(&candidate) {
|
|
90
|
+
matched = Some(candidate);
|
|
91
|
+
idx += size;
|
|
92
|
+
break;
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
if let Some(base) = matched {
|
|
97
|
+
let mut token = std::mem::take(&mut pending_prefix);
|
|
98
|
+
token.push_str(&base);
|
|
99
|
+
|
|
100
|
+
// Absorb any trailing diacritics that modify this phoneme.
|
|
101
|
+
while idx < chars.len() {
|
|
102
|
+
let next = chars[idx];
|
|
103
|
+
match Diacritic::from_char(next) {
|
|
104
|
+
Some(d) if !d.is_leading() => {
|
|
105
|
+
token.push(next);
|
|
106
|
+
idx += 1;
|
|
107
|
+
}
|
|
108
|
+
_ => break,
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
out.push(token);
|
|
112
|
+
} else {
|
|
113
|
+
// No recognised phoneme starts here; skip one character.
|
|
114
|
+
idx += 1;
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
out
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
#[cfg(test)]
|
|
122
|
+
mod tests {
|
|
123
|
+
use super::*;
|
|
124
|
+
|
|
125
|
+
fn t(s: &str) -> Vec<String> {
|
|
126
|
+
tokens(s, false)
|
|
127
|
+
}
|
|
128
|
+
fn tb(s: &str) -> Vec<String> {
|
|
129
|
+
tokens(s, true)
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
#[test]
|
|
133
|
+
fn matches_ruby_reference_tokenisations() {
|
|
134
|
+
// Reference outputs produced by the Ruby implementation.
|
|
135
|
+
let cases: &[(&str, &[&str], &[&str])] = &[
|
|
136
|
+
("kæt", &["k","æ","t"], &["k","æ","t"]),
|
|
137
|
+
("wətɛvɝ", &["w","ə","t","ɛ","v","ɝ"], &["w","ə","t","ɛ","v","ɝ"]),
|
|
138
|
+
("kuɹzlɑɪt", &["k","u","ɹ","z","l","ɑɪ","t"], &["k","u","ɹ","z","l","ɑɪ","t"]),
|
|
139
|
+
("dʒʌstɪs", &["dʒ","ʌ","s","t","ɪ","s"], &["dʒ","ʌ","s","t","ɪ","s"]),
|
|
140
|
+
("tʃɝtʃ", &["tʃ","ɝ","tʃ"], &["tʃ","ɝ","tʃ"]),
|
|
141
|
+
("stupɪdgeɪm", &["s","t","u","p","ɪ","d","g","eɪ","m"], &["s","t","u","p","ɪ","d","g","eɪ","m"]),
|
|
142
|
+
("wə t 9 ɛvɝ", &["w","ə","t","ɛ","v","ɝ"], &["w","ə","#","t","#","#","ɛ","v","ɝ"]),
|
|
143
|
+
("pʰɪt", &["pʰ","ɪ","t"], &["pʰ","ɪ","t"]),
|
|
144
|
+
("kʰæt̃", &["kʰ","æ","t̃"], &["kʰ","æ","t̃"]),
|
|
145
|
+
("ˈstop", &["ˈs","t","o","p"], &["ˈs","t","o","p"]),
|
|
146
|
+
("ˌɪntɝˈnæʃənl", &["ˌɪ","n","t","ɝ","ˈn","æ","ʃ","ə","n","l"], &["ˌɪ","n","t","ɝ","ˈn","æ","ʃ","ə","n","l"]),
|
|
147
|
+
("stuːpɪd", &["s","t","uː","p","ɪ","d"], &["s","t","uː","p","ɪ","d"]),
|
|
148
|
+
("aɪlʌvju", &["aɪ","l","ʌ","v","j","u"], &["aɪ","l","ʌ","v","j","u"]),
|
|
149
|
+
];
|
|
150
|
+
|
|
151
|
+
for (input, bare, with_bounds) in cases {
|
|
152
|
+
let got_bare = t(input);
|
|
153
|
+
let got_bnds = tb(input);
|
|
154
|
+
let want_bare: Vec<String> = bare.iter().map(|s| s.to_string()).collect();
|
|
155
|
+
let want_bnds: Vec<String> = with_bounds.iter().map(|s| s.to_string()).collect();
|
|
156
|
+
assert_eq!(got_bare, want_bare, "bare tokenisation diverged for {input:?}");
|
|
157
|
+
assert_eq!(got_bnds, want_bnds, "boundary tokenisation diverged for {input:?}");
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
#[test]
|
|
162
|
+
fn skips_unknown_characters() {
|
|
163
|
+
assert_eq!(t("k9æt"), vec!["k".to_string(), "æ".to_string(), "t".to_string()]);
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
#[test]
|
|
167
|
+
fn empty_input_yields_empty_output() {
|
|
168
|
+
assert!(t("").is_empty());
|
|
169
|
+
assert!(tb("").is_empty());
|
|
170
|
+
}
|
|
171
|
+
}
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
//! Vowel distance in Bark-Euclidean space.
|
|
2
|
+
//!
|
|
3
|
+
//! F1 and F2 are stored in Hz but compared in Bark via the Traunmüller
|
|
4
|
+
//! (1990) approximation, because pitch perception is logarithmic and a
|
|
5
|
+
//! 200 Hz shift at F1=300 is enormous while the same shift at F2=2200
|
|
6
|
+
//! is barely audible. Roundedness and rhoticity are additive penalties
|
|
7
|
+
//! on top of the formant distance.
|
|
8
|
+
|
|
9
|
+
use std::sync::LazyLock;
|
|
10
|
+
|
|
11
|
+
/// Acoustic properties of one vowel in the inventory.
|
|
12
|
+
#[derive(Debug, Clone, Copy, PartialEq)]
|
|
13
|
+
pub struct Vowel {
|
|
14
|
+
/// First formant frequency in Hz.
|
|
15
|
+
pub f1: f64,
|
|
16
|
+
/// Second formant frequency in Hz.
|
|
17
|
+
pub f2: f64,
|
|
18
|
+
/// Lip rounding.
|
|
19
|
+
pub rounded: bool,
|
|
20
|
+
/// Rhoticity (for /ɝ/).
|
|
21
|
+
pub rhotic: bool,
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
/// Tunable: vowels share a perceptually narrower space than consonants,
|
|
25
|
+
/// so the formant contribution is capped well below 1.0.
|
|
26
|
+
pub const VOWEL_SCALE: f64 = 0.60;
|
|
27
|
+
|
|
28
|
+
/// Additive penalty when one vowel is rounded and the other isn't.
|
|
29
|
+
pub const ROUNDING_PENALTY: f64 = 0.05;
|
|
30
|
+
|
|
31
|
+
/// Additive penalty for rhoticity mismatch.
|
|
32
|
+
pub const RHOTICITY_PENALTY: f64 = 0.20;
|
|
33
|
+
|
|
34
|
+
/// Hz → Bark. Traunmüller (1990) approximation.
|
|
35
|
+
pub fn bark(hz: f64) -> f64 {
|
|
36
|
+
if hz <= 0.0 {
|
|
37
|
+
return 0.0;
|
|
38
|
+
}
|
|
39
|
+
13.0 * (0.000_76 * hz).atan() + 3.5 * (hz / 7500.0).powi(2).atan()
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
/// IPA symbols in this inventory, in canonical order.
|
|
43
|
+
pub const INVENTORY: &[&str] = &[
|
|
44
|
+
"i", "y", "ɪ", "e", "ø", "ɛ", "œ", "a", "ɶ", "ɑ", "ɒ",
|
|
45
|
+
"ʌ", "ə", "ɝ", "ɔ", "ɤ", "o", "ɯ", "æ", "u", "ʊ",
|
|
46
|
+
];
|
|
47
|
+
|
|
48
|
+
/// Look up the formant data for an IPA vowel symbol.
|
|
49
|
+
///
|
|
50
|
+
/// Values from the cardinal-vowel measurements on Wikipedia (Daniel
|
|
51
|
+
/// Jones tradition), with the typo on /y/'s rounding flag corrected
|
|
52
|
+
/// from the original Ruby table and /ə/ no longer duplicating /ʌ/.
|
|
53
|
+
pub fn lookup(symbol: &str) -> Option<Vowel> {
|
|
54
|
+
let v = |f1, f2, rounded, rhotic| Vowel { f1, f2, rounded, rhotic };
|
|
55
|
+
Some(match symbol {
|
|
56
|
+
"i" => v(240.0, 2400.0, false, false),
|
|
57
|
+
"y" => v(235.0, 2100.0, true, false),
|
|
58
|
+
"ɪ" => v(300.0, 2100.0, false, false),
|
|
59
|
+
"e" => v(390.0, 2300.0, false, false),
|
|
60
|
+
"ø" => v(370.0, 1900.0, true, false),
|
|
61
|
+
"ɛ" => v(610.0, 1900.0, false, false),
|
|
62
|
+
"œ" => v(585.0, 1710.0, true, false),
|
|
63
|
+
"a" => v(850.0, 1610.0, false, false),
|
|
64
|
+
"ɶ" => v(820.0, 1530.0, true, false),
|
|
65
|
+
"ɑ" => v(750.0, 940.0, false, false),
|
|
66
|
+
"ɒ" => v(700.0, 760.0, true, false),
|
|
67
|
+
"ʌ" => v(600.0, 1170.0, false, false),
|
|
68
|
+
"ə" => v(500.0, 1500.0, false, false),
|
|
69
|
+
"ɝ" => v(500.0, 1350.0, false, true),
|
|
70
|
+
"ɔ" => v(500.0, 700.0, true, false),
|
|
71
|
+
"ɤ" => v(460.0, 1310.0, false, false),
|
|
72
|
+
"o" => v(360.0, 640.0, true, false),
|
|
73
|
+
"ɯ" => v(300.0, 1390.0, false, false),
|
|
74
|
+
"æ" => v(690.0, 1660.0, false, false),
|
|
75
|
+
"u" => v(250.0, 595.0, true, false),
|
|
76
|
+
"ʊ" => v(380.0, 950.0, true, false),
|
|
77
|
+
_ => return None,
|
|
78
|
+
})
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
/// Largest Bark-Euclidean distance achievable within the inventory.
|
|
82
|
+
/// Memoised; computed once on first access.
|
|
83
|
+
static BARK_SPAN: LazyLock<f64> = LazyLock::new(|| {
|
|
84
|
+
let coords: Vec<(f64, f64)> = INVENTORY
|
|
85
|
+
.iter()
|
|
86
|
+
.map(|s| {
|
|
87
|
+
let v = lookup(s).expect("INVENTORY entries must be in lookup()");
|
|
88
|
+
(bark(v.f1), bark(v.f2))
|
|
89
|
+
})
|
|
90
|
+
.collect();
|
|
91
|
+
let f1_min = coords.iter().map(|c| c.0).fold(f64::INFINITY, f64::min);
|
|
92
|
+
let f1_max = coords.iter().map(|c| c.0).fold(f64::NEG_INFINITY, f64::max);
|
|
93
|
+
let f2_min = coords.iter().map(|c| c.1).fold(f64::INFINITY, f64::min);
|
|
94
|
+
let f2_max = coords.iter().map(|c| c.1).fold(f64::NEG_INFINITY, f64::max);
|
|
95
|
+
((f1_max - f1_min).powi(2) + (f2_max - f2_min).powi(2)).sqrt()
|
|
96
|
+
});
|
|
97
|
+
|
|
98
|
+
/// Returns the cached Bark-span normaliser.
|
|
99
|
+
pub fn bark_span() -> f64 {
|
|
100
|
+
*BARK_SPAN
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
/// Distance between two vowels, scaled into [0, 1].
|
|
104
|
+
///
|
|
105
|
+
/// Returns `None` if either symbol is not in the inventory.
|
|
106
|
+
pub fn distance(p1: &str, p2: &str) -> Option<f64> {
|
|
107
|
+
if p1 == p2 {
|
|
108
|
+
return Some(0.0);
|
|
109
|
+
}
|
|
110
|
+
let v1 = lookup(p1)?;
|
|
111
|
+
let v2 = lookup(p2)?;
|
|
112
|
+
let (a1, b1) = (bark(v1.f1), bark(v1.f2));
|
|
113
|
+
let (a2, b2) = (bark(v2.f1), bark(v2.f2));
|
|
114
|
+
let formant_dist = ((a1 - a2).powi(2) + (b1 - b2).powi(2)).sqrt() / bark_span();
|
|
115
|
+
let mut penalty = formant_dist * VOWEL_SCALE;
|
|
116
|
+
if v1.rounded != v2.rounded {
|
|
117
|
+
penalty += ROUNDING_PENALTY;
|
|
118
|
+
}
|
|
119
|
+
if v1.rhotic != v2.rhotic {
|
|
120
|
+
penalty += RHOTICITY_PENALTY;
|
|
121
|
+
}
|
|
122
|
+
Some(penalty.min(1.0))
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
#[cfg(test)]
|
|
126
|
+
mod tests {
|
|
127
|
+
use super::*;
|
|
128
|
+
|
|
129
|
+
/// Tolerance: the Ruby reference uses f64 throughout; we should match
|
|
130
|
+
/// to at least 12 decimals.
|
|
131
|
+
const EPS: f64 = 1e-12;
|
|
132
|
+
|
|
133
|
+
/// Reference values produced by the Ruby implementation.
|
|
134
|
+
/// Bumping a constant here means bumping it in lib/phonetics/distances.rb
|
|
135
|
+
/// and confirming the parity tests still match.
|
|
136
|
+
#[test]
|
|
137
|
+
fn matches_ruby_vowel_distances() {
|
|
138
|
+
let cases: &[(&str, &str, f64)] = &[
|
|
139
|
+
("i", "y", 0.099_760_210_846_103_59),
|
|
140
|
+
("i", "ɪ", 0.060_056_384_465_816_57),
|
|
141
|
+
("i", "u", 0.565_279_341_709_588),
|
|
142
|
+
("a", "ɑ", 0.214_576_497_544_325_4),
|
|
143
|
+
("æ", "ɛ", 0.064_974_568_637_334_88),
|
|
144
|
+
("ə", "ɝ", 0.241_916_659_928_285_43),
|
|
145
|
+
("o", "ə", 0.371_374_251_614_846_3),
|
|
146
|
+
("u", "y", 0.465_646_551_803_915_9),
|
|
147
|
+
("ʊ", "u", 0.172_060_682_790_273_34),
|
|
148
|
+
];
|
|
149
|
+
|
|
150
|
+
for (a, b, expected) in cases {
|
|
151
|
+
let got = distance(a, b).expect("inventory pair");
|
|
152
|
+
assert!(
|
|
153
|
+
(got - expected).abs() < EPS,
|
|
154
|
+
"distance({a:?}, {b:?}) = {got}, expected {expected}",
|
|
155
|
+
);
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
#[test]
|
|
160
|
+
fn bark_span_matches_ruby() {
|
|
161
|
+
assert!((bark_span() - 10.148_711_232_912_262).abs() < EPS);
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
#[test]
|
|
165
|
+
fn bark_for_known_frequencies() {
|
|
166
|
+
// /i/'s F1 = 240 Hz → 2.349… Bark
|
|
167
|
+
assert!((bark(240.0) - 2.349_000_345_620_559).abs() < EPS);
|
|
168
|
+
// /a/'s F1 = 850 Hz → 7.501… Bark
|
|
169
|
+
assert!((bark(850.0) - 7.501_208_750_766_951).abs() < EPS);
|
|
170
|
+
// Edge: 0 Hz returns 0.
|
|
171
|
+
assert_eq!(bark(0.0), 0.0);
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
#[test]
|
|
175
|
+
fn identity_is_zero() {
|
|
176
|
+
for s in INVENTORY {
|
|
177
|
+
assert_eq!(distance(s, s), Some(0.0));
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
#[test]
|
|
182
|
+
fn symmetric() {
|
|
183
|
+
for a in INVENTORY {
|
|
184
|
+
for b in INVENTORY {
|
|
185
|
+
let d_ab = distance(a, b).unwrap();
|
|
186
|
+
let d_ba = distance(b, a).unwrap();
|
|
187
|
+
assert!((d_ab - d_ba).abs() < EPS, "asymmetric: {a}/{b}");
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
#[test]
|
|
193
|
+
fn unknown_symbol_returns_none() {
|
|
194
|
+
assert!(distance("Z", "i").is_none());
|
|
195
|
+
assert!(distance("i", "Z").is_none());
|
|
196
|
+
}
|
|
197
|
+
}
|
data/lib/phonetics.rb
CHANGED
|
@@ -1,4 +1,79 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
3
|
+
# Phonetics — IPA-based phonetic distance.
|
|
4
|
+
#
|
|
5
|
+
# The entire algorithmic core is written in Rust (see <repo>/rust/
|
|
6
|
+
# phonetics) and loaded as a native extension via Magnus. This file
|
|
7
|
+
# layers ergonomic Ruby idioms on top of the bare module functions
|
|
8
|
+
# that the extension exports.
|
|
9
|
+
#
|
|
10
|
+
# Two-tier distance API:
|
|
11
|
+
#
|
|
12
|
+
# Phonetics.distance(p1, p2) acoustic per-phoneme, 0..1
|
|
13
|
+
# Phonetics.levenshtein(s1, s2) strict edit distance
|
|
14
|
+
# Phonetics.confusion(s1, s2) listener-confusion distance
|
|
15
|
+
# Phonetics.similarity(s1, s2) normalised 0..1
|
|
16
|
+
# Phonetics.sub_cost(p1, p2) perceptual per-phoneme
|
|
17
|
+
# Phonetics.tokenize(ipa, boundaries:) phoneme stream
|
|
18
|
+
require 'delegate'
|
|
19
|
+
|
|
20
|
+
require_relative 'phonetics/phonetics_ruby'
|
|
21
|
+
require_relative 'phonetics/transcriptions'
|
|
22
|
+
|
|
23
|
+
module Phonetics
|
|
24
|
+
# The native binding exposes the tokenizer as `_tokenize(input,
|
|
25
|
+
# boundaries)`. Magnus's `function!` macro doesn't bridge Ruby
|
|
26
|
+
# keyword arguments through to Rust, so we wrap it in a Ruby method
|
|
27
|
+
# that does accept the kwarg.
|
|
28
|
+
def self.tokenize(input, boundaries: false)
|
|
29
|
+
_tokenize(input, boundaries)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# ------------------------------------------------------------------
|
|
33
|
+
# Phonetics::String — iterator over phonemes in an IPA string.
|
|
34
|
+
# ------------------------------------------------------------------
|
|
35
|
+
class String < SimpleDelegator
|
|
36
|
+
def each_phoneme(boundaries: false)
|
|
37
|
+
Phonetics.tokenize(to_s, boundaries: boundaries).each
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# ------------------------------------------------------------------
|
|
42
|
+
# Backwards-compatible namespaced API.
|
|
43
|
+
#
|
|
44
|
+
# The previous Ruby+C implementation exposed these under sub-modules.
|
|
45
|
+
# Keep them as thin delegators so existing callers don't break —
|
|
46
|
+
# there's nothing interesting happening here, just forwarding.
|
|
47
|
+
# ------------------------------------------------------------------
|
|
48
|
+
|
|
49
|
+
module Levenshtein
|
|
50
|
+
INDEL_COST = 1.0
|
|
51
|
+
TRANSPOSE_COST = 0.8
|
|
52
|
+
|
|
53
|
+
def self.distance(s1, s2, _verbose = false)
|
|
54
|
+
return if s1.nil? || s2.nil?
|
|
55
|
+
|
|
56
|
+
Phonetics.levenshtein(s1, s2)
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
module Confusion
|
|
61
|
+
GAP_OPEN = 0.60
|
|
62
|
+
GAP_EXTEND = 0.25
|
|
63
|
+
WEAK_INDEL_COST = 0.15
|
|
64
|
+
BOUNDARY_INDEL_COST = 0.02
|
|
65
|
+
|
|
66
|
+
def self.distance(s1, s2, verbose: false)
|
|
67
|
+
_ = verbose
|
|
68
|
+
Phonetics.confusion(s1, s2)
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def self.similarity(s1, s2)
|
|
72
|
+
Phonetics.similarity(s1, s2)
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def self.sub_cost(a, b)
|
|
76
|
+
Phonetics.sub_cost(a, b)
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
end
|
data/phonetics.gemspec
CHANGED
|
@@ -1,29 +1,53 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require_relative 'lib/phonetics/version'
|
|
4
|
+
|
|
3
5
|
Gem::Specification.new do |spec|
|
|
4
6
|
spec.name = 'phonetics'
|
|
5
|
-
spec.version =
|
|
7
|
+
spec.version = Phonetics::VERSION
|
|
6
8
|
spec.authors = ['Jack Danger']
|
|
7
9
|
spec.email = ['github@jackcanty.com']
|
|
8
10
|
|
|
9
|
-
spec.summary = '
|
|
10
|
-
spec.description =
|
|
11
|
+
spec.summary = 'IPA-based phonetic distance: strict edit distance, listener-confusion distance, and per-phoneme acoustic and perceptual scoring.'
|
|
12
|
+
spec.description = <<~DESC
|
|
13
|
+
Tools for working with the International Phonetic Alphabet. Two-tier
|
|
14
|
+
distance API — strict acoustic and listener-perception — backed by a
|
|
15
|
+
Rust core compiled in via Magnus. Calibrated against Mad Gab puzzle
|
|
16
|
+
data and English speech-perception literature.
|
|
17
|
+
DESC
|
|
11
18
|
spec.homepage = 'https://github.com/JackDanger/phonetics'
|
|
12
19
|
spec.license = 'MIT'
|
|
13
20
|
|
|
14
|
-
spec.required_ruby_version = '>=
|
|
21
|
+
spec.required_ruby_version = '>= 3.0'
|
|
22
|
+
spec.required_rubygems_version = '>= 3.3.11'
|
|
23
|
+
|
|
24
|
+
spec.metadata['homepage_uri'] = spec.homepage
|
|
25
|
+
spec.metadata['source_code_uri'] = spec.homepage
|
|
15
26
|
|
|
16
|
-
spec.extensions = ['ext/
|
|
27
|
+
spec.extensions = ['ext/phonetics_ruby/extconf.rb']
|
|
17
28
|
|
|
18
|
-
# Specify which files should be added to the gem when it is released.
|
|
19
|
-
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
|
20
29
|
spec.files = Dir.chdir(File.expand_path(__dir__)) do
|
|
21
|
-
`git ls-files -z`.split("\x0").reject
|
|
30
|
+
tracked = `git ls-files -z`.split("\x0").reject do |f|
|
|
31
|
+
f.match(%r{\A(test|spec|features)/}) ||
|
|
32
|
+
f.match(%r{\Aext/phonetics_ruby/(target|Cargo.lock|Makefile)})
|
|
33
|
+
end
|
|
34
|
+
# The vendored Rust core isn't tracked in git (it's a build
|
|
35
|
+
# artifact populated by `rake vendor_rust`), but it IS shipped
|
|
36
|
+
# in the .gem tarball so end users don't need the source
|
|
37
|
+
# workspace to compile the extension.
|
|
38
|
+
vendor = Dir.glob('ext/phonetics_ruby/vendor/**/*', File::FNM_DOTMATCH).reject do |p|
|
|
39
|
+
File.directory?(p) ||
|
|
40
|
+
p.include?('/target/') ||
|
|
41
|
+
p.end_with?('Cargo.lock', '/.', '/..')
|
|
42
|
+
end
|
|
43
|
+
(tracked + vendor).uniq.sort
|
|
22
44
|
end
|
|
45
|
+
|
|
23
46
|
spec.require_paths = ['lib']
|
|
24
47
|
|
|
48
|
+
spec.add_dependency 'rb_sys', '~> 0.9'
|
|
49
|
+
|
|
25
50
|
spec.add_development_dependency 'bundler'
|
|
26
|
-
spec.add_development_dependency 'pry-byebug'
|
|
27
51
|
spec.add_development_dependency 'rake'
|
|
28
52
|
spec.add_development_dependency 'rake-compiler'
|
|
29
53
|
spec.add_development_dependency 'rspec'
|
metadata
CHANGED
|
@@ -1,31 +1,31 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: phonetics
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version:
|
|
4
|
+
version: 4.0.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Jack Danger
|
|
8
|
-
autorequire:
|
|
8
|
+
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2026-05-12 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
|
-
name:
|
|
14
|
+
name: rb_sys
|
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
|
16
16
|
requirements:
|
|
17
|
-
- - "
|
|
17
|
+
- - "~>"
|
|
18
18
|
- !ruby/object:Gem::Version
|
|
19
|
-
version: '0'
|
|
20
|
-
type: :
|
|
19
|
+
version: '0.9'
|
|
20
|
+
type: :runtime
|
|
21
21
|
prerelease: false
|
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
|
23
23
|
requirements:
|
|
24
|
-
- - "
|
|
24
|
+
- - "~>"
|
|
25
25
|
- !ruby/object:Gem::Version
|
|
26
|
-
version: '0'
|
|
26
|
+
version: '0.9'
|
|
27
27
|
- !ruby/object:Gem::Dependency
|
|
28
|
-
name:
|
|
28
|
+
name: bundler
|
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
|
30
30
|
requirements:
|
|
31
31
|
- - ">="
|
|
@@ -94,24 +94,27 @@ dependencies:
|
|
|
94
94
|
- - ">="
|
|
95
95
|
- !ruby/object:Gem::Version
|
|
96
96
|
version: '0'
|
|
97
|
-
description:
|
|
97
|
+
description: |
|
|
98
|
+
Tools for working with the International Phonetic Alphabet. Two-tier
|
|
99
|
+
distance API — strict acoustic and listener-perception — backed by a
|
|
100
|
+
Rust core compiled in via Magnus. Calibrated against Mad Gab puzzle
|
|
101
|
+
data and English speech-perception literature.
|
|
98
102
|
email:
|
|
99
103
|
- github@jackcanty.com
|
|
100
104
|
executables: []
|
|
101
105
|
extensions:
|
|
102
|
-
- ext/
|
|
106
|
+
- ext/phonetics_ruby/extconf.rb
|
|
103
107
|
extra_rdoc_files: []
|
|
104
108
|
files:
|
|
105
|
-
- ".github/workflows/gempush.yml"
|
|
106
|
-
- ".github/workflows/test.yml"
|
|
107
109
|
- ".gitignore"
|
|
108
110
|
- ".rspec"
|
|
109
111
|
- ".rubocop.yml"
|
|
112
|
+
- CHANGELOG
|
|
110
113
|
- CODE_OF_CONDUCT.md
|
|
114
|
+
- Cargo.toml
|
|
111
115
|
- Dockerfile
|
|
112
116
|
- Gemfile
|
|
113
117
|
- LICENSE.txt
|
|
114
|
-
- Makefile
|
|
115
118
|
- README.md
|
|
116
119
|
- Rakefile
|
|
117
120
|
- VERSION
|
|
@@ -120,28 +123,36 @@ files:
|
|
|
120
123
|
- _site/vowel_chart_b_words.jpg
|
|
121
124
|
- bin/console
|
|
122
125
|
- bin/gempush-if-changed
|
|
123
|
-
-
|
|
124
|
-
- ext/
|
|
125
|
-
- ext/
|
|
126
|
-
- ext/
|
|
127
|
-
- ext/
|
|
128
|
-
- ext/
|
|
129
|
-
- ext/
|
|
130
|
-
- ext/
|
|
126
|
+
- bin/phonetics
|
|
127
|
+
- ext/phonetics_ruby/Cargo.toml
|
|
128
|
+
- ext/phonetics_ruby/build.rs
|
|
129
|
+
- ext/phonetics_ruby/extconf.rb
|
|
130
|
+
- ext/phonetics_ruby/src/lib.rs
|
|
131
|
+
- ext/phonetics_ruby/vendor/phonetics/Cargo.toml
|
|
132
|
+
- ext/phonetics_ruby/vendor/phonetics/README.md
|
|
133
|
+
- ext/phonetics_ruby/vendor/phonetics/src/compounds.rs
|
|
134
|
+
- ext/phonetics_ruby/vendor/phonetics/src/confusion.rs
|
|
135
|
+
- ext/phonetics_ruby/vendor/phonetics/src/consonants.rs
|
|
136
|
+
- ext/phonetics_ruby/vendor/phonetics/src/cross_class.rs
|
|
137
|
+
- ext/phonetics_ruby/vendor/phonetics/src/diacritics.rs
|
|
138
|
+
- ext/phonetics_ruby/vendor/phonetics/src/distance.rs
|
|
139
|
+
- ext/phonetics_ruby/vendor/phonetics/src/levenshtein.rs
|
|
140
|
+
- ext/phonetics_ruby/vendor/phonetics/src/lib.rs
|
|
141
|
+
- ext/phonetics_ruby/vendor/phonetics/src/symbols.rs
|
|
142
|
+
- ext/phonetics_ruby/vendor/phonetics/src/tokenizer.rs
|
|
143
|
+
- ext/phonetics_ruby/vendor/phonetics/src/vowels.rs
|
|
131
144
|
- lib/common_ipa_transcriptions.json
|
|
132
145
|
- lib/phonetics.rb
|
|
133
|
-
- lib/phonetics/code_generator.rb
|
|
134
|
-
- lib/phonetics/distances.rb
|
|
135
|
-
- lib/phonetics/levenshtein.rb
|
|
136
|
-
- lib/phonetics/ruby_levenshtein.rb
|
|
137
146
|
- lib/phonetics/transcriptions.rb
|
|
138
147
|
- lib/phonetics/version.rb
|
|
139
148
|
- phonetics.gemspec
|
|
140
149
|
homepage: https://github.com/JackDanger/phonetics
|
|
141
150
|
licenses:
|
|
142
151
|
- MIT
|
|
143
|
-
metadata:
|
|
144
|
-
|
|
152
|
+
metadata:
|
|
153
|
+
homepage_uri: https://github.com/JackDanger/phonetics
|
|
154
|
+
source_code_uri: https://github.com/JackDanger/phonetics
|
|
155
|
+
post_install_message:
|
|
145
156
|
rdoc_options: []
|
|
146
157
|
require_paths:
|
|
147
158
|
- lib
|
|
@@ -149,15 +160,16 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
149
160
|
requirements:
|
|
150
161
|
- - ">="
|
|
151
162
|
- !ruby/object:Gem::Version
|
|
152
|
-
version: '
|
|
163
|
+
version: '3.0'
|
|
153
164
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
154
165
|
requirements:
|
|
155
166
|
- - ">="
|
|
156
167
|
- !ruby/object:Gem::Version
|
|
157
|
-
version:
|
|
168
|
+
version: 3.3.11
|
|
158
169
|
requirements: []
|
|
159
|
-
rubygems_version: 3.
|
|
160
|
-
signing_key:
|
|
170
|
+
rubygems_version: 3.5.22
|
|
171
|
+
signing_key:
|
|
161
172
|
specification_version: 4
|
|
162
|
-
summary:
|
|
173
|
+
summary: 'IPA-based phonetic distance: strict edit distance, listener-confusion distance,
|
|
174
|
+
and per-phoneme acoustic and perceptual scoring.'
|
|
163
175
|
test_files: []
|