phonetics 3.2.0 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +17 -2
- data/Cargo.toml +27 -0
- data/Rakefile +58 -26
- data/VERSION +1 -1
- data/bin/phonetics +89 -0
- data/ext/phonetics_ruby/Cargo.toml +36 -0
- data/ext/phonetics_ruby/build.rs +24 -0
- data/ext/phonetics_ruby/extconf.rb +17 -0
- data/ext/phonetics_ruby/src/lib.rs +56 -0
- data/ext/phonetics_ruby/vendor/phonetics/Cargo.toml +30 -0
- data/ext/phonetics_ruby/vendor/phonetics/README.md +29 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/compounds.rs +40 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/confusion.rs +325 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/consonants.rs +363 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/cross_class.rs +56 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/diacritics.rs +113 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/distance.rs +183 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/levenshtein.rs +146 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/lib.rs +44 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/symbols.rs +21 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/tokenizer.rs +171 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/vowels.rs +197 -0
- data/lib/phonetics.rb +77 -2
- data/phonetics.gemspec +33 -9
- metadata +45 -34
- data/.github/workflows/gempush.yml +0 -28
- data/.github/workflows/test.yml +0 -20
- data/Makefile +0 -9
- data/ext/c_levenshtein/extconf.rb +0 -10
- data/ext/c_levenshtein/levenshtein.c +0 -223
- data/ext/c_levenshtein/next_phoneme_length.c +0 -1365
- data/ext/c_levenshtein/next_phoneme_length.h +0 -1
- data/ext/c_levenshtein/phonemes.c +0 -53
- data/ext/c_levenshtein/phonemes.h +0 -3
- data/ext/c_levenshtein/phonetic_cost.c +0 -88593
- data/ext/c_levenshtein/phonetic_cost.h +0 -1
- data/lib/phonetics/code_generator.rb +0 -228
- data/lib/phonetics/distances.rb +0 -249
- data/lib/phonetics/levenshtein.rb +0 -27
- data/lib/phonetics/ruby_levenshtein.rb +0 -162
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
//! Cross-class bridge: consonant ↔ vowel distance.
|
|
2
|
+
//!
|
|
3
|
+
//! English perception treats /j/, /w/, /ɹ/, /ɰ/ as non-syllabic versions
|
|
4
|
+
//! of /i/, /u/, /ɝ/, /ɯ/ respectively — Mad Gab's "yes" ≈ "Es" depends
|
|
5
|
+
//! on this bridge. Glottals are mostly an air pulse, also nearer to
|
|
6
|
+
//! vowels than to a true stop or fricative. Everything else uses
|
|
7
|
+
//! [`CROSS_CLASS_DEFAULT`](crate::symbols::CROSS_CLASS_DEFAULT).
|
|
8
|
+
|
|
9
|
+
use crate::symbols::{CROSS_CLASS_DEFAULT, CROSS_CLASS_NEAR_BRIDGE};
|
|
10
|
+
|
|
11
|
+
/// Returns the bridge cost when `consonant` is an approximant with
|
|
12
|
+
/// listed vowel-distance entries. Vowels not specifically listed under
|
|
13
|
+
/// a bridge consonant get `CROSS_CLASS_NEAR_BRIDGE`. Non-bridge
|
|
14
|
+
/// consonants return `None`.
|
|
15
|
+
fn approximant_bridge(consonant: &str, vowel: &str) -> Option<f64> {
|
|
16
|
+
let cost = match (consonant, vowel) {
|
|
17
|
+
("j", "i") => 0.10,
|
|
18
|
+
("j", "ɪ") => 0.14,
|
|
19
|
+
("j", "y") => 0.18,
|
|
20
|
+
("j", "e") => 0.22,
|
|
21
|
+
("w", "u") => 0.10,
|
|
22
|
+
("w", "ʊ") => 0.14,
|
|
23
|
+
("w", "o") => 0.22,
|
|
24
|
+
("w", "ɔ") => 0.30,
|
|
25
|
+
("w", "ɯ") => 0.20,
|
|
26
|
+
("ɹ", "ɝ") => 0.08,
|
|
27
|
+
("ɹ", "ə") => 0.25,
|
|
28
|
+
("ɰ", "ɯ") => 0.10,
|
|
29
|
+
("ɰ", "u") => 0.20,
|
|
30
|
+
// Bridge consonants without a specific vowel entry.
|
|
31
|
+
("j" | "w" | "ɹ" | "ɰ", _) => CROSS_CLASS_NEAR_BRIDGE,
|
|
32
|
+
_ => return None,
|
|
33
|
+
};
|
|
34
|
+
Some(cost)
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
/// Glottal-bridge cost; glottals are nearly vowel-like everywhere.
|
|
38
|
+
fn glottal_bridge(consonant: &str) -> Option<f64> {
|
|
39
|
+
Some(match consonant {
|
|
40
|
+
"h" | "ɦ" => 0.50,
|
|
41
|
+
"ʔ" => 0.55,
|
|
42
|
+
_ => return None,
|
|
43
|
+
})
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
/// Look up the cross-class distance for a (consonant, vowel) pair.
|
|
47
|
+
/// Falls back to `CROSS_CLASS_DEFAULT` when neither bridge fires.
|
|
48
|
+
pub fn distance(consonant: &str, vowel: &str) -> f64 {
|
|
49
|
+
if let Some(c) = approximant_bridge(consonant, vowel) {
|
|
50
|
+
return c;
|
|
51
|
+
}
|
|
52
|
+
if let Some(c) = glottal_bridge(consonant) {
|
|
53
|
+
return c;
|
|
54
|
+
}
|
|
55
|
+
CROSS_CLASS_DEFAULT
|
|
56
|
+
}
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
//! Suprasegmental and modifier diacritics.
|
|
2
|
+
//!
|
|
3
|
+
//! Each diacritic character either attaches to the preceding base
|
|
4
|
+
//! phoneme (length, aspiration, palatalization, etc.) or to the
|
|
5
|
+
//! following base phoneme (stress marks, which in IPA precede the
|
|
6
|
+
//! stressed syllable). The tokenizer absorbs them into the same token;
|
|
7
|
+
//! the distance metric splits them back out via [`decompose`] and
|
|
8
|
+
//! charges a small additive cost when modifier sets differ.
|
|
9
|
+
|
|
10
|
+
/// Kinds of suprasegmental modifier recognised by the metric.
|
|
11
|
+
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
|
12
|
+
pub enum Diacritic {
|
|
13
|
+
/// `ː` — full length.
|
|
14
|
+
Long,
|
|
15
|
+
/// `ˑ` — half-length.
|
|
16
|
+
HalfLong,
|
|
17
|
+
/// `ʰ` — aspiration.
|
|
18
|
+
Aspirated,
|
|
19
|
+
/// `ʲ` — palatalization.
|
|
20
|
+
Palatalized,
|
|
21
|
+
/// `ˤ` — pharyngealization.
|
|
22
|
+
Pharyngealized,
|
|
23
|
+
/// `ˠ` — velarization.
|
|
24
|
+
Velarized,
|
|
25
|
+
/// Combining tilde above (U+0303) — nasalization.
|
|
26
|
+
Nasalized,
|
|
27
|
+
/// `ˈ` — primary stress.
|
|
28
|
+
PrimaryStress,
|
|
29
|
+
/// `ˌ` — secondary stress.
|
|
30
|
+
SecondaryStress,
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
impl Diacritic {
|
|
34
|
+
/// Additive cost when this modifier is present on one phoneme but
|
|
35
|
+
/// not the other.
|
|
36
|
+
pub fn penalty(self) -> f64 {
|
|
37
|
+
match self {
|
|
38
|
+
Self::Long => 0.05,
|
|
39
|
+
Self::HalfLong => 0.025,
|
|
40
|
+
Self::Aspirated => 0.04,
|
|
41
|
+
Self::Palatalized => 0.06,
|
|
42
|
+
Self::Pharyngealized => 0.07,
|
|
43
|
+
Self::Velarized => 0.07,
|
|
44
|
+
Self::Nasalized => 0.06,
|
|
45
|
+
Self::PrimaryStress => 0.05,
|
|
46
|
+
Self::SecondaryStress => 0.03,
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
/// Classify a single Unicode character as a diacritic. The combining
|
|
51
|
+
/// tilde is a multi-byte sequence in UTF-8 but one Unicode scalar.
|
|
52
|
+
pub fn from_char(c: char) -> Option<Self> {
|
|
53
|
+
Some(match c {
|
|
54
|
+
'ː' => Self::Long,
|
|
55
|
+
'ˑ' => Self::HalfLong,
|
|
56
|
+
'ʰ' => Self::Aspirated,
|
|
57
|
+
'ʲ' => Self::Palatalized,
|
|
58
|
+
'ˤ' => Self::Pharyngealized,
|
|
59
|
+
'ˠ' => Self::Velarized,
|
|
60
|
+
'\u{0303}' => Self::Nasalized,
|
|
61
|
+
'ˈ' => Self::PrimaryStress,
|
|
62
|
+
'ˌ' => Self::SecondaryStress,
|
|
63
|
+
_ => return None,
|
|
64
|
+
})
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
/// True if this diacritic attaches to the *following* segment
|
|
68
|
+
/// (stress marks); false for the preceding-attached ones.
|
|
69
|
+
pub fn is_leading(self) -> bool {
|
|
70
|
+
matches!(self, Self::PrimaryStress | Self::SecondaryStress)
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
/// Generic fallback diacritic cost when an unrecognised diacritic-like
|
|
75
|
+
/// character ends up in a modifier set somehow.
|
|
76
|
+
pub const DEFAULT_PENALTY: f64 = 0.03;
|
|
77
|
+
|
|
78
|
+
/// Split a phoneme token into its base symbol and the set of diacritic
|
|
79
|
+
/// kinds it carries. Unrecognised characters stay in the base so a
|
|
80
|
+
/// misspelt input doesn't silently lose information.
|
|
81
|
+
pub fn decompose(token: &str) -> (String, Vec<Diacritic>) {
|
|
82
|
+
let mut base = String::new();
|
|
83
|
+
let mut mods: Vec<Diacritic> = Vec::new();
|
|
84
|
+
for c in token.chars() {
|
|
85
|
+
if let Some(d) = Diacritic::from_char(c) {
|
|
86
|
+
if !mods.contains(&d) {
|
|
87
|
+
mods.push(d);
|
|
88
|
+
}
|
|
89
|
+
} else {
|
|
90
|
+
base.push(c);
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
(base, mods)
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
/// Symmetric-difference cost between two diacritic sets.
|
|
97
|
+
pub fn distance(mods1: &[Diacritic], mods2: &[Diacritic]) -> f64 {
|
|
98
|
+
if mods1.is_empty() && mods2.is_empty() {
|
|
99
|
+
return 0.0;
|
|
100
|
+
}
|
|
101
|
+
let mut total = 0.0;
|
|
102
|
+
for d in mods1 {
|
|
103
|
+
if !mods2.contains(d) {
|
|
104
|
+
total += d.penalty();
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
for d in mods2 {
|
|
108
|
+
if !mods1.contains(d) {
|
|
109
|
+
total += d.penalty();
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
total
|
|
113
|
+
}
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
//! Top-level distance dispatch.
|
|
2
|
+
//!
|
|
3
|
+
//! `distance(p1, p2)` returns the acoustic distance between two phoneme
|
|
4
|
+
//! tokens, scaled to [0, 1]. The dispatch order matters:
|
|
5
|
+
//!
|
|
6
|
+
//! 1. Identity → 0.
|
|
7
|
+
//! 2. Boundary token (`#`) → 0 vs boundary, else `BOUNDARY_VS_PHONEME`.
|
|
8
|
+
//! 3. Diacritic decomposition: split the token into base + modifier
|
|
9
|
+
//! set; recurse on the base; add a per-modifier mismatch cost.
|
|
10
|
+
//! 4. Compound expansion: if either side is a diphthong or affricate,
|
|
11
|
+
//! compare component-by-component (padded by repeating the last
|
|
12
|
+
//! segment), averaging.
|
|
13
|
+
//! 5. Class-based dispatch: vowel-vowel, consonant-consonant, or
|
|
14
|
+
//! cross-class bridge.
|
|
15
|
+
|
|
16
|
+
use crate::{compounds, consonants, cross_class, diacritics, symbols, vowels};
|
|
17
|
+
|
|
18
|
+
/// Distance between two phoneme tokens, scaled to [0, 1].
|
|
19
|
+
pub fn distance(p1: &str, p2: &str) -> f64 {
|
|
20
|
+
if p1 == p2 {
|
|
21
|
+
return 0.0;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
if p1 == symbols::BOUNDARY_TOKEN || p2 == symbols::BOUNDARY_TOKEN {
|
|
25
|
+
return symbols::BOUNDARY_VS_PHONEME;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
let (base1, mods1) = diacritics::decompose(p1);
|
|
29
|
+
let (base2, mods2) = diacritics::decompose(p2);
|
|
30
|
+
|
|
31
|
+
// If either side carried any diacritics, strip them and recompute
|
|
32
|
+
// on the bases, then add a per-modifier mismatch cost.
|
|
33
|
+
if !mods1.is_empty() || !mods2.is_empty() {
|
|
34
|
+
let base_dist = base_pair_distance(&base1, &base2);
|
|
35
|
+
return (base_dist + diacritics::distance(&mods1, &mods2)).min(1.0);
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
base_pair_distance(p1, p2)
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/// Distance between two bare base phonemes (no diacritics on either
|
|
42
|
+
/// side). Handles compounds, class dispatch, and cross-class bridges.
|
|
43
|
+
fn base_pair_distance(p1: &str, p2: &str) -> f64 {
|
|
44
|
+
if p1 == p2 {
|
|
45
|
+
return 0.0;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
let comp1 = compounds::components(p1);
|
|
49
|
+
let comp2 = compounds::components(p2);
|
|
50
|
+
if comp1.is_some() || comp2.is_some() {
|
|
51
|
+
let a: Vec<&str> = comp1.map_or_else(|| vec![p1], <[&str]>::to_vec);
|
|
52
|
+
let b: Vec<&str> = comp2.map_or_else(|| vec![p2], <[&str]>::to_vec);
|
|
53
|
+
return compound_distance(&a, &b);
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
let is_vowel_1 = vowels::lookup(p1).is_some();
|
|
57
|
+
let is_vowel_2 = vowels::lookup(p2).is_some();
|
|
58
|
+
let is_cons_1 = consonants::lookup(p1).is_some();
|
|
59
|
+
let is_cons_2 = consonants::lookup(p2).is_some();
|
|
60
|
+
|
|
61
|
+
if is_vowel_1 && is_vowel_2 {
|
|
62
|
+
return vowels::distance(p1, p2).unwrap_or(1.0);
|
|
63
|
+
}
|
|
64
|
+
if is_cons_1 && is_cons_2 {
|
|
65
|
+
return consonants::distance(p1, p2).unwrap_or(1.0);
|
|
66
|
+
}
|
|
67
|
+
if is_cons_1 && is_vowel_2 {
|
|
68
|
+
return cross_class::distance(p1, p2);
|
|
69
|
+
}
|
|
70
|
+
if is_vowel_1 && is_cons_2 {
|
|
71
|
+
return cross_class::distance(p2, p1);
|
|
72
|
+
}
|
|
73
|
+
1.0
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
/// Pairwise component-mean distance for compound (or
|
|
77
|
+
/// compound-and-simple) phonemes. The shorter side is padded by
|
|
78
|
+
/// repeating its last segment so /aɪ/ vs /a/ charges half a phoneme
|
|
79
|
+
/// distance rather than nothing.
|
|
80
|
+
fn compound_distance(c1: &[&str], c2: &[&str]) -> f64 {
|
|
81
|
+
let n = c1.len().max(c2.len());
|
|
82
|
+
|
|
83
|
+
// Pad the shorter side by repeating its last entry.
|
|
84
|
+
fn pad<'a>(v: &[&'a str], n: usize) -> Vec<&'a str> {
|
|
85
|
+
let mut out: Vec<&'a str> = v.to_vec();
|
|
86
|
+
if let Some(&last) = out.last() {
|
|
87
|
+
while out.len() < n {
|
|
88
|
+
out.push(last);
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
out
|
|
92
|
+
}
|
|
93
|
+
let a = pad(c1, n);
|
|
94
|
+
let b = pad(c2, n);
|
|
95
|
+
|
|
96
|
+
let total: f64 = a
|
|
97
|
+
.iter()
|
|
98
|
+
.zip(b.iter())
|
|
99
|
+
.map(|(x, y)| if x == y { 0.0 } else { distance(x, y) })
|
|
100
|
+
.sum();
|
|
101
|
+
total / n as f64
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
#[cfg(test)]
|
|
105
|
+
mod tests {
|
|
106
|
+
use super::distance;
|
|
107
|
+
|
|
108
|
+
const EPS: f64 = 1e-12;
|
|
109
|
+
|
|
110
|
+
#[test]
|
|
111
|
+
fn matches_ruby_dispatch_cases() {
|
|
112
|
+
// Reference values produced by the Ruby implementation across
|
|
113
|
+
// every dispatch branch: cross-class bridges, glottals, the
|
|
114
|
+
// default cross-class, compound diphthongs and affricates,
|
|
115
|
+
// compound-vs-simple averaging, diacritics, and the boundary
|
|
116
|
+
// token.
|
|
117
|
+
let cases: &[(&str, &str, f64)] = &[
|
|
118
|
+
// Approximant↔vowel bridge
|
|
119
|
+
("j", "i", 0.10),
|
|
120
|
+
("j", "ɪ", 0.14),
|
|
121
|
+
("w", "u", 0.10),
|
|
122
|
+
("w", "o", 0.22),
|
|
123
|
+
("ɹ", "ɝ", 0.08),
|
|
124
|
+
("ɰ", "ɯ", 0.10),
|
|
125
|
+
// Glottal bridge
|
|
126
|
+
("h", "a", 0.50),
|
|
127
|
+
("ʔ", "i", 0.55),
|
|
128
|
+
("ɦ", "ɛ", 0.50),
|
|
129
|
+
// Default cross-class
|
|
130
|
+
("k", "i", 0.85),
|
|
131
|
+
("s", "u", 0.85),
|
|
132
|
+
("m", "a", 0.85),
|
|
133
|
+
// Diphthongs
|
|
134
|
+
("aɪ", "ɑɪ", 0.107_288_248_772_162_7),
|
|
135
|
+
("aɪ", "eɪ", 0.130_229_442_812_957_87),
|
|
136
|
+
("aɪ", "a", 0.144_904_524_019_594_53),
|
|
137
|
+
("oʊ", "o", 0.067_465_504_570_433_46),
|
|
138
|
+
// Affricates
|
|
139
|
+
("tʃ", "ʃ", 0.124_358_541_225_631_43),
|
|
140
|
+
("tʃ", "dʒ", 0.15),
|
|
141
|
+
("tʃ", "t", 0.124_358_541_225_631_43),
|
|
142
|
+
("dʒ", "ʒ", 0.124_358_541_225_631_43),
|
|
143
|
+
// Compound vs unrelated → averages to the default
|
|
144
|
+
("aɪ", "k", 0.85),
|
|
145
|
+
// Diacritics
|
|
146
|
+
("pʰ", "p", 0.04),
|
|
147
|
+
("uː", "u", 0.05),
|
|
148
|
+
("ˈp", "p", 0.05),
|
|
149
|
+
("pʰ", "bʰ", 0.15),
|
|
150
|
+
// Boundary token
|
|
151
|
+
("#", "a", 0.95),
|
|
152
|
+
("#", "#", 0.0),
|
|
153
|
+
];
|
|
154
|
+
|
|
155
|
+
for (a, b, expected) in cases {
|
|
156
|
+
let got = distance(a, b);
|
|
157
|
+
assert!(
|
|
158
|
+
(got - expected).abs() < EPS,
|
|
159
|
+
"distance({a:?}, {b:?}) = {got}, expected {expected}",
|
|
160
|
+
);
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
#[test]
|
|
165
|
+
fn identity_is_zero_across_classes() {
|
|
166
|
+
for s in ["i", "p", "aɪ", "tʃ", "pʰ", "#"] {
|
|
167
|
+
assert_eq!(distance(s, s), 0.0);
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
#[test]
|
|
172
|
+
fn diphthong_against_its_nucleus_is_half_a_phoneme() {
|
|
173
|
+
// /aɪ/ vs /a/ should be roughly distance(ɪ, a) / 2 since the
|
|
174
|
+
// first component matches and only the second contributes.
|
|
175
|
+
let direct = crate::vowels::distance("ɪ", "a").unwrap();
|
|
176
|
+
let compound = distance("aɪ", "a");
|
|
177
|
+
assert!(
|
|
178
|
+
(compound - direct / 2.0).abs() < EPS,
|
|
179
|
+
"compound={compound}, direct/2={}",
|
|
180
|
+
direct / 2.0
|
|
181
|
+
);
|
|
182
|
+
}
|
|
183
|
+
}
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
//! Strict phonetic edit distance.
|
|
2
|
+
//!
|
|
3
|
+
//! Damerau-Levenshtein DP with INDEL_COST = 1.0 (one inserted or
|
|
4
|
+
//! deleted phoneme costs exactly one indel, regardless of its
|
|
5
|
+
//! neighbours — the operation costs what the operation costs, not what
|
|
6
|
+
//! happens to sit next to it). Adjacent transpositions cost
|
|
7
|
+
//! TRANSPOSE_COST < 2 * INDEL_COST because in casual speech swapping
|
|
8
|
+
//! adjacent phonemes is a real and cheap thing speakers do.
|
|
9
|
+
//!
|
|
10
|
+
//! Substitution cost is the per-phoneme acoustic distance returned by
|
|
11
|
+
//! [`crate::distance`], so improvements to the acoustic model land here
|
|
12
|
+
//! automatically.
|
|
13
|
+
|
|
14
|
+
use crate::tokenizer;
|
|
15
|
+
|
|
16
|
+
/// Cost of inserting or deleting one phoneme.
|
|
17
|
+
pub const INDEL_COST: f64 = 1.0;
|
|
18
|
+
|
|
19
|
+
/// Cost of an adjacent-pair transposition (the Damerau extension).
|
|
20
|
+
pub const TRANSPOSE_COST: f64 = 0.8;
|
|
21
|
+
|
|
22
|
+
/// Edit distance between two IPA strings under the strict acoustic
|
|
23
|
+
/// metric. Non-IPA characters are tokenised out before the DP runs.
|
|
24
|
+
pub fn distance(a: &str, b: &str) -> f64 {
|
|
25
|
+
let ta = tokenizer::tokens(a, false);
|
|
26
|
+
let tb = tokenizer::tokens(b, false);
|
|
27
|
+
distance_from_tokens(&ta, &tb)
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/// Edit distance over pre-tokenised phoneme sequences.
|
|
31
|
+
pub fn distance_from_tokens<S: AsRef<str>>(a: &[S], b: &[S]) -> f64 {
|
|
32
|
+
let m = a.len();
|
|
33
|
+
let n = b.len();
|
|
34
|
+
if m == 0 && n == 0 {
|
|
35
|
+
return 0.0;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
// d[i][j] flattened; width = n + 1.
|
|
39
|
+
let width = n + 1;
|
|
40
|
+
let mut d = vec![0.0_f64; (m + 1) * width];
|
|
41
|
+
|
|
42
|
+
// Seed: matching the empty string against the first i phonemes of a
|
|
43
|
+
// costs i indels; symmetric for b.
|
|
44
|
+
for i in 0..=m {
|
|
45
|
+
d[i * width] = i as f64 * INDEL_COST;
|
|
46
|
+
}
|
|
47
|
+
#[allow(clippy::needless_range_loop)]
|
|
48
|
+
for j in 0..=n {
|
|
49
|
+
d[j] = j as f64 * INDEL_COST;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
for i in 1..=m {
|
|
53
|
+
for j in 1..=n {
|
|
54
|
+
let ai = a[i - 1].as_ref();
|
|
55
|
+
let bj = b[j - 1].as_ref();
|
|
56
|
+
let sub_cost = crate::distance(ai, bj);
|
|
57
|
+
|
|
58
|
+
let delete = d[(i - 1) * width + j] + INDEL_COST;
|
|
59
|
+
let insert = d[i * width + (j - 1)] + INDEL_COST;
|
|
60
|
+
let substitute = d[(i - 1) * width + (j - 1)] + sub_cost;
|
|
61
|
+
|
|
62
|
+
let mut best = delete.min(insert).min(substitute);
|
|
63
|
+
|
|
64
|
+
// Damerau adjacent-transposition.
|
|
65
|
+
if i > 1
|
|
66
|
+
&& j > 1
|
|
67
|
+
&& a[i - 1].as_ref() == b[j - 2].as_ref()
|
|
68
|
+
&& a[i - 2].as_ref() == b[j - 1].as_ref()
|
|
69
|
+
{
|
|
70
|
+
let transpose = d[(i - 2) * width + (j - 2)] + TRANSPOSE_COST;
|
|
71
|
+
if transpose < best {
|
|
72
|
+
best = transpose;
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
d[i * width + j] = best;
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
d[m * width + n]
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
#[cfg(test)]
|
|
84
|
+
mod tests {
|
|
85
|
+
use super::distance;
|
|
86
|
+
|
|
87
|
+
const EPS: f64 = 1e-12;
|
|
88
|
+
|
|
89
|
+
#[test]
|
|
90
|
+
fn matches_ruby_reference_distances() {
|
|
91
|
+
// Reference values produced by Ruby's Phonetics::RubyLevenshtein.
|
|
92
|
+
let cases: &[(&str, &str, f64)] = &[
|
|
93
|
+
("kæt", "kæt", 0.0),
|
|
94
|
+
("dɪsug", "ɪsug", 1.0),
|
|
95
|
+
("izok", "ɪsug", 0.425_001_067_076_172_47),
|
|
96
|
+
("kæt", "", 3.0),
|
|
97
|
+
("kæt", "kæɪt", 1.0),
|
|
98
|
+
("kæt", "kʌt", 0.145_085_455_502_268_37),
|
|
99
|
+
("ɪtsdʒʌstəstupɪdgeɪm",
|
|
100
|
+
"hɪtsdʒʌstɪsduphɪdkeɪm",
|
|
101
|
+
2.469_519_814_165_789_5),
|
|
102
|
+
("mɔop", "sinkœ", 3.025_788_981_175_774),
|
|
103
|
+
("bæd", "ben", 0.510_984_626_268_258_8),
|
|
104
|
+
];
|
|
105
|
+
|
|
106
|
+
for (a, b, expected) in cases {
|
|
107
|
+
let got = distance(a, b);
|
|
108
|
+
assert!(
|
|
109
|
+
(got - expected).abs() < EPS,
|
|
110
|
+
"distance({a:?}, {b:?}) = {got}, expected {expected}",
|
|
111
|
+
);
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
#[test]
|
|
116
|
+
fn empty_pair_is_zero() {
|
|
117
|
+
assert_eq!(distance("", ""), 0.0);
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
#[test]
|
|
121
|
+
fn one_indel_costs_INDEL_COST() {
|
|
122
|
+
// Inserting or deleting one phoneme costs exactly the indel.
|
|
123
|
+
assert!((distance("kæt", "kæte") - super::INDEL_COST).abs() < 1e-6);
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
#[test]
|
|
127
|
+
fn identity_strings_are_zero() {
|
|
128
|
+
for s in ["", "kæt", "stupɪdgeɪm", "ɪtsdʒʌstəstupɪdgeɪm"] {
|
|
129
|
+
assert_eq!(distance(s, s), 0.0);
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
#[test]
|
|
134
|
+
fn symmetric() {
|
|
135
|
+
let pairs = [
|
|
136
|
+
("kæt", "kʌt"),
|
|
137
|
+
("dɪsug", "ɪsug"),
|
|
138
|
+
("stupɪdgeɪm", "stupɪdli"),
|
|
139
|
+
];
|
|
140
|
+
for (a, b) in pairs {
|
|
141
|
+
let d_ab = distance(a, b);
|
|
142
|
+
let d_ba = distance(b, a);
|
|
143
|
+
assert!((d_ab - d_ba).abs() < EPS, "asymmetric: {a}/{b}");
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
//! Phonetics: IPA-based phonetic distance.
|
|
2
|
+
//!
|
|
3
|
+
//! Two-tier API, same as the Ruby reference implementation:
|
|
4
|
+
//!
|
|
5
|
+
//! * [`distance`] — strict per-phoneme acoustic distance, fed to
|
|
6
|
+
//! [`levenshtein`] for whole-string edit distance. The right call for
|
|
7
|
+
//! accent clustering, dialect work, and ASR error analysis.
|
|
8
|
+
//!
|
|
9
|
+
//! * [`Confusion::distance`](confusion::Confusion::distance) — listener-
|
|
10
|
+
//! confusion distance, calibrated against Mad Gab puzzle data. Uses
|
|
11
|
+
//! Gotoh's affine-gap DP plus a weak-phoneme indel discount and an
|
|
12
|
+
//! empirical-confusion overlay. The right call for Mad Gab solving,
|
|
13
|
+
//! pun detection, and mishearing modelling.
|
|
14
|
+
//!
|
|
15
|
+
//! Both tiers share the same per-phoneme cost basis. Improvements to the
|
|
16
|
+
//! acoustic model propagate to both metrics automatically.
|
|
17
|
+
//!
|
|
18
|
+
//! ```
|
|
19
|
+
//! use phonetics::distance;
|
|
20
|
+
//! // Tense /i/ versus lax /ɪ/ — close in Bark space.
|
|
21
|
+
//! assert!((distance("i", "ɪ") - 0.060_056).abs() < 1e-3);
|
|
22
|
+
//! // The same vowel twice is exactly zero.
|
|
23
|
+
//! assert_eq!(distance("ə", "ə"), 0.0);
|
|
24
|
+
//! ```
|
|
25
|
+
|
|
26
|
+
#![doc(html_root_url = "https://docs.rs/phonetics/0.1.0")]
|
|
27
|
+
|
|
28
|
+
pub mod compounds;
|
|
29
|
+
pub mod confusion;
|
|
30
|
+
pub mod consonants;
|
|
31
|
+
pub mod cross_class;
|
|
32
|
+
pub mod diacritics;
|
|
33
|
+
pub mod levenshtein;
|
|
34
|
+
pub mod symbols;
|
|
35
|
+
pub mod tokenizer;
|
|
36
|
+
pub mod vowels;
|
|
37
|
+
|
|
38
|
+
mod distance;
|
|
39
|
+
|
|
40
|
+
pub use confusion::distance as confusion;
|
|
41
|
+
pub use confusion::similarity;
|
|
42
|
+
pub use distance::distance;
|
|
43
|
+
pub use levenshtein::distance as levenshtein;
|
|
44
|
+
pub use tokenizer::tokens;
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
//! Shared phoneme symbol metadata.
|
|
2
|
+
//!
|
|
3
|
+
//! These constants are used by the distance dispatch and the tokenizer.
|
|
4
|
+
|
|
5
|
+
/// Synthetic phoneme token representing a word boundary. Used by the
|
|
6
|
+
/// Confusion metric to model re-syllabification cheaply.
|
|
7
|
+
pub const BOUNDARY_TOKEN: &str = "#";
|
|
8
|
+
|
|
9
|
+
/// Cost of substituting a word boundary against a real phoneme. Set high
|
|
10
|
+
/// enough that the Confusion algorithm prefers indeling the boundary
|
|
11
|
+
/// (via the cheap boundary-indel tier) over substituting it.
|
|
12
|
+
pub const BOUNDARY_VS_PHONEME: f64 = 0.95;
|
|
13
|
+
|
|
14
|
+
/// Default cross-class (consonant↔vowel) distance when no bridge applies.
|
|
15
|
+
/// Lower than 1.0 (the indel cost) on purpose: a consonant against a
|
|
16
|
+
/// vowel is more like a strong substitution than a categorical break.
|
|
17
|
+
pub const CROSS_CLASS_DEFAULT: f64 = 0.85;
|
|
18
|
+
|
|
19
|
+
/// Cross-class cost when the consonant is in the approximant bridge but
|
|
20
|
+
/// the specific vowel isn't enumerated.
|
|
21
|
+
pub const CROSS_CLASS_NEAR_BRIDGE: f64 = 0.55;
|