phonetics 3.2.0 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +17 -2
  3. data/Cargo.toml +27 -0
  4. data/Rakefile +58 -26
  5. data/VERSION +1 -1
  6. data/bin/phonetics +89 -0
  7. data/ext/phonetics_ruby/Cargo.toml +36 -0
  8. data/ext/phonetics_ruby/build.rs +24 -0
  9. data/ext/phonetics_ruby/extconf.rb +17 -0
  10. data/ext/phonetics_ruby/src/lib.rs +56 -0
  11. data/ext/phonetics_ruby/vendor/phonetics/Cargo.toml +30 -0
  12. data/ext/phonetics_ruby/vendor/phonetics/README.md +29 -0
  13. data/ext/phonetics_ruby/vendor/phonetics/src/compounds.rs +40 -0
  14. data/ext/phonetics_ruby/vendor/phonetics/src/confusion.rs +325 -0
  15. data/ext/phonetics_ruby/vendor/phonetics/src/consonants.rs +363 -0
  16. data/ext/phonetics_ruby/vendor/phonetics/src/cross_class.rs +56 -0
  17. data/ext/phonetics_ruby/vendor/phonetics/src/diacritics.rs +113 -0
  18. data/ext/phonetics_ruby/vendor/phonetics/src/distance.rs +183 -0
  19. data/ext/phonetics_ruby/vendor/phonetics/src/levenshtein.rs +146 -0
  20. data/ext/phonetics_ruby/vendor/phonetics/src/lib.rs +44 -0
  21. data/ext/phonetics_ruby/vendor/phonetics/src/symbols.rs +21 -0
  22. data/ext/phonetics_ruby/vendor/phonetics/src/tokenizer.rs +171 -0
  23. data/ext/phonetics_ruby/vendor/phonetics/src/vowels.rs +197 -0
  24. data/lib/phonetics.rb +77 -2
  25. data/phonetics.gemspec +33 -9
  26. metadata +45 -34
  27. data/.github/workflows/gempush.yml +0 -28
  28. data/.github/workflows/test.yml +0 -20
  29. data/Makefile +0 -9
  30. data/ext/c_levenshtein/extconf.rb +0 -10
  31. data/ext/c_levenshtein/levenshtein.c +0 -223
  32. data/ext/c_levenshtein/next_phoneme_length.c +0 -1365
  33. data/ext/c_levenshtein/next_phoneme_length.h +0 -1
  34. data/ext/c_levenshtein/phonemes.c +0 -53
  35. data/ext/c_levenshtein/phonemes.h +0 -3
  36. data/ext/c_levenshtein/phonetic_cost.c +0 -88593
  37. data/ext/c_levenshtein/phonetic_cost.h +0 -1
  38. data/lib/phonetics/code_generator.rb +0 -228
  39. data/lib/phonetics/distances.rb +0 -249
  40. data/lib/phonetics/levenshtein.rb +0 -27
  41. data/lib/phonetics/ruby_levenshtein.rb +0 -162
@@ -0,0 +1,56 @@
1
+ //! Cross-class bridge: consonant ↔ vowel distance.
2
+ //!
3
+ //! English perception treats /j/, /w/, /ɹ/, /ɰ/ as non-syllabic versions
4
+ //! of /i/, /u/, /ɝ/, /ɯ/ respectively — Mad Gab's "yes" ≈ "Es" depends
5
+ //! on this bridge. Glottals are mostly an air pulse, also nearer to
6
+ //! vowels than to a true stop or fricative. Everything else uses
7
+ //! [`CROSS_CLASS_DEFAULT`](crate::symbols::CROSS_CLASS_DEFAULT).
8
+
9
+ use crate::symbols::{CROSS_CLASS_DEFAULT, CROSS_CLASS_NEAR_BRIDGE};
10
+
11
+ /// Returns the bridge cost when `consonant` is an approximant with
12
+ /// listed vowel-distance entries. Vowels not specifically listed under
13
+ /// a bridge consonant get `CROSS_CLASS_NEAR_BRIDGE`. Non-bridge
14
+ /// consonants return `None`.
15
+ fn approximant_bridge(consonant: &str, vowel: &str) -> Option<f64> {
16
+ let cost = match (consonant, vowel) {
17
+ ("j", "i") => 0.10,
18
+ ("j", "ɪ") => 0.14,
19
+ ("j", "y") => 0.18,
20
+ ("j", "e") => 0.22,
21
+ ("w", "u") => 0.10,
22
+ ("w", "ʊ") => 0.14,
23
+ ("w", "o") => 0.22,
24
+ ("w", "ɔ") => 0.30,
25
+ ("w", "ɯ") => 0.20,
26
+ ("ɹ", "ɝ") => 0.08,
27
+ ("ɹ", "ə") => 0.25,
28
+ ("ɰ", "ɯ") => 0.10,
29
+ ("ɰ", "u") => 0.20,
30
+ // Bridge consonants without a specific vowel entry.
31
+ ("j" | "w" | "ɹ" | "ɰ", _) => CROSS_CLASS_NEAR_BRIDGE,
32
+ _ => return None,
33
+ };
34
+ Some(cost)
35
+ }
36
+
37
+ /// Glottal-bridge cost; glottals are nearly vowel-like everywhere.
38
+ fn glottal_bridge(consonant: &str) -> Option<f64> {
39
+ Some(match consonant {
40
+ "h" | "ɦ" => 0.50,
41
+ "ʔ" => 0.55,
42
+ _ => return None,
43
+ })
44
+ }
45
+
46
+ /// Look up the cross-class distance for a (consonant, vowel) pair.
47
+ /// Falls back to `CROSS_CLASS_DEFAULT` when neither bridge fires.
48
+ pub fn distance(consonant: &str, vowel: &str) -> f64 {
49
+ if let Some(c) = approximant_bridge(consonant, vowel) {
50
+ return c;
51
+ }
52
+ if let Some(c) = glottal_bridge(consonant) {
53
+ return c;
54
+ }
55
+ CROSS_CLASS_DEFAULT
56
+ }
@@ -0,0 +1,113 @@
1
+ //! Suprasegmental and modifier diacritics.
2
+ //!
3
+ //! Each diacritic character either attaches to the preceding base
4
+ //! phoneme (length, aspiration, palatalization, etc.) or to the
5
+ //! following base phoneme (stress marks, which in IPA precede the
6
+ //! stressed syllable). The tokenizer absorbs them into the same token;
7
+ //! the distance metric splits them back out via [`decompose`] and
8
+ //! charges a small additive cost when modifier sets differ.
9
+
10
+ /// Kinds of suprasegmental modifier recognised by the metric.
11
+ #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
12
+ pub enum Diacritic {
13
+ /// `ː` — full length.
14
+ Long,
15
+ /// `ˑ` — half-length.
16
+ HalfLong,
17
+ /// `ʰ` — aspiration.
18
+ Aspirated,
19
+ /// `ʲ` — palatalization.
20
+ Palatalized,
21
+ /// `ˤ` — pharyngealization.
22
+ Pharyngealized,
23
+ /// `ˠ` — velarization.
24
+ Velarized,
25
+ /// Combining tilde above (U+0303) — nasalization.
26
+ Nasalized,
27
+ /// `ˈ` — primary stress.
28
+ PrimaryStress,
29
+ /// `ˌ` — secondary stress.
30
+ SecondaryStress,
31
+ }
32
+
33
+ impl Diacritic {
34
+ /// Additive cost when this modifier is present on one phoneme but
35
+ /// not the other.
36
+ pub fn penalty(self) -> f64 {
37
+ match self {
38
+ Self::Long => 0.05,
39
+ Self::HalfLong => 0.025,
40
+ Self::Aspirated => 0.04,
41
+ Self::Palatalized => 0.06,
42
+ Self::Pharyngealized => 0.07,
43
+ Self::Velarized => 0.07,
44
+ Self::Nasalized => 0.06,
45
+ Self::PrimaryStress => 0.05,
46
+ Self::SecondaryStress => 0.03,
47
+ }
48
+ }
49
+
50
+ /// Classify a single Unicode character as a diacritic. The combining
51
+ /// tilde is a multi-byte sequence in UTF-8 but one Unicode scalar.
52
+ pub fn from_char(c: char) -> Option<Self> {
53
+ Some(match c {
54
+ 'ː' => Self::Long,
55
+ 'ˑ' => Self::HalfLong,
56
+ 'ʰ' => Self::Aspirated,
57
+ 'ʲ' => Self::Palatalized,
58
+ 'ˤ' => Self::Pharyngealized,
59
+ 'ˠ' => Self::Velarized,
60
+ '\u{0303}' => Self::Nasalized,
61
+ 'ˈ' => Self::PrimaryStress,
62
+ 'ˌ' => Self::SecondaryStress,
63
+ _ => return None,
64
+ })
65
+ }
66
+
67
+ /// True if this diacritic attaches to the *following* segment
68
+ /// (stress marks); false for the preceding-attached ones.
69
+ pub fn is_leading(self) -> bool {
70
+ matches!(self, Self::PrimaryStress | Self::SecondaryStress)
71
+ }
72
+ }
73
+
74
+ /// Generic fallback diacritic cost when an unrecognised diacritic-like
75
+ /// character ends up in a modifier set somehow.
76
+ pub const DEFAULT_PENALTY: f64 = 0.03;
77
+
78
+ /// Split a phoneme token into its base symbol and the set of diacritic
79
+ /// kinds it carries. Unrecognised characters stay in the base so a
80
+ /// misspelt input doesn't silently lose information.
81
+ pub fn decompose(token: &str) -> (String, Vec<Diacritic>) {
82
+ let mut base = String::new();
83
+ let mut mods: Vec<Diacritic> = Vec::new();
84
+ for c in token.chars() {
85
+ if let Some(d) = Diacritic::from_char(c) {
86
+ if !mods.contains(&d) {
87
+ mods.push(d);
88
+ }
89
+ } else {
90
+ base.push(c);
91
+ }
92
+ }
93
+ (base, mods)
94
+ }
95
+
96
+ /// Symmetric-difference cost between two diacritic sets.
97
+ pub fn distance(mods1: &[Diacritic], mods2: &[Diacritic]) -> f64 {
98
+ if mods1.is_empty() && mods2.is_empty() {
99
+ return 0.0;
100
+ }
101
+ let mut total = 0.0;
102
+ for d in mods1 {
103
+ if !mods2.contains(d) {
104
+ total += d.penalty();
105
+ }
106
+ }
107
+ for d in mods2 {
108
+ if !mods1.contains(d) {
109
+ total += d.penalty();
110
+ }
111
+ }
112
+ total
113
+ }
@@ -0,0 +1,183 @@
1
+ //! Top-level distance dispatch.
2
+ //!
3
+ //! `distance(p1, p2)` returns the acoustic distance between two phoneme
4
+ //! tokens, scaled to [0, 1]. The dispatch order matters:
5
+ //!
6
+ //! 1. Identity → 0.
7
+ //! 2. Boundary token (`#`) → 0 vs boundary, else `BOUNDARY_VS_PHONEME`.
8
+ //! 3. Diacritic decomposition: split the token into base + modifier
9
+ //! set; recurse on the base; add a per-modifier mismatch cost.
10
+ //! 4. Compound expansion: if either side is a diphthong or affricate,
11
+ //! compare component-by-component (padded by repeating the last
12
+ //! segment), averaging.
13
+ //! 5. Class-based dispatch: vowel-vowel, consonant-consonant, or
14
+ //! cross-class bridge.
15
+
16
+ use crate::{compounds, consonants, cross_class, diacritics, symbols, vowels};
17
+
18
+ /// Distance between two phoneme tokens, scaled to [0, 1].
19
+ pub fn distance(p1: &str, p2: &str) -> f64 {
20
+ if p1 == p2 {
21
+ return 0.0;
22
+ }
23
+
24
+ if p1 == symbols::BOUNDARY_TOKEN || p2 == symbols::BOUNDARY_TOKEN {
25
+ return symbols::BOUNDARY_VS_PHONEME;
26
+ }
27
+
28
+ let (base1, mods1) = diacritics::decompose(p1);
29
+ let (base2, mods2) = diacritics::decompose(p2);
30
+
31
+ // If either side carried any diacritics, strip them and recompute
32
+ // on the bases, then add a per-modifier mismatch cost.
33
+ if !mods1.is_empty() || !mods2.is_empty() {
34
+ let base_dist = base_pair_distance(&base1, &base2);
35
+ return (base_dist + diacritics::distance(&mods1, &mods2)).min(1.0);
36
+ }
37
+
38
+ base_pair_distance(p1, p2)
39
+ }
40
+
41
+ /// Distance between two bare base phonemes (no diacritics on either
42
+ /// side). Handles compounds, class dispatch, and cross-class bridges.
43
+ fn base_pair_distance(p1: &str, p2: &str) -> f64 {
44
+ if p1 == p2 {
45
+ return 0.0;
46
+ }
47
+
48
+ let comp1 = compounds::components(p1);
49
+ let comp2 = compounds::components(p2);
50
+ if comp1.is_some() || comp2.is_some() {
51
+ let a: Vec<&str> = comp1.map_or_else(|| vec![p1], <[&str]>::to_vec);
52
+ let b: Vec<&str> = comp2.map_or_else(|| vec![p2], <[&str]>::to_vec);
53
+ return compound_distance(&a, &b);
54
+ }
55
+
56
+ let is_vowel_1 = vowels::lookup(p1).is_some();
57
+ let is_vowel_2 = vowels::lookup(p2).is_some();
58
+ let is_cons_1 = consonants::lookup(p1).is_some();
59
+ let is_cons_2 = consonants::lookup(p2).is_some();
60
+
61
+ if is_vowel_1 && is_vowel_2 {
62
+ return vowels::distance(p1, p2).unwrap_or(1.0);
63
+ }
64
+ if is_cons_1 && is_cons_2 {
65
+ return consonants::distance(p1, p2).unwrap_or(1.0);
66
+ }
67
+ if is_cons_1 && is_vowel_2 {
68
+ return cross_class::distance(p1, p2);
69
+ }
70
+ if is_vowel_1 && is_cons_2 {
71
+ return cross_class::distance(p2, p1);
72
+ }
73
+ 1.0
74
+ }
75
+
76
+ /// Pairwise component-mean distance for compound (or
77
+ /// compound-and-simple) phonemes. The shorter side is padded by
78
+ /// repeating its last segment so /aɪ/ vs /a/ charges half a phoneme
79
+ /// distance rather than nothing.
80
+ fn compound_distance(c1: &[&str], c2: &[&str]) -> f64 {
81
+ let n = c1.len().max(c2.len());
82
+
83
+ // Pad the shorter side by repeating its last entry.
84
+ fn pad<'a>(v: &[&'a str], n: usize) -> Vec<&'a str> {
85
+ let mut out: Vec<&'a str> = v.to_vec();
86
+ if let Some(&last) = out.last() {
87
+ while out.len() < n {
88
+ out.push(last);
89
+ }
90
+ }
91
+ out
92
+ }
93
+ let a = pad(c1, n);
94
+ let b = pad(c2, n);
95
+
96
+ let total: f64 = a
97
+ .iter()
98
+ .zip(b.iter())
99
+ .map(|(x, y)| if x == y { 0.0 } else { distance(x, y) })
100
+ .sum();
101
+ total / n as f64
102
+ }
103
+
104
+ #[cfg(test)]
105
+ mod tests {
106
+ use super::distance;
107
+
108
+ const EPS: f64 = 1e-12;
109
+
110
+ #[test]
111
+ fn matches_ruby_dispatch_cases() {
112
+ // Reference values produced by the Ruby implementation across
113
+ // every dispatch branch: cross-class bridges, glottals, the
114
+ // default cross-class, compound diphthongs and affricates,
115
+ // compound-vs-simple averaging, diacritics, and the boundary
116
+ // token.
117
+ let cases: &[(&str, &str, f64)] = &[
118
+ // Approximant↔vowel bridge
119
+ ("j", "i", 0.10),
120
+ ("j", "ɪ", 0.14),
121
+ ("w", "u", 0.10),
122
+ ("w", "o", 0.22),
123
+ ("ɹ", "ɝ", 0.08),
124
+ ("ɰ", "ɯ", 0.10),
125
+ // Glottal bridge
126
+ ("h", "a", 0.50),
127
+ ("ʔ", "i", 0.55),
128
+ ("ɦ", "ɛ", 0.50),
129
+ // Default cross-class
130
+ ("k", "i", 0.85),
131
+ ("s", "u", 0.85),
132
+ ("m", "a", 0.85),
133
+ // Diphthongs
134
+ ("aɪ", "ɑɪ", 0.107_288_248_772_162_7),
135
+ ("aɪ", "eɪ", 0.130_229_442_812_957_87),
136
+ ("aɪ", "a", 0.144_904_524_019_594_53),
137
+ ("oʊ", "o", 0.067_465_504_570_433_46),
138
+ // Affricates
139
+ ("tʃ", "ʃ", 0.124_358_541_225_631_43),
140
+ ("tʃ", "dʒ", 0.15),
141
+ ("tʃ", "t", 0.124_358_541_225_631_43),
142
+ ("dʒ", "ʒ", 0.124_358_541_225_631_43),
143
+ // Compound vs unrelated → averages to the default
144
+ ("aɪ", "k", 0.85),
145
+ // Diacritics
146
+ ("pʰ", "p", 0.04),
147
+ ("uː", "u", 0.05),
148
+ ("ˈp", "p", 0.05),
149
+ ("pʰ", "bʰ", 0.15),
150
+ // Boundary token
151
+ ("#", "a", 0.95),
152
+ ("#", "#", 0.0),
153
+ ];
154
+
155
+ for (a, b, expected) in cases {
156
+ let got = distance(a, b);
157
+ assert!(
158
+ (got - expected).abs() < EPS,
159
+ "distance({a:?}, {b:?}) = {got}, expected {expected}",
160
+ );
161
+ }
162
+ }
163
+
164
+ #[test]
165
+ fn identity_is_zero_across_classes() {
166
+ for s in ["i", "p", "aɪ", "tʃ", "pʰ", "#"] {
167
+ assert_eq!(distance(s, s), 0.0);
168
+ }
169
+ }
170
+
171
+ #[test]
172
+ fn diphthong_against_its_nucleus_is_half_a_phoneme() {
173
+ // /aɪ/ vs /a/ should be roughly distance(ɪ, a) / 2 since the
174
+ // first component matches and only the second contributes.
175
+ let direct = crate::vowels::distance("ɪ", "a").unwrap();
176
+ let compound = distance("aɪ", "a");
177
+ assert!(
178
+ (compound - direct / 2.0).abs() < EPS,
179
+ "compound={compound}, direct/2={}",
180
+ direct / 2.0
181
+ );
182
+ }
183
+ }
@@ -0,0 +1,146 @@
1
+ //! Strict phonetic edit distance.
2
+ //!
3
+ //! Damerau-Levenshtein DP with INDEL_COST = 1.0 (one inserted or
4
+ //! deleted phoneme costs exactly one indel, regardless of its
5
+ //! neighbours — the operation costs what the operation costs, not what
6
+ //! happens to sit next to it). Adjacent transpositions cost
7
+ //! TRANSPOSE_COST < 2 * INDEL_COST because in casual speech swapping
8
+ //! adjacent phonemes is a real and cheap thing speakers do.
9
+ //!
10
+ //! Substitution cost is the per-phoneme acoustic distance returned by
11
+ //! [`crate::distance`], so improvements to the acoustic model land here
12
+ //! automatically.
13
+
14
+ use crate::tokenizer;
15
+
16
+ /// Cost of inserting or deleting one phoneme.
17
+ pub const INDEL_COST: f64 = 1.0;
18
+
19
+ /// Cost of an adjacent-pair transposition (the Damerau extension).
20
+ pub const TRANSPOSE_COST: f64 = 0.8;
21
+
22
+ /// Edit distance between two IPA strings under the strict acoustic
23
+ /// metric. Non-IPA characters are tokenised out before the DP runs.
24
+ pub fn distance(a: &str, b: &str) -> f64 {
25
+ let ta = tokenizer::tokens(a, false);
26
+ let tb = tokenizer::tokens(b, false);
27
+ distance_from_tokens(&ta, &tb)
28
+ }
29
+
30
+ /// Edit distance over pre-tokenised phoneme sequences.
31
+ pub fn distance_from_tokens<S: AsRef<str>>(a: &[S], b: &[S]) -> f64 {
32
+ let m = a.len();
33
+ let n = b.len();
34
+ if m == 0 && n == 0 {
35
+ return 0.0;
36
+ }
37
+
38
+ // d[i][j] flattened; width = n + 1.
39
+ let width = n + 1;
40
+ let mut d = vec![0.0_f64; (m + 1) * width];
41
+
42
+ // Seed: matching the empty string against the first i phonemes of a
43
+ // costs i indels; symmetric for b.
44
+ for i in 0..=m {
45
+ d[i * width] = i as f64 * INDEL_COST;
46
+ }
47
+ #[allow(clippy::needless_range_loop)]
48
+ for j in 0..=n {
49
+ d[j] = j as f64 * INDEL_COST;
50
+ }
51
+
52
+ for i in 1..=m {
53
+ for j in 1..=n {
54
+ let ai = a[i - 1].as_ref();
55
+ let bj = b[j - 1].as_ref();
56
+ let sub_cost = crate::distance(ai, bj);
57
+
58
+ let delete = d[(i - 1) * width + j] + INDEL_COST;
59
+ let insert = d[i * width + (j - 1)] + INDEL_COST;
60
+ let substitute = d[(i - 1) * width + (j - 1)] + sub_cost;
61
+
62
+ let mut best = delete.min(insert).min(substitute);
63
+
64
+ // Damerau adjacent-transposition.
65
+ if i > 1
66
+ && j > 1
67
+ && a[i - 1].as_ref() == b[j - 2].as_ref()
68
+ && a[i - 2].as_ref() == b[j - 1].as_ref()
69
+ {
70
+ let transpose = d[(i - 2) * width + (j - 2)] + TRANSPOSE_COST;
71
+ if transpose < best {
72
+ best = transpose;
73
+ }
74
+ }
75
+
76
+ d[i * width + j] = best;
77
+ }
78
+ }
79
+
80
+ d[m * width + n]
81
+ }
82
+
83
+ #[cfg(test)]
84
+ mod tests {
85
+ use super::distance;
86
+
87
+ const EPS: f64 = 1e-12;
88
+
89
+ #[test]
90
+ fn matches_ruby_reference_distances() {
91
+ // Reference values produced by Ruby's Phonetics::RubyLevenshtein.
92
+ let cases: &[(&str, &str, f64)] = &[
93
+ ("kæt", "kæt", 0.0),
94
+ ("dɪsug", "ɪsug", 1.0),
95
+ ("izok", "ɪsug", 0.425_001_067_076_172_47),
96
+ ("kæt", "", 3.0),
97
+ ("kæt", "kæɪt", 1.0),
98
+ ("kæt", "kʌt", 0.145_085_455_502_268_37),
99
+ ("ɪtsdʒʌstəstupɪdgeɪm",
100
+ "hɪtsdʒʌstɪsduphɪdkeɪm",
101
+ 2.469_519_814_165_789_5),
102
+ ("mɔop", "sinkœ", 3.025_788_981_175_774),
103
+ ("bæd", "ben", 0.510_984_626_268_258_8),
104
+ ];
105
+
106
+ for (a, b, expected) in cases {
107
+ let got = distance(a, b);
108
+ assert!(
109
+ (got - expected).abs() < EPS,
110
+ "distance({a:?}, {b:?}) = {got}, expected {expected}",
111
+ );
112
+ }
113
+ }
114
+
115
+ #[test]
116
+ fn empty_pair_is_zero() {
117
+ assert_eq!(distance("", ""), 0.0);
118
+ }
119
+
120
+ #[test]
121
+ fn one_indel_costs_INDEL_COST() {
122
+ // Inserting or deleting one phoneme costs exactly the indel.
123
+ assert!((distance("kæt", "kæte") - super::INDEL_COST).abs() < 1e-6);
124
+ }
125
+
126
+ #[test]
127
+ fn identity_strings_are_zero() {
128
+ for s in ["", "kæt", "stupɪdgeɪm", "ɪtsdʒʌstəstupɪdgeɪm"] {
129
+ assert_eq!(distance(s, s), 0.0);
130
+ }
131
+ }
132
+
133
+ #[test]
134
+ fn symmetric() {
135
+ let pairs = [
136
+ ("kæt", "kʌt"),
137
+ ("dɪsug", "ɪsug"),
138
+ ("stupɪdgeɪm", "stupɪdli"),
139
+ ];
140
+ for (a, b) in pairs {
141
+ let d_ab = distance(a, b);
142
+ let d_ba = distance(b, a);
143
+ assert!((d_ab - d_ba).abs() < EPS, "asymmetric: {a}/{b}");
144
+ }
145
+ }
146
+ }
@@ -0,0 +1,44 @@
1
+ //! Phonetics: IPA-based phonetic distance.
2
+ //!
3
+ //! Two-tier API, same as the Ruby reference implementation:
4
+ //!
5
+ //! * [`distance`] — strict per-phoneme acoustic distance, fed to
6
+ //! [`levenshtein`] for whole-string edit distance. The right call for
7
+ //! accent clustering, dialect work, and ASR error analysis.
8
+ //!
9
+ //! * [`Confusion::distance`](confusion::Confusion::distance) — listener-
10
+ //! confusion distance, calibrated against Mad Gab puzzle data. Uses
11
+ //! Gotoh's affine-gap DP plus a weak-phoneme indel discount and an
12
+ //! empirical-confusion overlay. The right call for Mad Gab solving,
13
+ //! pun detection, and mishearing modelling.
14
+ //!
15
+ //! Both tiers share the same per-phoneme cost basis. Improvements to the
16
+ //! acoustic model propagate to both metrics automatically.
17
+ //!
18
+ //! ```
19
+ //! use phonetics::distance;
20
+ //! // Tense /i/ versus lax /ɪ/ — close in Bark space.
21
+ //! assert!((distance("i", "ɪ") - 0.060_056).abs() < 1e-3);
22
+ //! // The same vowel twice is exactly zero.
23
+ //! assert_eq!(distance("ə", "ə"), 0.0);
24
+ //! ```
25
+
26
+ #![doc(html_root_url = "https://docs.rs/phonetics/0.1.0")]
27
+
28
+ pub mod compounds;
29
+ pub mod confusion;
30
+ pub mod consonants;
31
+ pub mod cross_class;
32
+ pub mod diacritics;
33
+ pub mod levenshtein;
34
+ pub mod symbols;
35
+ pub mod tokenizer;
36
+ pub mod vowels;
37
+
38
+ mod distance;
39
+
40
+ pub use confusion::distance as confusion;
41
+ pub use confusion::similarity;
42
+ pub use distance::distance;
43
+ pub use levenshtein::distance as levenshtein;
44
+ pub use tokenizer::tokens;
@@ -0,0 +1,21 @@
1
+ //! Shared phoneme symbol metadata.
2
+ //!
3
+ //! These constants are used by the distance dispatch and the tokenizer.
4
+
5
+ /// Synthetic phoneme token representing a word boundary. Used by the
6
+ /// Confusion metric to model re-syllabification cheaply.
7
+ pub const BOUNDARY_TOKEN: &str = "#";
8
+
9
+ /// Cost of substituting a word boundary against a real phoneme. Set high
10
+ /// enough that the Confusion algorithm prefers indeling the boundary
11
+ /// (via the cheap boundary-indel tier) over substituting it.
12
+ pub const BOUNDARY_VS_PHONEME: f64 = 0.95;
13
+
14
+ /// Default cross-class (consonant↔vowel) distance when no bridge applies.
15
+ /// Lower than 1.0 (the indel cost) on purpose: a consonant against a
16
+ /// vowel is more like a strong substitution than a categorical break.
17
+ pub const CROSS_CLASS_DEFAULT: f64 = 0.85;
18
+
19
+ /// Cross-class cost when the consonant is in the approximant bridge but
20
+ /// the specific vowel isn't enumerated.
21
+ pub const CROSS_CLASS_NEAR_BRIDGE: f64 = 0.55;