phonetics 3.0.9 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +17 -2
  3. data/CHANGELOG +4 -0
  4. data/Cargo.toml +27 -0
  5. data/Rakefile +58 -26
  6. data/VERSION +1 -1
  7. data/bin/phonetics +89 -0
  8. data/ext/phonetics_ruby/Cargo.toml +36 -0
  9. data/ext/phonetics_ruby/build.rs +24 -0
  10. data/ext/phonetics_ruby/extconf.rb +17 -0
  11. data/ext/phonetics_ruby/src/lib.rs +56 -0
  12. data/ext/phonetics_ruby/vendor/phonetics/Cargo.toml +30 -0
  13. data/ext/phonetics_ruby/vendor/phonetics/README.md +29 -0
  14. data/ext/phonetics_ruby/vendor/phonetics/src/compounds.rs +40 -0
  15. data/ext/phonetics_ruby/vendor/phonetics/src/confusion.rs +325 -0
  16. data/ext/phonetics_ruby/vendor/phonetics/src/consonants.rs +363 -0
  17. data/ext/phonetics_ruby/vendor/phonetics/src/cross_class.rs +56 -0
  18. data/ext/phonetics_ruby/vendor/phonetics/src/diacritics.rs +113 -0
  19. data/ext/phonetics_ruby/vendor/phonetics/src/distance.rs +183 -0
  20. data/ext/phonetics_ruby/vendor/phonetics/src/levenshtein.rs +146 -0
  21. data/ext/phonetics_ruby/vendor/phonetics/src/lib.rs +44 -0
  22. data/ext/phonetics_ruby/vendor/phonetics/src/symbols.rs +21 -0
  23. data/ext/phonetics_ruby/vendor/phonetics/src/tokenizer.rs +171 -0
  24. data/ext/phonetics_ruby/vendor/phonetics/src/vowels.rs +197 -0
  25. data/lib/phonetics.rb +77 -2
  26. data/phonetics.gemspec +33 -9
  27. metadata +46 -34
  28. data/.github/workflows/gempush.yml +0 -28
  29. data/.github/workflows/test.yml +0 -20
  30. data/Makefile +0 -6
  31. data/ext/c_levenshtein/extconf.rb +0 -10
  32. data/ext/c_levenshtein/levenshtein.c +0 -223
  33. data/ext/c_levenshtein/next_phoneme_length.c +0 -1365
  34. data/ext/c_levenshtein/next_phoneme_length.h +0 -1
  35. data/ext/c_levenshtein/phonemes.c +0 -53
  36. data/ext/c_levenshtein/phonemes.h +0 -3
  37. data/ext/c_levenshtein/phonetic_cost.c +0 -88593
  38. data/ext/c_levenshtein/phonetic_cost.h +0 -1
  39. data/lib/phonetics/code_generator.rb +0 -228
  40. data/lib/phonetics/distances.rb +0 -245
  41. data/lib/phonetics/levenshtein.rb +0 -27
  42. data/lib/phonetics/ruby_levenshtein.rb +0 -162
@@ -0,0 +1,171 @@
1
+ //! IPA phoneme tokenizer.
2
+ //!
3
+ //! Walks an input string and emits a sequence of phoneme tokens. The
4
+ //! recognition is longest-prefix: multi-character atoms like /tʃ/,
5
+ //! /aɪ/, and /ɝ/ win over their single-character constituents.
6
+ //!
7
+ //! Diacritics absorb into the segment they modify — trailing
8
+ //! modifiers attach to the preceding base phoneme, stress marks
9
+ //! attach to the following one. Whitespace is skipped by default; in
10
+ //! boundary mode (used by the Confusion metric) each whitespace
11
+ //! character emits the `#` boundary token.
12
+
13
+ use std::collections::HashSet;
14
+ use std::sync::LazyLock;
15
+
16
+ use crate::{compounds, consonants, diacritics::Diacritic, symbols, vowels};
17
+
18
+ /// Set of every recognised phoneme symbol. Includes the boundary token
19
+ /// so longest-prefix matching can pick it up on raw `#` input.
20
+ pub static PHONEME_SET: LazyLock<HashSet<&'static str>> = LazyLock::new(|| {
21
+ let mut s: HashSet<&'static str> = HashSet::new();
22
+ for &p in vowels::INVENTORY {
23
+ s.insert(p);
24
+ }
25
+ for &p in consonants::INVENTORY {
26
+ s.insert(p);
27
+ }
28
+ for &p in compounds::INVENTORY {
29
+ s.insert(p);
30
+ }
31
+ s.insert(symbols::BOUNDARY_TOKEN);
32
+ s
33
+ });
34
+
35
+ /// Largest phoneme-symbol size in characters (not bytes). Used as the
36
+ /// upper bound for longest-prefix matching.
37
+ pub static MAX_PHONEME_CHARS: LazyLock<usize> = LazyLock::new(|| {
38
+ PHONEME_SET
39
+ .iter()
40
+ .map(|s| s.chars().count())
41
+ .max()
42
+ .unwrap_or(1)
43
+ });
44
+
45
+ /// True if `s` is a recognised phoneme symbol.
46
+ pub fn is_phoneme(s: &str) -> bool {
47
+ PHONEME_SET.contains(s)
48
+ }
49
+
50
+ /// Characters that represent a word boundary in raw IPA input.
51
+ const BOUNDARY_CHARS: &[char] = &[' ', '\t', '_', '|'];
52
+
53
+ /// Tokenise an IPA string into a sequence of phoneme tokens.
54
+ ///
55
+ /// When `boundaries` is true, each whitespace / boundary character
56
+ /// in the input emits the `#` token; otherwise they're skipped.
57
+ pub fn tokens(input: &str, boundaries: bool) -> Vec<String> {
58
+ let chars: Vec<char> = input.chars().collect();
59
+ let max_phoneme_size = *MAX_PHONEME_CHARS;
60
+ let mut out: Vec<String> = Vec::new();
61
+ let mut pending_prefix = String::new();
62
+ let mut idx = 0;
63
+
64
+ while idx < chars.len() {
65
+ let ch = chars[idx];
66
+
67
+ if BOUNDARY_CHARS.contains(&ch) {
68
+ if boundaries {
69
+ out.push(symbols::BOUNDARY_TOKEN.to_string());
70
+ }
71
+ idx += 1;
72
+ continue;
73
+ }
74
+
75
+ // Stress marks bind forward; carry them onto the next emitted token.
76
+ if let Some(d) = Diacritic::from_char(ch) {
77
+ if d.is_leading() {
78
+ pending_prefix.push(ch);
79
+ idx += 1;
80
+ continue;
81
+ }
82
+ }
83
+
84
+ // Try longest-prefix match against the recognized inventory.
85
+ let mut matched: Option<String> = None;
86
+ let max = max_phoneme_size.min(chars.len() - idx);
87
+ for size in (1..=max).rev() {
88
+ let candidate: String = chars[idx..idx + size].iter().collect();
89
+ if is_phoneme(&candidate) {
90
+ matched = Some(candidate);
91
+ idx += size;
92
+ break;
93
+ }
94
+ }
95
+
96
+ if let Some(base) = matched {
97
+ let mut token = std::mem::take(&mut pending_prefix);
98
+ token.push_str(&base);
99
+
100
+ // Absorb any trailing diacritics that modify this phoneme.
101
+ while idx < chars.len() {
102
+ let next = chars[idx];
103
+ match Diacritic::from_char(next) {
104
+ Some(d) if !d.is_leading() => {
105
+ token.push(next);
106
+ idx += 1;
107
+ }
108
+ _ => break,
109
+ }
110
+ }
111
+ out.push(token);
112
+ } else {
113
+ // No recognised phoneme starts here; skip one character.
114
+ idx += 1;
115
+ }
116
+ }
117
+
118
+ out
119
+ }
120
+
121
+ #[cfg(test)]
122
+ mod tests {
123
+ use super::*;
124
+
125
+ fn t(s: &str) -> Vec<String> {
126
+ tokens(s, false)
127
+ }
128
+ fn tb(s: &str) -> Vec<String> {
129
+ tokens(s, true)
130
+ }
131
+
132
+ #[test]
133
+ fn matches_ruby_reference_tokenisations() {
134
+ // Reference outputs produced by the Ruby implementation.
135
+ let cases: &[(&str, &[&str], &[&str])] = &[
136
+ ("kæt", &["k","æ","t"], &["k","æ","t"]),
137
+ ("wətɛvɝ", &["w","ə","t","ɛ","v","ɝ"], &["w","ə","t","ɛ","v","ɝ"]),
138
+ ("kuɹzlɑɪt", &["k","u","ɹ","z","l","ɑɪ","t"], &["k","u","ɹ","z","l","ɑɪ","t"]),
139
+ ("dʒʌstɪs", &["dʒ","ʌ","s","t","ɪ","s"], &["dʒ","ʌ","s","t","ɪ","s"]),
140
+ ("tʃɝtʃ", &["tʃ","ɝ","tʃ"], &["tʃ","ɝ","tʃ"]),
141
+ ("stupɪdgeɪm", &["s","t","u","p","ɪ","d","g","eɪ","m"], &["s","t","u","p","ɪ","d","g","eɪ","m"]),
142
+ ("wə t 9 ɛvɝ", &["w","ə","t","ɛ","v","ɝ"], &["w","ə","#","t","#","#","ɛ","v","ɝ"]),
143
+ ("pʰɪt", &["pʰ","ɪ","t"], &["pʰ","ɪ","t"]),
144
+ ("kʰæt̃", &["kʰ","æ","t̃"], &["kʰ","æ","t̃"]),
145
+ ("ˈstop", &["ˈs","t","o","p"], &["ˈs","t","o","p"]),
146
+ ("ˌɪntɝˈnæʃənl", &["ˌɪ","n","t","ɝ","ˈn","æ","ʃ","ə","n","l"], &["ˌɪ","n","t","ɝ","ˈn","æ","ʃ","ə","n","l"]),
147
+ ("stuːpɪd", &["s","t","uː","p","ɪ","d"], &["s","t","uː","p","ɪ","d"]),
148
+ ("aɪlʌvju", &["aɪ","l","ʌ","v","j","u"], &["aɪ","l","ʌ","v","j","u"]),
149
+ ];
150
+
151
+ for (input, bare, with_bounds) in cases {
152
+ let got_bare = t(input);
153
+ let got_bnds = tb(input);
154
+ let want_bare: Vec<String> = bare.iter().map(|s| s.to_string()).collect();
155
+ let want_bnds: Vec<String> = with_bounds.iter().map(|s| s.to_string()).collect();
156
+ assert_eq!(got_bare, want_bare, "bare tokenisation diverged for {input:?}");
157
+ assert_eq!(got_bnds, want_bnds, "boundary tokenisation diverged for {input:?}");
158
+ }
159
+ }
160
+
161
+ #[test]
162
+ fn skips_unknown_characters() {
163
+ assert_eq!(t("k9æt"), vec!["k".to_string(), "æ".to_string(), "t".to_string()]);
164
+ }
165
+
166
+ #[test]
167
+ fn empty_input_yields_empty_output() {
168
+ assert!(t("").is_empty());
169
+ assert!(tb("").is_empty());
170
+ }
171
+ }
@@ -0,0 +1,197 @@
1
+ //! Vowel distance in Bark-Euclidean space.
2
+ //!
3
+ //! F1 and F2 are stored in Hz but compared in Bark via the Traunmüller
4
+ //! (1990) approximation, because pitch perception is logarithmic and a
5
+ //! 200 Hz shift at F1=300 is enormous while the same shift at F2=2200
6
+ //! is barely audible. Roundedness and rhoticity are additive penalties
7
+ //! on top of the formant distance.
8
+
9
+ use std::sync::LazyLock;
10
+
11
+ /// Acoustic properties of one vowel in the inventory.
12
+ #[derive(Debug, Clone, Copy, PartialEq)]
13
+ pub struct Vowel {
14
+ /// First formant frequency in Hz.
15
+ pub f1: f64,
16
+ /// Second formant frequency in Hz.
17
+ pub f2: f64,
18
+ /// Lip rounding.
19
+ pub rounded: bool,
20
+ /// Rhoticity (for /ɝ/).
21
+ pub rhotic: bool,
22
+ }
23
+
24
+ /// Tunable: vowels share a perceptually narrower space than consonants,
25
+ /// so the formant contribution is capped well below 1.0.
26
+ pub const VOWEL_SCALE: f64 = 0.60;
27
+
28
+ /// Additive penalty when one vowel is rounded and the other isn't.
29
+ pub const ROUNDING_PENALTY: f64 = 0.05;
30
+
31
+ /// Additive penalty for rhoticity mismatch.
32
+ pub const RHOTICITY_PENALTY: f64 = 0.20;
33
+
34
+ /// Hz → Bark. Traunmüller (1990) approximation.
35
+ pub fn bark(hz: f64) -> f64 {
36
+ if hz <= 0.0 {
37
+ return 0.0;
38
+ }
39
+ 13.0 * (0.000_76 * hz).atan() + 3.5 * (hz / 7500.0).powi(2).atan()
40
+ }
41
+
42
+ /// IPA symbols in this inventory, in canonical order.
43
+ pub const INVENTORY: &[&str] = &[
44
+ "i", "y", "ɪ", "e", "ø", "ɛ", "œ", "a", "ɶ", "ɑ", "ɒ",
45
+ "ʌ", "ə", "ɝ", "ɔ", "ɤ", "o", "ɯ", "æ", "u", "ʊ",
46
+ ];
47
+
48
+ /// Look up the formant data for an IPA vowel symbol.
49
+ ///
50
+ /// Values from the cardinal-vowel measurements on Wikipedia (Daniel
51
+ /// Jones tradition), with the typo on /y/'s rounding flag corrected
52
+ /// from the original Ruby table and /ə/ no longer duplicating /ʌ/.
53
+ pub fn lookup(symbol: &str) -> Option<Vowel> {
54
+ let v = |f1, f2, rounded, rhotic| Vowel { f1, f2, rounded, rhotic };
55
+ Some(match symbol {
56
+ "i" => v(240.0, 2400.0, false, false),
57
+ "y" => v(235.0, 2100.0, true, false),
58
+ "ɪ" => v(300.0, 2100.0, false, false),
59
+ "e" => v(390.0, 2300.0, false, false),
60
+ "ø" => v(370.0, 1900.0, true, false),
61
+ "ɛ" => v(610.0, 1900.0, false, false),
62
+ "œ" => v(585.0, 1710.0, true, false),
63
+ "a" => v(850.0, 1610.0, false, false),
64
+ "ɶ" => v(820.0, 1530.0, true, false),
65
+ "ɑ" => v(750.0, 940.0, false, false),
66
+ "ɒ" => v(700.0, 760.0, true, false),
67
+ "ʌ" => v(600.0, 1170.0, false, false),
68
+ "ə" => v(500.0, 1500.0, false, false),
69
+ "ɝ" => v(500.0, 1350.0, false, true),
70
+ "ɔ" => v(500.0, 700.0, true, false),
71
+ "ɤ" => v(460.0, 1310.0, false, false),
72
+ "o" => v(360.0, 640.0, true, false),
73
+ "ɯ" => v(300.0, 1390.0, false, false),
74
+ "æ" => v(690.0, 1660.0, false, false),
75
+ "u" => v(250.0, 595.0, true, false),
76
+ "ʊ" => v(380.0, 950.0, true, false),
77
+ _ => return None,
78
+ })
79
+ }
80
+
81
+ /// Largest Bark-Euclidean distance achievable within the inventory.
82
+ /// Memoised; computed once on first access.
83
+ static BARK_SPAN: LazyLock<f64> = LazyLock::new(|| {
84
+ let coords: Vec<(f64, f64)> = INVENTORY
85
+ .iter()
86
+ .map(|s| {
87
+ let v = lookup(s).expect("INVENTORY entries must be in lookup()");
88
+ (bark(v.f1), bark(v.f2))
89
+ })
90
+ .collect();
91
+ let f1_min = coords.iter().map(|c| c.0).fold(f64::INFINITY, f64::min);
92
+ let f1_max = coords.iter().map(|c| c.0).fold(f64::NEG_INFINITY, f64::max);
93
+ let f2_min = coords.iter().map(|c| c.1).fold(f64::INFINITY, f64::min);
94
+ let f2_max = coords.iter().map(|c| c.1).fold(f64::NEG_INFINITY, f64::max);
95
+ ((f1_max - f1_min).powi(2) + (f2_max - f2_min).powi(2)).sqrt()
96
+ });
97
+
98
+ /// Returns the cached Bark-span normaliser.
99
+ pub fn bark_span() -> f64 {
100
+ *BARK_SPAN
101
+ }
102
+
103
+ /// Distance between two vowels, scaled into [0, 1].
104
+ ///
105
+ /// Returns `None` if either symbol is not in the inventory.
106
+ pub fn distance(p1: &str, p2: &str) -> Option<f64> {
107
+ if p1 == p2 {
108
+ return Some(0.0);
109
+ }
110
+ let v1 = lookup(p1)?;
111
+ let v2 = lookup(p2)?;
112
+ let (a1, b1) = (bark(v1.f1), bark(v1.f2));
113
+ let (a2, b2) = (bark(v2.f1), bark(v2.f2));
114
+ let formant_dist = ((a1 - a2).powi(2) + (b1 - b2).powi(2)).sqrt() / bark_span();
115
+ let mut penalty = formant_dist * VOWEL_SCALE;
116
+ if v1.rounded != v2.rounded {
117
+ penalty += ROUNDING_PENALTY;
118
+ }
119
+ if v1.rhotic != v2.rhotic {
120
+ penalty += RHOTICITY_PENALTY;
121
+ }
122
+ Some(penalty.min(1.0))
123
+ }
124
+
125
+ #[cfg(test)]
126
+ mod tests {
127
+ use super::*;
128
+
129
+ /// Tolerance: the Ruby reference uses f64 throughout; we should match
130
+ /// to at least 12 decimals.
131
+ const EPS: f64 = 1e-12;
132
+
133
+ /// Reference values produced by the Ruby implementation.
134
+ /// Bumping a constant here means bumping it in lib/phonetics/distances.rb
135
+ /// and confirming the parity tests still match.
136
+ #[test]
137
+ fn matches_ruby_vowel_distances() {
138
+ let cases: &[(&str, &str, f64)] = &[
139
+ ("i", "y", 0.099_760_210_846_103_59),
140
+ ("i", "ɪ", 0.060_056_384_465_816_57),
141
+ ("i", "u", 0.565_279_341_709_588),
142
+ ("a", "ɑ", 0.214_576_497_544_325_4),
143
+ ("æ", "ɛ", 0.064_974_568_637_334_88),
144
+ ("ə", "ɝ", 0.241_916_659_928_285_43),
145
+ ("o", "ə", 0.371_374_251_614_846_3),
146
+ ("u", "y", 0.465_646_551_803_915_9),
147
+ ("ʊ", "u", 0.172_060_682_790_273_34),
148
+ ];
149
+
150
+ for (a, b, expected) in cases {
151
+ let got = distance(a, b).expect("inventory pair");
152
+ assert!(
153
+ (got - expected).abs() < EPS,
154
+ "distance({a:?}, {b:?}) = {got}, expected {expected}",
155
+ );
156
+ }
157
+ }
158
+
159
+ #[test]
160
+ fn bark_span_matches_ruby() {
161
+ assert!((bark_span() - 10.148_711_232_912_262).abs() < EPS);
162
+ }
163
+
164
+ #[test]
165
+ fn bark_for_known_frequencies() {
166
+ // /i/'s F1 = 240 Hz → 2.349… Bark
167
+ assert!((bark(240.0) - 2.349_000_345_620_559).abs() < EPS);
168
+ // /a/'s F1 = 850 Hz → 7.501… Bark
169
+ assert!((bark(850.0) - 7.501_208_750_766_951).abs() < EPS);
170
+ // Edge: 0 Hz returns 0.
171
+ assert_eq!(bark(0.0), 0.0);
172
+ }
173
+
174
+ #[test]
175
+ fn identity_is_zero() {
176
+ for s in INVENTORY {
177
+ assert_eq!(distance(s, s), Some(0.0));
178
+ }
179
+ }
180
+
181
+ #[test]
182
+ fn symmetric() {
183
+ for a in INVENTORY {
184
+ for b in INVENTORY {
185
+ let d_ab = distance(a, b).unwrap();
186
+ let d_ba = distance(b, a).unwrap();
187
+ assert!((d_ab - d_ba).abs() < EPS, "asymmetric: {a}/{b}");
188
+ }
189
+ }
190
+ }
191
+
192
+ #[test]
193
+ fn unknown_symbol_returns_none() {
194
+ assert!(distance("Z", "i").is_none());
195
+ assert!(distance("i", "Z").is_none());
196
+ }
197
+ }
data/lib/phonetics.rb CHANGED
@@ -1,4 +1,79 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'phonetics/distances'
4
- require 'phonetics/transcriptions'
3
+ # Phonetics — IPA-based phonetic distance.
4
+ #
5
+ # The entire algorithmic core is written in Rust (see <repo>/rust/
6
+ # phonetics) and loaded as a native extension via Magnus. This file
7
+ # layers ergonomic Ruby idioms on top of the bare module functions
8
+ # that the extension exports.
9
+ #
10
+ # Two-tier distance API:
11
+ #
12
+ # Phonetics.distance(p1, p2) acoustic per-phoneme, 0..1
13
+ # Phonetics.levenshtein(s1, s2) strict edit distance
14
+ # Phonetics.confusion(s1, s2) listener-confusion distance
15
+ # Phonetics.similarity(s1, s2) normalised 0..1
16
+ # Phonetics.sub_cost(p1, p2) perceptual per-phoneme
17
+ # Phonetics.tokenize(ipa, boundaries:) phoneme stream
18
+ require 'delegate'
19
+
20
+ require_relative 'phonetics/phonetics_ruby'
21
+ require_relative 'phonetics/transcriptions'
22
+
23
+ module Phonetics
24
+ # The native binding exposes the tokenizer as `_tokenize(input,
25
+ # boundaries)`. Magnus's `function!` macro doesn't bridge Ruby
26
+ # keyword arguments through to Rust, so we wrap it in a Ruby method
27
+ # that does accept the kwarg.
28
+ def self.tokenize(input, boundaries: false)
29
+ _tokenize(input, boundaries)
30
+ end
31
+
32
+ # ------------------------------------------------------------------
33
+ # Phonetics::String — iterator over phonemes in an IPA string.
34
+ # ------------------------------------------------------------------
35
+ class String < SimpleDelegator
36
+ def each_phoneme(boundaries: false)
37
+ Phonetics.tokenize(to_s, boundaries: boundaries).each
38
+ end
39
+ end
40
+
41
+ # ------------------------------------------------------------------
42
+ # Backwards-compatible namespaced API.
43
+ #
44
+ # The previous Ruby+C implementation exposed these under sub-modules.
45
+ # Keep them as thin delegators so existing callers don't break —
46
+ # there's nothing interesting happening here, just forwarding.
47
+ # ------------------------------------------------------------------
48
+
49
+ module Levenshtein
50
+ INDEL_COST = 1.0
51
+ TRANSPOSE_COST = 0.8
52
+
53
+ def self.distance(s1, s2, _verbose = false)
54
+ return if s1.nil? || s2.nil?
55
+
56
+ Phonetics.levenshtein(s1, s2)
57
+ end
58
+ end
59
+
60
+ module Confusion
61
+ GAP_OPEN = 0.60
62
+ GAP_EXTEND = 0.25
63
+ WEAK_INDEL_COST = 0.15
64
+ BOUNDARY_INDEL_COST = 0.02
65
+
66
+ def self.distance(s1, s2, verbose: false)
67
+ _ = verbose
68
+ Phonetics.confusion(s1, s2)
69
+ end
70
+
71
+ def self.similarity(s1, s2)
72
+ Phonetics.similarity(s1, s2)
73
+ end
74
+
75
+ def self.sub_cost(a, b)
76
+ Phonetics.sub_cost(a, b)
77
+ end
78
+ end
79
+ end
data/phonetics.gemspec CHANGED
@@ -1,29 +1,53 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require_relative 'lib/phonetics/version'
4
+
3
5
  Gem::Specification.new do |spec|
4
6
  spec.name = 'phonetics'
5
- spec.version = File.read(File.join(File.dirname(__FILE__), './VERSION'))
7
+ spec.version = Phonetics::VERSION
6
8
  spec.authors = ['Jack Danger']
7
9
  spec.email = ['github@jackcanty.com']
8
10
 
9
- spec.summary = 'tools for linguistic code using the International Phonetic Alphabet'
10
- spec.description = 'tools for linguistic code using the International Phonetic Alphabet'
11
+ spec.summary = 'IPA-based phonetic distance: strict edit distance, listener-confusion distance, and per-phoneme acoustic and perceptual scoring.'
12
+ spec.description = <<~DESC
13
+ Tools for working with the International Phonetic Alphabet. Two-tier
14
+ distance API — strict acoustic and listener-perception — backed by a
15
+ Rust core compiled in via Magnus. Calibrated against Mad Gab puzzle
16
+ data and English speech-perception literature.
17
+ DESC
11
18
  spec.homepage = 'https://github.com/JackDanger/phonetics'
12
19
  spec.license = 'MIT'
13
20
 
14
- spec.required_ruby_version = '>= 2.5'
21
+ spec.required_ruby_version = '>= 3.0'
22
+ spec.required_rubygems_version = '>= 3.3.11'
23
+
24
+ spec.metadata['homepage_uri'] = spec.homepage
25
+ spec.metadata['source_code_uri'] = spec.homepage
15
26
 
16
- spec.extensions = ['ext/c_levenshtein/extconf.rb']
27
+ spec.extensions = ['ext/phonetics_ruby/extconf.rb']
17
28
 
18
- # Specify which files should be added to the gem when it is released.
19
- # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
20
29
  spec.files = Dir.chdir(File.expand_path(__dir__)) do
21
- `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
30
+ tracked = `git ls-files -z`.split("\x0").reject do |f|
31
+ f.match(%r{\A(test|spec|features)/}) ||
32
+ f.match(%r{\Aext/phonetics_ruby/(target|Cargo.lock|Makefile)})
33
+ end
34
+ # The vendored Rust core isn't tracked in git (it's a build
35
+ # artifact populated by `rake vendor_rust`), but it IS shipped
36
+ # in the .gem tarball so end users don't need the source
37
+ # workspace to compile the extension.
38
+ vendor = Dir.glob('ext/phonetics_ruby/vendor/**/*', File::FNM_DOTMATCH).reject do |p|
39
+ File.directory?(p) ||
40
+ p.include?('/target/') ||
41
+ p.end_with?('Cargo.lock', '/.', '/..')
42
+ end
43
+ (tracked + vendor).uniq.sort
22
44
  end
45
+
23
46
  spec.require_paths = ['lib']
24
47
 
48
+ spec.add_dependency 'rb_sys', '~> 0.9'
49
+
25
50
  spec.add_development_dependency 'bundler'
26
- spec.add_development_dependency 'pry-byebug'
27
51
  spec.add_development_dependency 'rake'
28
52
  spec.add_development_dependency 'rake-compiler'
29
53
  spec.add_development_dependency 'rspec'
metadata CHANGED
@@ -1,31 +1,31 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: phonetics
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.0.9
4
+ version: 4.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jack Danger
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-04-30 00:00:00.000000000 Z
11
+ date: 2026-05-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: bundler
14
+ name: rb_sys
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ">="
17
+ - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '0'
20
- type: :development
19
+ version: '0.9'
20
+ type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - ">="
24
+ - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '0'
26
+ version: '0.9'
27
27
  - !ruby/object:Gem::Dependency
28
- name: pry-byebug
28
+ name: bundler
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
31
  - - ">="
@@ -94,24 +94,27 @@ dependencies:
94
94
  - - ">="
95
95
  - !ruby/object:Gem::Version
96
96
  version: '0'
97
- description: tools for linguistic code using the International Phonetic Alphabet
97
+ description: |
98
+ Tools for working with the International Phonetic Alphabet. Two-tier
99
+ distance API — strict acoustic and listener-perception — backed by a
100
+ Rust core compiled in via Magnus. Calibrated against Mad Gab puzzle
101
+ data and English speech-perception literature.
98
102
  email:
99
103
  - github@jackcanty.com
100
104
  executables: []
101
105
  extensions:
102
- - ext/c_levenshtein/extconf.rb
106
+ - ext/phonetics_ruby/extconf.rb
103
107
  extra_rdoc_files: []
104
108
  files:
105
- - ".github/workflows/gempush.yml"
106
- - ".github/workflows/test.yml"
107
109
  - ".gitignore"
108
110
  - ".rspec"
109
111
  - ".rubocop.yml"
112
+ - CHANGELOG
110
113
  - CODE_OF_CONDUCT.md
114
+ - Cargo.toml
111
115
  - Dockerfile
112
116
  - Gemfile
113
117
  - LICENSE.txt
114
- - Makefile
115
118
  - README.md
116
119
  - Rakefile
117
120
  - VERSION
@@ -120,28 +123,36 @@ files:
120
123
  - _site/vowel_chart_b_words.jpg
121
124
  - bin/console
122
125
  - bin/gempush-if-changed
123
- - ext/c_levenshtein/extconf.rb
124
- - ext/c_levenshtein/levenshtein.c
125
- - ext/c_levenshtein/next_phoneme_length.c
126
- - ext/c_levenshtein/next_phoneme_length.h
127
- - ext/c_levenshtein/phonemes.c
128
- - ext/c_levenshtein/phonemes.h
129
- - ext/c_levenshtein/phonetic_cost.c
130
- - ext/c_levenshtein/phonetic_cost.h
126
+ - bin/phonetics
127
+ - ext/phonetics_ruby/Cargo.toml
128
+ - ext/phonetics_ruby/build.rs
129
+ - ext/phonetics_ruby/extconf.rb
130
+ - ext/phonetics_ruby/src/lib.rs
131
+ - ext/phonetics_ruby/vendor/phonetics/Cargo.toml
132
+ - ext/phonetics_ruby/vendor/phonetics/README.md
133
+ - ext/phonetics_ruby/vendor/phonetics/src/compounds.rs
134
+ - ext/phonetics_ruby/vendor/phonetics/src/confusion.rs
135
+ - ext/phonetics_ruby/vendor/phonetics/src/consonants.rs
136
+ - ext/phonetics_ruby/vendor/phonetics/src/cross_class.rs
137
+ - ext/phonetics_ruby/vendor/phonetics/src/diacritics.rs
138
+ - ext/phonetics_ruby/vendor/phonetics/src/distance.rs
139
+ - ext/phonetics_ruby/vendor/phonetics/src/levenshtein.rs
140
+ - ext/phonetics_ruby/vendor/phonetics/src/lib.rs
141
+ - ext/phonetics_ruby/vendor/phonetics/src/symbols.rs
142
+ - ext/phonetics_ruby/vendor/phonetics/src/tokenizer.rs
143
+ - ext/phonetics_ruby/vendor/phonetics/src/vowels.rs
131
144
  - lib/common_ipa_transcriptions.json
132
145
  - lib/phonetics.rb
133
- - lib/phonetics/code_generator.rb
134
- - lib/phonetics/distances.rb
135
- - lib/phonetics/levenshtein.rb
136
- - lib/phonetics/ruby_levenshtein.rb
137
146
  - lib/phonetics/transcriptions.rb
138
147
  - lib/phonetics/version.rb
139
148
  - phonetics.gemspec
140
149
  homepage: https://github.com/JackDanger/phonetics
141
150
  licenses:
142
151
  - MIT
143
- metadata: {}
144
- post_install_message:
152
+ metadata:
153
+ homepage_uri: https://github.com/JackDanger/phonetics
154
+ source_code_uri: https://github.com/JackDanger/phonetics
155
+ post_install_message:
145
156
  rdoc_options: []
146
157
  require_paths:
147
158
  - lib
@@ -149,15 +160,16 @@ required_ruby_version: !ruby/object:Gem::Requirement
149
160
  requirements:
150
161
  - - ">="
151
162
  - !ruby/object:Gem::Version
152
- version: '2.5'
163
+ version: '3.0'
153
164
  required_rubygems_version: !ruby/object:Gem::Requirement
154
165
  requirements:
155
166
  - - ">="
156
167
  - !ruby/object:Gem::Version
157
- version: '0'
168
+ version: 3.3.11
158
169
  requirements: []
159
- rubygems_version: 3.3.7
160
- signing_key:
170
+ rubygems_version: 3.5.22
171
+ signing_key:
161
172
  specification_version: 4
162
- summary: tools for linguistic code using the International Phonetic Alphabet
173
+ summary: 'IPA-based phonetic distance: strict edit distance, listener-confusion distance,
174
+ and per-phoneme acoustic and perceptual scoring.'
163
175
  test_files: []