phonetics 3.2.0 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +17 -2
  3. data/Cargo.toml +27 -0
  4. data/Rakefile +58 -26
  5. data/VERSION +1 -1
  6. data/bin/phonetics +89 -0
  7. data/ext/phonetics_ruby/Cargo.toml +36 -0
  8. data/ext/phonetics_ruby/build.rs +24 -0
  9. data/ext/phonetics_ruby/extconf.rb +17 -0
  10. data/ext/phonetics_ruby/src/lib.rs +56 -0
  11. data/ext/phonetics_ruby/vendor/phonetics/Cargo.toml +30 -0
  12. data/ext/phonetics_ruby/vendor/phonetics/README.md +29 -0
  13. data/ext/phonetics_ruby/vendor/phonetics/src/compounds.rs +40 -0
  14. data/ext/phonetics_ruby/vendor/phonetics/src/confusion.rs +325 -0
  15. data/ext/phonetics_ruby/vendor/phonetics/src/consonants.rs +363 -0
  16. data/ext/phonetics_ruby/vendor/phonetics/src/cross_class.rs +56 -0
  17. data/ext/phonetics_ruby/vendor/phonetics/src/diacritics.rs +113 -0
  18. data/ext/phonetics_ruby/vendor/phonetics/src/distance.rs +183 -0
  19. data/ext/phonetics_ruby/vendor/phonetics/src/levenshtein.rs +146 -0
  20. data/ext/phonetics_ruby/vendor/phonetics/src/lib.rs +44 -0
  21. data/ext/phonetics_ruby/vendor/phonetics/src/symbols.rs +21 -0
  22. data/ext/phonetics_ruby/vendor/phonetics/src/tokenizer.rs +171 -0
  23. data/ext/phonetics_ruby/vendor/phonetics/src/vowels.rs +197 -0
  24. data/lib/phonetics.rb +77 -2
  25. data/phonetics.gemspec +33 -9
  26. metadata +45 -34
  27. data/.github/workflows/gempush.yml +0 -28
  28. data/.github/workflows/test.yml +0 -20
  29. data/Makefile +0 -9
  30. data/ext/c_levenshtein/extconf.rb +0 -10
  31. data/ext/c_levenshtein/levenshtein.c +0 -223
  32. data/ext/c_levenshtein/next_phoneme_length.c +0 -1365
  33. data/ext/c_levenshtein/next_phoneme_length.h +0 -1
  34. data/ext/c_levenshtein/phonemes.c +0 -53
  35. data/ext/c_levenshtein/phonemes.h +0 -3
  36. data/ext/c_levenshtein/phonetic_cost.c +0 -88593
  37. data/ext/c_levenshtein/phonetic_cost.h +0 -1
  38. data/lib/phonetics/code_generator.rb +0 -228
  39. data/lib/phonetics/distances.rb +0 -249
  40. data/lib/phonetics/levenshtein.rb +0 -27
  41. data/lib/phonetics/ruby_levenshtein.rb +0 -162
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ddee58e3870d471e1d5f03399be12dac1d1a4c9132c28000715796e80914a493
4
- data.tar.gz: 502596f055d85a9c7206b5ea8b33fcb18ac9229dd00014551428e6fee498fc01
3
+ metadata.gz: c174999459702990b87d739037c9fbd2d99d3be2bda047b9b381d9d2432e9670
4
+ data.tar.gz: 89416b41cf47c0db96186575ac9e133d49e5d81fec15ebec02e864526a0a77af
5
5
  SHA512:
6
- metadata.gz: 5aef31036df0eb866f036757f024e0c4296c1b958b69f4f1aca901a2c68a63fd84334ae18d1a44ccb8467ed16bdc5069e40d81b5bd1cba22f04da089b490b2dd
7
- data.tar.gz: 9bf842a914628aed7fbe0c8cc3ee3ddf49745f35769c11e5650645cae5a59e234edb5c760a8165c2041ff72a516fdf32b7e9230adad418860dfd54e90f4077d6
6
+ metadata.gz: 6b5bd8339e1e3aba8cbc4599f97588ff046e074832ae84657aba4988d4fe6ef1e06f56a4a8b9c73cb54c130702d459d59257cbdb967396aef9eee8bcd2b12279
7
+ data.tar.gz: ba950dfbd243de733de5230336a61b6f03a1726371c21112e9765f76d854929084a151d97d8c49d68f593ef7a787c91a82bae70a29becd0297ebc84a7b730e26
data/.gitignore CHANGED
@@ -10,8 +10,23 @@
10
10
  # rspec failure tracking
11
11
  .rspec_status
12
12
  .ruby-version
13
- .ruby-version
14
13
  *.so
15
14
  Gemfile.lock
16
15
  *.bundle
17
- .ruby-version
16
+ *.bundle.dSYM
17
+
18
+ # Rust extension build artifacts (mkmf-generated)
19
+ ext/phonetics_ruby/Makefile
20
+ ext/phonetics_ruby/Cargo.lock
21
+ ext/phonetics_ruby/target/
22
+ ext/phonetics_ruby/mkmf.log
23
+
24
+ # Vendored Rust core. Populated by `rake vendor_rust`; shipped in
25
+ # the .gem tarball but not tracked in git (single source of truth
26
+ # lives at <repo>/rust/phonetics).
27
+ ext/phonetics_ruby/vendor/
28
+
29
+ # Cargo workspace build artifacts (the gem-local workspace at
30
+ # ruby/Cargo.toml puts its target/ at ruby/target).
31
+ target/
32
+ Cargo.lock
data/Cargo.toml ADDED
@@ -0,0 +1,27 @@
1
+ # Cargo workspace declaration for the Ruby gem. Exists so that
2
+ # `rb_sys/extensiontask` can find the native extension's manifest by
3
+ # walking up from the gem root, and so cargo resolves the vendored
4
+ # `phonetics` core path dependency.
5
+ #
6
+ # The workspace.package values mirror what the Rust workspace at
7
+ # <repo>/rust/ defines; they're declared here so the vendored copy
8
+ # of the Rust core (which inherits via `.workspace = true`) resolves
9
+ # correctly inside the gem.
10
+ [workspace]
11
+ resolver = "2"
12
+ members = [
13
+ "ext/phonetics_ruby",
14
+ "ext/phonetics_ruby/vendor/phonetics",
15
+ ]
16
+ exclude = []
17
+
18
+ [workspace.package]
19
+ version = "0.2.0"
20
+ edition = "2021"
21
+ license = "MIT"
22
+ repository = "https://github.com/JackDanger/phonetics"
23
+ homepage = "https://github.com/JackDanger/phonetics"
24
+ authors = ["Jack Danger <github@jackcanty.com>"]
25
+
26
+ [workspace.lints.rust]
27
+ unused_must_use = "deny"
data/Rakefile CHANGED
@@ -1,41 +1,73 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ # Rake tasks for the Rust-backed Ruby gem.
4
+ #
5
+ # rake compile build the native extension via cargo
6
+ # rake spec run rspec
7
+ # rake build assemble the source gem (vendor + package)
8
+ #
9
+ # Cross-platform native gem builds use rb-sys-dock; see
10
+ # .github/workflows/release.yml for the published recipe.
11
+
12
+ require 'fileutils'
3
13
  require 'bundler/gem_tasks'
4
- require 'rake/extensiontask'
5
- require 'rspec/core/rake_task'
6
- require 'rubocop/rake_task'
7
14
 
8
- EXT_PATH = 'ext/c_levenshtein'
15
+ # ---------------------------------------------------------------
16
+ # Vendoring of the Rust core into the gem directory.
17
+ #
18
+ # A gem can't ship files from outside its own directory (rubygems
19
+ # policy), and `cargo` can't follow a path dependency that points
20
+ # up into the parent of a `gem install`-extracted source tree. So
21
+ # we keep the canonical Rust core at <repo>/rust/phonetics and
22
+ # copy it into <repo>/ruby/ext/phonetics_ruby/vendor/ for builds
23
+ # that need to ship it (gem-build, gem-install, CI).
24
+ #
25
+ # The vendor copy is git-ignored — single source of truth is the
26
+ # /rust/ tree. It IS bundled into the .gem tarball at build time.
27
+ # ---------------------------------------------------------------
28
+
29
+ RUST_CORE_SRC = File.expand_path('../rust/phonetics', __dir__)
30
+ RUST_CORE_VENDOR = File.expand_path('ext/phonetics_ruby/vendor/phonetics', __dir__)
31
+
32
+ def vendor_rust_core
33
+ return if File.directory?(RUST_CORE_SRC).nil?
9
34
 
10
- Rake::ExtensionTask.new('c_levenshtein') do |extension|
11
- extension.ext_dir = EXT_PATH
12
- extension.lib_dir = 'lib/phonetics'
35
+ unless File.directory?(RUST_CORE_SRC)
36
+ warn "(skipping vendor: #{RUST_CORE_SRC} not present)" if ENV['VERBOSE']
37
+ return
38
+ end
39
+ FileUtils.rm_rf(RUST_CORE_VENDOR)
40
+ FileUtils.mkdir_p(File.dirname(RUST_CORE_VENDOR))
41
+ FileUtils.cp_r(RUST_CORE_SRC, RUST_CORE_VENDOR)
42
+ # Strip the CLI bin and any stale build artifacts.
43
+ FileUtils.rm_rf(File.join(RUST_CORE_VENDOR, 'src', 'bin'))
44
+ FileUtils.rm_rf(File.join(RUST_CORE_VENDOR, 'target'))
13
45
  end
14
46
 
15
- PHONETIC_COST_C_EXTENSION = File.join(EXT_PATH, 'phonetic_cost.c')
16
- NEXT_PHONEME_LENGTH_C_EXTENSION = File.join(EXT_PATH, 'next_phoneme_length.c')
47
+ # The downstream Cargo workspace at ruby/Cargo.toml lists the ext
48
+ # crate as a member, and that crate's Cargo.toml has a path
49
+ # dependency on vendor/phonetics. `cargo metadata` (which
50
+ # `rb_sys/extensiontask` runs at load time) would fail before our
51
+ # tasks even register. Vendor synchronously here, before any
52
+ # require touches cargo.
53
+ vendor_rust_core
17
54
 
18
- require_relative './lib/phonetics/code_generator'
55
+ require 'rb_sys/extensiontask'
56
+ require 'rspec/core/rake_task'
19
57
 
20
- desc 'Write phonetic_cost.c using Phonetic values'
21
- task PHONETIC_COST_C_EXTENSION do
22
- file = File.open(PHONETIC_COST_C_EXTENSION, 'w')
23
- Phonetics::CodeGenerator.new(file).generate_phonetic_cost_c_code
24
- puts "Wrote #{PHONETIC_COST_C_EXTENSION}"
25
- end
58
+ desc 'Re-copy the Rust core into ext/phonetics_ruby/vendor/'
59
+ task(:vendor_rust) { vendor_rust_core }
26
60
 
27
- desc 'Write phonemes.c using a lookup table of byte arrays'
28
- task NEXT_PHONEME_LENGTH_C_EXTENSION do
29
- file = File.open(NEXT_PHONEME_LENGTH_C_EXTENSION, 'w')
30
- Phonetics::CodeGenerator.new(file).generate_next_phoneme_length_c_code
31
- puts "Wrote #{NEXT_PHONEME_LENGTH_C_EXTENSION}"
61
+ GEMSPEC = Gem::Specification.load(File.expand_path('phonetics.gemspec', __dir__))
62
+
63
+ RbSys::ExtensionTask.new('phonetics_ruby', GEMSPEC) do |ext|
64
+ ext.lib_dir = 'lib/phonetics'
32
65
  end
33
66
 
34
- task compile: PHONETIC_COST_C_EXTENSION
35
- task compile: NEXT_PHONEME_LENGTH_C_EXTENSION
67
+ # build / compile both depend on a fresh vendor copy.
68
+ task compile: :vendor_rust
69
+ task build: :vendor_rust
36
70
 
37
71
  RSpec::Core::RakeTask.new(:spec)
38
72
 
39
- RuboCop::RakeTask.new
40
-
41
- task default: [:compile, :rubocop, :spec]
73
+ task default: %i[compile spec]
data/VERSION CHANGED
@@ -1 +1 @@
1
- 3.2.0
1
+ 4.0.0
data/bin/phonetics ADDED
@@ -0,0 +1,89 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ # Phonetics CLI — implementation-agnostic contract.
5
+ #
6
+ # The integration tests in spec/phonetics/cli_spec.rb shell out to this
7
+ # binary (or any other one — set PHONETICS_BIN). What matters is
8
+ # stdin/stdout/exit-code behaviour, not which language fields the
9
+ # Phonetics module. With the Rust port live, this script is a thin
10
+ # layer over the Magnus binding.
11
+
12
+ $stdout.set_encoding('UTF-8')
13
+ $stderr.set_encoding('UTF-8')
14
+
15
+ $LOAD_PATH.unshift File.expand_path('../lib', __dir__)
16
+ require 'phonetics'
17
+
18
+ USAGE = <<~USAGE
19
+ Usage: phonetics <command> [args...]
20
+
21
+ Phrase-level distances (input: IPA strings, possibly with spaces):
22
+ distance <ipa1> <ipa2> Strict phonetic Levenshtein.
23
+ confusion <ipa1> <ipa2> Listener-confusion distance.
24
+ similarity <ipa1> <ipa2> 0..1 normalised similarity.
25
+
26
+ Phoneme-level distances (input: single phonemes):
27
+ phoneme <a> <b> Acoustic distance.
28
+ phoneme-conf <a> <b> Perceptual distance (with overlay).
29
+
30
+ Tokenisation:
31
+ tokenize [--boundaries] <ipa> Phoneme stream, one per line.
32
+
33
+ Numeric output is a single line. With --json, output is a JSON object.
34
+ USAGE
35
+
36
+ def die(message, code = 2)
37
+ warn message
38
+ exit code
39
+ end
40
+
41
+ def format_number(value, json:)
42
+ return JSON.dump(value: value) if json
43
+
44
+ value.to_s
45
+ end
46
+
47
+ json = !ARGV.delete('--json').nil?
48
+ bounds = !ARGV.delete('--boundaries').nil?
49
+
50
+ command, *args = ARGV
51
+ require 'json' if json
52
+
53
+ case command
54
+ when 'distance'
55
+ die USAGE unless args.size == 2
56
+ puts format_number(Phonetics.levenshtein(args[0], args[1]), json: json)
57
+
58
+ when 'confusion'
59
+ die USAGE unless args.size == 2
60
+ puts format_number(Phonetics.confusion(args[0], args[1]), json: json)
61
+
62
+ when 'similarity'
63
+ die USAGE unless args.size == 2
64
+ puts format_number(Phonetics.similarity(args[0], args[1]), json: json)
65
+
66
+ when 'phoneme'
67
+ die USAGE unless args.size == 2
68
+ puts format_number(Phonetics.distance(args[0], args[1]), json: json)
69
+
70
+ when 'phoneme-conf'
71
+ die USAGE unless args.size == 2
72
+ puts format_number(Phonetics.sub_cost(args[0], args[1]), json: json)
73
+
74
+ when 'tokenize'
75
+ die USAGE unless args.size == 1
76
+
77
+ tokens = Phonetics.tokenize(args[0], boundaries: bounds)
78
+ if json
79
+ puts JSON.dump(tokens: tokens)
80
+ else
81
+ puts tokens
82
+ end
83
+
84
+ when '--help', '-h', nil
85
+ puts USAGE
86
+
87
+ else
88
+ die "phonetics: unknown command #{command.inspect}\n\n#{USAGE}"
89
+ end
@@ -0,0 +1,36 @@
1
+ # Member of the gem-local workspace at <repo>/ruby/Cargo.toml. The
2
+ # workspace exists purely so `rb_sys` can find this crate via `cargo
3
+ # metadata` from the gem root; the package is otherwise standalone
4
+ # (no shared inheritance from the Rust workspace at <repo>/rust/),
5
+ # which means it builds the same in a checkout and in a `gem
6
+ # install`-extracted source tree.
7
+ [package]
8
+ # Underscored to match the rake extension name and the cdylib that
9
+ # `Init_phonetics_ruby` expects to find. Cargo accepts both forms;
10
+ # this avoids the s/-/_/ rename Cargo otherwise applies to the
11
+ # produced artifact.
12
+ name = "phonetics_ruby"
13
+ description = "Ruby bindings for the phonetics crate, via magnus."
14
+ version = "0.2.0"
15
+ edition = "2021"
16
+ license = "MIT"
17
+ publish = false
18
+
19
+ [lib]
20
+ # cdylib so Ruby `require` can dlopen it as a native extension.
21
+ crate-type = ["cdylib"]
22
+
23
+ [dependencies]
24
+ # Vendored copy of the Rust core. Populated by `rake vendor_rust`
25
+ # from <repo>/rust/phonetics; see ruby/Rakefile. The vendor dir is
26
+ # not committed to git but IS shipped in the .gem tarball, so a
27
+ # user running `gem install phonetics` gets a self-contained build.
28
+ #
29
+ # The package on crates.io is `phonetics-rs` (the bare `phonetics`
30
+ # name is taken by an unrelated metaphone library). The `package`
31
+ # alias lets us still write `use phonetics::distance;` in our code.
32
+ phonetics = { path = "vendor/phonetics", package = "phonetics-rs" }
33
+ magnus = "0.8"
34
+
35
+ [build-dependencies]
36
+ rb-sys-env = "0.2"
@@ -0,0 +1,24 @@
1
+ fn main() {
2
+ // rb-sys-env 0.2 reads RUBY_VERSION, but the rb_sys gem's mkmf
3
+ // Makefile only exports the split form (RBCONFIG_MAJOR/MINOR/TEENY).
4
+ // Reconstruct it from RBCONFIG_* when needed so both invocation
5
+ // paths (standalone `cargo build` and gem-extconf-driven `make`)
6
+ // work without the caller having to know which one to set.
7
+ if std::env::var_os("RUBY_VERSION").is_none() {
8
+ if let (Ok(major), Ok(minor), Ok(teeny)) = (
9
+ std::env::var("RBCONFIG_MAJOR"),
10
+ std::env::var("RBCONFIG_MINOR"),
11
+ std::env::var("RBCONFIG_TEENY"),
12
+ ) {
13
+ std::env::set_var("RUBY_VERSION", format!("{major}.{minor}.{teeny}"));
14
+ }
15
+ }
16
+
17
+ let _ = rb_sys_env::activate().expect("phonetics-ruby: cannot locate a Ruby");
18
+
19
+ // macOS native-extension linker behaviour: leave Ruby's `rb_*`
20
+ // symbols unresolved until dlopen time.
21
+ if cfg!(target_os = "macos") {
22
+ println!("cargo:rustc-link-arg=-Wl,-undefined,dynamic_lookup");
23
+ }
24
+ }
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Build the Rust-backed native extension that replaces the previous
4
+ # hand-written C codegen.
5
+ #
6
+ # rb_sys/mkmf generates a Makefile that invokes `cargo rustc` against
7
+ # the Cargo.toml sibling to this file. That crate is a workspace
8
+ # member of <repo>/rust/Cargo.toml — the workspace lints, profile, and
9
+ # version come from there, but the build pipeline that produces the
10
+ # Ruby-loadable cdylib lives here where conventional gem tooling
11
+ # expects it.
12
+ require 'mkmf'
13
+ require 'rb_sys/mkmf'
14
+
15
+ create_rust_makefile('phonetics/phonetics_ruby') do |r|
16
+ r.profile = :release
17
+ end
@@ -0,0 +1,56 @@
1
+ //! Magnus bindings around the `phonetics` core crate.
2
+ //!
3
+ //! Loaded into Ruby as `phonetics_ruby.bundle` / `.so`. The
4
+ //! gem-installed library calls `require 'phonetics/phonetics_ruby'`
5
+ //! and gets a populated `Phonetics` module with module functions
6
+ //! matching what the previous hand-written C extension exposed:
7
+ //!
8
+ //! Phonetics.distance(a, b) acoustic per-phoneme
9
+ //! Phonetics.confusion(a, b) listener-confusion distance
10
+ //! Phonetics.levenshtein(a, b) strict edit distance
11
+ //! Phonetics.similarity(a, b) normalised 0..1
12
+ //! Phonetics.sub_cost(a, b) perceptual per-phoneme
13
+ //! Phonetics.tokenize(input, boundaries) phoneme stream
14
+ //!
15
+ //! All real work lives in the `phonetics` core crate. This file is
16
+ //! the impedance-matching layer between Rust types and Ruby's ABI.
17
+
18
+ use magnus::{function, Error, Ruby};
19
+
20
+ #[magnus::init]
21
+ fn init(ruby: &Ruby) -> Result<(), Error> {
22
+ let phonetics = ruby.define_module("Phonetics")?;
23
+
24
+ phonetics.define_module_function("distance", function!(distance, 2))?;
25
+ phonetics.define_module_function("confusion", function!(confusion, 2))?;
26
+ phonetics.define_module_function("levenshtein", function!(levenshtein, 2))?;
27
+ phonetics.define_module_function("similarity", function!(similarity, 2))?;
28
+ phonetics.define_module_function("sub_cost", function!(sub_cost, 2))?;
29
+ phonetics.define_module_function("_tokenize", function!(tokenize, 2))?;
30
+
31
+ Ok(())
32
+ }
33
+
34
+ fn distance(a: String, b: String) -> f64 {
35
+ phonetics::distance(&a, &b)
36
+ }
37
+
38
+ fn confusion(a: String, b: String) -> f64 {
39
+ phonetics::confusion(&a, &b)
40
+ }
41
+
42
+ fn levenshtein(a: String, b: String) -> f64 {
43
+ phonetics::levenshtein(&a, &b)
44
+ }
45
+
46
+ fn similarity(a: String, b: String) -> f64 {
47
+ phonetics::similarity(&a, &b)
48
+ }
49
+
50
+ fn sub_cost(a: String, b: String) -> f64 {
51
+ phonetics::confusion::sub_cost(&a, &b)
52
+ }
53
+
54
+ fn tokenize(input: String, boundaries: bool) -> Vec<String> {
55
+ phonetics::tokens(&input, boundaries)
56
+ }
@@ -0,0 +1,30 @@
1
+ [package]
2
+ # `phonetics` on crates.io is taken (different project — a metaphone/
3
+ # NYSIIS encoder). We publish under `phonetics-rs`, the conventional
4
+ # fallback when a bare project name is unavailable. The Rust API is
5
+ # still `use phonetics::distance;` because `lib.name` below stays
6
+ # `phonetics`; only the crates.io package name changes.
7
+ name = "phonetics-rs"
8
+ description = "IPA-based phonetic distance metrics: strict edit distance, listener-confusion distance, and per-phoneme acoustic and perceptual scoring. Calibrated against Mad Gab puzzle data; tunable per dialect."
9
+ version.workspace = true
10
+ edition.workspace = true
11
+ license.workspace = true
12
+ repository.workspace = true
13
+ homepage.workspace = true
14
+ authors.workspace = true
15
+ readme = "README.md"
16
+ keywords = ["phonetics", "ipa", "linguistics", "edit-distance", "speech"]
17
+ categories = ["text-processing", "science"]
18
+
19
+ [lints]
20
+ workspace = true
21
+
22
+ # Keep the Rust API name as `phonetics` even though the registry name
23
+ # is `phonetics-rs`. Downstream code stays `use phonetics::distance;`.
24
+ [lib]
25
+ name = "phonetics"
26
+
27
+ [dependencies]
28
+
29
+ [dev-dependencies]
30
+ # Empty for now; will gain criterion + insta as the surface grows.
@@ -0,0 +1,29 @@
1
+ # phonetics
2
+
3
+ IPA-based phonetic distance metrics for Rust.
4
+
5
+ Two scores live in this crate:
6
+
7
+ * **`distance(a, b)`** — strict per-phoneme acoustic distance over IPA
8
+ phonemes (Bark-space vowel distance, 2D consonant place embedding,
9
+ approximant–vowel bridge, diphthong/affricate handling).
10
+ * **`Confusion::distance(s1, s2)`** — listener-confusion distance over
11
+ whole phonemic strings, calibrated against Mad Gab puzzle data and
12
+ English speech-perception literature.
13
+
14
+ The split is the point. The first is a claim about the waveform; the
15
+ second is a claim about how a listener parses it. They give different
16
+ answers to different questions.
17
+
18
+ This is a port of the Ruby gem of the same name; see the parent
19
+ repository for an extended write-up of the metric design.
20
+
21
+ ## Status
22
+
23
+ Pre-1.0. Core data tables and the per-phoneme acoustic distance are
24
+ parity-tested against the Ruby reference. Levenshtein and Confusion DPs
25
+ are landing module-by-module.
26
+
27
+ ## License
28
+
29
+ MIT.
@@ -0,0 +1,40 @@
1
+ //! Compound phonemes — diphthongs and affricates that are recognised
2
+ //! as single perceptual units even though their IPA notation is two
3
+ //! characters.
4
+ //!
5
+ //! Distance for a compound vs another phoneme is the average of the
6
+ //! pairwise component distances; the shorter side is padded by
7
+ //! repeating its last segment so /aɪ/ vs /a/ charges half a phoneme
8
+ //! distance rather than nothing.
9
+
10
+ /// Every registered compound phoneme symbol.
11
+ pub const INVENTORY: &[&str] = &[
12
+ // Diphthongs
13
+ "aɪ","ɑɪ","aʊ","ɑʊ","ɔɪ","eɪ","oʊ","əʊ","ɪə","ʊə","ɛə",
14
+ // English affricates
15
+ "tʃ","dʒ",
16
+ ];
17
+
18
+ /// Decompose a compound phoneme symbol into its components. Returns
19
+ /// `None` for non-compound phonemes.
20
+ pub fn components(symbol: &str) -> Option<&'static [&'static str]> {
21
+ Some(match symbol {
22
+ // English-style diphthongs (both /aɪ/-form and /ɑɪ/-form).
23
+ "aɪ" => &["a", "ɪ"],
24
+ "ɑɪ" => &["ɑ", "ɪ"],
25
+ "aʊ" => &["a", "ʊ"],
26
+ "ɑʊ" => &["ɑ", "ʊ"],
27
+ "ɔɪ" => &["ɔ", "ɪ"],
28
+ "eɪ" => &["e", "ɪ"],
29
+ "oʊ" => &["o", "ʊ"],
30
+ "əʊ" => &["ə", "ʊ"],
31
+ "ɪə" => &["ɪ", "ə"],
32
+ "ʊə" => &["ʊ", "ə"],
33
+ "ɛə" => &["ɛ", "ə"],
34
+ // English affricates only. /ts/, /dz/ etc. would mis-tokenise
35
+ // English plurals (cats, kids) as single phonemes; left out.
36
+ "tʃ" => &["t", "ʃ"],
37
+ "dʒ" => &["d", "ʒ"],
38
+ _ => return None,
39
+ })
40
+ }