phonetics 3.0.9 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +17 -2
- data/CHANGELOG +4 -0
- data/Cargo.toml +27 -0
- data/Rakefile +58 -26
- data/VERSION +1 -1
- data/bin/phonetics +89 -0
- data/ext/phonetics_ruby/Cargo.toml +36 -0
- data/ext/phonetics_ruby/build.rs +24 -0
- data/ext/phonetics_ruby/extconf.rb +17 -0
- data/ext/phonetics_ruby/src/lib.rs +56 -0
- data/ext/phonetics_ruby/vendor/phonetics/Cargo.toml +30 -0
- data/ext/phonetics_ruby/vendor/phonetics/README.md +29 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/compounds.rs +40 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/confusion.rs +325 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/consonants.rs +363 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/cross_class.rs +56 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/diacritics.rs +113 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/distance.rs +183 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/levenshtein.rs +146 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/lib.rs +44 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/symbols.rs +21 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/tokenizer.rs +171 -0
- data/ext/phonetics_ruby/vendor/phonetics/src/vowels.rs +197 -0
- data/lib/phonetics.rb +77 -2
- data/phonetics.gemspec +33 -9
- metadata +46 -34
- data/.github/workflows/gempush.yml +0 -28
- data/.github/workflows/test.yml +0 -20
- data/Makefile +0 -6
- data/ext/c_levenshtein/extconf.rb +0 -10
- data/ext/c_levenshtein/levenshtein.c +0 -223
- data/ext/c_levenshtein/next_phoneme_length.c +0 -1365
- data/ext/c_levenshtein/next_phoneme_length.h +0 -1
- data/ext/c_levenshtein/phonemes.c +0 -53
- data/ext/c_levenshtein/phonemes.h +0 -3
- data/ext/c_levenshtein/phonetic_cost.c +0 -88593
- data/ext/c_levenshtein/phonetic_cost.h +0 -1
- data/lib/phonetics/code_generator.rb +0 -228
- data/lib/phonetics/distances.rb +0 -245
- data/lib/phonetics/levenshtein.rb +0 -27
- data/lib/phonetics/ruby_levenshtein.rb +0 -162
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: c174999459702990b87d739037c9fbd2d99d3be2bda047b9b381d9d2432e9670
|
|
4
|
+
data.tar.gz: 89416b41cf47c0db96186575ac9e133d49e5d81fec15ebec02e864526a0a77af
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 6b5bd8339e1e3aba8cbc4599f97588ff046e074832ae84657aba4988d4fe6ef1e06f56a4a8b9c73cb54c130702d459d59257cbdb967396aef9eee8bcd2b12279
|
|
7
|
+
data.tar.gz: ba950dfbd243de733de5230336a61b6f03a1726371c21112e9765f76d854929084a151d97d8c49d68f593ef7a787c91a82bae70a29becd0297ebc84a7b730e26
|
data/.gitignore
CHANGED
|
@@ -10,8 +10,23 @@
|
|
|
10
10
|
# rspec failure tracking
|
|
11
11
|
.rspec_status
|
|
12
12
|
.ruby-version
|
|
13
|
-
.ruby-version
|
|
14
13
|
*.so
|
|
15
14
|
Gemfile.lock
|
|
16
15
|
*.bundle
|
|
17
|
-
.
|
|
16
|
+
*.bundle.dSYM
|
|
17
|
+
|
|
18
|
+
# Rust extension build artifacts (mkmf-generated)
|
|
19
|
+
ext/phonetics_ruby/Makefile
|
|
20
|
+
ext/phonetics_ruby/Cargo.lock
|
|
21
|
+
ext/phonetics_ruby/target/
|
|
22
|
+
ext/phonetics_ruby/mkmf.log
|
|
23
|
+
|
|
24
|
+
# Vendored Rust core. Populated by `rake vendor_rust`; shipped in
|
|
25
|
+
# the .gem tarball but not tracked in git (single source of truth
|
|
26
|
+
# lives at <repo>/rust/phonetics).
|
|
27
|
+
ext/phonetics_ruby/vendor/
|
|
28
|
+
|
|
29
|
+
# Cargo workspace build artifacts (the gem-local workspace at
|
|
30
|
+
# ruby/Cargo.toml puts its target/ at ruby/target).
|
|
31
|
+
target/
|
|
32
|
+
Cargo.lock
|
data/CHANGELOG
ADDED
data/Cargo.toml
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# Cargo workspace declaration for the Ruby gem. Exists so that
|
|
2
|
+
# `rb_sys/extensiontask` can find the native extension's manifest by
|
|
3
|
+
# walking up from the gem root, and so cargo resolves the vendored
|
|
4
|
+
# `phonetics` core path dependency.
|
|
5
|
+
#
|
|
6
|
+
# The workspace.package values mirror what the Rust workspace at
|
|
7
|
+
# <repo>/rust/ defines; they're declared here so the vendored copy
|
|
8
|
+
# of the Rust core (which inherits via `.workspace = true`) resolves
|
|
9
|
+
# correctly inside the gem.
|
|
10
|
+
[workspace]
|
|
11
|
+
resolver = "2"
|
|
12
|
+
members = [
|
|
13
|
+
"ext/phonetics_ruby",
|
|
14
|
+
"ext/phonetics_ruby/vendor/phonetics",
|
|
15
|
+
]
|
|
16
|
+
exclude = []
|
|
17
|
+
|
|
18
|
+
[workspace.package]
|
|
19
|
+
version = "0.2.0"
|
|
20
|
+
edition = "2021"
|
|
21
|
+
license = "MIT"
|
|
22
|
+
repository = "https://github.com/JackDanger/phonetics"
|
|
23
|
+
homepage = "https://github.com/JackDanger/phonetics"
|
|
24
|
+
authors = ["Jack Danger <github@jackcanty.com>"]
|
|
25
|
+
|
|
26
|
+
[workspace.lints.rust]
|
|
27
|
+
unused_must_use = "deny"
|
data/Rakefile
CHANGED
|
@@ -1,41 +1,73 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
# Rake tasks for the Rust-backed Ruby gem.
|
|
4
|
+
#
|
|
5
|
+
# rake compile build the native extension via cargo
|
|
6
|
+
# rake spec run rspec
|
|
7
|
+
# rake build assemble the source gem (vendor + package)
|
|
8
|
+
#
|
|
9
|
+
# Cross-platform native gem builds use rb-sys-dock; see
|
|
10
|
+
# .github/workflows/release.yml for the published recipe.
|
|
11
|
+
|
|
12
|
+
require 'fileutils'
|
|
3
13
|
require 'bundler/gem_tasks'
|
|
4
|
-
require 'rake/extensiontask'
|
|
5
|
-
require 'rspec/core/rake_task'
|
|
6
|
-
require 'rubocop/rake_task'
|
|
7
14
|
|
|
8
|
-
|
|
15
|
+
# ---------------------------------------------------------------
|
|
16
|
+
# Vendoring of the Rust core into the gem directory.
|
|
17
|
+
#
|
|
18
|
+
# A gem can't ship files from outside its own directory (rubygems
|
|
19
|
+
# policy), and `cargo` can't follow a path dependency that points
|
|
20
|
+
# up into the parent of a `gem install`-extracted source tree. So
|
|
21
|
+
# we keep the canonical Rust core at <repo>/rust/phonetics and
|
|
22
|
+
# copy it into <repo>/ruby/ext/phonetics_ruby/vendor/ for builds
|
|
23
|
+
# that need to ship it (gem-build, gem-install, CI).
|
|
24
|
+
#
|
|
25
|
+
# The vendor copy is git-ignored — single source of truth is the
|
|
26
|
+
# /rust/ tree. It IS bundled into the .gem tarball at build time.
|
|
27
|
+
# ---------------------------------------------------------------
|
|
28
|
+
|
|
29
|
+
RUST_CORE_SRC = File.expand_path('../rust/phonetics', __dir__)
|
|
30
|
+
RUST_CORE_VENDOR = File.expand_path('ext/phonetics_ruby/vendor/phonetics', __dir__)
|
|
31
|
+
|
|
32
|
+
def vendor_rust_core
|
|
33
|
+
return if File.directory?(RUST_CORE_SRC).nil?
|
|
9
34
|
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
35
|
+
unless File.directory?(RUST_CORE_SRC)
|
|
36
|
+
warn "(skipping vendor: #{RUST_CORE_SRC} not present)" if ENV['VERBOSE']
|
|
37
|
+
return
|
|
38
|
+
end
|
|
39
|
+
FileUtils.rm_rf(RUST_CORE_VENDOR)
|
|
40
|
+
FileUtils.mkdir_p(File.dirname(RUST_CORE_VENDOR))
|
|
41
|
+
FileUtils.cp_r(RUST_CORE_SRC, RUST_CORE_VENDOR)
|
|
42
|
+
# Strip the CLI bin and any stale build artifacts.
|
|
43
|
+
FileUtils.rm_rf(File.join(RUST_CORE_VENDOR, 'src', 'bin'))
|
|
44
|
+
FileUtils.rm_rf(File.join(RUST_CORE_VENDOR, 'target'))
|
|
13
45
|
end
|
|
14
46
|
|
|
15
|
-
|
|
16
|
-
|
|
47
|
+
# The downstream Cargo workspace at ruby/Cargo.toml lists the ext
|
|
48
|
+
# crate as a member, and that crate's Cargo.toml has a path
|
|
49
|
+
# dependency on vendor/phonetics. `cargo metadata` (which
|
|
50
|
+
# `rb_sys/extensiontask` runs at load time) would fail before our
|
|
51
|
+
# tasks even register. Vendor synchronously here, before any
|
|
52
|
+
# require touches cargo.
|
|
53
|
+
vendor_rust_core
|
|
17
54
|
|
|
18
|
-
|
|
55
|
+
require 'rb_sys/extensiontask'
|
|
56
|
+
require 'rspec/core/rake_task'
|
|
19
57
|
|
|
20
|
-
desc '
|
|
21
|
-
task
|
|
22
|
-
file = File.open(PHONETIC_COST_C_EXTENSION, 'w')
|
|
23
|
-
Phonetics::CodeGenerator.new(file).generate_phonetic_cost_c_code
|
|
24
|
-
puts "Wrote #{PHONETIC_COST_C_EXTENSION}"
|
|
25
|
-
end
|
|
58
|
+
desc 'Re-copy the Rust core into ext/phonetics_ruby/vendor/'
|
|
59
|
+
task(:vendor_rust) { vendor_rust_core }
|
|
26
60
|
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
puts "Wrote #{NEXT_PHONEME_LENGTH_C_EXTENSION}"
|
|
61
|
+
GEMSPEC = Gem::Specification.load(File.expand_path('phonetics.gemspec', __dir__))
|
|
62
|
+
|
|
63
|
+
RbSys::ExtensionTask.new('phonetics_ruby', GEMSPEC) do |ext|
|
|
64
|
+
ext.lib_dir = 'lib/phonetics'
|
|
32
65
|
end
|
|
33
66
|
|
|
34
|
-
|
|
35
|
-
task compile:
|
|
67
|
+
# build / compile both depend on a fresh vendor copy.
|
|
68
|
+
task compile: :vendor_rust
|
|
69
|
+
task build: :vendor_rust
|
|
36
70
|
|
|
37
71
|
RSpec::Core::RakeTask.new(:spec)
|
|
38
72
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
task default: [:compile, :rubocop, :spec]
|
|
73
|
+
task default: %i[compile spec]
|
data/VERSION
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
|
|
1
|
+
4.0.0
|
data/bin/phonetics
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
# Phonetics CLI — implementation-agnostic contract.
|
|
5
|
+
#
|
|
6
|
+
# The integration tests in spec/phonetics/cli_spec.rb shell out to this
|
|
7
|
+
# binary (or any other one — set PHONETICS_BIN). What matters is
|
|
8
|
+
# stdin/stdout/exit-code behaviour, not which language fields the
|
|
9
|
+
# Phonetics module. With the Rust port live, this script is a thin
|
|
10
|
+
# layer over the Magnus binding.
|
|
11
|
+
|
|
12
|
+
$stdout.set_encoding('UTF-8')
|
|
13
|
+
$stderr.set_encoding('UTF-8')
|
|
14
|
+
|
|
15
|
+
$LOAD_PATH.unshift File.expand_path('../lib', __dir__)
|
|
16
|
+
require 'phonetics'
|
|
17
|
+
|
|
18
|
+
USAGE = <<~USAGE
|
|
19
|
+
Usage: phonetics <command> [args...]
|
|
20
|
+
|
|
21
|
+
Phrase-level distances (input: IPA strings, possibly with spaces):
|
|
22
|
+
distance <ipa1> <ipa2> Strict phonetic Levenshtein.
|
|
23
|
+
confusion <ipa1> <ipa2> Listener-confusion distance.
|
|
24
|
+
similarity <ipa1> <ipa2> 0..1 normalised similarity.
|
|
25
|
+
|
|
26
|
+
Phoneme-level distances (input: single phonemes):
|
|
27
|
+
phoneme <a> <b> Acoustic distance.
|
|
28
|
+
phoneme-conf <a> <b> Perceptual distance (with overlay).
|
|
29
|
+
|
|
30
|
+
Tokenisation:
|
|
31
|
+
tokenize [--boundaries] <ipa> Phoneme stream, one per line.
|
|
32
|
+
|
|
33
|
+
Numeric output is a single line. With --json, output is a JSON object.
|
|
34
|
+
USAGE
|
|
35
|
+
|
|
36
|
+
def die(message, code = 2)
|
|
37
|
+
warn message
|
|
38
|
+
exit code
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def format_number(value, json:)
|
|
42
|
+
return JSON.dump(value: value) if json
|
|
43
|
+
|
|
44
|
+
value.to_s
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
json = !ARGV.delete('--json').nil?
|
|
48
|
+
bounds = !ARGV.delete('--boundaries').nil?
|
|
49
|
+
|
|
50
|
+
command, *args = ARGV
|
|
51
|
+
require 'json' if json
|
|
52
|
+
|
|
53
|
+
case command
|
|
54
|
+
when 'distance'
|
|
55
|
+
die USAGE unless args.size == 2
|
|
56
|
+
puts format_number(Phonetics.levenshtein(args[0], args[1]), json: json)
|
|
57
|
+
|
|
58
|
+
when 'confusion'
|
|
59
|
+
die USAGE unless args.size == 2
|
|
60
|
+
puts format_number(Phonetics.confusion(args[0], args[1]), json: json)
|
|
61
|
+
|
|
62
|
+
when 'similarity'
|
|
63
|
+
die USAGE unless args.size == 2
|
|
64
|
+
puts format_number(Phonetics.similarity(args[0], args[1]), json: json)
|
|
65
|
+
|
|
66
|
+
when 'phoneme'
|
|
67
|
+
die USAGE unless args.size == 2
|
|
68
|
+
puts format_number(Phonetics.distance(args[0], args[1]), json: json)
|
|
69
|
+
|
|
70
|
+
when 'phoneme-conf'
|
|
71
|
+
die USAGE unless args.size == 2
|
|
72
|
+
puts format_number(Phonetics.sub_cost(args[0], args[1]), json: json)
|
|
73
|
+
|
|
74
|
+
when 'tokenize'
|
|
75
|
+
die USAGE unless args.size == 1
|
|
76
|
+
|
|
77
|
+
tokens = Phonetics.tokenize(args[0], boundaries: bounds)
|
|
78
|
+
if json
|
|
79
|
+
puts JSON.dump(tokens: tokens)
|
|
80
|
+
else
|
|
81
|
+
puts tokens
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
when '--help', '-h', nil
|
|
85
|
+
puts USAGE
|
|
86
|
+
|
|
87
|
+
else
|
|
88
|
+
die "phonetics: unknown command #{command.inspect}\n\n#{USAGE}"
|
|
89
|
+
end
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# Member of the gem-local workspace at <repo>/ruby/Cargo.toml. The
|
|
2
|
+
# workspace exists purely so `rb_sys` can find this crate via `cargo
|
|
3
|
+
# metadata` from the gem root; the package is otherwise standalone
|
|
4
|
+
# (no shared inheritance from the Rust workspace at <repo>/rust/),
|
|
5
|
+
# which means it builds the same in a checkout and in a `gem
|
|
6
|
+
# install`-extracted source tree.
|
|
7
|
+
[package]
|
|
8
|
+
# Underscored to match the rake extension name and the cdylib that
|
|
9
|
+
# `Init_phonetics_ruby` expects to find. Cargo accepts both forms;
|
|
10
|
+
# this avoids the s/-/_/ rename Cargo otherwise applies to the
|
|
11
|
+
# produced artifact.
|
|
12
|
+
name = "phonetics_ruby"
|
|
13
|
+
description = "Ruby bindings for the phonetics crate, via magnus."
|
|
14
|
+
version = "0.2.0"
|
|
15
|
+
edition = "2021"
|
|
16
|
+
license = "MIT"
|
|
17
|
+
publish = false
|
|
18
|
+
|
|
19
|
+
[lib]
|
|
20
|
+
# cdylib so Ruby `require` can dlopen it as a native extension.
|
|
21
|
+
crate-type = ["cdylib"]
|
|
22
|
+
|
|
23
|
+
[dependencies]
|
|
24
|
+
# Vendored copy of the Rust core. Populated by `rake vendor_rust`
|
|
25
|
+
# from <repo>/rust/phonetics; see ruby/Rakefile. The vendor dir is
|
|
26
|
+
# not committed to git but IS shipped in the .gem tarball, so a
|
|
27
|
+
# user running `gem install phonetics` gets a self-contained build.
|
|
28
|
+
#
|
|
29
|
+
# The package on crates.io is `phonetics-rs` (the bare `phonetics`
|
|
30
|
+
# name is taken by an unrelated metaphone library). The `package`
|
|
31
|
+
# alias lets us still write `use phonetics::distance;` in our code.
|
|
32
|
+
phonetics = { path = "vendor/phonetics", package = "phonetics-rs" }
|
|
33
|
+
magnus = "0.8"
|
|
34
|
+
|
|
35
|
+
[build-dependencies]
|
|
36
|
+
rb-sys-env = "0.2"
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
fn main() {
|
|
2
|
+
// rb-sys-env 0.2 reads RUBY_VERSION, but the rb_sys gem's mkmf
|
|
3
|
+
// Makefile only exports the split form (RBCONFIG_MAJOR/MINOR/TEENY).
|
|
4
|
+
// Reconstruct it from RBCONFIG_* when needed so both invocation
|
|
5
|
+
// paths (standalone `cargo build` and gem-extconf-driven `make`)
|
|
6
|
+
// work without the caller having to know which one to set.
|
|
7
|
+
if std::env::var_os("RUBY_VERSION").is_none() {
|
|
8
|
+
if let (Ok(major), Ok(minor), Ok(teeny)) = (
|
|
9
|
+
std::env::var("RBCONFIG_MAJOR"),
|
|
10
|
+
std::env::var("RBCONFIG_MINOR"),
|
|
11
|
+
std::env::var("RBCONFIG_TEENY"),
|
|
12
|
+
) {
|
|
13
|
+
std::env::set_var("RUBY_VERSION", format!("{major}.{minor}.{teeny}"));
|
|
14
|
+
}
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
let _ = rb_sys_env::activate().expect("phonetics-ruby: cannot locate a Ruby");
|
|
18
|
+
|
|
19
|
+
// macOS native-extension linker behaviour: leave Ruby's `rb_*`
|
|
20
|
+
// symbols unresolved until dlopen time.
|
|
21
|
+
if cfg!(target_os = "macos") {
|
|
22
|
+
println!("cargo:rustc-link-arg=-Wl,-undefined,dynamic_lookup");
|
|
23
|
+
}
|
|
24
|
+
}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Build the Rust-backed native extension that replaces the previous
|
|
4
|
+
# hand-written C codegen.
|
|
5
|
+
#
|
|
6
|
+
# rb_sys/mkmf generates a Makefile that invokes `cargo rustc` against
|
|
7
|
+
# the Cargo.toml sibling to this file. That crate is a workspace
|
|
8
|
+
# member of <repo>/rust/Cargo.toml — the workspace lints, profile, and
|
|
9
|
+
# version come from there, but the build pipeline that produces the
|
|
10
|
+
# Ruby-loadable cdylib lives here where conventional gem tooling
|
|
11
|
+
# expects it.
|
|
12
|
+
require 'mkmf'
|
|
13
|
+
require 'rb_sys/mkmf'
|
|
14
|
+
|
|
15
|
+
create_rust_makefile('phonetics/phonetics_ruby') do |r|
|
|
16
|
+
r.profile = :release
|
|
17
|
+
end
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
//! Magnus bindings around the `phonetics` core crate.
|
|
2
|
+
//!
|
|
3
|
+
//! Loaded into Ruby as `phonetics_ruby.bundle` / `.so`. The
|
|
4
|
+
//! gem-installed library calls `require 'phonetics/phonetics_ruby'`
|
|
5
|
+
//! and gets a populated `Phonetics` module with module functions
|
|
6
|
+
//! matching what the previous hand-written C extension exposed:
|
|
7
|
+
//!
|
|
8
|
+
//! Phonetics.distance(a, b) acoustic per-phoneme
|
|
9
|
+
//! Phonetics.confusion(a, b) listener-confusion distance
|
|
10
|
+
//! Phonetics.levenshtein(a, b) strict edit distance
|
|
11
|
+
//! Phonetics.similarity(a, b) normalised 0..1
|
|
12
|
+
//! Phonetics.sub_cost(a, b) perceptual per-phoneme
|
|
13
|
+
//! Phonetics.tokenize(input, boundaries) phoneme stream
|
|
14
|
+
//!
|
|
15
|
+
//! All real work lives in the `phonetics` core crate. This file is
|
|
16
|
+
//! the impedance-matching layer between Rust types and Ruby's ABI.
|
|
17
|
+
|
|
18
|
+
use magnus::{function, Error, Ruby};
|
|
19
|
+
|
|
20
|
+
#[magnus::init]
|
|
21
|
+
fn init(ruby: &Ruby) -> Result<(), Error> {
|
|
22
|
+
let phonetics = ruby.define_module("Phonetics")?;
|
|
23
|
+
|
|
24
|
+
phonetics.define_module_function("distance", function!(distance, 2))?;
|
|
25
|
+
phonetics.define_module_function("confusion", function!(confusion, 2))?;
|
|
26
|
+
phonetics.define_module_function("levenshtein", function!(levenshtein, 2))?;
|
|
27
|
+
phonetics.define_module_function("similarity", function!(similarity, 2))?;
|
|
28
|
+
phonetics.define_module_function("sub_cost", function!(sub_cost, 2))?;
|
|
29
|
+
phonetics.define_module_function("_tokenize", function!(tokenize, 2))?;
|
|
30
|
+
|
|
31
|
+
Ok(())
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
fn distance(a: String, b: String) -> f64 {
|
|
35
|
+
phonetics::distance(&a, &b)
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
fn confusion(a: String, b: String) -> f64 {
|
|
39
|
+
phonetics::confusion(&a, &b)
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
fn levenshtein(a: String, b: String) -> f64 {
|
|
43
|
+
phonetics::levenshtein(&a, &b)
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
fn similarity(a: String, b: String) -> f64 {
|
|
47
|
+
phonetics::similarity(&a, &b)
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
fn sub_cost(a: String, b: String) -> f64 {
|
|
51
|
+
phonetics::confusion::sub_cost(&a, &b)
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
fn tokenize(input: String, boundaries: bool) -> Vec<String> {
|
|
55
|
+
phonetics::tokens(&input, boundaries)
|
|
56
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
[package]
|
|
2
|
+
# `phonetics` on crates.io is taken (different project — a metaphone/
|
|
3
|
+
# NYSIIS encoder). We publish under `phonetics-rs`, the conventional
|
|
4
|
+
# fallback when a bare project name is unavailable. The Rust API is
|
|
5
|
+
# still `use phonetics::distance;` because `lib.name` below stays
|
|
6
|
+
# `phonetics`; only the crates.io package name changes.
|
|
7
|
+
name = "phonetics-rs"
|
|
8
|
+
description = "IPA-based phonetic distance metrics: strict edit distance, listener-confusion distance, and per-phoneme acoustic and perceptual scoring. Calibrated against Mad Gab puzzle data; tunable per dialect."
|
|
9
|
+
version.workspace = true
|
|
10
|
+
edition.workspace = true
|
|
11
|
+
license.workspace = true
|
|
12
|
+
repository.workspace = true
|
|
13
|
+
homepage.workspace = true
|
|
14
|
+
authors.workspace = true
|
|
15
|
+
readme = "README.md"
|
|
16
|
+
keywords = ["phonetics", "ipa", "linguistics", "edit-distance", "speech"]
|
|
17
|
+
categories = ["text-processing", "science"]
|
|
18
|
+
|
|
19
|
+
[lints]
|
|
20
|
+
workspace = true
|
|
21
|
+
|
|
22
|
+
# Keep the Rust API name as `phonetics` even though the registry name
|
|
23
|
+
# is `phonetics-rs`. Downstream code stays `use phonetics::distance;`.
|
|
24
|
+
[lib]
|
|
25
|
+
name = "phonetics"
|
|
26
|
+
|
|
27
|
+
[dependencies]
|
|
28
|
+
|
|
29
|
+
[dev-dependencies]
|
|
30
|
+
# Empty for now; will gain criterion + insta as the surface grows.
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# phonetics
|
|
2
|
+
|
|
3
|
+
IPA-based phonetic distance metrics for Rust.
|
|
4
|
+
|
|
5
|
+
Two scores live in this crate:
|
|
6
|
+
|
|
7
|
+
* **`distance(a, b)`** — strict per-phoneme acoustic distance over IPA
|
|
8
|
+
phonemes (Bark-space vowel distance, 2D consonant place embedding,
|
|
9
|
+
approximant–vowel bridge, diphthong/affricate handling).
|
|
10
|
+
* **`Confusion::distance(s1, s2)`** — listener-confusion distance over
|
|
11
|
+
whole phonemic strings, calibrated against Mad Gab puzzle data and
|
|
12
|
+
English speech-perception literature.
|
|
13
|
+
|
|
14
|
+
The split is the point. The first is a claim about the waveform; the
|
|
15
|
+
second is a claim about how a listener parses it. They give different
|
|
16
|
+
answers to different questions.
|
|
17
|
+
|
|
18
|
+
This is a port of the Ruby gem of the same name; see the parent
|
|
19
|
+
repository for an extended write-up of the metric design.
|
|
20
|
+
|
|
21
|
+
## Status
|
|
22
|
+
|
|
23
|
+
Pre-1.0. Core data tables and the per-phoneme acoustic distance are
|
|
24
|
+
parity-tested against the Ruby reference. Levenshtein and Confusion DPs
|
|
25
|
+
are landing module-by-module.
|
|
26
|
+
|
|
27
|
+
## License
|
|
28
|
+
|
|
29
|
+
MIT.
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
//! Compound phonemes — diphthongs and affricates that are recognised
|
|
2
|
+
//! as single perceptual units even though their IPA notation is two
|
|
3
|
+
//! characters.
|
|
4
|
+
//!
|
|
5
|
+
//! Distance for a compound vs another phoneme is the average of the
|
|
6
|
+
//! pairwise component distances; the shorter side is padded by
|
|
7
|
+
//! repeating its last segment so /aɪ/ vs /a/ charges half a phoneme
|
|
8
|
+
//! distance rather than nothing.
|
|
9
|
+
|
|
10
|
+
/// Every registered compound phoneme symbol.
|
|
11
|
+
pub const INVENTORY: &[&str] = &[
|
|
12
|
+
// Diphthongs
|
|
13
|
+
"aɪ","ɑɪ","aʊ","ɑʊ","ɔɪ","eɪ","oʊ","əʊ","ɪə","ʊə","ɛə",
|
|
14
|
+
// English affricates
|
|
15
|
+
"tʃ","dʒ",
|
|
16
|
+
];
|
|
17
|
+
|
|
18
|
+
/// Decompose a compound phoneme symbol into its components. Returns
|
|
19
|
+
/// `None` for non-compound phonemes.
|
|
20
|
+
pub fn components(symbol: &str) -> Option<&'static [&'static str]> {
|
|
21
|
+
Some(match symbol {
|
|
22
|
+
// English-style diphthongs (both /aɪ/-form and /ɑɪ/-form).
|
|
23
|
+
"aɪ" => &["a", "ɪ"],
|
|
24
|
+
"ɑɪ" => &["ɑ", "ɪ"],
|
|
25
|
+
"aʊ" => &["a", "ʊ"],
|
|
26
|
+
"ɑʊ" => &["ɑ", "ʊ"],
|
|
27
|
+
"ɔɪ" => &["ɔ", "ɪ"],
|
|
28
|
+
"eɪ" => &["e", "ɪ"],
|
|
29
|
+
"oʊ" => &["o", "ʊ"],
|
|
30
|
+
"əʊ" => &["ə", "ʊ"],
|
|
31
|
+
"ɪə" => &["ɪ", "ə"],
|
|
32
|
+
"ʊə" => &["ʊ", "ə"],
|
|
33
|
+
"ɛə" => &["ɛ", "ə"],
|
|
34
|
+
// English affricates only. /ts/, /dz/ etc. would mis-tokenise
|
|
35
|
+
// English plurals (cats, kids) as single phonemes; left out.
|
|
36
|
+
"tʃ" => &["t", "ʃ"],
|
|
37
|
+
"dʒ" => &["d", "ʒ"],
|
|
38
|
+
_ => return None,
|
|
39
|
+
})
|
|
40
|
+
}
|