phonetics 4.0.0-arm64-darwin

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,167 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'open-uri'
4
+ require 'json'
5
+
6
+ module Phonetics
7
+ def self.transcription_for(phrase)
8
+ phrase.downcase.split(' ').map { |word| Transcriptions[word] }.join
9
+ end
10
+
11
+ module Transcriptions
12
+ extend self
13
+
14
+ TranscriptionFile = File.join(__dir__, '..', 'common_ipa_transcriptions.json')
15
+ TranscriptionsURL = 'https://jackdanger.com/common_ipa_transcriptions.json'
16
+
17
+ SourcesByPreference = [/wiktionary/, /cmu/, /phonemicchart.com/].freeze
18
+
19
+ def [](key)
20
+ entry = transcriptions[key]
21
+ return unless entry
22
+ return unless entry['ipa']
23
+
24
+ SourcesByPreference.each do |preferred_source|
25
+ entry['ipa'].each_key do |source|
26
+ return entry['ipa'][source] if source =~ preferred_source
27
+ end
28
+ end
29
+ nil
30
+ end
31
+
32
+ def words
33
+ transcriptions.keys
34
+ end
35
+
36
+ def transcriptions
37
+ @transcriptions ||= begin
38
+ download! unless File.exist?(TranscriptionFile)
39
+ load_from_disk!
40
+ end
41
+ end
42
+
43
+ # Lazily loaded from JSON file on disk
44
+ def load_from_disk!
45
+ @transcriptions = JSON.parse(File.read(TranscriptionFile))
46
+ end
47
+
48
+ # rubocop:disable Security/Open
49
+ def download!
50
+ File.open(Transcriptions, 'w') { |f| f.write(URI.open(TranscriptionsURL).read) }
51
+ end
52
+ # rubocop:enable Security/Open
53
+
54
+ # rubocop:disable Metrics/CyclomaticComplexity
55
+ def trie(max_rarity = nil)
56
+ # Let's turn this:
57
+ #
58
+ # "century": {
59
+ # "rarity": 462.0,
60
+ # "ipa": {
61
+ # "cmu": "sɛntʃɝɪ",
62
+ # "phonemicchart.com": "sentʃərɪ",
63
+ # "wiktionary": "sɛntʃəɹi",
64
+ # "wiktionary2": "sɛntʃɹi",
65
+ # "wiktionary3": "sɛntʃʊɹi"
66
+ # },
67
+ # "alt_display": "CENTURY"
68
+ # }
69
+ #
70
+ # into this:
71
+ #
72
+ # "s": {
73
+ # "e": {
74
+ # "n": {
75
+ # "t": {
76
+ # "ʃ": {
77
+ # "ʊ": {
78
+ # "ɹ": {
79
+ # "i": {
80
+ # "terminal": [Term('century')],
81
+ # },
82
+ # },
83
+ # },
84
+ # "ə": {
85
+ # "r": {
86
+ # "ɪ": {
87
+ # "terminal": [Term('century')],
88
+ # },
89
+ # },
90
+ # },
91
+ # "ɹ": {
92
+ # "i": {
93
+ # "terminal": [Term('century')],
94
+ # },
95
+ # },
96
+ # "ɝ": {
97
+ # "ɪ": {
98
+ # "terminal": [Term('century')],
99
+ # },
100
+ # },
101
+ # },
102
+ # },
103
+ # },
104
+ # },
105
+ # "ɛ": {
106
+ # "n": {
107
+ # "t": {
108
+ # "ʃ": {
109
+ # "ɝ": {
110
+ # "ɪ": {
111
+ # "terminal": [Term('century')],
112
+ # },
113
+ # },
114
+ # },
115
+ # },
116
+ # },
117
+ # },
118
+ # },
119
+ #
120
+ @tries ||= {}
121
+ @tries[max_rarity] ||= begin
122
+ base_trie = {}
123
+ transcriptions.each do |key, entry|
124
+ next if max_rarity && (entry['rarity'].nil? || entry['rarity'] > max_rarity)
125
+
126
+ entry_data = {
127
+ word: key,
128
+ rarity: entry['rarity'],
129
+ }
130
+ entry.fetch('ipa', []).each_value do |transcription|
131
+ base_trie = construct_trie(base_trie, transcription, entry_data)
132
+ end
133
+ end
134
+ base_trie.freeze
135
+ end
136
+ end
137
+ # rubocop:enable Metrics/CyclomaticComplexity
138
+
139
+ def walk(ipa)
140
+ ipa.each_char.reduce(trie) { |acc, char| acc[char] }
141
+ end
142
+
143
+ def transcription_for(phrase)
144
+ phrase.downcase.split(' ').map { |word| self[word] }.join
145
+ end
146
+
147
+ private
148
+
149
+ # Given an portion of an existing trie (to be modified), the remainder of a
150
+ # char string, and an entry, walk or construct the appropriate trie nodes
151
+ # necessary to place the entry in a leaf.
152
+ def construct_trie(subtrie, chars_remaining, entry_data, depth = 0)
153
+ subtrie[:depth] ||= depth
154
+ if chars_remaining.empty?
155
+ # Base condition met
156
+ subtrie[:terminal] ||= []
157
+ subtrie[:terminal] << entry_data unless subtrie[:terminal].include?(entry_data)
158
+ else
159
+ next_char = chars_remaining[0]
160
+ subtrie[next_char] ||= {}
161
+ subtrie[next_char][:path] ||= subtrie[:path].to_s + next_char
162
+ subtrie[next_char] = construct_trie(subtrie[next_char], chars_remaining[1..], entry_data, depth + 1)
163
+ end
164
+ subtrie
165
+ end
166
+ end
167
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Phonetics
4
+ VERSION = File.read(File.join(File.dirname(__FILE__), '../../VERSION')).chomp
5
+ end
data/lib/phonetics.rb ADDED
@@ -0,0 +1,79 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Phonetics — IPA-based phonetic distance.
4
+ #
5
+ # The entire algorithmic core is written in Rust (see <repo>/rust/
6
+ # phonetics) and loaded as a native extension via Magnus. This file
7
+ # layers ergonomic Ruby idioms on top of the bare module functions
8
+ # that the extension exports.
9
+ #
10
+ # Two-tier distance API:
11
+ #
12
+ # Phonetics.distance(p1, p2) acoustic per-phoneme, 0..1
13
+ # Phonetics.levenshtein(s1, s2) strict edit distance
14
+ # Phonetics.confusion(s1, s2) listener-confusion distance
15
+ # Phonetics.similarity(s1, s2) normalised 0..1
16
+ # Phonetics.sub_cost(p1, p2) perceptual per-phoneme
17
+ # Phonetics.tokenize(ipa, boundaries:) phoneme stream
18
+ require 'delegate'
19
+
20
+ require_relative 'phonetics/phonetics_ruby'
21
+ require_relative 'phonetics/transcriptions'
22
+
23
+ module Phonetics
24
+ # The native binding exposes the tokenizer as `_tokenize(input,
25
+ # boundaries)`. Magnus's `function!` macro doesn't bridge Ruby
26
+ # keyword arguments through to Rust, so we wrap it in a Ruby method
27
+ # that does accept the kwarg.
28
+ def self.tokenize(input, boundaries: false)
29
+ _tokenize(input, boundaries)
30
+ end
31
+
32
+ # ------------------------------------------------------------------
33
+ # Phonetics::String — iterator over phonemes in an IPA string.
34
+ # ------------------------------------------------------------------
35
+ class String < SimpleDelegator
36
+ def each_phoneme(boundaries: false)
37
+ Phonetics.tokenize(to_s, boundaries: boundaries).each
38
+ end
39
+ end
40
+
41
+ # ------------------------------------------------------------------
42
+ # Backwards-compatible namespaced API.
43
+ #
44
+ # The previous Ruby+C implementation exposed these under sub-modules.
45
+ # Keep them as thin delegators so existing callers don't break —
46
+ # there's nothing interesting happening here, just forwarding.
47
+ # ------------------------------------------------------------------
48
+
49
+ module Levenshtein
50
+ INDEL_COST = 1.0
51
+ TRANSPOSE_COST = 0.8
52
+
53
+ def self.distance(s1, s2, _verbose = false)
54
+ return if s1.nil? || s2.nil?
55
+
56
+ Phonetics.levenshtein(s1, s2)
57
+ end
58
+ end
59
+
60
+ module Confusion
61
+ GAP_OPEN = 0.60
62
+ GAP_EXTEND = 0.25
63
+ WEAK_INDEL_COST = 0.15
64
+ BOUNDARY_INDEL_COST = 0.02
65
+
66
+ def self.distance(s1, s2, verbose: false)
67
+ _ = verbose
68
+ Phonetics.confusion(s1, s2)
69
+ end
70
+
71
+ def self.similarity(s1, s2)
72
+ Phonetics.similarity(s1, s2)
73
+ end
74
+
75
+ def self.sub_cost(a, b)
76
+ Phonetics.sub_cost(a, b)
77
+ end
78
+ end
79
+ end
data/phonetics.gemspec ADDED
@@ -0,0 +1,55 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'lib/phonetics/version'
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = 'phonetics'
7
+ spec.version = Phonetics::VERSION
8
+ spec.authors = ['Jack Danger']
9
+ spec.email = ['github@jackcanty.com']
10
+
11
+ spec.summary = 'IPA-based phonetic distance: strict edit distance, listener-confusion distance, and per-phoneme acoustic and perceptual scoring.'
12
+ spec.description = <<~DESC
13
+ Tools for working with the International Phonetic Alphabet. Two-tier
14
+ distance API — strict acoustic and listener-perception — backed by a
15
+ Rust core compiled in via Magnus. Calibrated against Mad Gab puzzle
16
+ data and English speech-perception literature.
17
+ DESC
18
+ spec.homepage = 'https://github.com/JackDanger/phonetics'
19
+ spec.license = 'MIT'
20
+
21
+ spec.required_ruby_version = '>= 3.0'
22
+ spec.required_rubygems_version = '>= 3.3.11'
23
+
24
+ spec.metadata['homepage_uri'] = spec.homepage
25
+ spec.metadata['source_code_uri'] = spec.homepage
26
+
27
+ spec.extensions = ['ext/phonetics_ruby/extconf.rb']
28
+
29
+ spec.files = Dir.chdir(File.expand_path(__dir__)) do
30
+ tracked = `git ls-files -z`.split("\x0").reject do |f|
31
+ f.match(%r{\A(test|spec|features)/}) ||
32
+ f.match(%r{\Aext/phonetics_ruby/(target|Cargo.lock|Makefile)})
33
+ end
34
+ # The vendored Rust core isn't tracked in git (it's a build
35
+ # artifact populated by `rake vendor_rust`), but it IS shipped
36
+ # in the .gem tarball so end users don't need the source
37
+ # workspace to compile the extension.
38
+ vendor = Dir.glob('ext/phonetics_ruby/vendor/**/*', File::FNM_DOTMATCH).reject do |p|
39
+ File.directory?(p) ||
40
+ p.include?('/target/') ||
41
+ p.end_with?('Cargo.lock', '/.', '/..')
42
+ end
43
+ (tracked + vendor).uniq.sort
44
+ end
45
+
46
+ spec.require_paths = ['lib']
47
+
48
+ spec.add_dependency 'rb_sys', '~> 0.9'
49
+
50
+ spec.add_development_dependency 'bundler'
51
+ spec.add_development_dependency 'rake'
52
+ spec.add_development_dependency 'rake-compiler'
53
+ spec.add_development_dependency 'rspec'
54
+ spec.add_development_dependency 'rubocop'
55
+ end
metadata ADDED
@@ -0,0 +1,149 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: phonetics
3
+ version: !ruby/object:Gem::Version
4
+ version: 4.0.0
5
+ platform: arm64-darwin
6
+ authors:
7
+ - Jack Danger
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2026-05-12 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake-compiler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rspec
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rubocop
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ description: |
84
+ Tools for working with the International Phonetic Alphabet. Two-tier
85
+ distance API — strict acoustic and listener-perception — backed by a
86
+ Rust core compiled in via Magnus. Calibrated against Mad Gab puzzle
87
+ data and English speech-perception literature.
88
+ email:
89
+ - github@jackcanty.com
90
+ executables: []
91
+ extensions: []
92
+ extra_rdoc_files: []
93
+ files:
94
+ - ".gitignore"
95
+ - ".rspec"
96
+ - ".rubocop.yml"
97
+ - CHANGELOG
98
+ - CODE_OF_CONDUCT.md
99
+ - Dockerfile
100
+ - Gemfile
101
+ - LICENSE.txt
102
+ - README.md
103
+ - Rakefile
104
+ - VERSION
105
+ - _site/orthographic_levenshtein_example.png
106
+ - _site/phonetic_levenshtein_example.png
107
+ - _site/vowel_chart_b_words.jpg
108
+ - bin/console
109
+ - bin/gempush-if-changed
110
+ - bin/phonetics
111
+ - ext/phonetics_ruby/vendor/phonetics/README.md
112
+ - lib/common_ipa_transcriptions.json
113
+ - lib/phonetics.rb
114
+ - lib/phonetics/3.2/phonetics_ruby.bundle
115
+ - lib/phonetics/3.3/phonetics_ruby.bundle
116
+ - lib/phonetics/3.4/phonetics_ruby.bundle
117
+ - lib/phonetics/transcriptions.rb
118
+ - lib/phonetics/version.rb
119
+ - phonetics.gemspec
120
+ homepage: https://github.com/JackDanger/phonetics
121
+ licenses:
122
+ - MIT
123
+ metadata:
124
+ homepage_uri: https://github.com/JackDanger/phonetics
125
+ source_code_uri: https://github.com/JackDanger/phonetics
126
+ post_install_message:
127
+ rdoc_options: []
128
+ require_paths:
129
+ - lib
130
+ required_ruby_version: !ruby/object:Gem::Requirement
131
+ requirements:
132
+ - - ">="
133
+ - !ruby/object:Gem::Version
134
+ version: '3.2'
135
+ - - "<"
136
+ - !ruby/object:Gem::Version
137
+ version: 3.5.dev
138
+ required_rubygems_version: !ruby/object:Gem::Requirement
139
+ requirements:
140
+ - - ">="
141
+ - !ruby/object:Gem::Version
142
+ version: 3.3.11
143
+ requirements: []
144
+ rubygems_version: 3.5.23
145
+ signing_key:
146
+ specification_version: 4
147
+ summary: 'IPA-based phonetic distance: strict edit distance, listener-confusion distance,
148
+ and per-phoneme acoustic and perceptual scoring.'
149
+ test_files: []