phonetics 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +11 -0
- data/.rspec +3 -0
- data/.travis.yml +5 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +6 -0
- data/Gemfile.lock +35 -0
- data/LICENSE.txt +21 -0
- data/README.md +14 -0
- data/Rakefile +6 -0
- data/VERSION +1 -0
- data/lib/phonetic_levenshtein.rb +103 -0
- data/lib/phonetics/version.rb +3 -0
- data/lib/phonetics.rb +246 -0
- data/phonetics.gemspec +22 -0
- metadata +100 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: c1ad0894f34019de351475721cfba624c2873944993e82db0e72ee7dd6aef374
|
4
|
+
data.tar.gz: 4590f9791a8409e9ef380949cc675c75507864050e72f616d4b3ac99f4dc4439
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 60af744c78500372afb02dd22fdc1066cc7d2e83ab218eed798ff7d96f101cbc112eb0b19c6a694ec83d79e891c5ea41e0d8cda26ad5aab209aa87aed3ba9f2e
|
7
|
+
data.tar.gz: 971566027a0b60cfad064f7c54c871a94830dae6c3210c77c306b31dbe0410c2687415c87686caecf9dd065812b3027b7bc6c48b69ec7fee3b01c56bfeff2644
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/.travis.yml
ADDED
data/CODE_OF_CONDUCT.md
ADDED
@@ -0,0 +1,74 @@
|
|
1
|
+
# Contributor Covenant Code of Conduct
|
2
|
+
|
3
|
+
## Our Pledge
|
4
|
+
|
5
|
+
In the interest of fostering an open and welcoming environment, we as
|
6
|
+
contributors and maintainers pledge to making participation in our project and
|
7
|
+
our community a harassment-free experience for everyone, regardless of age, body
|
8
|
+
size, disability, ethnicity, gender identity and expression, level of experience,
|
9
|
+
nationality, personal appearance, race, religion, or sexual identity and
|
10
|
+
orientation.
|
11
|
+
|
12
|
+
## Our Standards
|
13
|
+
|
14
|
+
Examples of behavior that contributes to creating a positive environment
|
15
|
+
include:
|
16
|
+
|
17
|
+
* Using welcoming and inclusive language
|
18
|
+
* Being respectful of differing viewpoints and experiences
|
19
|
+
* Gracefully accepting constructive criticism
|
20
|
+
* Focusing on what is best for the community
|
21
|
+
* Showing empathy towards other community members
|
22
|
+
|
23
|
+
Examples of unacceptable behavior by participants include:
|
24
|
+
|
25
|
+
* The use of sexualized language or imagery and unwelcome sexual attention or
|
26
|
+
advances
|
27
|
+
* Trolling, insulting/derogatory comments, and personal or political attacks
|
28
|
+
* Public or private harassment
|
29
|
+
* Publishing others' private information, such as a physical or electronic
|
30
|
+
address, without explicit permission
|
31
|
+
* Other conduct which could reasonably be considered inappropriate in a
|
32
|
+
professional setting
|
33
|
+
|
34
|
+
## Our Responsibilities
|
35
|
+
|
36
|
+
Project maintainers are responsible for clarifying the standards of acceptable
|
37
|
+
behavior and are expected to take appropriate and fair corrective action in
|
38
|
+
response to any instances of unacceptable behavior.
|
39
|
+
|
40
|
+
Project maintainers have the right and responsibility to remove, edit, or
|
41
|
+
reject comments, commits, code, wiki edits, issues, and other contributions
|
42
|
+
that are not aligned to this Code of Conduct, or to ban temporarily or
|
43
|
+
permanently any contributor for other behaviors that they deem inappropriate,
|
44
|
+
threatening, offensive, or harmful.
|
45
|
+
|
46
|
+
## Scope
|
47
|
+
|
48
|
+
This Code of Conduct applies both within project spaces and in public spaces
|
49
|
+
when an individual is representing the project or its community. Examples of
|
50
|
+
representing a project or community include using an official project e-mail
|
51
|
+
address, posting via an official social media account, or acting as an appointed
|
52
|
+
representative at an online or offline event. Representation of a project may be
|
53
|
+
further defined and clarified by project maintainers.
|
54
|
+
|
55
|
+
## Enforcement
|
56
|
+
|
57
|
+
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
58
|
+
reported by contacting the project team at github@jackcanty.com. All
|
59
|
+
complaints will be reviewed and investigated and will result in a response that
|
60
|
+
is deemed necessary and appropriate to the circumstances. The project team is
|
61
|
+
obligated to maintain confidentiality with regard to the reporter of an incident.
|
62
|
+
Further details of specific enforcement policies may be posted separately.
|
63
|
+
|
64
|
+
Project maintainers who do not follow or enforce the Code of Conduct in good
|
65
|
+
faith may face temporary or permanent repercussions as determined by other
|
66
|
+
members of the project's leadership.
|
67
|
+
|
68
|
+
## Attribution
|
69
|
+
|
70
|
+
This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
|
71
|
+
available at [http://contributor-covenant.org/version/1/4][version]
|
72
|
+
|
73
|
+
[homepage]: http://contributor-covenant.org
|
74
|
+
[version]: http://contributor-covenant.org/version/1/4/
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
phonetics (1.0.0)
|
5
|
+
|
6
|
+
GEM
|
7
|
+
remote: https://rubygems.org/
|
8
|
+
specs:
|
9
|
+
diff-lcs (1.3)
|
10
|
+
rake (12.3.2)
|
11
|
+
rspec (3.8.0)
|
12
|
+
rspec-core (~> 3.8.0)
|
13
|
+
rspec-expectations (~> 3.8.0)
|
14
|
+
rspec-mocks (~> 3.8.0)
|
15
|
+
rspec-core (3.8.0)
|
16
|
+
rspec-support (~> 3.8.0)
|
17
|
+
rspec-expectations (3.8.2)
|
18
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
19
|
+
rspec-support (~> 3.8.0)
|
20
|
+
rspec-mocks (3.8.0)
|
21
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
22
|
+
rspec-support (~> 3.8.0)
|
23
|
+
rspec-support (3.8.0)
|
24
|
+
|
25
|
+
PLATFORMS
|
26
|
+
ruby
|
27
|
+
|
28
|
+
DEPENDENCIES
|
29
|
+
bundler (~> 1.16)
|
30
|
+
phonetics!
|
31
|
+
rake
|
32
|
+
rspec (~> 3.0)
|
33
|
+
|
34
|
+
BUNDLED WITH
|
35
|
+
1.16.2
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2019 Jack Danger
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
# Phonetics
|
2
|
+
|
3
|
+
|
4
|
+
|
5
|
+
|
6
|
+
## Contributing
|
7
|
+
|
8
|
+
Patches welcome, forks celebrated.
|
9
|
+
|
10
|
+
Everyone interacting in the Phonetics project’s codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/JackDanger/phonetics/blob/master/CODE_OF_CONDUCT.md).
|
11
|
+
|
12
|
+
## License
|
13
|
+
|
14
|
+
The gem is available as open source under the wide-open terms of the [MIT License](https://opensource.org/licenses/MIT).
|
data/Rakefile
ADDED
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
1.0.0
|
@@ -0,0 +1,103 @@
|
|
1
|
+
require_relative './phonetics'
|
2
|
+
|
3
|
+
# Using the Damerau version of the Levenshtein algorithm, with phonetic feature
|
4
|
+
# count used instead of a binary edit distance calculation
|
5
|
+
#
|
6
|
+
# This implementation is almost entirely taken from the damerau-levenshtein gem
|
7
|
+
# (https://github.com/GlobalNamesArchitecture/damerau-levenshtein/tree/master/ext/damerau_levenshtein).
|
8
|
+
# The implementation is modified based on "Using Phonologically Weighted
|
9
|
+
# Levenshtein Distances for the Prediction of Microscopic Intelligibility" by
|
10
|
+
# Lionel Fontan, Isabelle Ferrané, Jérôme Farinas, Julien Pinquier, Xavier
|
11
|
+
# Aumont, 2016
|
12
|
+
# https://hal.archives-ouvertes.fr/hal-01474904/document
|
13
|
+
class PhoneticLevenshtein
|
14
|
+
def initialize(ipa_str1, ipa_str2)
|
15
|
+
@str1 = ipa_str1
|
16
|
+
@str2 = ipa_str2
|
17
|
+
@len1 = ipa_str1.size
|
18
|
+
@len2 = ipa_str2.size
|
19
|
+
prepare_matrix
|
20
|
+
set_edit_distances(ipa_str1, ipa_str2)
|
21
|
+
end
|
22
|
+
|
23
|
+
def distance
|
24
|
+
walk.last[:distance]
|
25
|
+
end
|
26
|
+
|
27
|
+
def self.distance(str1, str2)
|
28
|
+
new(str1, str2).distance
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
def walk
|
34
|
+
res = []
|
35
|
+
cell = [@len2, @len1]
|
36
|
+
while cell != [0, 0]
|
37
|
+
cell, char = char_data(cell)
|
38
|
+
res.unshift char
|
39
|
+
end
|
40
|
+
res
|
41
|
+
end
|
42
|
+
|
43
|
+
def set_edit_distances(str1, str2)
|
44
|
+
(1..@len2).each do |i|
|
45
|
+
(1..@len1).each do |j|
|
46
|
+
no_change(i, j) && next if str2[i - 1] == str1[j - 1]
|
47
|
+
@matrix[i][j] = [del(i, j) + 1.0, ins(i, j) + 1.0, subst(i, j)].min
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def char_data(cell)
|
53
|
+
char = { distance: @matrix[cell[0]][cell[1]] }
|
54
|
+
val = find_previous(cell)
|
55
|
+
previous_value = val[0][0]
|
56
|
+
char[:type] = previous_value == char[:distance] ? :same : val[1]
|
57
|
+
cell = val.pop
|
58
|
+
[cell, char]
|
59
|
+
end
|
60
|
+
|
61
|
+
def find_previous(cell)
|
62
|
+
candidates = [[[ins(*cell), 1], :ins, [cell[0], cell[1] - 1]],
|
63
|
+
[[del(*cell), 2], :del, [cell[0] - 1, cell[1]]],
|
64
|
+
[[subst(*cell), 0], :subst, [cell[0] - 1, cell[1] - 1]]]
|
65
|
+
select_cell(candidates)
|
66
|
+
end
|
67
|
+
|
68
|
+
def select_cell(candidates)
|
69
|
+
candidates.select { |e| e[-1][0] >= 0 && e[-1][1] >= 0 }.
|
70
|
+
sort_by(&:first).first
|
71
|
+
end
|
72
|
+
|
73
|
+
# TODO: Score the edit distance lower if sonorant sounds are found in sequence.
|
74
|
+
def del(i, j)
|
75
|
+
@matrix[i - 1][j]
|
76
|
+
end
|
77
|
+
|
78
|
+
def ins(i, j)
|
79
|
+
@matrix[i][j - 1]
|
80
|
+
end
|
81
|
+
|
82
|
+
# This is where we implement the modifications to Damerau-Levenshtein according to
|
83
|
+
# https://hal.archives-ouvertes.fr/hal-01474904/document
|
84
|
+
def subst(i, j)
|
85
|
+
map = Phonetics.distance_map[@str1[j]]
|
86
|
+
score = map[@str2[i]] if map
|
87
|
+
score ||= 1.0
|
88
|
+
@matrix[i - 1][j - 1] + score
|
89
|
+
end
|
90
|
+
|
91
|
+
def no_change(i, j)
|
92
|
+
@matrix[i][j] = @matrix[i - 1][j - 1]
|
93
|
+
end
|
94
|
+
|
95
|
+
def prepare_matrix
|
96
|
+
@matrix = []
|
97
|
+
@matrix << (0..@len1).to_a
|
98
|
+
@len2.times do |i|
|
99
|
+
ary = [i + 1] + (1..@len1).map { nil }
|
100
|
+
@matrix << ary
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
data/lib/phonetics.rb
ADDED
@@ -0,0 +1,246 @@
|
|
1
|
+
require 'delegate'
|
2
|
+
|
3
|
+
module Phonetics
|
4
|
+
extend self
|
5
|
+
|
6
|
+
# This subclass of the stdlib's String allows us to iterate over each phoneme
|
7
|
+
# in a string without monkeypatching
|
8
|
+
#
|
9
|
+
# Usage:
|
10
|
+
# Phonetics::String.new("wətɛvɝ").each_phoneme.to_a
|
11
|
+
# => ["w", "ə", "t", "ɛ", "v", "ɝ"]
|
12
|
+
class String < SimpleDelegator
|
13
|
+
|
14
|
+
# Group all phonemes by how many characters they have. Use this to walk
|
15
|
+
# through a string finding phonemes (looking for longest ones first)
|
16
|
+
def self.phonemes_by_length
|
17
|
+
@phonemes_by_length ||= Phonetics.phonemes.each_with_object(
|
18
|
+
# This relies on the impicit stable key ordering of Hash objects in Ruby
|
19
|
+
# 2+ to keep the keys in descending order.
|
20
|
+
4 => Set.new, 3 => Set.new, 2 => Set.new, 1 => Set.new
|
21
|
+
) do |str, acc|
|
22
|
+
acc[str.chars.size] << str
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def each_phoneme
|
27
|
+
idx = 0
|
28
|
+
Enumerator.new do |y|
|
29
|
+
while idx < chars.length
|
30
|
+
found = false
|
31
|
+
self.class.phonemes_by_length.each do |size, phonemes|
|
32
|
+
if idx + size <= chars.length
|
33
|
+
candidate = chars[idx..idx+size-1].join
|
34
|
+
if phonemes.include?(candidate)
|
35
|
+
y.yield candidate
|
36
|
+
idx += size
|
37
|
+
found = true
|
38
|
+
break
|
39
|
+
else
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
idx += 1 unless found
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
module Vowels
|
50
|
+
extend self
|
51
|
+
|
52
|
+
FormantFrequencies = {
|
53
|
+
# https://en.wikipedia.org/wiki/Formant#Phonetics
|
54
|
+
'i' => { F1: 240, F2: 2400, rounded: false },
|
55
|
+
'y' => { F1: 235, F2: 2100, rounded: false },
|
56
|
+
'ɪ' => { F1: 300, F2: 2100, rounded: false }, # Guessing From other vowels
|
57
|
+
'e' => { F1: 390, F2: 2300, rounded: false },
|
58
|
+
'ø' => { F1: 370, F2: 1900, rounded: true },
|
59
|
+
'ɛ' => { F1: 610, F2: 1900, rounded: false },
|
60
|
+
'œ' => { F1: 585, F2: 1710, rounded: true },
|
61
|
+
'a' => { F1: 850, F2: 1610, rounded: false },
|
62
|
+
'ɶ' => { F1: 820, F2: 1530, rounded: true },
|
63
|
+
'ɑ' => { F1: 750, F2: 940, rounded: false },
|
64
|
+
'ɒ' => { F1: 700, F2: 760, rounded: true },
|
65
|
+
|
66
|
+
'ʌ' => { F1: 600, F2: 1170, rounded: false },
|
67
|
+
# copying 'ʌ' for other mid-vowel formants
|
68
|
+
'ə' => { F1: 600, F2: 1170, rounded: false },
|
69
|
+
'ɝ' => { F1: 600, F2: 1170, rounded: false, rhotic: true },
|
70
|
+
|
71
|
+
'ɔ' => { F1: 500, F2: 700, rounded: true },
|
72
|
+
'ɤ' => { F1: 460, F2: 1310, rounded: false },
|
73
|
+
'o' => { F1: 360, F2: 640, rounded: true },
|
74
|
+
'ɯ' => { F1: 300, F2: 1390, rounded: false },
|
75
|
+
'æ' => { F1: 800, F2: 1900, rounded: false }, # Guessing From other vowels
|
76
|
+
'u' => { F1: 350, F2: 650, rounded: true }, # Guessing From other vowels
|
77
|
+
'ʊ' => { F1: 350, F2: 650, rounded: true },
|
78
|
+
# Frequencies from http://videoweb.nie.edu.sg/phonetic/vowels/measurements.html
|
79
|
+
}
|
80
|
+
|
81
|
+
def phonemes
|
82
|
+
@phonemes ||= FormantFrequencies.keys
|
83
|
+
end
|
84
|
+
|
85
|
+
# Given two vowels, calculate the (pythagorean) distance between them using
|
86
|
+
# their F1 and F2 frequencies as x/y coordinates.
|
87
|
+
# The return value is scaled to a value between 0 and 1
|
88
|
+
# TODO: account for rhoticity (F3)
|
89
|
+
def distance(phoneme1, phoneme2)
|
90
|
+
formants1 = FormantFrequencies.fetch(phoneme1)
|
91
|
+
formants2 = FormantFrequencies.fetch(phoneme2)
|
92
|
+
|
93
|
+
@minmax_f1 ||= FormantFrequencies.values.minmax {|a, b| a[:F1] <=> b[:F1] }.map {|h| h[:F1] }
|
94
|
+
@minmax_f2 ||= FormantFrequencies.values.minmax {|a, b| a[:F2] <=> b[:F2] }.map {|h| h[:F2] }
|
95
|
+
|
96
|
+
# Get an x and y value for each input phoneme scaled between 0.0 and 1.0
|
97
|
+
# We'll use the scaled f1 as the 'x' and the scaled f2 as the 'y'
|
98
|
+
scaled_phoneme1_f1 = (formants1[:F1] - @minmax_f1[0]) / @minmax_f1[1].to_f
|
99
|
+
scaled_phoneme1_f2 = (formants1[:F2] - @minmax_f2[0]) / @minmax_f2[1].to_f
|
100
|
+
scaled_phoneme2_f1 = (formants2[:F1] - @minmax_f1[0]) / @minmax_f1[1].to_f
|
101
|
+
scaled_phoneme2_f2 = (formants2[:F2] - @minmax_f2[0]) / @minmax_f2[1].to_f
|
102
|
+
|
103
|
+
f1_distance = (scaled_phoneme1_f1 - scaled_phoneme2_f1).abs
|
104
|
+
f2_distance = (scaled_phoneme1_f2 - scaled_phoneme2_f2).abs
|
105
|
+
|
106
|
+
# When we have four values we can use the pythagorean theorem on them
|
107
|
+
# (order doesn't matter)
|
108
|
+
Math.sqrt((f1_distance ** 2) + (f2_distance ** 2))
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
module Consonants
|
113
|
+
extend self
|
114
|
+
|
115
|
+
# Plosives and fricatives are less similar than trills and flaps, or
|
116
|
+
# sibilant fricatives and non-sibilant fricatives
|
117
|
+
# TODO: this is unfinished and possibly a bad idea
|
118
|
+
MannerDistances = {
|
119
|
+
'Nasal' => %w[continuant],
|
120
|
+
'Stop' => %w[],
|
121
|
+
'Sibilant fricative' => %w[continuant fricative],
|
122
|
+
'Non-sibilant fricative' => %w[continuant non_sibilant fricative],
|
123
|
+
'Approximant' => %w[],
|
124
|
+
'Tap/Flap' => %w[],
|
125
|
+
'Trill' => %w[],
|
126
|
+
'Lateral fricative' => %w[continuant fricative],
|
127
|
+
'Lateral approximant' => %w[],
|
128
|
+
'Lateral tap/flap' => %w[],
|
129
|
+
}.freeze
|
130
|
+
|
131
|
+
# This chart (columns 2 through the end, anyway) is a direct port of
|
132
|
+
# https://en.wikipedia.org/wiki/International_Phonetic_Alphabet#Letters
|
133
|
+
# We # store the consonant table in this format to make updating it easier.
|
134
|
+
ChartData = %Q{ | Labio-velar | Bi-labial | Labio-dental | Linguo-labial | Dental | Alveolar | Post-alveolar | Retro-flex | Palatal | Velar | Uvular | Pharyngeal | Glottal
|
135
|
+
Nasal | | m̥ m | ɱ | n̼ | | n̥ n | | ɳ̊ ɳ | ɲ̊ ɲ | ŋ̊ ŋ | ɴ | |
|
136
|
+
Stop | | p b | p̪ b̪ | t̼ d̼ | | t d | | ʈ ɖ | c ɟ | k g | q ɢ | ʡ | ʔ
|
137
|
+
Sibilant fricative | | | | | | s z | ʃ ʒ | ʂ ʐ | ɕ ʑ | | | |
|
138
|
+
Non-sibilant fricative | | ɸ β | f v | θ̼ ð̼ | θ ð | θ̠ ð̠ | ɹ̠̊˔ ɹ̠˔ | ɻ˔ | ç ʝ | x ɣ | χ ʁ | ħ ʕ | h ɦ
|
139
|
+
Approximant | w | | ʋ̥ ʋ | | | ɹ̥ ɹ | | ɻ̊ ɻ | j̊ j | ɰ̊ ɰ | | | ʔ̞
|
140
|
+
Tap/flap | | ⱱ̟ | ⱱ | ɾ̼ | | ɾ̥ ɾ | | ɽ̊ ɽ | | | ɢ̆ | ʡ̆ |
|
141
|
+
Trill | | ʙ̥ ʙ | | | | r̥ r | | | | | ʀ̥ ʀ | ʜ ʢ |
|
142
|
+
Lateral fricative | | | | | | ɬ ɮ | | ɭ̊˔ ɭ˔ | ʎ̝̊ ʎ̝ | ʟ̝̊ ʟ̝ | | |
|
143
|
+
Lateral approximant | | | | | | l̥ l | | ɭ̊ ɭ | ʎ̥ ʎ | ʟ̥ ʟ | ʟ̠ | |
|
144
|
+
Lateral tap/flap | | | | | | ɺ | | ɭ̆ | ʎ̆ | ʟ̆ | | |
|
145
|
+
}
|
146
|
+
|
147
|
+
# Parse the ChartData into a lookup table where we can retrieve attributes
|
148
|
+
# for each phoneme
|
149
|
+
def features
|
150
|
+
@features ||= begin
|
151
|
+
header, *manners = ChartData.lines
|
152
|
+
|
153
|
+
_, *positions = header.chomp.split(' | ')
|
154
|
+
positions.map(&:strip!)
|
155
|
+
|
156
|
+
# Remove any trailing blank lines
|
157
|
+
manners.pop while manners.last.to_s.strip.empty?
|
158
|
+
|
159
|
+
position_indexes = Hash[*positions.each_with_index.to_a.flatten]
|
160
|
+
|
161
|
+
@position_count = positions.size
|
162
|
+
|
163
|
+
manners.each_with_object({}) do |row, phonemes|
|
164
|
+
manner, *columns = row.chomp.split(' | ')
|
165
|
+
manner.strip!
|
166
|
+
positions.zip(columns).each do |position, phoneme_text|
|
167
|
+
data = {
|
168
|
+
position: position,
|
169
|
+
position_index: position_indexes[position],
|
170
|
+
manner: manner,
|
171
|
+
}
|
172
|
+
# If there is a character in the first byte then this articulation
|
173
|
+
# has a voiceless phoneme. The symbol may use additional characters
|
174
|
+
# as part of the phoneme symbol.
|
175
|
+
unless phoneme_text[0] == ' '
|
176
|
+
# Take the first non-blank character string
|
177
|
+
symbol = phoneme_text.chars.take_while { |char| char != ' ' }.join
|
178
|
+
phoneme_text = phoneme_text[symbol.chars.size..-1]
|
179
|
+
|
180
|
+
phonemes[symbol] = data.merge(voiced: false)
|
181
|
+
end
|
182
|
+
# If there's a character anywhere left in the string then this
|
183
|
+
# articulation has a voiced phoneme
|
184
|
+
unless phoneme_text.strip.empty?
|
185
|
+
symbol = phoneme_text.strip
|
186
|
+
phonemes[symbol] = data.merge(voiced: true)
|
187
|
+
end
|
188
|
+
end
|
189
|
+
end
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
def phonemes
|
194
|
+
@phonemes ||= features.keys
|
195
|
+
end
|
196
|
+
|
197
|
+
# Given two consonants, calculate their difference by summing the
|
198
|
+
# following:
|
199
|
+
# * 0.1 if they are not voiced the same
|
200
|
+
# * 0.3 if they are different manners
|
201
|
+
# * Up to 0.6 if they are the maximum position difference
|
202
|
+
def distance(phoneme1, phoneme2)
|
203
|
+
features1 = features[phoneme1]
|
204
|
+
features2 = features[phoneme2]
|
205
|
+
|
206
|
+
penalty = 0
|
207
|
+
penalty += 0.1 if features1[:voiced] != features2[:voiced]
|
208
|
+
|
209
|
+
penalty += 0.3 if features1[:manner] != features2[:manner]
|
210
|
+
|
211
|
+
# Use up to the remaining 0.6 for penalizing differences in manner
|
212
|
+
penalty += 0.6 * ((features1[:position_index] - features2[:position_index]).abs / @position_count.to_f)
|
213
|
+
penalty
|
214
|
+
end
|
215
|
+
end
|
216
|
+
|
217
|
+
def phonemes
|
218
|
+
Consonants.phonemes + Vowels.phonemes
|
219
|
+
end
|
220
|
+
|
221
|
+
Symbols = Consonants.phonemes.reduce({}) {|acc, p| acc.update p => :consonant }.merge(
|
222
|
+
Vowels.phonemes.reduce({}) { |acc, p| acc.update p => :vowel }
|
223
|
+
)
|
224
|
+
|
225
|
+
def distance(phoneme1, phoneme2)
|
226
|
+
types = [Symbols.fetch(phoneme1), Symbols.fetch(phoneme2)].sort
|
227
|
+
if types == [:consonant, :vowel]
|
228
|
+
1.0
|
229
|
+
elsif types == [:vowel, :vowel]
|
230
|
+
Vowels.distance(phoneme1, phoneme2)
|
231
|
+
elsif types == [:consonant, :consonant]
|
232
|
+
Consonants.distance(phoneme1, phoneme2)
|
233
|
+
end
|
234
|
+
end
|
235
|
+
|
236
|
+
def distance_map
|
237
|
+
@distance_map ||= (
|
238
|
+
Vowels.phonemes + Consonants.phonemes
|
239
|
+
).permutation(2).each_with_object(Hash.new { |h, k| h[k] = {} } ) do |pair, scores|
|
240
|
+
p1, p2 = *pair
|
241
|
+
score = distance(p1, p2)
|
242
|
+
scores[p1][p2] = score
|
243
|
+
scores[p2][p1] = score
|
244
|
+
end
|
245
|
+
end
|
246
|
+
end
|
data/phonetics.gemspec
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Gem::Specification.new do |spec|
|
2
|
+
spec.name = "phonetics"
|
3
|
+
spec.version = File.read(File.join(File.dirname(__FILE__), './VERSION'))
|
4
|
+
spec.authors = ["Jack Danger"]
|
5
|
+
spec.email = ["github@jackcanty.com"]
|
6
|
+
|
7
|
+
spec.summary = %q{tools for linguistic code using the International Phonetic Alphabet}
|
8
|
+
spec.description = %q{tools for linguistic code using the International Phonetic Alphabet}
|
9
|
+
spec.homepage = "https://github.com/JackDanger/phonetics"
|
10
|
+
spec.license = "MIT"
|
11
|
+
|
12
|
+
# Specify which files should be added to the gem when it is released.
|
13
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
14
|
+
spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
|
15
|
+
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
16
|
+
end
|
17
|
+
spec.require_paths = ["lib"]
|
18
|
+
|
19
|
+
spec.add_development_dependency "bundler", "~> 1.16"
|
20
|
+
spec.add_development_dependency "rake"
|
21
|
+
spec.add_development_dependency "rspec", "~> 3.0"
|
22
|
+
end
|
metadata
ADDED
@@ -0,0 +1,100 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: phonetics
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jack Danger
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2019-04-13 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.16'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.16'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rspec
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '3.0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '3.0'
|
55
|
+
description: tools for linguistic code using the International Phonetic Alphabet
|
56
|
+
email:
|
57
|
+
- github@jackcanty.com
|
58
|
+
executables: []
|
59
|
+
extensions: []
|
60
|
+
extra_rdoc_files: []
|
61
|
+
files:
|
62
|
+
- ".gitignore"
|
63
|
+
- ".rspec"
|
64
|
+
- ".travis.yml"
|
65
|
+
- CODE_OF_CONDUCT.md
|
66
|
+
- Gemfile
|
67
|
+
- Gemfile.lock
|
68
|
+
- LICENSE.txt
|
69
|
+
- README.md
|
70
|
+
- Rakefile
|
71
|
+
- VERSION
|
72
|
+
- lib/phonetic_levenshtein.rb
|
73
|
+
- lib/phonetics.rb
|
74
|
+
- lib/phonetics/version.rb
|
75
|
+
- phonetics.gemspec
|
76
|
+
homepage: https://github.com/JackDanger/phonetics
|
77
|
+
licenses:
|
78
|
+
- MIT
|
79
|
+
metadata: {}
|
80
|
+
post_install_message:
|
81
|
+
rdoc_options: []
|
82
|
+
require_paths:
|
83
|
+
- lib
|
84
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
85
|
+
requirements:
|
86
|
+
- - ">="
|
87
|
+
- !ruby/object:Gem::Version
|
88
|
+
version: '0'
|
89
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
90
|
+
requirements:
|
91
|
+
- - ">="
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '0'
|
94
|
+
requirements: []
|
95
|
+
rubyforge_project:
|
96
|
+
rubygems_version: 2.7.6
|
97
|
+
signing_key:
|
98
|
+
specification_version: 4
|
99
|
+
summary: tools for linguistic code using the International Phonetic Alphabet
|
100
|
+
test_files: []
|