phoonnx 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- phoonnx/__init__.py +0 -0
- phoonnx/config.py +490 -0
- phoonnx/locale/ca/phonetic_spellings.txt +2 -0
- phoonnx/locale/en/phonetic_spellings.txt +1 -0
- phoonnx/locale/gl/phonetic_spellings.txt +2 -0
- phoonnx/locale/pt/phonetic_spellings.txt +2 -0
- phoonnx/phoneme_ids.py +453 -0
- phoonnx/phonemizers/__init__.py +45 -0
- phoonnx/phonemizers/ar.py +42 -0
- phoonnx/phonemizers/base.py +216 -0
- phoonnx/phonemizers/en.py +250 -0
- phoonnx/phonemizers/fa.py +46 -0
- phoonnx/phonemizers/gl.py +142 -0
- phoonnx/phonemizers/he.py +67 -0
- phoonnx/phonemizers/ja.py +119 -0
- phoonnx/phonemizers/ko.py +97 -0
- phoonnx/phonemizers/mul.py +606 -0
- phoonnx/phonemizers/vi.py +44 -0
- phoonnx/phonemizers/zh.py +308 -0
- phoonnx/thirdparty/__init__.py +0 -0
- phoonnx/thirdparty/arpa2ipa.py +249 -0
- phoonnx/thirdparty/cotovia/cotovia_aarch64 +0 -0
- phoonnx/thirdparty/cotovia/cotovia_x86_64 +0 -0
- phoonnx/thirdparty/hangul2ipa.py +783 -0
- phoonnx/thirdparty/ko_tables/aspiration.csv +20 -0
- phoonnx/thirdparty/ko_tables/assimilation.csv +31 -0
- phoonnx/thirdparty/ko_tables/double_coda.csv +17 -0
- phoonnx/thirdparty/ko_tables/hanja.tsv +8525 -0
- phoonnx/thirdparty/ko_tables/ipa.csv +22 -0
- phoonnx/thirdparty/ko_tables/neutralization.csv +11 -0
- phoonnx/thirdparty/ko_tables/tensification.csv +56 -0
- phoonnx/thirdparty/ko_tables/yale.csv +22 -0
- phoonnx/thirdparty/kog2p/__init__.py +385 -0
- phoonnx/thirdparty/kog2p/rulebook.txt +212 -0
- phoonnx/thirdparty/mantoq/__init__.py +67 -0
- phoonnx/thirdparty/mantoq/buck/__init__.py +0 -0
- phoonnx/thirdparty/mantoq/buck/phonetise_buckwalter.py +569 -0
- phoonnx/thirdparty/mantoq/buck/symbols.py +64 -0
- phoonnx/thirdparty/mantoq/buck/tokenization.py +105 -0
- phoonnx/thirdparty/mantoq/num2words.py +37 -0
- phoonnx/thirdparty/mantoq/pyarabic/__init__.py +12 -0
- phoonnx/thirdparty/mantoq/pyarabic/arabrepr.py +64 -0
- phoonnx/thirdparty/mantoq/pyarabic/araby.py +1647 -0
- phoonnx/thirdparty/mantoq/pyarabic/named_const.py +227 -0
- phoonnx/thirdparty/mantoq/pyarabic/normalize.py +161 -0
- phoonnx/thirdparty/mantoq/pyarabic/number.py +826 -0
- phoonnx/thirdparty/mantoq/pyarabic/number_const.py +1704 -0
- phoonnx/thirdparty/mantoq/pyarabic/stack.py +52 -0
- phoonnx/thirdparty/mantoq/pyarabic/trans.py +517 -0
- phoonnx/thirdparty/mantoq/unicode_symbol2label.py +4173 -0
- phoonnx/thirdparty/tashkeel/LICENSE +22 -0
- phoonnx/thirdparty/tashkeel/SOURCE +1 -0
- phoonnx/thirdparty/tashkeel/__init__.py +212 -0
- phoonnx/thirdparty/tashkeel/hint_id_map.json +18 -0
- phoonnx/thirdparty/tashkeel/input_id_map.json +56 -0
- phoonnx/thirdparty/tashkeel/model.onnx +0 -0
- phoonnx/thirdparty/tashkeel/target_id_map.json +17 -0
- phoonnx/thirdparty/zh_num.py +238 -0
- phoonnx/util.py +705 -0
- phoonnx/version.py +6 -0
- phoonnx/voice.py +521 -0
- phoonnx-0.0.0.dist-info/METADATA +255 -0
- phoonnx-0.0.0.dist-info/RECORD +86 -0
- phoonnx-0.0.0.dist-info/WHEEL +5 -0
- phoonnx-0.0.0.dist-info/top_level.txt +2 -0
- phoonnx_train/__main__.py +151 -0
- phoonnx_train/export_onnx.py +109 -0
- phoonnx_train/norm_audio/__init__.py +92 -0
- phoonnx_train/norm_audio/trim.py +54 -0
- phoonnx_train/norm_audio/vad.py +54 -0
- phoonnx_train/preprocess.py +420 -0
- phoonnx_train/vits/__init__.py +0 -0
- phoonnx_train/vits/attentions.py +427 -0
- phoonnx_train/vits/commons.py +147 -0
- phoonnx_train/vits/config.py +330 -0
- phoonnx_train/vits/dataset.py +214 -0
- phoonnx_train/vits/lightning.py +352 -0
- phoonnx_train/vits/losses.py +58 -0
- phoonnx_train/vits/mel_processing.py +139 -0
- phoonnx_train/vits/models.py +732 -0
- phoonnx_train/vits/modules.py +527 -0
- phoonnx_train/vits/monotonic_align/__init__.py +20 -0
- phoonnx_train/vits/monotonic_align/setup.py +13 -0
- phoonnx_train/vits/transforms.py +212 -0
- phoonnx_train/vits/utils.py +16 -0
- phoonnx_train/vits/wavfile.py +860 -0
@@ -0,0 +1,308 @@
|
|
1
|
+
import abc
|
2
|
+
from typing import List
|
3
|
+
|
4
|
+
|
5
|
+
from phoonnx.phonemizers.base import BasePhonemizer
|
6
|
+
from phoonnx.thirdparty.zh_num import num2str
|
7
|
+
from phoonnx.config import Alphabet
|
8
|
+
|
9
|
+
|
10
|
+
class JiebaPhonemizer(BasePhonemizer):
|
11
|
+
"""
|
12
|
+
A non-phonemizing class that simply uses Jieba to segment Chinese text
|
13
|
+
into words with spaces for token separation.
|
14
|
+
"""
|
15
|
+
def __init__(self):
|
16
|
+
super().__init__(Alphabet.HANZI)
|
17
|
+
|
18
|
+
@classmethod
|
19
|
+
def get_lang(cls, target_lang: str) -> str:
|
20
|
+
"""
|
21
|
+
Validates and returns the closest supported language code.
|
22
|
+
|
23
|
+
Args:
|
24
|
+
target_lang (str): The language code to validate.
|
25
|
+
|
26
|
+
Returns:
|
27
|
+
str: The validated language code.
|
28
|
+
|
29
|
+
Raises:
|
30
|
+
ValueError: If the language code is unsupported.
|
31
|
+
"""
|
32
|
+
# this check is here only to throw an exception if invalid language is provided
|
33
|
+
return cls.match_lang(target_lang, ["zh"])
|
34
|
+
|
35
|
+
def phonemize_string(self, text: str, lang: str = "zh") -> str:
|
36
|
+
"""
|
37
|
+
Segments the input Chinese text using Jieba.
|
38
|
+
|
39
|
+
Args:
|
40
|
+
text (str): The input sentence.
|
41
|
+
lang (str): Language code (must be "zh").
|
42
|
+
|
43
|
+
Returns:
|
44
|
+
str: Tokenized text with words separated by spaces.
|
45
|
+
"""
|
46
|
+
import jieba
|
47
|
+
lang = self.get_lang(lang)
|
48
|
+
seg_list = jieba.cut(text, cut_all=False)
|
49
|
+
seg_list = [num2str(w) if w.isdigit() else w for w in seg_list]
|
50
|
+
return " ".join(seg_list)
|
51
|
+
|
52
|
+
|
53
|
+
class BaseChinesePinyinPhonemizer(BasePhonemizer):
|
54
|
+
"""
|
55
|
+
Base class for Chinese phonemizers using different pinyin G2P libraries.
|
56
|
+
Supports optional IPA conversion and segmentation via Jieba.
|
57
|
+
"""
|
58
|
+
|
59
|
+
def __init__(self, alphabet=Alphabet.PINYIN, jieba: bool = True, retone=True):
|
60
|
+
"""
|
61
|
+
Initializes the phonemizer.
|
62
|
+
|
63
|
+
Args:
|
64
|
+
ipa (bool): Whether to convert pinyin to IPA.
|
65
|
+
jieba (bool): Whether to segment text using Jieba before phonemization.
|
66
|
+
"""
|
67
|
+
assert alphabet in [Alphabet.PINYIN, Alphabet.IPA]
|
68
|
+
super().__init__(alphabet)
|
69
|
+
self.jieba = jieba
|
70
|
+
self.retone = retone
|
71
|
+
from pinyin_to_ipa import pinyin_to_ipa
|
72
|
+
self.pinyin_to_ipa = pinyin_to_ipa
|
73
|
+
|
74
|
+
@classmethod
|
75
|
+
def get_lang(cls, target_lang: str) -> str:
|
76
|
+
"""
|
77
|
+
Validates and returns the closest supported language code.
|
78
|
+
|
79
|
+
Args:
|
80
|
+
target_lang (str): The language code to validate.
|
81
|
+
|
82
|
+
Returns:
|
83
|
+
str: The validated language code.
|
84
|
+
|
85
|
+
Raises:
|
86
|
+
ValueError: If the language code is unsupported.
|
87
|
+
"""
|
88
|
+
# this check is here only to throw an exception if invalid language is provided
|
89
|
+
return cls.match_lang(target_lang, ["zh"])
|
90
|
+
|
91
|
+
@staticmethod
|
92
|
+
def _retone(p):
|
93
|
+
p = p.replace('˧˩˧', '↓') # third tone
|
94
|
+
p = p.replace('˧˥', '↗') # second tone
|
95
|
+
p = p.replace('˥˩', '↘') # fourth tone
|
96
|
+
p = p.replace('˥', '→') # first tone
|
97
|
+
p = p.replace(chr(635) + chr(809), 'ɨ').replace(chr(633) + chr(809), 'ɨ')
|
98
|
+
assert chr(809) not in p, p
|
99
|
+
return p
|
100
|
+
|
101
|
+
def to_ipa(self, phones: List[str]) -> List[str]:
|
102
|
+
"""
|
103
|
+
Converts a list of pinyin syllables to IPA. Falls back to the original syllable if conversion fails.
|
104
|
+
|
105
|
+
Args:
|
106
|
+
phones (List[str]): List of pinyin syllables or phrases.
|
107
|
+
|
108
|
+
Returns:
|
109
|
+
List[str]: Corresponding IPA or original syllables.
|
110
|
+
"""
|
111
|
+
ipa_phones: List[str] = []
|
112
|
+
for p in phones:
|
113
|
+
if p == " ":
|
114
|
+
ipa_phones.append(" ")
|
115
|
+
continue
|
116
|
+
pho_str = ""
|
117
|
+
for sp in p.split(): # G2P might return phrases with multiple syllables
|
118
|
+
try:
|
119
|
+
pho = self.pinyin_to_ipa(sp.strip())[0][0]
|
120
|
+
if self.retone:
|
121
|
+
pho = self._retone(pho)
|
122
|
+
pho_str += pho
|
123
|
+
except Exception:
|
124
|
+
pass
|
125
|
+
ipa_phones.append(pho_str)
|
126
|
+
return ipa_phones
|
127
|
+
|
128
|
+
def phonemize_to_list(self, text: str, lang: str) -> List[str]:
|
129
|
+
phones: List[str] = []
|
130
|
+
lang = self.get_lang(lang)
|
131
|
+
if self.jieba:
|
132
|
+
import jieba
|
133
|
+
for chunk in jieba.cut(text, cut_all=False):
|
134
|
+
if chunk.isdigit():
|
135
|
+
chunk = num2str(chunk)
|
136
|
+
phones += self.get_pinyin(chunk)
|
137
|
+
phones += [" "] # keep jieba whitespace
|
138
|
+
else:
|
139
|
+
phones = self.get_pinyin(text)
|
140
|
+
if self.alphabet == Alphabet.IPA:
|
141
|
+
phones = self.to_ipa(phones)
|
142
|
+
return phones
|
143
|
+
|
144
|
+
def phonemize_string(self, text: str, lang: str = "zh") -> str:
|
145
|
+
"""
|
146
|
+
Converts input text to a space-separated phoneme string.
|
147
|
+
|
148
|
+
Args:
|
149
|
+
text (str): The input sentence.
|
150
|
+
lang (str): Language code (must be "zh").
|
151
|
+
|
152
|
+
Returns:
|
153
|
+
str: Space-separated phoneme string.
|
154
|
+
"""
|
155
|
+
phones: List[str] = self.phonemize_to_list(text, lang)
|
156
|
+
return "".join(phones)
|
157
|
+
|
158
|
+
@abc.abstractmethod
|
159
|
+
def get_pinyin(self, text: str) -> List[str]:
|
160
|
+
"""
|
161
|
+
Abstract method to be implemented by subclasses for converting text to pinyin.
|
162
|
+
|
163
|
+
Args:
|
164
|
+
text (str): Input Chinese text.
|
165
|
+
|
166
|
+
Returns:
|
167
|
+
List[str]: List of pinyin tokens.
|
168
|
+
"""
|
169
|
+
return NotImplemented
|
170
|
+
|
171
|
+
|
172
|
+
class G2pCPhonemizer(BaseChinesePinyinPhonemizer):
|
173
|
+
"""
|
174
|
+
Phonemizer using g2pc (CRF-based Grapheme-to-Phoneme converter).
|
175
|
+
https://github.com/Kyubyong/g2pC
|
176
|
+
"""
|
177
|
+
|
178
|
+
def __init__(self, alphabet=Alphabet.PINYIN, jieba: bool = True):
|
179
|
+
from g2pc import G2pC
|
180
|
+
self.g2p = G2pC()
|
181
|
+
super().__init__(alphabet, jieba)
|
182
|
+
|
183
|
+
def get_pinyin(self, text: str) -> List[str]:
|
184
|
+
"""
|
185
|
+
Returns a list of pinyin syllables from g2pc.
|
186
|
+
|
187
|
+
Args:
|
188
|
+
text (str): Input Chinese text.
|
189
|
+
|
190
|
+
Returns:
|
191
|
+
List[str]: Pinyin tokens.
|
192
|
+
"""
|
193
|
+
return [a[3] for a in self.g2p(text)]
|
194
|
+
|
195
|
+
|
196
|
+
class G2pMPhonemizer(BaseChinesePinyinPhonemizer):
|
197
|
+
"""
|
198
|
+
Phonemizer using g2pM - A Neural Grapheme-to-Phoneme Conversion Package for Mandarin Chinese
|
199
|
+
https://github.com/kakaobrain/g2pm
|
200
|
+
"""
|
201
|
+
|
202
|
+
def __init__(self, alphabet=Alphabet.PINYIN, tone: bool = True, char_split: bool = False, jieba: bool = True):
|
203
|
+
from g2pM import G2pM
|
204
|
+
self.g2p = G2pM()
|
205
|
+
self.tone = tone
|
206
|
+
self.char_split = char_split
|
207
|
+
super().__init__(alphabet, jieba)
|
208
|
+
|
209
|
+
def get_pinyin(self, text: str) -> List[str]:
|
210
|
+
"""
|
211
|
+
Returns a list of pinyin tokens from g2pM.
|
212
|
+
|
213
|
+
Args:
|
214
|
+
text (str): Input Chinese text.
|
215
|
+
|
216
|
+
Returns:
|
217
|
+
List[str]: Pinyin tokens.
|
218
|
+
"""
|
219
|
+
return self.g2p(text, tone=self.tone, char_split=self.char_split)
|
220
|
+
|
221
|
+
|
222
|
+
class XpinyinPhonemizer(BaseChinesePinyinPhonemizer):
|
223
|
+
"""
|
224
|
+
Phonemizer using xpinyin (basic pinyin generator with optional tone marks).
|
225
|
+
"""
|
226
|
+
|
227
|
+
def __init__(self, alphabet=Alphabet.PINYIN, tone_marks: str = "numbers", jieba: bool = True):
|
228
|
+
from xpinyin import Pinyin
|
229
|
+
self.g2p = Pinyin()
|
230
|
+
self.tone_marks = tone_marks
|
231
|
+
super().__init__(alphabet, jieba)
|
232
|
+
|
233
|
+
def get_pinyin(self, text: str) -> List[str]:
|
234
|
+
"""
|
235
|
+
Returns a list of pinyin tokens from xpinyin.
|
236
|
+
|
237
|
+
Args:
|
238
|
+
text (str): Input Chinese text.
|
239
|
+
|
240
|
+
Returns:
|
241
|
+
List[str]: Pinyin tokens.
|
242
|
+
"""
|
243
|
+
return self.g2p.get_pinyin(text, tone_marks=self.tone_marks).split("-")
|
244
|
+
|
245
|
+
|
246
|
+
class PypinyinPhonemizer(BaseChinesePinyinPhonemizer):
|
247
|
+
"""
|
248
|
+
Phonemizer using pypinyin (comprehensive and accurate pinyin library).
|
249
|
+
"""
|
250
|
+
|
251
|
+
def __init__(self, alphabet=Alphabet.PINYIN, jieba: bool = True):
|
252
|
+
from pypinyin import pinyin
|
253
|
+
self.g2p = pinyin
|
254
|
+
super().__init__(alphabet, jieba)
|
255
|
+
|
256
|
+
def get_pinyin(self, text: str) -> List[str]:
|
257
|
+
"""
|
258
|
+
Returns a list of pinyin tokens from pypinyin.
|
259
|
+
|
260
|
+
Args:
|
261
|
+
text (str): Input Chinese text.
|
262
|
+
|
263
|
+
Returns:
|
264
|
+
List[str]: Pinyin tokens.
|
265
|
+
"""
|
266
|
+
return [p[0] for p in self.g2p(text)]
|
267
|
+
|
268
|
+
|
269
|
+
if __name__ == "__main__":
|
270
|
+
lang = "zh"
|
271
|
+
text = "然而,他红了20年以后,他竟退出了大家的视线。"
|
272
|
+
|
273
|
+
pho = JiebaPhonemizer()
|
274
|
+
#pho1 = G2pCPhonemizer(ipa=True)
|
275
|
+
pho2 = G2pMPhonemizer()
|
276
|
+
pho3 = XpinyinPhonemizer()
|
277
|
+
pho4 = PypinyinPhonemizer()
|
278
|
+
|
279
|
+
from phoonnx.phonemizers.mul import MisakiPhonemizer
|
280
|
+
|
281
|
+
pho5 = MisakiPhonemizer()
|
282
|
+
|
283
|
+
print(f"\n--- Getting phonemes for '{text}' ---")
|
284
|
+
|
285
|
+
phones = pho5.phonemize_to_list(text, lang)
|
286
|
+
print(f" Misaki: {phones}")
|
287
|
+
|
288
|
+
#phones = pho1.phonemize(text, lang)
|
289
|
+
#print(f" G2pC: {phones}")
|
290
|
+
|
291
|
+
phones = pho2.phonemize_to_list(text, lang)
|
292
|
+
print(f" G2pM: {phones}")
|
293
|
+
|
294
|
+
phones = pho3.phonemize_to_list(text, lang)
|
295
|
+
print(f" Xpinyin: {phones}")
|
296
|
+
|
297
|
+
phones = pho4.phonemize_to_list(text, lang)
|
298
|
+
print(f" Pypinyin: {phones}")
|
299
|
+
|
300
|
+
phones = pho.phonemize_to_list(text, lang)
|
301
|
+
print(f" Jieba: {phones}")
|
302
|
+
|
303
|
+
#
|
304
|
+
exit()
|
305
|
+
# Phonemes: [('ran2 er2 , ta1 hong2 le5 2 0 nian2 yi3 hou4 , ta1 jing4 tui4 chu1 le5 da4 jia1 de5 shi4 xian4 。', '.', True)]
|
306
|
+
# Phonemes: [('ran2 er2 , ta1 hong2 le5 20 nian2 yi3 hou4 , ta1 jing4 tui4 chu1 le5 da4 jia1 de5 shi4 xian4 。', '.', True)]
|
307
|
+
# Phonemes: [('ran2 er2 , ta1 hong2 le5 20 nian2 yi3 hou4 , ta1 jing4 tui4 chu1 le5 da4 jia1 de5 shi4 xian4 。', '.', True)]
|
308
|
+
# Phonemes: [('rán ér , tā hóng le 20 nián yǐ hòu , tā jìng tuì chū le dà jiā de shì xiàn 。', '.', True)]
|
File without changes
|
@@ -0,0 +1,249 @@
|
|
1
|
+
# taken from https://github.com/chorusai/arpa2ipa/blob/master/arpa2ipa/_arpa_to_ipa.py
|
2
|
+
import re
|
3
|
+
|
4
|
+
"""
|
5
|
+
https://en.wikipedia.org/wiki/Arpabet
|
6
|
+
|
7
|
+
In Arpabet, every phoneme is represented by one or two capital letters.
|
8
|
+
Digits are used as stress indicators and are placed at the end of the
|
9
|
+
stressed syllabic vowel. Punctuation marks are used like in the written
|
10
|
+
language, to represent intonation changes at the end of clauses and
|
11
|
+
sentences. The stress values are:
|
12
|
+
|
13
|
+
Value | Description
|
14
|
+
0 | No stress
|
15
|
+
1 | Primary stress
|
16
|
+
2 | Seconary stress
|
17
|
+
|
18
|
+
** Vowels
|
19
|
+
*** Monophthongs
|
20
|
+
|
21
|
+
Arpabet IPA Word examples
|
22
|
+
AO ɔ off (AO1 F); fall (F AO1 L); frost (F R AO1 S T)
|
23
|
+
AA ɑ father (F AA1 DH ER), cot (K AA1 T)
|
24
|
+
IY i bee (B IY1); she (SH IY1)
|
25
|
+
UW u you (Y UW1); new (N UW1); food (F UW1 D)
|
26
|
+
EH ɛ red (R EH1 D); men (M EH1 N) (modern versions use e)
|
27
|
+
IH ɪ big (B IH1 G); win (W IH1 N)
|
28
|
+
UH ʊ should (SH UH1 D), could (K UH1 D)
|
29
|
+
AH ʌ but (B AH1 T), sun (S AH1 N)
|
30
|
+
AH(0) ə sofa (S OW1 F AH0), alone (AH0 L OW1 N)
|
31
|
+
AX ə discus (D IH1 S K AX0 S);
|
32
|
+
AE æ at (AE1 T); fast (F AE1 S T)
|
33
|
+
*** Dipthongs
|
34
|
+
|
35
|
+
Arpabet IPA Word Examples
|
36
|
+
EY eɪ say (S EY1); eight (EY1 T)
|
37
|
+
AY aɪ my (M AY1); why (W AY1); ride (R AY1 D)
|
38
|
+
OW oʊ show (SH OW1); coat (K OW1 T)
|
39
|
+
AW a how (HH AW1); now (N AW1)
|
40
|
+
OY ɔɪ boy (B OY1); toy (T OY1)
|
41
|
+
*** R-colored vowels
|
42
|
+
|
43
|
+
Arpabet IPA Word Examples
|
44
|
+
ER ɝ her (HH ER0); bird (B ER1 D); hurt (HH ER1 T), nurse (N ER1 S)
|
45
|
+
AXR ɚ father (F AA1 DH AXR); coward (K AW1 AXR D)
|
46
|
+
EH R ɛr air (EH1 R); where (W EH1 R); hair (HH EH1 R)
|
47
|
+
UH R ʊr cure (K Y UH1 R); bureau (B Y UH1 R OW0), detour (D IH0 T
|
48
|
+
UH1 R)
|
49
|
+
AO R ɔr more (M AO1 R); bored (B AO1 R D); chord (K AO1 R D)
|
50
|
+
AA R ɑr large (L AA1 R JH); hard (HH AA1 R D)
|
51
|
+
IH R or IY R ɪr ear (IY1 R); near (N IH1 R)
|
52
|
+
AW R aʊr This seems to be a rarely used r-controlled vowel. In some
|
53
|
+
dialects flower (F L AW1 R; in other dialects F L AW1 ER0)
|
54
|
+
** Consonants
|
55
|
+
*** Stops
|
56
|
+
Arpabet IPA Word Examples
|
57
|
+
P p pay (P EY1)
|
58
|
+
B b buy (B AY1)
|
59
|
+
T t take (T EY1 K)
|
60
|
+
D d day (D EY1)
|
61
|
+
K k key (K IY1)
|
62
|
+
G ɡ go (G OW1)
|
63
|
+
|
64
|
+
*** Affricates
|
65
|
+
Arpabet IPA Word Examples
|
66
|
+
CH tʃ chair (CH EH1 R)
|
67
|
+
JH dʒ just (JH AH1 S T); gym (JH IH1 M)
|
68
|
+
|
69
|
+
*** Fricatives
|
70
|
+
Arpabet IPA Word Examples
|
71
|
+
F f for (F AO1 R)
|
72
|
+
V v very (V EH1 R IY0)
|
73
|
+
TH θ thanks (TH AE1 NG K S); Thursday (TH ER1 Z D EY2)
|
74
|
+
DH ð that (DH AE1 T); the (DH AH0); them (DH EH1 M)
|
75
|
+
S s say (S EY1)
|
76
|
+
Z z zoo (Z UW1)
|
77
|
+
SH ʃ show (SH OW1)
|
78
|
+
ZH ʒ measure (M EH1 ZH ER0); pleasure (P L EH1 ZH ER)
|
79
|
+
HH h house (HH AW1 S)
|
80
|
+
|
81
|
+
*** Nasals
|
82
|
+
Arpabet IPA Word Examples
|
83
|
+
M m man (M AE1 N)
|
84
|
+
EM m̩ keep 'em (K IY1 P EM)
|
85
|
+
N n no (N OW1)
|
86
|
+
EN n̩ button (B AH1 T EN)
|
87
|
+
NG ŋ sing (S IH1 NG)
|
88
|
+
ENG ŋ̍ Washington (W AO1 SH ENG T EN)
|
89
|
+
|
90
|
+
*** Liquids
|
91
|
+
Arpabet IPA Word Examples
|
92
|
+
L l or ɫ late (L EY1 T); fail (F EY1 L)
|
93
|
+
EL ɫ̩ bottle (B AO1 DX EL)
|
94
|
+
R r or ɹ run (R AH1 N)
|
95
|
+
DX ɾ wetter (W EH1 DX AXR)
|
96
|
+
NX ɾ̃ wintergreen (W IY2 NX AXR G R IY1 N)
|
97
|
+
|
98
|
+
*** Semivowels
|
99
|
+
Arpabet IPA Word Examples
|
100
|
+
Y j yes (Y EH1 S)
|
101
|
+
W w way (W EY1)
|
102
|
+
Q ʔ (glottal stop) uh-oh (Q AH1 Q OW) (ʔʌʔoʊ)
|
103
|
+
(missing) hw or ʍ "when" etc. in some dialects
|
104
|
+
|
105
|
+
"""
|
106
|
+
monopthongs = {
|
107
|
+
'AO': 'ɔ',
|
108
|
+
'AO0': 'ɔ',
|
109
|
+
'AO1': 'ɔ',
|
110
|
+
'AO2': 'ɔ',
|
111
|
+
'AA': 'ɑ',
|
112
|
+
'AA0': 'ɑ',
|
113
|
+
'AA1': 'ɑ',
|
114
|
+
'AA2': 'ɑ',
|
115
|
+
'IY': 'i',
|
116
|
+
'IY0': 'i',
|
117
|
+
'IY1': 'i',
|
118
|
+
'IY2': 'i',
|
119
|
+
'UW': 'u',
|
120
|
+
'UW0': 'u',
|
121
|
+
'UW1': 'u',
|
122
|
+
'UW2': 'u',
|
123
|
+
'EH': 'e',
|
124
|
+
'EH0': 'e',
|
125
|
+
'EH1': 'e',
|
126
|
+
'EH2': 'e',
|
127
|
+
'IH': 'ɪ',
|
128
|
+
'IH0': 'ɪ',
|
129
|
+
'IH1': 'ɪ',
|
130
|
+
'IH2': 'ɪ',
|
131
|
+
'UH': 'ʊ',
|
132
|
+
'UH0': 'ʊ',
|
133
|
+
'UH1': 'ʊ',
|
134
|
+
'UH2': 'ʊ',
|
135
|
+
'AH': 'ʌ',
|
136
|
+
'AH0': 'ə',
|
137
|
+
'AH1': 'ʌ',
|
138
|
+
'AH2': 'ʌ',
|
139
|
+
'AE': 'æ',
|
140
|
+
'AE0': 'æ',
|
141
|
+
'AE1': 'æ',
|
142
|
+
'AE2': 'æ',
|
143
|
+
'AX': 'ə',
|
144
|
+
'AX0': 'ə',
|
145
|
+
'AX1': 'ə',
|
146
|
+
'AX2': 'ə',
|
147
|
+
}
|
148
|
+
|
149
|
+
dipthongs = {
|
150
|
+
'EY': 'eɪ',
|
151
|
+
'EY0': 'eɪ',
|
152
|
+
'EY1': 'eɪ',
|
153
|
+
'EY2': 'eɪ',
|
154
|
+
'AY': 'aɪ',
|
155
|
+
'AY0': 'aɪ',
|
156
|
+
'AY1': 'aɪ',
|
157
|
+
'AY2': 'aɪ',
|
158
|
+
'OW': 'oʊ',
|
159
|
+
'OW0': 'oʊ',
|
160
|
+
'OW1': 'oʊ',
|
161
|
+
'OW2': 'oʊ',
|
162
|
+
'AW': 'aʊ',
|
163
|
+
'AW0': 'aʊ',
|
164
|
+
'AW1': 'aʊ',
|
165
|
+
'AW2': 'aʊ',
|
166
|
+
'OY': 'ɔɪ',
|
167
|
+
'OY0': 'ɔɪ',
|
168
|
+
'OY1': 'ɔɪ',
|
169
|
+
'OY2': 'ɔɪ',
|
170
|
+
}
|
171
|
+
|
172
|
+
r_colored_vowels = {
|
173
|
+
'ER': 'ɜr',
|
174
|
+
'ER0': 'ɜr',
|
175
|
+
'ER1': 'ɜr',
|
176
|
+
'ER2': 'ɜr',
|
177
|
+
'AXR': 'ər',
|
178
|
+
'AXR0': 'ər',
|
179
|
+
'AXR1': 'ər',
|
180
|
+
'AXR2': 'ər',
|
181
|
+
}
|
182
|
+
|
183
|
+
stops = {
|
184
|
+
'P': 'p',
|
185
|
+
'B': 'b',
|
186
|
+
'T': 't',
|
187
|
+
'D': 'd',
|
188
|
+
'K': 'k',
|
189
|
+
'G': 'g',
|
190
|
+
}
|
191
|
+
|
192
|
+
affricates = {
|
193
|
+
'CH': 'tʃ',
|
194
|
+
'JH': 'dʒ',
|
195
|
+
}
|
196
|
+
|
197
|
+
fricatives = {
|
198
|
+
'F': 'f',
|
199
|
+
'V': 'v',
|
200
|
+
'TH': 'θ',
|
201
|
+
'DH': 'ð',
|
202
|
+
'S': 's',
|
203
|
+
'Z': 'z',
|
204
|
+
'SH': 'ʃ',
|
205
|
+
'ZH': 'ʒ',
|
206
|
+
'HH': 'h',
|
207
|
+
}
|
208
|
+
|
209
|
+
nasals = {
|
210
|
+
'M': 'm',
|
211
|
+
'EM': 'm̩',
|
212
|
+
'N': 'n',
|
213
|
+
'EN': 'n̩',
|
214
|
+
'NG': 'ŋ',
|
215
|
+
'ENG': 'ŋ̍',
|
216
|
+
}
|
217
|
+
|
218
|
+
liquids = {
|
219
|
+
'L': 'l',
|
220
|
+
'EL': 'ɫ̩',
|
221
|
+
'R': 'r',
|
222
|
+
'DX': 'ɾ',
|
223
|
+
'NX': 'ɾ̃',
|
224
|
+
}
|
225
|
+
|
226
|
+
semivowels = {
|
227
|
+
'W': 'w',
|
228
|
+
'Y': 'j',
|
229
|
+
'Q': 'ʔ'
|
230
|
+
}
|
231
|
+
|
232
|
+
arpa_to_ipa_lookup = {}
|
233
|
+
arpa_to_ipa_lookup.update(monopthongs)
|
234
|
+
arpa_to_ipa_lookup.update(dipthongs)
|
235
|
+
arpa_to_ipa_lookup.update(r_colored_vowels)
|
236
|
+
arpa_to_ipa_lookup.update(stops)
|
237
|
+
arpa_to_ipa_lookup.update(affricates)
|
238
|
+
arpa_to_ipa_lookup.update(fricatives)
|
239
|
+
arpa_to_ipa_lookup.update(nasals)
|
240
|
+
arpa_to_ipa_lookup.update(liquids)
|
241
|
+
arpa_to_ipa_lookup.update(semivowels)
|
242
|
+
|
243
|
+
|
244
|
+
def split_on_capital(camel):
|
245
|
+
' '.join(re.findall('[A-Z][a-z]*', camel.title())).lower()
|
246
|
+
|
247
|
+
|
248
|
+
def arpa_to_ipa(arpa):
|
249
|
+
return ' '.join(arpa_to_ipa_lookup[phoneme] for phoneme in arpa.split(' '))
|
Binary file
|
Binary file
|