phoonnx 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. phoonnx/__init__.py +0 -0
  2. phoonnx/config.py +490 -0
  3. phoonnx/locale/ca/phonetic_spellings.txt +2 -0
  4. phoonnx/locale/en/phonetic_spellings.txt +1 -0
  5. phoonnx/locale/gl/phonetic_spellings.txt +2 -0
  6. phoonnx/locale/pt/phonetic_spellings.txt +2 -0
  7. phoonnx/phoneme_ids.py +453 -0
  8. phoonnx/phonemizers/__init__.py +45 -0
  9. phoonnx/phonemizers/ar.py +42 -0
  10. phoonnx/phonemizers/base.py +216 -0
  11. phoonnx/phonemizers/en.py +250 -0
  12. phoonnx/phonemizers/fa.py +46 -0
  13. phoonnx/phonemizers/gl.py +142 -0
  14. phoonnx/phonemizers/he.py +67 -0
  15. phoonnx/phonemizers/ja.py +119 -0
  16. phoonnx/phonemizers/ko.py +97 -0
  17. phoonnx/phonemizers/mul.py +606 -0
  18. phoonnx/phonemizers/vi.py +44 -0
  19. phoonnx/phonemizers/zh.py +308 -0
  20. phoonnx/thirdparty/__init__.py +0 -0
  21. phoonnx/thirdparty/arpa2ipa.py +249 -0
  22. phoonnx/thirdparty/cotovia/cotovia_aarch64 +0 -0
  23. phoonnx/thirdparty/cotovia/cotovia_x86_64 +0 -0
  24. phoonnx/thirdparty/hangul2ipa.py +783 -0
  25. phoonnx/thirdparty/ko_tables/aspiration.csv +20 -0
  26. phoonnx/thirdparty/ko_tables/assimilation.csv +31 -0
  27. phoonnx/thirdparty/ko_tables/double_coda.csv +17 -0
  28. phoonnx/thirdparty/ko_tables/hanja.tsv +8525 -0
  29. phoonnx/thirdparty/ko_tables/ipa.csv +22 -0
  30. phoonnx/thirdparty/ko_tables/neutralization.csv +11 -0
  31. phoonnx/thirdparty/ko_tables/tensification.csv +56 -0
  32. phoonnx/thirdparty/ko_tables/yale.csv +22 -0
  33. phoonnx/thirdparty/kog2p/__init__.py +385 -0
  34. phoonnx/thirdparty/kog2p/rulebook.txt +212 -0
  35. phoonnx/thirdparty/mantoq/__init__.py +67 -0
  36. phoonnx/thirdparty/mantoq/buck/__init__.py +0 -0
  37. phoonnx/thirdparty/mantoq/buck/phonetise_buckwalter.py +569 -0
  38. phoonnx/thirdparty/mantoq/buck/symbols.py +64 -0
  39. phoonnx/thirdparty/mantoq/buck/tokenization.py +105 -0
  40. phoonnx/thirdparty/mantoq/num2words.py +37 -0
  41. phoonnx/thirdparty/mantoq/pyarabic/__init__.py +12 -0
  42. phoonnx/thirdparty/mantoq/pyarabic/arabrepr.py +64 -0
  43. phoonnx/thirdparty/mantoq/pyarabic/araby.py +1647 -0
  44. phoonnx/thirdparty/mantoq/pyarabic/named_const.py +227 -0
  45. phoonnx/thirdparty/mantoq/pyarabic/normalize.py +161 -0
  46. phoonnx/thirdparty/mantoq/pyarabic/number.py +826 -0
  47. phoonnx/thirdparty/mantoq/pyarabic/number_const.py +1704 -0
  48. phoonnx/thirdparty/mantoq/pyarabic/stack.py +52 -0
  49. phoonnx/thirdparty/mantoq/pyarabic/trans.py +517 -0
  50. phoonnx/thirdparty/mantoq/unicode_symbol2label.py +4173 -0
  51. phoonnx/thirdparty/tashkeel/LICENSE +22 -0
  52. phoonnx/thirdparty/tashkeel/SOURCE +1 -0
  53. phoonnx/thirdparty/tashkeel/__init__.py +212 -0
  54. phoonnx/thirdparty/tashkeel/hint_id_map.json +18 -0
  55. phoonnx/thirdparty/tashkeel/input_id_map.json +56 -0
  56. phoonnx/thirdparty/tashkeel/model.onnx +0 -0
  57. phoonnx/thirdparty/tashkeel/target_id_map.json +17 -0
  58. phoonnx/thirdparty/zh_num.py +238 -0
  59. phoonnx/util.py +705 -0
  60. phoonnx/version.py +6 -0
  61. phoonnx/voice.py +521 -0
  62. phoonnx-0.0.0.dist-info/METADATA +255 -0
  63. phoonnx-0.0.0.dist-info/RECORD +86 -0
  64. phoonnx-0.0.0.dist-info/WHEEL +5 -0
  65. phoonnx-0.0.0.dist-info/top_level.txt +2 -0
  66. phoonnx_train/__main__.py +151 -0
  67. phoonnx_train/export_onnx.py +109 -0
  68. phoonnx_train/norm_audio/__init__.py +92 -0
  69. phoonnx_train/norm_audio/trim.py +54 -0
  70. phoonnx_train/norm_audio/vad.py +54 -0
  71. phoonnx_train/preprocess.py +420 -0
  72. phoonnx_train/vits/__init__.py +0 -0
  73. phoonnx_train/vits/attentions.py +427 -0
  74. phoonnx_train/vits/commons.py +147 -0
  75. phoonnx_train/vits/config.py +330 -0
  76. phoonnx_train/vits/dataset.py +214 -0
  77. phoonnx_train/vits/lightning.py +352 -0
  78. phoonnx_train/vits/losses.py +58 -0
  79. phoonnx_train/vits/mel_processing.py +139 -0
  80. phoonnx_train/vits/models.py +732 -0
  81. phoonnx_train/vits/modules.py +527 -0
  82. phoonnx_train/vits/monotonic_align/__init__.py +20 -0
  83. phoonnx_train/vits/monotonic_align/setup.py +13 -0
  84. phoonnx_train/vits/transforms.py +212 -0
  85. phoonnx_train/vits/utils.py +16 -0
  86. phoonnx_train/vits/wavfile.py +860 -0
@@ -0,0 +1,308 @@
1
+ import abc
2
+ from typing import List
3
+
4
+
5
+ from phoonnx.phonemizers.base import BasePhonemizer
6
+ from phoonnx.thirdparty.zh_num import num2str
7
+ from phoonnx.config import Alphabet
8
+
9
+
10
+ class JiebaPhonemizer(BasePhonemizer):
11
+ """
12
+ A non-phonemizing class that simply uses Jieba to segment Chinese text
13
+ into words with spaces for token separation.
14
+ """
15
+ def __init__(self):
16
+ super().__init__(Alphabet.HANZI)
17
+
18
+ @classmethod
19
+ def get_lang(cls, target_lang: str) -> str:
20
+ """
21
+ Validates and returns the closest supported language code.
22
+
23
+ Args:
24
+ target_lang (str): The language code to validate.
25
+
26
+ Returns:
27
+ str: The validated language code.
28
+
29
+ Raises:
30
+ ValueError: If the language code is unsupported.
31
+ """
32
+ # this check is here only to throw an exception if invalid language is provided
33
+ return cls.match_lang(target_lang, ["zh"])
34
+
35
+ def phonemize_string(self, text: str, lang: str = "zh") -> str:
36
+ """
37
+ Segments the input Chinese text using Jieba.
38
+
39
+ Args:
40
+ text (str): The input sentence.
41
+ lang (str): Language code (must be "zh").
42
+
43
+ Returns:
44
+ str: Tokenized text with words separated by spaces.
45
+ """
46
+ import jieba
47
+ lang = self.get_lang(lang)
48
+ seg_list = jieba.cut(text, cut_all=False)
49
+ seg_list = [num2str(w) if w.isdigit() else w for w in seg_list]
50
+ return " ".join(seg_list)
51
+
52
+
53
+ class BaseChinesePinyinPhonemizer(BasePhonemizer):
54
+ """
55
+ Base class for Chinese phonemizers using different pinyin G2P libraries.
56
+ Supports optional IPA conversion and segmentation via Jieba.
57
+ """
58
+
59
+ def __init__(self, alphabet=Alphabet.PINYIN, jieba: bool = True, retone=True):
60
+ """
61
+ Initializes the phonemizer.
62
+
63
+ Args:
64
+ ipa (bool): Whether to convert pinyin to IPA.
65
+ jieba (bool): Whether to segment text using Jieba before phonemization.
66
+ """
67
+ assert alphabet in [Alphabet.PINYIN, Alphabet.IPA]
68
+ super().__init__(alphabet)
69
+ self.jieba = jieba
70
+ self.retone = retone
71
+ from pinyin_to_ipa import pinyin_to_ipa
72
+ self.pinyin_to_ipa = pinyin_to_ipa
73
+
74
+ @classmethod
75
+ def get_lang(cls, target_lang: str) -> str:
76
+ """
77
+ Validates and returns the closest supported language code.
78
+
79
+ Args:
80
+ target_lang (str): The language code to validate.
81
+
82
+ Returns:
83
+ str: The validated language code.
84
+
85
+ Raises:
86
+ ValueError: If the language code is unsupported.
87
+ """
88
+ # this check is here only to throw an exception if invalid language is provided
89
+ return cls.match_lang(target_lang, ["zh"])
90
+
91
+ @staticmethod
92
+ def _retone(p):
93
+ p = p.replace('˧˩˧', '↓') # third tone
94
+ p = p.replace('˧˥', '↗') # second tone
95
+ p = p.replace('˥˩', '↘') # fourth tone
96
+ p = p.replace('˥', '→') # first tone
97
+ p = p.replace(chr(635) + chr(809), 'ɨ').replace(chr(633) + chr(809), 'ɨ')
98
+ assert chr(809) not in p, p
99
+ return p
100
+
101
+ def to_ipa(self, phones: List[str]) -> List[str]:
102
+ """
103
+ Converts a list of pinyin syllables to IPA. Falls back to the original syllable if conversion fails.
104
+
105
+ Args:
106
+ phones (List[str]): List of pinyin syllables or phrases.
107
+
108
+ Returns:
109
+ List[str]: Corresponding IPA or original syllables.
110
+ """
111
+ ipa_phones: List[str] = []
112
+ for p in phones:
113
+ if p == " ":
114
+ ipa_phones.append(" ")
115
+ continue
116
+ pho_str = ""
117
+ for sp in p.split(): # G2P might return phrases with multiple syllables
118
+ try:
119
+ pho = self.pinyin_to_ipa(sp.strip())[0][0]
120
+ if self.retone:
121
+ pho = self._retone(pho)
122
+ pho_str += pho
123
+ except Exception:
124
+ pass
125
+ ipa_phones.append(pho_str)
126
+ return ipa_phones
127
+
128
+ def phonemize_to_list(self, text: str, lang: str) -> List[str]:
129
+ phones: List[str] = []
130
+ lang = self.get_lang(lang)
131
+ if self.jieba:
132
+ import jieba
133
+ for chunk in jieba.cut(text, cut_all=False):
134
+ if chunk.isdigit():
135
+ chunk = num2str(chunk)
136
+ phones += self.get_pinyin(chunk)
137
+ phones += [" "] # keep jieba whitespace
138
+ else:
139
+ phones = self.get_pinyin(text)
140
+ if self.alphabet == Alphabet.IPA:
141
+ phones = self.to_ipa(phones)
142
+ return phones
143
+
144
+ def phonemize_string(self, text: str, lang: str = "zh") -> str:
145
+ """
146
+ Converts input text to a space-separated phoneme string.
147
+
148
+ Args:
149
+ text (str): The input sentence.
150
+ lang (str): Language code (must be "zh").
151
+
152
+ Returns:
153
+ str: Space-separated phoneme string.
154
+ """
155
+ phones: List[str] = self.phonemize_to_list(text, lang)
156
+ return "".join(phones)
157
+
158
+ @abc.abstractmethod
159
+ def get_pinyin(self, text: str) -> List[str]:
160
+ """
161
+ Abstract method to be implemented by subclasses for converting text to pinyin.
162
+
163
+ Args:
164
+ text (str): Input Chinese text.
165
+
166
+ Returns:
167
+ List[str]: List of pinyin tokens.
168
+ """
169
+ return NotImplemented
170
+
171
+
172
+ class G2pCPhonemizer(BaseChinesePinyinPhonemizer):
173
+ """
174
+ Phonemizer using g2pc (CRF-based Grapheme-to-Phoneme converter).
175
+ https://github.com/Kyubyong/g2pC
176
+ """
177
+
178
+ def __init__(self, alphabet=Alphabet.PINYIN, jieba: bool = True):
179
+ from g2pc import G2pC
180
+ self.g2p = G2pC()
181
+ super().__init__(alphabet, jieba)
182
+
183
+ def get_pinyin(self, text: str) -> List[str]:
184
+ """
185
+ Returns a list of pinyin syllables from g2pc.
186
+
187
+ Args:
188
+ text (str): Input Chinese text.
189
+
190
+ Returns:
191
+ List[str]: Pinyin tokens.
192
+ """
193
+ return [a[3] for a in self.g2p(text)]
194
+
195
+
196
+ class G2pMPhonemizer(BaseChinesePinyinPhonemizer):
197
+ """
198
+ Phonemizer using g2pM - A Neural Grapheme-to-Phoneme Conversion Package for Mandarin Chinese
199
+ https://github.com/kakaobrain/g2pm
200
+ """
201
+
202
+ def __init__(self, alphabet=Alphabet.PINYIN, tone: bool = True, char_split: bool = False, jieba: bool = True):
203
+ from g2pM import G2pM
204
+ self.g2p = G2pM()
205
+ self.tone = tone
206
+ self.char_split = char_split
207
+ super().__init__(alphabet, jieba)
208
+
209
+ def get_pinyin(self, text: str) -> List[str]:
210
+ """
211
+ Returns a list of pinyin tokens from g2pM.
212
+
213
+ Args:
214
+ text (str): Input Chinese text.
215
+
216
+ Returns:
217
+ List[str]: Pinyin tokens.
218
+ """
219
+ return self.g2p(text, tone=self.tone, char_split=self.char_split)
220
+
221
+
222
+ class XpinyinPhonemizer(BaseChinesePinyinPhonemizer):
223
+ """
224
+ Phonemizer using xpinyin (basic pinyin generator with optional tone marks).
225
+ """
226
+
227
+ def __init__(self, alphabet=Alphabet.PINYIN, tone_marks: str = "numbers", jieba: bool = True):
228
+ from xpinyin import Pinyin
229
+ self.g2p = Pinyin()
230
+ self.tone_marks = tone_marks
231
+ super().__init__(alphabet, jieba)
232
+
233
+ def get_pinyin(self, text: str) -> List[str]:
234
+ """
235
+ Returns a list of pinyin tokens from xpinyin.
236
+
237
+ Args:
238
+ text (str): Input Chinese text.
239
+
240
+ Returns:
241
+ List[str]: Pinyin tokens.
242
+ """
243
+ return self.g2p.get_pinyin(text, tone_marks=self.tone_marks).split("-")
244
+
245
+
246
+ class PypinyinPhonemizer(BaseChinesePinyinPhonemizer):
247
+ """
248
+ Phonemizer using pypinyin (comprehensive and accurate pinyin library).
249
+ """
250
+
251
+ def __init__(self, alphabet=Alphabet.PINYIN, jieba: bool = True):
252
+ from pypinyin import pinyin
253
+ self.g2p = pinyin
254
+ super().__init__(alphabet, jieba)
255
+
256
+ def get_pinyin(self, text: str) -> List[str]:
257
+ """
258
+ Returns a list of pinyin tokens from pypinyin.
259
+
260
+ Args:
261
+ text (str): Input Chinese text.
262
+
263
+ Returns:
264
+ List[str]: Pinyin tokens.
265
+ """
266
+ return [p[0] for p in self.g2p(text)]
267
+
268
+
269
+ if __name__ == "__main__":
270
+ lang = "zh"
271
+ text = "然而,他红了20年以后,他竟退出了大家的视线。"
272
+
273
+ pho = JiebaPhonemizer()
274
+ #pho1 = G2pCPhonemizer(ipa=True)
275
+ pho2 = G2pMPhonemizer()
276
+ pho3 = XpinyinPhonemizer()
277
+ pho4 = PypinyinPhonemizer()
278
+
279
+ from phoonnx.phonemizers.mul import MisakiPhonemizer
280
+
281
+ pho5 = MisakiPhonemizer()
282
+
283
+ print(f"\n--- Getting phonemes for '{text}' ---")
284
+
285
+ phones = pho5.phonemize_to_list(text, lang)
286
+ print(f" Misaki: {phones}")
287
+
288
+ #phones = pho1.phonemize(text, lang)
289
+ #print(f" G2pC: {phones}")
290
+
291
+ phones = pho2.phonemize_to_list(text, lang)
292
+ print(f" G2pM: {phones}")
293
+
294
+ phones = pho3.phonemize_to_list(text, lang)
295
+ print(f" Xpinyin: {phones}")
296
+
297
+ phones = pho4.phonemize_to_list(text, lang)
298
+ print(f" Pypinyin: {phones}")
299
+
300
+ phones = pho.phonemize_to_list(text, lang)
301
+ print(f" Jieba: {phones}")
302
+
303
+ #
304
+ exit()
305
+ # Phonemes: [('ran2 er2 , ta1 hong2 le5 2 0 nian2 yi3 hou4 , ta1 jing4 tui4 chu1 le5 da4 jia1 de5 shi4 xian4 。', '.', True)]
306
+ # Phonemes: [('ran2 er2 , ta1 hong2 le5 20 nian2 yi3 hou4 , ta1 jing4 tui4 chu1 le5 da4 jia1 de5 shi4 xian4 。', '.', True)]
307
+ # Phonemes: [('ran2 er2 , ta1 hong2 le5 20 nian2 yi3 hou4 , ta1 jing4 tui4 chu1 le5 da4 jia1 de5 shi4 xian4 。', '.', True)]
308
+ # Phonemes: [('rán ér , tā hóng le 20 nián yǐ hòu , tā jìng tuì chū le dà jiā de shì xiàn 。', '.', True)]
File without changes
@@ -0,0 +1,249 @@
1
+ # taken from https://github.com/chorusai/arpa2ipa/blob/master/arpa2ipa/_arpa_to_ipa.py
2
+ import re
3
+
4
+ """
5
+ https://en.wikipedia.org/wiki/Arpabet
6
+
7
+ In Arpabet, every phoneme is represented by one or two capital letters.
8
+ Digits are used as stress indicators and are placed at the end of the
9
+ stressed syllabic vowel. Punctuation marks are used like in the written
10
+ language, to represent intonation changes at the end of clauses and
11
+ sentences. The stress values are:
12
+
13
+ Value | Description
14
+ 0 | No stress
15
+ 1 | Primary stress
16
+ 2 | Seconary stress
17
+
18
+ ** Vowels
19
+ *** Monophthongs
20
+
21
+ Arpabet IPA Word examples
22
+ AO ɔ off (AO1 F); fall (F AO1 L); frost (F R AO1 S T)
23
+ AA ɑ father (F AA1 DH ER), cot (K AA1 T)
24
+ IY i bee (B IY1); she (SH IY1)
25
+ UW u you (Y UW1); new (N UW1); food (F UW1 D)
26
+ EH ɛ red (R EH1 D); men (M EH1 N) (modern versions use e)
27
+ IH ɪ big (B IH1 G); win (W IH1 N)
28
+ UH ʊ should (SH UH1 D), could (K UH1 D)
29
+ AH ʌ but (B AH1 T), sun (S AH1 N)
30
+ AH(0) ə sofa (S OW1 F AH0), alone (AH0 L OW1 N)
31
+ AX ə discus (D IH1 S K AX0 S);
32
+ AE æ at (AE1 T); fast (F AE1 S T)
33
+ *** Dipthongs
34
+
35
+ Arpabet IPA Word Examples
36
+ EY eɪ say (S EY1); eight (EY1 T)
37
+ AY aɪ my (M AY1); why (W AY1); ride (R AY1 D)
38
+ OW oʊ show (SH OW1); coat (K OW1 T)
39
+ AW a how (HH AW1); now (N AW1)
40
+ OY ɔɪ boy (B OY1); toy (T OY1)
41
+ *** R-colored vowels
42
+
43
+ Arpabet IPA Word Examples
44
+ ER ɝ her (HH ER0); bird (B ER1 D); hurt (HH ER1 T), nurse (N ER1 S)
45
+ AXR ɚ father (F AA1 DH AXR); coward (K AW1 AXR D)
46
+ EH R ɛr air (EH1 R); where (W EH1 R); hair (HH EH1 R)
47
+ UH R ʊr cure (K Y UH1 R); bureau (B Y UH1 R OW0), detour (D IH0 T
48
+ UH1 R)
49
+ AO R ɔr more (M AO1 R); bored (B AO1 R D); chord (K AO1 R D)
50
+ AA R ɑr large (L AA1 R JH); hard (HH AA1 R D)
51
+ IH R or IY R ɪr ear (IY1 R); near (N IH1 R)
52
+ AW R aʊr This seems to be a rarely used r-controlled vowel. In some
53
+ dialects flower (F L AW1 R; in other dialects F L AW1 ER0)
54
+ ** Consonants
55
+ *** Stops
56
+ Arpabet IPA Word Examples
57
+ P p pay (P EY1)
58
+ B b buy (B AY1)
59
+ T t take (T EY1 K)
60
+ D d day (D EY1)
61
+ K k key (K IY1)
62
+ G ɡ go (G OW1)
63
+
64
+ *** Affricates
65
+ Arpabet IPA Word Examples
66
+ CH tʃ chair (CH EH1 R)
67
+ JH dʒ just (JH AH1 S T); gym (JH IH1 M)
68
+
69
+ *** Fricatives
70
+ Arpabet IPA Word Examples
71
+ F f for (F AO1 R)
72
+ V v very (V EH1 R IY0)
73
+ TH θ thanks (TH AE1 NG K S); Thursday (TH ER1 Z D EY2)
74
+ DH ð that (DH AE1 T); the (DH AH0); them (DH EH1 M)
75
+ S s say (S EY1)
76
+ Z z zoo (Z UW1)
77
+ SH ʃ show (SH OW1)
78
+ ZH ʒ measure (M EH1 ZH ER0); pleasure (P L EH1 ZH ER)
79
+ HH h house (HH AW1 S)
80
+
81
+ *** Nasals
82
+ Arpabet IPA Word Examples
83
+ M m man (M AE1 N)
84
+ EM m̩ keep 'em (K IY1 P EM)
85
+ N n no (N OW1)
86
+ EN n̩ button (B AH1 T EN)
87
+ NG ŋ sing (S IH1 NG)
88
+ ENG ŋ̍ Washington (W AO1 SH ENG T EN)
89
+
90
+ *** Liquids
91
+ Arpabet IPA Word Examples
92
+ L l or ɫ late (L EY1 T); fail (F EY1 L)
93
+ EL ɫ̩ bottle (B AO1 DX EL)
94
+ R r or ɹ run (R AH1 N)
95
+ DX ɾ wetter (W EH1 DX AXR)
96
+ NX ɾ̃ wintergreen (W IY2 NX AXR G R IY1 N)
97
+
98
+ *** Semivowels
99
+ Arpabet IPA Word Examples
100
+ Y j yes (Y EH1 S)
101
+ W w way (W EY1)
102
+ Q ʔ (glottal stop) uh-oh (Q AH1 Q OW) (ʔʌʔoʊ)
103
+ (missing) hw or ʍ "when" etc. in some dialects
104
+
105
+ """
106
+ monopthongs = {
107
+ 'AO': 'ɔ',
108
+ 'AO0': 'ɔ',
109
+ 'AO1': 'ɔ',
110
+ 'AO2': 'ɔ',
111
+ 'AA': 'ɑ',
112
+ 'AA0': 'ɑ',
113
+ 'AA1': 'ɑ',
114
+ 'AA2': 'ɑ',
115
+ 'IY': 'i',
116
+ 'IY0': 'i',
117
+ 'IY1': 'i',
118
+ 'IY2': 'i',
119
+ 'UW': 'u',
120
+ 'UW0': 'u',
121
+ 'UW1': 'u',
122
+ 'UW2': 'u',
123
+ 'EH': 'e',
124
+ 'EH0': 'e',
125
+ 'EH1': 'e',
126
+ 'EH2': 'e',
127
+ 'IH': 'ɪ',
128
+ 'IH0': 'ɪ',
129
+ 'IH1': 'ɪ',
130
+ 'IH2': 'ɪ',
131
+ 'UH': 'ʊ',
132
+ 'UH0': 'ʊ',
133
+ 'UH1': 'ʊ',
134
+ 'UH2': 'ʊ',
135
+ 'AH': 'ʌ',
136
+ 'AH0': 'ə',
137
+ 'AH1': 'ʌ',
138
+ 'AH2': 'ʌ',
139
+ 'AE': 'æ',
140
+ 'AE0': 'æ',
141
+ 'AE1': 'æ',
142
+ 'AE2': 'æ',
143
+ 'AX': 'ə',
144
+ 'AX0': 'ə',
145
+ 'AX1': 'ə',
146
+ 'AX2': 'ə',
147
+ }
148
+
149
+ dipthongs = {
150
+ 'EY': 'eɪ',
151
+ 'EY0': 'eɪ',
152
+ 'EY1': 'eɪ',
153
+ 'EY2': 'eɪ',
154
+ 'AY': 'aɪ',
155
+ 'AY0': 'aɪ',
156
+ 'AY1': 'aɪ',
157
+ 'AY2': 'aɪ',
158
+ 'OW': 'oʊ',
159
+ 'OW0': 'oʊ',
160
+ 'OW1': 'oʊ',
161
+ 'OW2': 'oʊ',
162
+ 'AW': 'aʊ',
163
+ 'AW0': 'aʊ',
164
+ 'AW1': 'aʊ',
165
+ 'AW2': 'aʊ',
166
+ 'OY': 'ɔɪ',
167
+ 'OY0': 'ɔɪ',
168
+ 'OY1': 'ɔɪ',
169
+ 'OY2': 'ɔɪ',
170
+ }
171
+
172
+ r_colored_vowels = {
173
+ 'ER': 'ɜr',
174
+ 'ER0': 'ɜr',
175
+ 'ER1': 'ɜr',
176
+ 'ER2': 'ɜr',
177
+ 'AXR': 'ər',
178
+ 'AXR0': 'ər',
179
+ 'AXR1': 'ər',
180
+ 'AXR2': 'ər',
181
+ }
182
+
183
+ stops = {
184
+ 'P': 'p',
185
+ 'B': 'b',
186
+ 'T': 't',
187
+ 'D': 'd',
188
+ 'K': 'k',
189
+ 'G': 'g',
190
+ }
191
+
192
+ affricates = {
193
+ 'CH': 'tʃ',
194
+ 'JH': 'dʒ',
195
+ }
196
+
197
+ fricatives = {
198
+ 'F': 'f',
199
+ 'V': 'v',
200
+ 'TH': 'θ',
201
+ 'DH': 'ð',
202
+ 'S': 's',
203
+ 'Z': 'z',
204
+ 'SH': 'ʃ',
205
+ 'ZH': 'ʒ',
206
+ 'HH': 'h',
207
+ }
208
+
209
+ nasals = {
210
+ 'M': 'm',
211
+ 'EM': 'm̩',
212
+ 'N': 'n',
213
+ 'EN': 'n̩',
214
+ 'NG': 'ŋ',
215
+ 'ENG': 'ŋ̍',
216
+ }
217
+
218
+ liquids = {
219
+ 'L': 'l',
220
+ 'EL': 'ɫ̩',
221
+ 'R': 'r',
222
+ 'DX': 'ɾ',
223
+ 'NX': 'ɾ̃',
224
+ }
225
+
226
+ semivowels = {
227
+ 'W': 'w',
228
+ 'Y': 'j',
229
+ 'Q': 'ʔ'
230
+ }
231
+
232
+ arpa_to_ipa_lookup = {}
233
+ arpa_to_ipa_lookup.update(monopthongs)
234
+ arpa_to_ipa_lookup.update(dipthongs)
235
+ arpa_to_ipa_lookup.update(r_colored_vowels)
236
+ arpa_to_ipa_lookup.update(stops)
237
+ arpa_to_ipa_lookup.update(affricates)
238
+ arpa_to_ipa_lookup.update(fricatives)
239
+ arpa_to_ipa_lookup.update(nasals)
240
+ arpa_to_ipa_lookup.update(liquids)
241
+ arpa_to_ipa_lookup.update(semivowels)
242
+
243
+
244
+ def split_on_capital(camel):
245
+ ' '.join(re.findall('[A-Z][a-z]*', camel.title())).lower()
246
+
247
+
248
+ def arpa_to_ipa(arpa):
249
+ return ' '.join(arpa_to_ipa_lookup[phoneme] for phoneme in arpa.split(' '))