phoonnx 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. phoonnx/__init__.py +0 -0
  2. phoonnx/config.py +490 -0
  3. phoonnx/locale/ca/phonetic_spellings.txt +2 -0
  4. phoonnx/locale/en/phonetic_spellings.txt +1 -0
  5. phoonnx/locale/gl/phonetic_spellings.txt +2 -0
  6. phoonnx/locale/pt/phonetic_spellings.txt +2 -0
  7. phoonnx/phoneme_ids.py +453 -0
  8. phoonnx/phonemizers/__init__.py +45 -0
  9. phoonnx/phonemizers/ar.py +42 -0
  10. phoonnx/phonemizers/base.py +216 -0
  11. phoonnx/phonemizers/en.py +250 -0
  12. phoonnx/phonemizers/fa.py +46 -0
  13. phoonnx/phonemizers/gl.py +142 -0
  14. phoonnx/phonemizers/he.py +67 -0
  15. phoonnx/phonemizers/ja.py +119 -0
  16. phoonnx/phonemizers/ko.py +97 -0
  17. phoonnx/phonemizers/mul.py +606 -0
  18. phoonnx/phonemizers/vi.py +44 -0
  19. phoonnx/phonemizers/zh.py +308 -0
  20. phoonnx/thirdparty/__init__.py +0 -0
  21. phoonnx/thirdparty/arpa2ipa.py +249 -0
  22. phoonnx/thirdparty/cotovia/cotovia_aarch64 +0 -0
  23. phoonnx/thirdparty/cotovia/cotovia_x86_64 +0 -0
  24. phoonnx/thirdparty/hangul2ipa.py +783 -0
  25. phoonnx/thirdparty/ko_tables/aspiration.csv +20 -0
  26. phoonnx/thirdparty/ko_tables/assimilation.csv +31 -0
  27. phoonnx/thirdparty/ko_tables/double_coda.csv +17 -0
  28. phoonnx/thirdparty/ko_tables/hanja.tsv +8525 -0
  29. phoonnx/thirdparty/ko_tables/ipa.csv +22 -0
  30. phoonnx/thirdparty/ko_tables/neutralization.csv +11 -0
  31. phoonnx/thirdparty/ko_tables/tensification.csv +56 -0
  32. phoonnx/thirdparty/ko_tables/yale.csv +22 -0
  33. phoonnx/thirdparty/kog2p/__init__.py +385 -0
  34. phoonnx/thirdparty/kog2p/rulebook.txt +212 -0
  35. phoonnx/thirdparty/mantoq/__init__.py +67 -0
  36. phoonnx/thirdparty/mantoq/buck/__init__.py +0 -0
  37. phoonnx/thirdparty/mantoq/buck/phonetise_buckwalter.py +569 -0
  38. phoonnx/thirdparty/mantoq/buck/symbols.py +64 -0
  39. phoonnx/thirdparty/mantoq/buck/tokenization.py +105 -0
  40. phoonnx/thirdparty/mantoq/num2words.py +37 -0
  41. phoonnx/thirdparty/mantoq/pyarabic/__init__.py +12 -0
  42. phoonnx/thirdparty/mantoq/pyarabic/arabrepr.py +64 -0
  43. phoonnx/thirdparty/mantoq/pyarabic/araby.py +1647 -0
  44. phoonnx/thirdparty/mantoq/pyarabic/named_const.py +227 -0
  45. phoonnx/thirdparty/mantoq/pyarabic/normalize.py +161 -0
  46. phoonnx/thirdparty/mantoq/pyarabic/number.py +826 -0
  47. phoonnx/thirdparty/mantoq/pyarabic/number_const.py +1704 -0
  48. phoonnx/thirdparty/mantoq/pyarabic/stack.py +52 -0
  49. phoonnx/thirdparty/mantoq/pyarabic/trans.py +517 -0
  50. phoonnx/thirdparty/mantoq/unicode_symbol2label.py +4173 -0
  51. phoonnx/thirdparty/tashkeel/LICENSE +22 -0
  52. phoonnx/thirdparty/tashkeel/SOURCE +1 -0
  53. phoonnx/thirdparty/tashkeel/__init__.py +212 -0
  54. phoonnx/thirdparty/tashkeel/hint_id_map.json +18 -0
  55. phoonnx/thirdparty/tashkeel/input_id_map.json +56 -0
  56. phoonnx/thirdparty/tashkeel/model.onnx +0 -0
  57. phoonnx/thirdparty/tashkeel/target_id_map.json +17 -0
  58. phoonnx/thirdparty/zh_num.py +238 -0
  59. phoonnx/util.py +705 -0
  60. phoonnx/version.py +6 -0
  61. phoonnx/voice.py +521 -0
  62. phoonnx-0.0.0.dist-info/METADATA +255 -0
  63. phoonnx-0.0.0.dist-info/RECORD +86 -0
  64. phoonnx-0.0.0.dist-info/WHEEL +5 -0
  65. phoonnx-0.0.0.dist-info/top_level.txt +2 -0
  66. phoonnx_train/__main__.py +151 -0
  67. phoonnx_train/export_onnx.py +109 -0
  68. phoonnx_train/norm_audio/__init__.py +92 -0
  69. phoonnx_train/norm_audio/trim.py +54 -0
  70. phoonnx_train/norm_audio/vad.py +54 -0
  71. phoonnx_train/preprocess.py +420 -0
  72. phoonnx_train/vits/__init__.py +0 -0
  73. phoonnx_train/vits/attentions.py +427 -0
  74. phoonnx_train/vits/commons.py +147 -0
  75. phoonnx_train/vits/config.py +330 -0
  76. phoonnx_train/vits/dataset.py +214 -0
  77. phoonnx_train/vits/lightning.py +352 -0
  78. phoonnx_train/vits/losses.py +58 -0
  79. phoonnx_train/vits/mel_processing.py +139 -0
  80. phoonnx_train/vits/models.py +732 -0
  81. phoonnx_train/vits/modules.py +527 -0
  82. phoonnx_train/vits/monotonic_align/__init__.py +20 -0
  83. phoonnx_train/vits/monotonic_align/setup.py +13 -0
  84. phoonnx_train/vits/transforms.py +212 -0
  85. phoonnx_train/vits/utils.py +16 -0
  86. phoonnx_train/vits/wavfile.py +860 -0
@@ -0,0 +1,783 @@
1
+ import csv
2
+ import math
3
+ import os.path
4
+ from base64 import b64decode
5
+ from pathlib import Path
6
+ from typing import Union, List, Dict
7
+
8
+ import regex as re
9
+
10
+
11
+ # ----------------------------
12
+ # Classes and Helper Functions
13
+ # ----------------------------
14
+
15
+ class ConversionTable:
16
+ def __init__(self, name: str, tables_dir: Path):
17
+ self.name = name
18
+ # Open the CSV file located in the 'tables' folder
19
+ table_path = tables_dir / f'{self.name}.csv'
20
+ if not table_path.exists():
21
+ raise FileNotFoundError(f"无法找到转换表文件: {table_path}")
22
+ with open(table_path, 'r', encoding='utf-8') as f:
23
+ reader = csv.DictReader(f, delimiter=',')
24
+ # Iterate over each row in the file
25
+ for row in reader:
26
+ # For each header, set it as an attribute if it's not already set
27
+ for header, value in row.items():
28
+ # Add the value to a list associated with the header
29
+ if not hasattr(self, header):
30
+ setattr(self, header, [])
31
+ getattr(self, header).append(value)
32
+ # Convert lists to tuples because the contents should be immutable
33
+ for header in reader.fieldnames:
34
+ setattr(self, header, tuple(getattr(self, header)))
35
+
36
+ def apply(self, text: str, find_in: str = '_from') -> str:
37
+ # for a single phoneme, find it among _from (or any attribute name find_in)
38
+ # and convert it to _to
39
+ try:
40
+ from_tuple = getattr(self, find_in)
41
+ ind = from_tuple.index(text)
42
+ return self._to[ind]
43
+ except (AttributeError, ValueError):
44
+ return text
45
+
46
+ def sub(self, text: str, find_in: str = '_from') -> str:
47
+ from_tuple = getattr(self, find_in)
48
+ for index, item in enumerate(from_tuple):
49
+ text = text.replace(item, self._to[index])
50
+ return text
51
+
52
+ def safe_index(self, attribute: str, element: str) -> int:
53
+ target_tuple = getattr(self, attribute)
54
+ try:
55
+ return target_tuple.index(element)
56
+ except ValueError:
57
+ return -1
58
+
59
+ def __str__(self):
60
+ return str(f'ConversionTable {self.name}')
61
+
62
+
63
+ class Word:
64
+ def __init__(self, hangul: str, tables_dir: Path):
65
+ # word to convert
66
+ self.hangul = hangul
67
+ self.tables_dir = tables_dir
68
+ self._jamo = self.to_jamo(hangul)
69
+ self._cv = self.mark_CV(self.jamo)
70
+
71
+ @property
72
+ def jamo(self) -> str:
73
+ return self._jamo
74
+
75
+ @jamo.setter
76
+ def jamo(self, value: str):
77
+ self._jamo = value
78
+ self._cv = self.mark_CV(self._jamo)
79
+
80
+ @property
81
+ def cv(self) -> str:
82
+ return self._cv
83
+
84
+ def mark_CV(self, jamo: str, convention: ConversionTable = None) -> str:
85
+ # identify each element in jamo as either consonant or vowel
86
+ r = ''
87
+
88
+ if convention is None:
89
+ convention = ConversionTable('ipa', self.tables_dir)
90
+
91
+ consonants = convention.C
92
+ vowels = convention.V
93
+
94
+ for j in jamo:
95
+ if j in vowels:
96
+ r += 'V'
97
+ elif j in consonants:
98
+ r += 'C'
99
+ return r
100
+
101
+ def to_jamo(self, hangul: str, no_empty_onset: bool = True, sboundary: bool = False) -> str:
102
+ # Convert Hangul forms to jamo, remove empty onset ㅇ
103
+ # e.g., input "안녕" output "ㅏㄴㄴㅕㅇ"
104
+ not_hangul = r'[^가-힣ㄱ-ㅎㅏ-ㅣ]'
105
+ cleaned_hangul = re.sub(not_hangul, '', hangul) # hangul without special characters
106
+ jamo_forms = hangul_to_jamos(cleaned_hangul)
107
+
108
+ jamo_forms = self.separate_double_coda(jamo_forms) # divide double coda (e.g., "ㄳ" -> "ㄱㅅ")
109
+
110
+ if no_empty_onset: # remove soundless syllable initial ㅇ
111
+ jamo_forms = self.remove_empty_onset(jamo_forms)
112
+
113
+ if sboundary:
114
+ # not implemented
115
+ pass
116
+
117
+ return ''.join(jamo_forms)
118
+
119
+ def remove_empty_onset(self, syllables: List[str]) -> List[str]:
120
+ r = []
121
+ for syllable in syllables:
122
+ to_append = syllable[1:] if syllable[0] == 'ㅇ' else syllable
123
+ r.append(to_append)
124
+ return r
125
+
126
+ def separate_double_coda(self, syllables: List[str]) -> List[str]:
127
+ r = []
128
+ CT_double_codas = ConversionTable('double_coda', self.tables_dir)
129
+ for syllable in syllables:
130
+ if len(syllable) < 3:
131
+ r.append(syllable)
132
+ continue
133
+ coda = syllable[2]
134
+ try:
135
+ separated_coda = CT_double_codas._separated[CT_double_codas._double.index(coda)]
136
+ r.append(syllable[:2] + separated_coda)
137
+ continue
138
+ except ValueError:
139
+ r.append(syllable)
140
+ continue
141
+ return r
142
+
143
+ def __str__(self):
144
+ return self.hangul
145
+
146
+
147
+ # ----------------------------
148
+ # Hangul Tools
149
+ # ----------------------------
150
+
151
+ GA_CODE = 44032 # The unicode representation of the Korean syllabic orthography starts with GA_CODE
152
+ G_CODE = 12593 # The unicode representation of the Korean phonetic (jamo) orthography starts with G_CODE
153
+ ONSET = 588
154
+ CODA = 28
155
+
156
+ # ONSET LIST. 00 -- 18
157
+ ONSET_LIST = ('ㄱ', 'ㄲ', 'ㄴ', 'ㄷ', 'ㄸ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅃ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅉ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ')
158
+
159
+ # VOWEL LIST. 00 -- 20
160
+ VOWEL_LIST = ('ㅏ', 'ㅐ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅔ', 'ㅕ', 'ㅖ', 'ㅗ', 'ㅘ', 'ㅙ', 'ㅚ', 'ㅛ', 'ㅜ', 'ㅝ', 'ㅞ', 'ㅟ', 'ㅠ',
161
+ 'ㅡ', 'ㅢ', 'ㅣ')
162
+
163
+ # CODA LIST. 00 -- 27 + 1 (0 for open syllable)
164
+ CODA_LIST = ('', 'ㄱ', 'ㄲ', 'ㄳ', 'ㄴ', 'ㄵ', 'ㄶ', 'ㄷ', 'ㄹ', 'ㄺ', 'ㄻ', 'ㄼ', 'ㄽ', 'ㄾ', 'ㄿ', 'ㅀ', 'ㅁ', 'ㅂ', 'ㅄ',
165
+ 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ')
166
+
167
+
168
+ def hangul_to_jamos(hangul: str) -> List[str]:
169
+ # convert hangul strings to jamos
170
+ # hangul: str. multiple korean letters like 가나다라마바사
171
+ syllables = list(hangul)
172
+ r = []
173
+
174
+ for letter in syllables:
175
+ if bool(re.match(r'^[가-힣]+$', letter)): # if letter is a hangul character
176
+ chr_code = ord(letter) - GA_CODE
177
+ onset = math.floor(chr_code / ONSET)
178
+ vowel = math.floor((chr_code - (ONSET * onset)) / CODA)
179
+ coda = math.floor((chr_code - (ONSET * onset) - (CODA * vowel)))
180
+
181
+ syllable = f'{ONSET_LIST[onset]}{VOWEL_LIST[vowel]}{CODA_LIST[coda]}'
182
+ else: # if letter is NOT a hangul character
183
+ syllable = letter
184
+ r.append(syllable)
185
+
186
+ return r
187
+
188
+
189
+ def jamo_to_hangul(syllable: str) -> str:
190
+ # only accept one syllable length of jamos and convert it to one hangul character
191
+ if len(syllable) > 1:
192
+ jamos = list(syllable)
193
+ onset = ONSET_LIST.index(jamos[0])
194
+ vowel = VOWEL_LIST.index(jamos[1])
195
+ coda = CODA_LIST.index(jamos[2]) if len(syllable) == 3 else 0
196
+
197
+ utf_pointer = (((onset * 21) + vowel) * 28) + coda + GA_CODE
198
+ syllable = chr(utf_pointer)
199
+ return syllable
200
+
201
+
202
+ # ----------------------------
203
+ # Hanja Tools
204
+ # ----------------------------
205
+
206
+ HIGHV_DIPHTHONGS = ("ㅑ", "ㅕ", "ㅖ", "ㅛ", "ㅠ", "ㅣ")
207
+
208
+
209
+ def realize_hanja(raw: str) -> str:
210
+ # convert the Unicode code point (e.g., U+349A) into actual hanja 㒚
211
+ stripped_raw = raw.strip('U+') # 'U+' part is meaningless so strip
212
+ r = chr(int(stripped_raw, 16)) # hexadecimal part into int and then into character
213
+ return r
214
+
215
+
216
+ def load_jajeon(tables_dir: Path) -> Dict[str, str]:
217
+ # import a 漢字 - 한글 conversion table
218
+ jajeon = {}
219
+ jajeon_path = tables_dir / 'hanja.tsv'
220
+ if not jajeon_path.exists():
221
+ raise FileNotFoundError(f"无法找到汉字转换表文件: {jajeon_path}")
222
+ with open(jajeon_path, newline='', encoding='utf-8') as f:
223
+ reader = csv.reader(f, delimiter='\t')
224
+ for row in reader:
225
+ if len(row) < 2:
226
+ continue # 跳过不完整的行
227
+ # the original file uses the Unicode code point (e.g., U+349A), so need to convert this to the actual hanja
228
+ key = realize_hanja(row[0])
229
+ value = row[1]
230
+ jajeon[key] = value
231
+ return jajeon
232
+
233
+
234
+ def hanja_to_hangul(jajeon: Dict[str, str], char: str) -> str:
235
+ try:
236
+ r = jajeon[char]
237
+ except KeyError:
238
+ r = char
239
+ return r
240
+
241
+
242
+ def initial_rule(char: str, jajeon: Dict[str, str]) -> str:
243
+ # apply the 'initial rule' (두음규칙) where 'l' becomes 'n' and 'n' gets deleted word-initially
244
+ # char: hangul character
245
+ changed_flag = False
246
+ jamos = hangul_to_jamos(char)
247
+ jamos = ''.join(jamos)
248
+ onset = jamos[0]
249
+ nucleus = jamos[1]
250
+ if onset == 'ㄹ':
251
+ onset = 'ㄴ'
252
+ changed_flag = True
253
+ if onset == 'ㄴ' and nucleus in HIGHV_DIPHTHONGS:
254
+ onset = 'ㅇ'
255
+ changed_flag = True
256
+
257
+ if changed_flag:
258
+ jamo_list = list(jamos)
259
+ jamo_list[0], jamo_list[1] = onset, nucleus
260
+ jamos = ''.join(jamo_list)
261
+
262
+ return jamo_to_hangul(jamos)
263
+
264
+
265
+ def hanja_cleaner(word: str, hanja_loc: List[int], tables_dir: Path) -> str:
266
+ jajeon = load_jajeon(tables_dir)
267
+ chars = list(word)
268
+
269
+ for i in hanja_loc:
270
+ if chars[i] in ["不", "不"] and (i < len(chars) - 1): # if 不 appears in a non-final syllable
271
+ if chars[i + 1] == "實":
272
+ # special case: 不實 = 부실
273
+ chars[i] = "부"
274
+ chars[i + 1] = "실"
275
+ continue
276
+ else:
277
+ # special case: 不 is pronounced as 부[pu] before an alveolar ㄷㅈ
278
+ chars[i + 1] = hanja_to_hangul(jajeon, chars[i + 1])
279
+ next_syllable = hangul_to_jamos(chars[i + 1])
280
+ if len(next_syllable) == 0:
281
+ following_onset = ''
282
+ else:
283
+ following_onset = ''.join(next_syllable)[0]
284
+ chars[i] = "부" if following_onset in ["ㄷ", "ㅈ"] else "불"
285
+ continue
286
+
287
+ chars[i] = hanja_to_hangul(jajeon, chars[i])
288
+
289
+ if i == 0: # apply the 'initial rule' (두음법칙)
290
+ chars[i] = initial_rule(chars[i], jajeon)
291
+
292
+ return ''.join(chars)
293
+
294
+
295
+ # ----------------------------
296
+ # Phonological Rules
297
+ # ----------------------------
298
+
299
+ CT_double_codas = None
300
+ CT_neutral = None
301
+ CT_tensification = None
302
+ CT_assimilation = None
303
+ CT_aspiration = None
304
+ CT_convention = None
305
+
306
+ CONSONANTS = ()
307
+ VOWELS = ()
308
+ C_SONORANTS = ('ㄴ', 'ㄹ', 'ㅇ', 'ㅁ')
309
+ OBSTRUENTS = ()
310
+ SONORANTS = ()
311
+
312
+
313
+ def initialize_conversion_tables(tables_dir: Path):
314
+ global CT_double_codas, CT_neutral, CT_tensification, CT_assimilation, CT_aspiration, CT_convention
315
+ CT_double_codas = ConversionTable('double_coda', tables_dir)
316
+ CT_neutral = ConversionTable('neutralization', tables_dir)
317
+ CT_tensification = ConversionTable('tensification', tables_dir)
318
+ CT_assimilation = ConversionTable('assimilation', tables_dir)
319
+ CT_aspiration = ConversionTable('aspiration', tables_dir)
320
+ CT_convention = ConversionTable('ipa', tables_dir)
321
+
322
+ global CONSONANTS, VOWELS, OBSTRUENTS, SONORANTS
323
+ CONSONANTS = tuple(
324
+ list(CT_convention.C)[:-2]) # from the C column of the IPA table, remove special characters # and $
325
+ VOWELS = tuple(list(CT_convention.V)) # from the V column of the IPA table
326
+ OBSTRUENTS = tuple(set(CONSONANTS) - set(C_SONORANTS))
327
+ SONORANTS = VOWELS + C_SONORANTS
328
+
329
+
330
+ def get_substring_ind(string: str, pattern: str) -> List[int]:
331
+ return [match.start() for match in re.finditer(f'(?={pattern})', string)]
332
+
333
+
334
+ def transcribe(jamos: str, convention: ConversionTable = None, str_return: bool = False) -> Union[List[str], str]:
335
+ if convention is None:
336
+ convention = CT_convention
337
+ transcribed = []
338
+ for jamo in jamos:
339
+ is_C = convention.safe_index('C', jamo)
340
+ is_V = convention.safe_index('V', jamo)
341
+ if is_V >= 0:
342
+ transcribed.append(convention.VSymbol[is_V])
343
+ elif is_C >= 0:
344
+ transcribed.append(convention.CSymbol[is_C])
345
+
346
+ if str_return:
347
+ return ''.join(transcribed)
348
+ return transcribed
349
+
350
+
351
+ def palatalize(word: Word) -> str:
352
+ palatalization_table = {
353
+ 'ㄷ': 'ㅈ',
354
+ 'ㅌ': 'ㅊ'
355
+ }
356
+ hangul_syllables = list(word.hangul)
357
+ to_jamo_bound = word.to_jamo
358
+ syllables_in_jamo = [to_jamo_bound(syl) for syl in hangul_syllables]
359
+ for i, syllable in enumerate(syllables_in_jamo):
360
+ try:
361
+ next_syllable = syllables_in_jamo[i + 1]
362
+ if next_syllable[0] == 'ㅣ':
363
+ new_coda = palatalization_table.get(syllable[-1], syllable[-1])
364
+ syllables_in_jamo[i] = ''.join(list(syllables_in_jamo[i])[:-1] + [new_coda])
365
+ except IndexError:
366
+ continue
367
+ new_jamo = ''.join(syllables_in_jamo)
368
+ return new_jamo
369
+
370
+
371
+ def aspirate(word: Word) -> str:
372
+ return CT_aspiration.sub(word.jamo)
373
+
374
+
375
+ def assimilate(word: Word) -> str:
376
+ return CT_assimilation.sub(word.jamo)
377
+
378
+
379
+ def pot(word: Word) -> str:
380
+ return CT_tensification.sub(word.jamo)
381
+
382
+
383
+ def neutralize(word: Word) -> str:
384
+ new_jamos = list(word.jamo)
385
+ for i, jamo in enumerate(new_jamos):
386
+ if i == len(new_jamos) - 1 or word.cv[i + 1] == 'C':
387
+ new_jamos[i] = CT_neutral.apply(jamo)
388
+ return ''.join(new_jamos)
389
+
390
+
391
+ def delete_h(word: Word) -> str:
392
+ h_locations = get_substring_ind(string=word.jamo, pattern='ㅎ')
393
+
394
+ for h_location in reversed(h_locations):
395
+ if h_location == 0 or h_location == len(word.jamo) - 1:
396
+ # a word-initial h cannot undergo deletion
397
+ continue
398
+ preceding = word.jamo[h_location - 1]
399
+ succeeding = word.jamo[h_location + 1]
400
+ if preceding in SONORANTS and succeeding in SONORANTS:
401
+ word.jamo = word.jamo[:h_location] + word.jamo[h_location + 1:]
402
+ return word.jamo
403
+
404
+
405
+ def simplify_coda(input_word: Word, word_final: bool = False) -> Word:
406
+ def simplify(jamo: str, loc: int) -> str:
407
+ # coda cluster simplification
408
+
409
+ list_jamo = list(jamo)
410
+ before = ''.join(list_jamo[:loc + 1])
411
+ double_coda = ''.join(list_jamo[loc + 1:loc + 3])
412
+ after = ''.join(list_jamo[loc + 3:])
413
+
414
+ converted = CT_double_codas.apply(text=double_coda, find_in='_separated')
415
+ return before + converted + after
416
+
417
+ while True:
418
+ double_coda_loc = get_substring_ind(input_word.cv, 'VCCC') # get all CCC location
419
+ if len(double_coda_loc) == 0:
420
+ break # if no, exit while-loop
421
+
422
+ cc = double_coda_loc[0] # work on the leftest CCC
423
+ new_jamo = simplify(input_word.jamo, cc)
424
+ input_word.jamo = new_jamo
425
+
426
+ # additionally, simplify word-final consonant cluster
427
+ final_CC = get_substring_ind(input_word.cv, 'CC$')
428
+ if len(final_CC) > 0:
429
+ cc = final_CC[0] - 1
430
+ new_jamo = simplify(input_word.jamo, cc)
431
+ input_word.jamo = new_jamo
432
+ return input_word
433
+
434
+
435
+ def non_coronalize(input_word: Word) -> str:
436
+ velars = list('ㄱㅋㄲ')
437
+ bilabials = list('ㅂㅍㅃㅁ')
438
+ non_velar_nasals = list('ㅁㄴ')
439
+
440
+ res = list(input_word.jamo)
441
+ for i, jamo in enumerate(input_word.jamo[:-1]):
442
+ if i == 0 or jamo not in non_velar_nasals:
443
+ continue
444
+ succeeding = input_word.jamo[i + 1]
445
+ if succeeding in velars:
446
+ res[i] = 'ㅇ'
447
+ elif succeeding in bilabials:
448
+ res[i] = 'ㅁ'
449
+ return ''.join(res)
450
+
451
+
452
+ def inter_v(symbols: List[str]) -> List[str]:
453
+ voicing_table = {
454
+ 'p': 'b',
455
+ 't': 'd',
456
+ 'k': 'ɡ',
457
+ 'tɕ': 'dʑ'
458
+ }
459
+ ipa_sonorants = [transcribe(s, str_return=True) for s in SONORANTS]
460
+
461
+ res = list(symbols)
462
+
463
+ for index, symbol in enumerate(symbols[:-1]):
464
+ if index == 0 or symbol not in voicing_table.keys():
465
+ continue
466
+ preceding = symbols[index - 1]
467
+ succeeding = symbols[index + 1]
468
+
469
+ if preceding in ipa_sonorants:
470
+ if succeeding in ipa_sonorants:
471
+ res[index] = voicing_table.get(symbol, symbol)
472
+ elif succeeding == 'ɕ':
473
+ res[index] = voicing_table.get(symbol, symbol)
474
+ res[index + 1] = 'ʑ'
475
+
476
+ return res
477
+
478
+
479
+ def alternate_lr(symbols: List[str]) -> List[str]:
480
+ ipa_vowels = [transcribe(v, str_return=True) for v in VOWELS]
481
+
482
+ res = list(symbols)
483
+
484
+ l_locs = [index for index, value in enumerate(symbols) if value == 'l']
485
+
486
+ for l_loc in reversed(l_locs):
487
+ if l_loc == 0 or l_loc == (len(symbols) - 1):
488
+ continue
489
+
490
+ preceding = symbols[l_loc - 1]
491
+ succeeding = symbols[l_loc + 1]
492
+ if preceding in ipa_vowels and succeeding in ipa_vowels:
493
+ res[l_loc] = 'ɾ'
494
+
495
+ return res
496
+
497
+
498
+ def apply_rules(word: Word, rules_to_apply: str = 'pastcnhovr') -> Word:
499
+ # 规则的种类和顺序
500
+ # (P)alatalization: 구개음화 (맏이 -> 마지)
501
+ # (A)spiration: 격음화 (북한 -> 부칸)
502
+ # a(S)similation: 음운동화
503
+ # (T)ensification: 표준발음법 제23항(예외없는 경음화) 적용
504
+ # (C)omplex coda simplification: 자음군단순화 (닭도 -> 닭도, 닭 -> 닭)
505
+ # coda (N)eutralization: 음절말 장애음 중화 (빛/빚/빗 -> 빝)
506
+ # intersonorant (H)-deletion: 공명음 사이 'ㅎ' 삭제
507
+ # intersonorant Obstruent (V)oicing: 공명음 사이 장애음 유성음화
508
+
509
+ # apply palatalization
510
+ if 'p' in rules_to_apply and ('ㄷㅣ' in word.jamo or 'ㅌㅣ' in word.jamo):
511
+ word.jamo = palatalize(word)
512
+
513
+ # apply aspiration
514
+ if 'a' in rules_to_apply and 'ㅎ' in word.jamo:
515
+ word.jamo = aspirate(word)
516
+
517
+ # apply place assimilation
518
+ if 's' in rules_to_apply:
519
+ word.jamo = assimilate(word)
520
+
521
+ # apply post-obstruent tensification
522
+ if 't' in rules_to_apply and any(jm in word.jamo for jm in OBSTRUENTS):
523
+ word.jamo = pot(word)
524
+
525
+ # apply complex coda simplification
526
+ if 'c' in rules_to_apply:
527
+ word = simplify_coda(word)
528
+
529
+ # apply coda neutralization
530
+ if 'n' in rules_to_apply:
531
+ word.jamo = neutralize(word)
532
+
533
+ # apply intersonorant H-deletion
534
+ if 'h' in rules_to_apply and 'ㅎ' in word.jamo[1:-1]:
535
+ word.jamo = delete_h(word)
536
+
537
+ # apply (optional) non-coronalization
538
+ if 'o' in rules_to_apply:
539
+ word.jamo = non_coronalize(word)
540
+
541
+ return word
542
+
543
+
544
+ def apply_phonetics(ipa_symbols: List[str], rules_to_apply: str) -> List[str]:
545
+ if 'v' in rules_to_apply:
546
+ ipa_symbols = inter_v(ipa_symbols)
547
+ if 'r' in rules_to_apply and 'l' in ipa_symbols:
548
+ ipa_symbols = alternate_lr(ipa_symbols)
549
+ return ipa_symbols
550
+
551
+
552
+ # ----------------------------
553
+ # IPA to Pinyin Conversion
554
+ # ----------------------------
555
+
556
+ def ipa_to_pinyin(ipa: str) -> str:
557
+ ipa_to_pinyin_dict = {
558
+ # Consonants
559
+ 'p': 'b', 'pʰ': 'p', 'm': 'm', 'f': 'f',
560
+ 't': 'd', 'tʰ': 't', 'n': 'n', 'l': 'l',
561
+ 'k': 'g', 'kʰ': 'k', 'x': 'h', 'h': 'h', 'ɣ': 'e', 'χ': 'h', 'ʁ': 'ʁ', 'ħ': 'haʰoʰ', 'ʕ': 'haʰo', 'ɦ': 'aʰ',
562
+ 'tɕ': 'j', 'tɕʰ': 'q', 'ɕ': 'x', 't͡ɕ': 'j', 't͡ɕʰ': 'q',
563
+ 'tʂ': 'zh', 'tʂʰ': 'ch', 'ʂ': 'sh', 'ɻ': 'r', 'ʐ': 'r', 't͡s': 'z', 't͡sʰ': 'c', 'ʈ͡ʂ': 'zh', 'ʈ͡ʂʰ': 'ch',
564
+ 'ts': 'z', 'tsʰ': 'c', 's': 's', 'd͡z': 'zi', 'dz': 'zi',
565
+ 'ŋ': 'ng', 'ɲ': 'ni', 'ɲ̟': 'ni',
566
+ 'ʔ': 'ʔ',
567
+ 'ɉ': 'i',
568
+ 'w': 'u', 'ɥ': 'ü',
569
+ 'j': 'i', 'ç': 'xi', 'd͡ʑ': 'ji', 'dʑ': 'ji',
570
+
571
+ # Syllabic Consonants
572
+ 'm̩': 'm', 'm̥': 'hm',
573
+ 'n̩': 'n', 'ŋ̍': 'ng', 'ŋ̊': 'hng',
574
+ 'ɹ̩': 'i', 'ɻ̩': 'ri',
575
+
576
+ # Vowels
577
+ 'i': 'i', 'u': 'u', 'y': 'ü', 'u˞': 'ur',
578
+ 'ai': 'a', 'ä': 'a', 'ɑ': 'ao', 'e̞': 'ie', 'ə': 'en', 'a̠': 'a',
579
+ 'o': 'o', 'ɔ': 'ao', 'o̞': 'o', 'o̞˞': 'or',
580
+ 'ɤ': 'e', 'ɛ': 'i', 'e': 'ie', 'œ': 'ue', 'o̜': 'o',
581
+ 'ɵ': 'ou', 'ʊ': 'ong', 'ʊ̃˞': 'ongr', 'ɤ˞': 'e', 'ɤ̞˞': 'eng', 'ɤ˞˞': 'er',
582
+ 'ɚ': 'r', 'ɐ': 'i', 'ɚ̃': 'ngr', 'ʌ̹': 'ao',
583
+ 'i̞': 'ie',
584
+
585
+ # Diphthongs and Triphthongs
586
+ 'ja': 'ia', 'wa': 'ua',
587
+ 'jo': 'io', 'wo': 'uo',
588
+ 'jɛ': 'ie', 'ɥɛ': 'üe',
589
+ 'aɪ': 'ai', 'waɪ': 'uai', 'ai̯': 'ai',
590
+ 'eɪ': 'ei', 'weɪ': 'ui', 'ei̯': 'ei',
591
+ 'ɑʊ': 'ao', 'jɑʊ': 'iao', 'ɑu̯': 'ao', 'ɑu̯˞': 'aor',
592
+ 'oʊ': 'ou', 'joʊ': 'iu', 'ou̯': 'iu', 'ou̯˞': 'our',
593
+
594
+ # R-colored vowels and combinations
595
+ 'äɚ̯': 'r', 'ä̃ɚ̯̃': 'angr', 'ɐɚ̯': 'yanr',
596
+
597
+ 'an': 'an', 'jɛn': 'ian', 'wan': 'uan', 'ɥæn': 'üan',
598
+ 'ən': 'en', 'in': 'in', 'wən': 'un', 'yn': 'ün',
599
+ 'ɑŋ': 'ang', 'jɑŋ': 'iang', 'wɑŋ': 'uang',
600
+ 'ɤŋ': 'eng', 'iŋ': 'ing', 'wɤŋ': 'ueng',
601
+ 'ʊŋ': 'ong', 'jʊŋ': 'iong',
602
+ 'ɚ̃': 'a',
603
+
604
+ # Tones
605
+ '˥˥': '55', '˧˥': '35', '˨˩˦': '214', '˨˩˩': '211',
606
+ '˩˦': '14', '˥˩': '51', '˥˧': '53',
607
+ '˨˩': '21', '˧˩': '31', '˦˩': '41', '˩˩': '11', '˨˥': '25',
608
+ '˧': '33', '˩˧': '13', '˨˧': '23', '˨': '22',
609
+
610
+ # Neutral Tone
611
+ 'k˥': '5', 'k˧': '3', 'k˨': '2', '˥': '55',
612
+ }
613
+
614
+ # Sort the keys by length in descending order to match longer patterns first
615
+ sorted_ipa_symbols = sorted(ipa_to_pinyin_dict.keys(), key=lambda x: len(x), reverse=True)
616
+ # Create a regex pattern to match any of the IPA symbols
617
+ pattern = '|'.join(re.escape(symbol) for symbol in sorted_ipa_symbols)
618
+ ipa_regex = re.compile(pattern)
619
+
620
+ def replace_match(match):
621
+ return ipa_to_pinyin_dict.get(match.group(0), match.group(0))
622
+
623
+ return ipa_regex.sub(replace_match, ipa)
624
+
625
+
626
+ # ----------------------------
627
+ # Worker Functions
628
+ # ----------------------------
629
+
630
+ def transcription_convention(convention: str, tables_dir: Path) -> ConversionTable:
631
+ # supported transcription conventions: ipa, yale, park
632
+ convention = convention.lower()
633
+ if convention not in ['ipa', 'yale', 'park']:
634
+ raise ValueError(f"您的输入 {convention} 不被支持。")
635
+ return ConversionTable(convention, tables_dir)
636
+
637
+
638
+ def sanitize(word: str, tables_dir: Path) -> str:
639
+ """
640
+ converts all hanja 漢字 letters to hangul
641
+ and also remove any space in the middle of the word
642
+ """
643
+ if len(word) < 1: # if empty input, no sanitize
644
+ return word
645
+
646
+ word = word.replace(' ', '')
647
+
648
+ hanja_idx = [match.start() for match in re.finditer(r'\p{Han}', word)]
649
+ if len(hanja_idx) == 0: # if no hanja, no sanitize
650
+ return word
651
+
652
+ r = hanja_cleaner(word, hanja_idx, tables_dir)
653
+ return r
654
+
655
+
656
+ def convert(hangul: str,
657
+ rules_to_apply: str = 'pastcnhovr',
658
+ convention: str = 'ipa',
659
+ sep: str = '',
660
+ tables_dir: Path = Path('tables')) -> Dict[str, str]:
661
+ # the main function for IPA and Pinyin conversion
662
+
663
+ if len(hangul) < 1: # if no content, then return no content
664
+ return {"ipa": "", "pinyin": ""}
665
+
666
+ # prepare
667
+ rules_to_apply = rules_to_apply.lower()
668
+ CT_convention = transcription_convention(convention, tables_dir)
669
+ hangul = sanitize(hangul, tables_dir)
670
+
671
+ word = Word(hangul=hangul, tables_dir=tables_dir)
672
+
673
+ # resolve word-final consonant clusters right off the bat
674
+ simplify_coda(word)
675
+
676
+ # apply rules
677
+ word = apply_rules(word, rules_to_apply)
678
+
679
+ # high mid/back vowel merger after bilabial (only for the Yale convention)
680
+ if CT_convention.name == 'yale' and 'u' in rules_to_apply:
681
+ bilabials = list("ㅂㅃㅍㅁ")
682
+ applied = list(word.jamo)
683
+ for i, jamo in enumerate(word.jamo[:-1]):
684
+ if jamo in bilabials and word.jamo[i + 1] == "ㅜ":
685
+ applied[i + 1] = "ㅡ"
686
+ word.jamo = ''.join(applied)
687
+
688
+ # convert to IPA or Yale
689
+ transcribed = transcribe(word.jamo, CT_convention)
690
+
691
+ # apply phonetic rules
692
+ if CT_convention.name == 'ipa':
693
+ transcribed = apply_phonetics(transcribed, rules_to_apply)
694
+
695
+ ipa_result = sep.join(transcribed)
696
+
697
+ # Convert IPA to Pinyin
698
+ pinyin_result = ipa_to_pinyin(ipa_result)
699
+
700
+ return {"ipa": ipa_result, "pinyin": pinyin_result}
701
+
702
+
703
+ def convert_many(long_content: str,
704
+ rules_to_apply: str = 'pastcnhovr',
705
+ convention: str = 'ipa',
706
+ sep: str = '',
707
+ tables_dir: Path = Path('tables'),
708
+ output_mode: str = 'both') -> Union[int, str]:
709
+ """
710
+ Convert many words from the decoded content.
711
+ output_mode: 'ipa', 'pinyin', or 'both'
712
+ """
713
+ # decode uploaded file and create a wordlist to pass to convert()
714
+ decoded = b64decode(long_content).decode('utf-8')
715
+ decoded = decoded.replace('\r\n', '\n').replace('\r', '\n') # normalize line endings
716
+ decoded = decoded.replace('\n\n', '\n') # keep single newlines
717
+
718
+ res = []
719
+ # Split the decoded content into lines
720
+ lines = decoded.split('\n')
721
+ for line in lines:
722
+ line = line.strip()
723
+ if not line:
724
+ continue
725
+ tokens = re.findall(r'\w+|[^\w\s]', line, re.UNICODE)
726
+ annotated_line = []
727
+ for token in tokens:
728
+ if re.match(r'\w+', token, re.UNICODE):
729
+ # It's a word, convert it
730
+ converted_r = convert(hangul=token,
731
+ rules_to_apply=rules_to_apply,
732
+ convention=convention,
733
+ sep=sep,
734
+ tables_dir=tables_dir)
735
+ if output_mode == 'ipa':
736
+ anno = f'\\anno{{{token}}}{{{converted_r["ipa"]}}}'
737
+ annotated_line.append(anno)
738
+ elif output_mode == 'pinyin':
739
+ anno = f'\\anno{{{token}}}{{{converted_r["pinyin"]}}}'
740
+ annotated_line.append(anno)
741
+ elif output_mode == 'both':
742
+ anno_ipa = f'\\anno{{{token}}}{{{converted_r["ipa"]}}}'
743
+ anno_pinyin = f'\\anno{{{token}}}{{{converted_r["pinyin"]}}}'
744
+ annotated_line.append(anno_ipa)
745
+ annotated_line.append(anno_pinyin)
746
+ else:
747
+ # Default to both
748
+ anno_ipa = f'\\anno{{{token}}}{{{converted_r["ipa"]}}}'
749
+ anno_pinyin = f'\\anno{{{token}}}{{{converted_r["pinyin"]}}}'
750
+ annotated_line.append(anno_ipa)
751
+ annotated_line.append(anno_pinyin)
752
+ else:
753
+ # It's punctuation or space, retain as-is
754
+ annotated_line.append(token)
755
+ # Join the annotated tokens into a line
756
+ res.append(' '.join(annotated_line))
757
+ # Replace newline with double backslash
758
+ final_output = '\\\\\n'.join(res)
759
+ return final_output
760
+
761
+
762
+ # ----------------------------
763
+ # Initialization
764
+ # ----------------------------
765
+
766
+
767
+ def hangul2ipa(input_text: str) -> str:
768
+ tables = Path(f'{os.path.dirname(os.path.dirname(__file__))}/thirdparty/ko_tables')
769
+ initialize_conversion_tables(tables)
770
+ ipa = []
771
+ for h in input_text.split():
772
+ converted_results = convert(hangul=h,
773
+ rules_to_apply='pastcnhovr',
774
+ convention="ipa",
775
+ sep='',
776
+ tables_dir=tables)
777
+ ipa.append(converted_results["ipa"])
778
+ return " ".join(ipa)
779
+
780
+
781
+ if __name__ == "__main__":
782
+ input_text = "안녕하세요"
783
+ print(hangul2ipa(input_text))