phoonnx 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. phoonnx/__init__.py +0 -0
  2. phoonnx/config.py +490 -0
  3. phoonnx/locale/ca/phonetic_spellings.txt +2 -0
  4. phoonnx/locale/en/phonetic_spellings.txt +1 -0
  5. phoonnx/locale/gl/phonetic_spellings.txt +2 -0
  6. phoonnx/locale/pt/phonetic_spellings.txt +2 -0
  7. phoonnx/phoneme_ids.py +453 -0
  8. phoonnx/phonemizers/__init__.py +45 -0
  9. phoonnx/phonemizers/ar.py +42 -0
  10. phoonnx/phonemizers/base.py +216 -0
  11. phoonnx/phonemizers/en.py +250 -0
  12. phoonnx/phonemizers/fa.py +46 -0
  13. phoonnx/phonemizers/gl.py +142 -0
  14. phoonnx/phonemizers/he.py +67 -0
  15. phoonnx/phonemizers/ja.py +119 -0
  16. phoonnx/phonemizers/ko.py +97 -0
  17. phoonnx/phonemizers/mul.py +606 -0
  18. phoonnx/phonemizers/vi.py +44 -0
  19. phoonnx/phonemizers/zh.py +308 -0
  20. phoonnx/thirdparty/__init__.py +0 -0
  21. phoonnx/thirdparty/arpa2ipa.py +249 -0
  22. phoonnx/thirdparty/cotovia/cotovia_aarch64 +0 -0
  23. phoonnx/thirdparty/cotovia/cotovia_x86_64 +0 -0
  24. phoonnx/thirdparty/hangul2ipa.py +783 -0
  25. phoonnx/thirdparty/ko_tables/aspiration.csv +20 -0
  26. phoonnx/thirdparty/ko_tables/assimilation.csv +31 -0
  27. phoonnx/thirdparty/ko_tables/double_coda.csv +17 -0
  28. phoonnx/thirdparty/ko_tables/hanja.tsv +8525 -0
  29. phoonnx/thirdparty/ko_tables/ipa.csv +22 -0
  30. phoonnx/thirdparty/ko_tables/neutralization.csv +11 -0
  31. phoonnx/thirdparty/ko_tables/tensification.csv +56 -0
  32. phoonnx/thirdparty/ko_tables/yale.csv +22 -0
  33. phoonnx/thirdparty/kog2p/__init__.py +385 -0
  34. phoonnx/thirdparty/kog2p/rulebook.txt +212 -0
  35. phoonnx/thirdparty/mantoq/__init__.py +67 -0
  36. phoonnx/thirdparty/mantoq/buck/__init__.py +0 -0
  37. phoonnx/thirdparty/mantoq/buck/phonetise_buckwalter.py +569 -0
  38. phoonnx/thirdparty/mantoq/buck/symbols.py +64 -0
  39. phoonnx/thirdparty/mantoq/buck/tokenization.py +105 -0
  40. phoonnx/thirdparty/mantoq/num2words.py +37 -0
  41. phoonnx/thirdparty/mantoq/pyarabic/__init__.py +12 -0
  42. phoonnx/thirdparty/mantoq/pyarabic/arabrepr.py +64 -0
  43. phoonnx/thirdparty/mantoq/pyarabic/araby.py +1647 -0
  44. phoonnx/thirdparty/mantoq/pyarabic/named_const.py +227 -0
  45. phoonnx/thirdparty/mantoq/pyarabic/normalize.py +161 -0
  46. phoonnx/thirdparty/mantoq/pyarabic/number.py +826 -0
  47. phoonnx/thirdparty/mantoq/pyarabic/number_const.py +1704 -0
  48. phoonnx/thirdparty/mantoq/pyarabic/stack.py +52 -0
  49. phoonnx/thirdparty/mantoq/pyarabic/trans.py +517 -0
  50. phoonnx/thirdparty/mantoq/unicode_symbol2label.py +4173 -0
  51. phoonnx/thirdparty/tashkeel/LICENSE +22 -0
  52. phoonnx/thirdparty/tashkeel/SOURCE +1 -0
  53. phoonnx/thirdparty/tashkeel/__init__.py +212 -0
  54. phoonnx/thirdparty/tashkeel/hint_id_map.json +18 -0
  55. phoonnx/thirdparty/tashkeel/input_id_map.json +56 -0
  56. phoonnx/thirdparty/tashkeel/model.onnx +0 -0
  57. phoonnx/thirdparty/tashkeel/target_id_map.json +17 -0
  58. phoonnx/thirdparty/zh_num.py +238 -0
  59. phoonnx/util.py +705 -0
  60. phoonnx/version.py +6 -0
  61. phoonnx/voice.py +521 -0
  62. phoonnx-0.0.0.dist-info/METADATA +255 -0
  63. phoonnx-0.0.0.dist-info/RECORD +86 -0
  64. phoonnx-0.0.0.dist-info/WHEEL +5 -0
  65. phoonnx-0.0.0.dist-info/top_level.txt +2 -0
  66. phoonnx_train/__main__.py +151 -0
  67. phoonnx_train/export_onnx.py +109 -0
  68. phoonnx_train/norm_audio/__init__.py +92 -0
  69. phoonnx_train/norm_audio/trim.py +54 -0
  70. phoonnx_train/norm_audio/vad.py +54 -0
  71. phoonnx_train/preprocess.py +420 -0
  72. phoonnx_train/vits/__init__.py +0 -0
  73. phoonnx_train/vits/attentions.py +427 -0
  74. phoonnx_train/vits/commons.py +147 -0
  75. phoonnx_train/vits/config.py +330 -0
  76. phoonnx_train/vits/dataset.py +214 -0
  77. phoonnx_train/vits/lightning.py +352 -0
  78. phoonnx_train/vits/losses.py +58 -0
  79. phoonnx_train/vits/mel_processing.py +139 -0
  80. phoonnx_train/vits/models.py +732 -0
  81. phoonnx_train/vits/modules.py +527 -0
  82. phoonnx_train/vits/monotonic_align/__init__.py +20 -0
  83. phoonnx_train/vits/monotonic_align/setup.py +13 -0
  84. phoonnx_train/vits/transforms.py +212 -0
  85. phoonnx_train/vits/utils.py +16 -0
  86. phoonnx_train/vits/wavfile.py +860 -0
@@ -0,0 +1,212 @@
1
+ ### ----------------------------------------------------------------------------------------
2
+ ### 한국어 발음규칙 (Korean G2P Rulebook)
3
+ ### Last updated: 2019-01-31
4
+ ### Yejin Cho (ycho@utexas.edu)
5
+ ### 처리순서:
6
+ ### 예외처리 - 유기음화(겹받침) - 겹받침관련규칙 - 경음화 - 겹받침단순화 - 비음화 - 리을 재음절화
7
+ ### - 유음화 - 구개음화 - 유기음화(홑받침) - 연음 - 종성중화 - 리을 재음절화 [종료]
8
+ ### ----------------------------------------------------------------------------------------
9
+ ### 예외처리
10
+ ii,ll,[#-]y([aeoquv]), ii,ll,rr,y\1, # 일 연대, 삼십일여간
11
+ (h0,aa|t0,xx),ll,-ii,ll, \1,ll,rr,ii,ll, # 들일, 볼일, 할일
12
+ s0,vv,ll,-ii,kf, s0,vv,ll,rr,ii,kf, # 설익(다)
13
+ mm,uu,ll,-k0,oo,-k0,ii, mm,uu,ll,kk,oo,k0,ii, # 물고기
14
+ s0,ii,ll,-s0,ii,ll s0,ii,ll,s0,ii,ll # 실실
15
+ k0,ii,-s0,xx,lk, k0,ii,s0,xx,kf, # 기슭
16
+ c0,vv,ll,-ya,kf, c0,vv,rr,ya,kf, # 절약
17
+ k0,xx,mf,-yo,-ii,ll, k0,xx,-mm,yo,-ii,ll, # 금요일
18
+ lt,-ii, ll,-ch,ii, # 훑이
19
+ (?<=nn,vv,)lb,(?=-(c0,(uu|vv),kf|t0,(uu|vv),ng)) pf, # 넓죽/넓둥글다
20
+ (?<=s0,ii,)lh,-c0,(?=xx,ng) ll,cc, # 싫증
21
+ t0,aa,lk, t0,aa,kf, # 닭
22
+ (wq|we|oo),nf,-k0,aa,c0, \1,nf,k0,aa,tf, # 온갖
23
+ mm,aa,tf,-h0,yv,ng, mm,aa,th,yv,ng, # 맏형
24
+ k0,vv,th,-oo,s0, k0,vv,t0,oo,tf, # 겉옷
25
+ c0,uu,ll,-nn,vv,mf,-k0,ii, c0,uu,ll,rr,vv,mf,-kk,ii, # 줄넘기
26
+ h0,oo,th,-ii,-p0,uu,ll, h0,oo,nf,nn,ii,p0,uu,ll, # 홑이불
27
+ s0,aa,ks,-ii,ll, s0,aa,ng,nn,ii,ll, # 삯일
28
+ mm,qq,nf,-ii,pf, mm,qq,nf,nn,ii,pf, # 맨입
29
+ kk,oo,ch,-ii,ph, kk,oo,nf,nn,ii,pf, # 꽃잎
30
+ nn,qq,-p0,oo,kf,-ya,kf, nn,qq,p0,oo,ng,nn,ya,kf, # 내복약
31
+ h0,aa,nf,-yv,-rr,xx,mf, h0,aa,nf,nn,yv,rr,xx,mf, # 한여름
32
+ nn,aa,mf,-c0,oo,nf,-yv,-p0,ii, nn,aa,mf,c0,oo,nf,nn,yv,p0,ii, # 남존여비
33
+ s0,ii,nf,-yv,-s0,vv,ng, s0,ii,nf,nn,yv,s0,vv,ng, # 신여성
34
+ s0,qq,kf,-yv,nf,-ph,ii,ll, s0,qq,ng,nn,yv,nf,ph,ii,ll, # 색연필
35
+ t0,aa,mf,-yo, t0,aa,mf,nn,yo, # 담요
36
+ nn,uu,nf,-yo,-k0,ii, nn,uu,nf,nn,yo,k0,ii, # 눈요기
37
+ vv,pf,-yo,ng, vv,mf,nn,yo,ng, # (영)업용
38
+ s0,ii,kf,-yo,ng,-yu, s0,ii,k0,yo,ng,nn,yu, # 식용유
39
+ nf,-yu,nf,-rr,ii, nf,nn,yu,ll,rr,ii, # (국민)윤리
40
+ (c0|s0),(aa|oo|uu),ll,-ii,(ph|p0|pf), \1,\2,ll,rr,ii,pf, # 잘입다, 솔잎, 술잎
41
+ (?=(^|#))h0,aa,nf,-ii,ll, h0,aa,nf,nn,ii,ll, # 한일
42
+ (?=(^|#))mm,aa,kf,-ii,ll, mm,aa,ng,nn,ii,ll, # 막일
43
+ mm,oo,ll,-s0,aa,ng,-s0,ii,kf, mm,oo,ll,ss,aa,ng,s0,ii,kf, # 몰상식
44
+ oo,s0,#ii,pf, oo,nf,nn,ii,pf, # 옷입(다)
45
+ (nf|ll),-yv,-s0,vv,-s0, \1,nn,yv,s0,vv,tf, # (스물/서른)여섯
46
+ (ng|mf|nf),-y([aeoquv]), \1,nn,y\2, # 밤윷, 직행열차, 콩엿, 볶은엿
47
+ (wv|ii),ll,-y([aeoquv]), \1,rr,y\2, # 일/월요일
48
+ ll,-y([aeoquv]), ll,rr,y\1, # 불여우, 물약, 서울역, 물엿, 물옆, 굴옆, 휘발유, 유들유들
49
+ ii,ll,-c0,vv,ll, ii,ll,cc,vv,ll, # 일절
50
+ (th|tf|s0),-y([aeoquv]), nf,-nn,y\2, # 쑥갓요
51
+ (<=^|#)mm,aa,kf,-ii,ll mm,aa,ng,nn,ii,ll # 막일
52
+ k0,uu,-k0,xx,nf,-rr,yu, k0,uu,k0,xx,nf,nn,yu, # 구근류
53
+ k0,aa,ll,-([ct])0,xx,ng, k0,aa,ll,\1\1,xx,ng, # 갈등/갈증
54
+ p0,aa,ll,-t0,oo,ng, p0,aa,ll,tt,oo,ng, # 발동
55
+ c0,vv,ll,-t0,oo, c0,vv,ll,tt,oo, # 절도
56
+ mm,aa,ll,-s0,aa,ll, mm,aa,ll,ss,aa,ll, # 말살
57
+ p0,uu,ll,-s0, p0,uu,ll,ss, # 불소/불세출
58
+ ii,ll,-s0,ii, ii,ll,ss,ii, # 일시
59
+ p0,aa,ll,-c0,vv,nf, p0,aa,ll,cc,vv,nf, # 발전
60
+ (?<=(s0,ii,nf,|s0,aa,mf,)-)(c|k|t)0, \2\2, # 신고, 신다, 신자, 삼고, 삼다, 삼자
61
+ (?<=k0,ii,mf,-)p0, pp, # 김밥
62
+ (?<=t0,vv,-t0,xx,mf,-)c0, cc, # 더듬지
63
+ c0,aa,mf,-c0,aa,-rr,ii, c0,aa,mf,cc,aa,rr,ii, # 잠자리
64
+ (?<=(ng|ll),-)c0,(?=uu,ll,-k0,ii) cc, # 물줄기, 강줄기
65
+ (?<=(nf|ll),-)p0,vv,pf, pp,vv,pf, # 문법, 불법
66
+ (?<=(nf|tf),-)p0,(?=aa,-rr,aa,mf) pp, # 신바람, 늦바람
67
+ p0,aa,-rr,aa,mf,-k0,yv,ll, p0,aa,rr,aa,mf,kk,yv,ll, # 바람결
68
+ (?<=(mf|kf),-)p0,(?=aa,pf,) pp, # 아침밥, 점심밥, 저녁밥
69
+ (?<=nn,uu,nf,-)t0, tt, # 눈동자, 눈대중
70
+ mm,aa,kf,-yv,mf, mm,aa,ng,nn,yv,mf, # 늑막염, 결막염
71
+ p0,aa,lb,-(t|k)0, p0,aa,pf,\1\1, # 밟다, 밟고
72
+ p0,aa,lb,-nn, p0,aa,mf,nn, # 밟는
73
+ nn,vv,lb,-(t|k)0, nn,vv,ll,\1\1, # 넓다, 넓고
74
+ mm,(aa|vv),s0,-ii,ss,-t0,aa, mm,\1,t0,ii,tf,tt,aa, # 맛있다
75
+ mm,(aa|vv),s0,-vv,ps,-t0,aa, mm,\1,t0,vv,pf,tt,aa, # 맛없다
76
+ c0,vv,c0,-vv,-mm,ii, c0,vv,t0,vv,mm,ii, # 젖어미
77
+ h0,vv,s0,-uu,s0,-xx,mf, h0,vv,t0,uu,s0,xx,mf, # 헛웃음
78
+ k0,aa,ps,-vv,-ch,ii, k0,aa,p0,vv,ch,ii, # 값어치
79
+ k0,aa,ps,-ii,ss,-nn,xx,nf, k0,aa,p0,ii,nf,nn,xx,nf, # 값있는
80
+ c0,vv,lm,-c0,ii, c0,vv,mf,cc,ii, # 젊지
81
+ oo,lm,-k0,(?=[iy]) oo,mf,k0, # 옮기(다)
82
+ k0,uu,lm,-k0,ii,-t0,aa, k0,uu,mf,k0,ii,t0,aa, # 굶기다
83
+ (nn|k0|h0),aa,ll,-(p|s|c|k|t)0, \1,aa,ll,\2\2, # 갈바, 할바, 만날것
84
+ ch,vv,s0,-ii,nf, ch,vv,t0,ii,nf, # 첫인(상)
85
+ (?<=(mf|nf),-)ii,-p0,uu,ll, nn,ii,p0,uu,ll, # 솜이불
86
+ (?<=(nf|ll),-)k0,oo,-rr,ii, kk,oo,rr,ii, # 문고리
87
+ (?<=(nf|ll),-)s0,qq, ss,qq, # 산새, 들새
88
+ (?<=(nf|ll),-)c0,qq,-c0,uu, cc,qq,c0,uu, # 손재주, 글재주
89
+ k0,ii,ll,-k0,aa, k0,ii,ll,kk,aa, # 길가
90
+ mm,uu,ll,-t0,oo,ng,-ii, mm,uu,ll,tt,oo,ng,ii, # 물동이
91
+ mm,uu,ll,-c0, mm,uu,ll,-cc, # 물증
92
+ (?<=(nf|ll),-)p0,aa,-t0,aa,kf, pp,aa,t0,aa,kf, # 발바닥, 손바닥
93
+ (?<=(nf|ll),-)s0,oo,kf, ss,oo,kf, # 굴속, 물속
94
+ (?<=s0,uu,ll,-)(c|p|t)0, \1\1, # 술잔, 술독, 술병, 술자리
95
+ k0,aa,ng,-k0,aa, k0,aa,ng,kk,aa, # 강가
96
+ (?<=(ng|mf),-)t0,aa,ll, tt,aa,ll, # 초승달
97
+ t0,xx,ng,-p0,uu,ll, t0,xx,ng,pp,uu,ll, # 등불
98
+ ch,aa,ng,-s0,aa,ll, ch,aa,ng,ss,aa,ll, # 창살
99
+ (?<=(ll|ng),-)c0,uu,ll,-k0,ii, k0,aa,ng,cc,uu,ll,k0,ii, # 강줄기, 물줄기
100
+ aa,nf,-k0,oo, aa,nf,kk,oo, # 안고
101
+ (?<=kk,yv,-aa,nf,-)(t|c)0, \1\1, # 껴안지, 껴안다
102
+ ii,-c0,uu,kf,-ii,-c0,uu,kf, ii,c0,uu,ng,nn,ii,c0,uu,kf, # 이죽이죽
103
+ ya,-k0,xx,mf,-ya,-k0,xx,mf, ya,k0,xx,mf,nn,ya,k0,xx,mf, # 야금야금
104
+ p0,ee,-k0,qq,s0,-ii,s0, p0,ee,k0,qq,nf,nn,ii,tf, # 베갯잇
105
+ kk,qq,s0,-ii,ph, kk,qq,nf,nn,ii,pf, # 깻잎
106
+ nn,aa,-mm,uu,s0,-ii,ph, nn,aa,mm,uu,nf,nn,ii,pf, # 나뭇잎
107
+ qq,s0,-yv,ll, qq,nf,nn,yv,ll, # 도리깻열
108
+ t0,wi,s0,-(?=[aeqiouyvwx]) t0,wi,nf,-nn, # 뒷윷, 뒷얘기
109
+ nn,xx,c0,-yv,-rr,xx,mf, nn,xx,tf,nn,yv,rr,xx,mf, # 늦여름
110
+ t0,ii,-k0,xx,tf,-(ii|xx|ee), t0,ii,k0,xx,s0,\1, # 디귿에, 디귿이
111
+ (c0|ch|th|h0),ii,-xx,(c0|ch|th|h0),-(ii|xx|ee), \1,ii,xx,s0,\3, # 치읓이, 지읒에
112
+ ph,ii,-xx,ph,-(ii|xx|ee), ph,ii,xx,p0,\1, # 피읖에
113
+ kh,ii,-xx,kh,-(ii|xx|ee), kh,ii,xx,k0,\1, # 키읔이
114
+ ### 유기음화 (겹받침)
115
+ l(b|p),-h0, ll,-ph,
116
+ nh,-(c|k|t)0, nf,-\1h,
117
+ lh,-(c|k|t)0, ll,-\1h,
118
+ lk,-h0, ll,-kh,
119
+ nc,-h0, nf,-ch,
120
+ ### 겹받침 규칙 (ㄹㅎ)
121
+ (k0,aa,|k0,uu,|k0,vv,|oo,|p0,aa,|nn,aa,|nn,xx,|p0,uu,|^ii,|-,ii,mm,aa,|mm,uu,|(^|-,)vv,)lk,-(t0|c0|s0), \1kf,-\3,
122
+ (k0,aa,|k0,uu,|k0,vv,|vv,|oo,|mm,aa,|p0,aa,|nn,aa,|nn,xx,|mm,uu,|p0,uu,|^ii,|-,ii,)lk,-k0, \1ll,-kk,
123
+ ### 겹받침 규칙 (ㄴㅎ)
124
+ nh,-(k|t|c)0, nf,-\1h,
125
+ nh,-s0, nf,-ss,
126
+ nh,-nn, nf,-nn,
127
+ nh,-(?=[aeqiouyvwx]) -nn,
128
+ ### 겹받침 규칙 (ㄹㅎ)
129
+ lh,-nn, ll,-rr,
130
+ lh,-(k|t|c)0, ll,-\1h,
131
+ lh,-s0, ll,-ss,
132
+ lh,-(?=[aeqiouyvwx]) -rr,
133
+ ### 겹받침 규칙 (ㄴㅈ)
134
+ nc,-([ktsc])0, nf,-\1\1,
135
+ ### 겹받침 규칙 (ㄹㅁ)
136
+ (c0,vv,|c0,ii,|k0,uu,|t0,aa,|(^|-,)oo,|k0,oo,)lm,-([ktsc])0, \1mf,-\3\3,
137
+ ### 겹받침 규칙 (ㄹㅂ)
138
+ (p0,aa,|tt,vv,|(^|-,)yv,|nn,vv,|(^|-,)ya,|cc,aa,)lb,-([ktsc])0, \1ll,-\4\4,
139
+ ### 겹받침 규칙 (ㄹㅌ)
140
+ h0,(aa|uu),lt,-nn, h0,\1,ll,-ll,
141
+ h0,(aa|uu),lt,-([ktsc])0, h0,\1,ll,-\2\2,
142
+ ### 경음화
143
+ lk,-(c|k|p|s|t)0, kf,-\1\1,
144
+ l(b|p),-p0, pf,-pp,
145
+ s0,-p0, tf,-pp,
146
+ l(b|t),-(c|k|s|t|p)0, ll,-\2\2,
147
+ lp,-(c|k|s|t)0, pf,-\1\1,
148
+ (c[h0]|s[s0]|t[fh]),-(c|k|s|t)0, tf,-\2\2,
149
+ k[fhks],-(c|k|p|s|t)0, kf,-\1\1,
150
+ p[sfh],-(c|k|p|s|t)0, pf,-\1\1,
151
+ (?<=(kf|kh|ks|ss|c0|ch|tf|th),-)p0, pp,
152
+ h0,-s0, -ss,
153
+ nh,-s0, nf,-ss,
154
+ lh,-s0, ll,-ss,
155
+ ### 겹받침 단순화: 어말 또는 자음 앞
156
+ (ks|lk),(?=(#|$|-[ptkshcmnr])) kf,
157
+ n[ch],(?=(#|$|-[ptkshcmnr])) nf,
158
+ l[bsth],(?=(#|$|-[ptkshcmnr])) ll,
159
+ lm,(?=(#|$|-[ptkshcmnr])) mf,
160
+ (ps|lp),(?=(#|$|-[ptkshcmnr])) pf,
161
+ ### 겹받침 단순화: 모음 앞
162
+ ([kp])s,-(?=[aeqiouyvwx]) \1f,-ss,
163
+ ls,-(?=[aeqiouyvwx]) ll,-ss,
164
+ nc,-(?=[aeqiouyvwx]) nf,-c0,
165
+ lk,-(?=[aeqiouyvwx]) ll,-k0,
166
+ lm,-(?=[aeqiouyvwx]) ll,-mm,
167
+ lb,-(?=[aeqiouyvwx]) ll,-p0,
168
+ l([tp]),-(?=[aeqiouyvwx]) ll,-\1h,
169
+ ### 비음화
170
+ (?<=[pk])0,-rr, f,-nn,
171
+ (c0|ch|s0|ss|tf|nh|h0),-nn, nf,-nn,
172
+ nc,-(p|t|k)0, nf,-\1\1,
173
+ nc,(?=-[ptkshcmnr]) nf,
174
+ lm,-k0, mf,-kk,
175
+ lm,(?=-[ptkshcmnr]) mf,
176
+ k[fhks],(?=-(nn|mm),) ng,
177
+ lk,(?=-(nn|mm),) ng,
178
+ p[sfh],(?=-(nn|mm),) mf,
179
+ l[bp],(?=-(nn|mm),) mf,
180
+ (?<=(mf|ng|pf|kf),-)rr, nn,
181
+ (c0|ch|s0|ss|tf|nh|h0),(?=-mm,) nf,
182
+ ### 리을 재음절화
183
+ ll,-(?=y) -rr,
184
+ ### 유음화
185
+ (nf|ll),-rr, ll,-rr,
186
+ l[lht],-nn, ll,-rr,
187
+ ### 구개음화
188
+ tf,-(?=[iy]) -c0,
189
+ th,-(?=[iy]) -ch,
190
+ tf,-h0,(?=[iy]) -ch,
191
+ ### 유기음화 (홑받침)
192
+ (p|k)f,-h0, -\1h,
193
+ h0,-(c|k|t)0, -\1h,
194
+ (tf|th|s0),(-|#)h0, -th,
195
+ ### 연음규칙
196
+ (s0|ss|kk|p0|ph|pp|t0|th|tt|c0|ch|kh|kk|k0|mm|nn),-(?=[aeqiouyvwx]) -\1,
197
+ nh,-(?=[aeqiouyvwx]) -nn,
198
+ (s0|ss|c0|ch|th),(?=-[ptkshcmnr]) tf,
199
+ h0,-(?=[aeqiouyvwx]) -
200
+ lh,-?(?=[aeqiouyvwx]) -rr,
201
+ (p|t|k)f,-?(?=[aeqiouyvwx]) -\g<1>0,
202
+ (m|n)f,-?(?=[aeqiouyvwx]) -\1\1,
203
+ ### 종성규칙
204
+ (s0|ss|c0|ch|th),(?=-|#|$) tf,
205
+ (kh|kk|ks|lk),(?=-|#|$|[ptkshcmnr]) kf, # (ks|lk),(?=-[ptkshcmnr]) kf,
206
+ (ph|lp|ps),(?=-|#|$|[ptkshcmnr]) pf,
207
+ (?<=[ptkshcmnr].),-(?=[aeqiouyvwx]) ,
208
+ l[bhstp],(?=-|#|$|[ptkshcmnr]) ll, # l[bt],(?=-[ptkshcmnr]) ll,
209
+ nh,(?=-|#|$|[ptkshcmnr]) nf,-
210
+ ### 리을 재음절화
211
+ (?<=[aeqiouyvwx].,)ll,-(?=[aeqiouyvwx]) -rr,
212
+ ll,-ll, ll,-rr,
@@ -0,0 +1,67 @@
1
+ from phoonnx.thirdparty.mantoq.buck import symbols
2
+ from phoonnx.thirdparty.mantoq.buck.tokenization import (arabic_to_phonemes, phon_to_id_,
3
+ phonemes_to_tokens, simplify_phonemes)
4
+ from phoonnx.thirdparty.mantoq.buck.tokenization import tokens_to_ids as _tokens_to_id
5
+ from phoonnx.thirdparty.mantoq.num2words import num2words
6
+ import warnings
7
+ from phoonnx.thirdparty.tashkeel import TashkeelDiacritizer
8
+ try:
9
+ import onnxruntime
10
+
11
+ _TASHKEEL_AVAILABLE = True
12
+ except ImportError:
13
+ _TASHKEEL_AVAILABLE = False
14
+
15
+ _DIACRITIZER_INST = None
16
+
17
+ MANTOQ_SYMBOLS = dict(phon_to_id_)
18
+ MANTOQ_SPECIAL_SYMBOLS = dict(
19
+ pad=phon_to_id_[symbols.PADDING_TOKEN],
20
+ eos=phon_to_id_[symbols.EOS_TOKEN],
21
+ )
22
+ # Maps Arabic-specific puncs with their English equivlents
23
+ AR_SPECIAL_PUNCS_TABLE = str.maketrans("،؟؛", ",?;")
24
+ OMITTED_SYMBOLS = str.maketrans("", "", "+=<>")
25
+
26
+ # Quotes
27
+ QUOTES = '“”„«»'
28
+ QUOTES_TABLE = str.maketrans(QUOTES, '"' * len(QUOTES))
29
+ BRACKETS_TABLE = str.maketrans("[]{}", "()()")
30
+
31
+
32
+
33
+
34
+ def tashkeel(text: str) -> str:
35
+ global _DIACRITIZER_INST
36
+ if not _TASHKEEL_AVAILABLE:
37
+ warnings.warn(
38
+ "Warning: The Tashkeel feature will not be available. Please re-install with the `libtashkeel` extra.",
39
+ UserWarning,
40
+ )
41
+ return text
42
+ if _DIACRITIZER_INST is None:
43
+ _DIACRITIZER_INST = TashkeelDiacritizer()
44
+ return _DIACRITIZER_INST.diacritize(text)
45
+
46
+ def g2p(
47
+ text: str,
48
+ add_tashkeel: bool = True,
49
+ process_numbers: bool = True,
50
+ append_eos: bool = False,
51
+ ) -> list[str]:
52
+ text = text.translate(AR_SPECIAL_PUNCS_TABLE).translate(QUOTES_TABLE).translate(BRACKETS_TABLE)
53
+ if add_tashkeel:
54
+ text = tashkeel(text)
55
+ if process_numbers:
56
+ text = num2words(text)
57
+ normalized_text = text
58
+ phones = arabic_to_phonemes(text)
59
+ phones = simplify_phonemes(phones)
60
+ tokens = phonemes_to_tokens(phones)
61
+ if not append_eos:
62
+ tokens = tokens[:-1]
63
+ return normalized_text, tokens
64
+
65
+
66
+ def tokens2ids(tokens: list[str]) -> list[int]:
67
+ return _tokens_to_id(tokens)
File without changes