phoonnx 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- phoonnx/__init__.py +0 -0
- phoonnx/config.py +490 -0
- phoonnx/locale/ca/phonetic_spellings.txt +2 -0
- phoonnx/locale/en/phonetic_spellings.txt +1 -0
- phoonnx/locale/gl/phonetic_spellings.txt +2 -0
- phoonnx/locale/pt/phonetic_spellings.txt +2 -0
- phoonnx/phoneme_ids.py +453 -0
- phoonnx/phonemizers/__init__.py +45 -0
- phoonnx/phonemizers/ar.py +42 -0
- phoonnx/phonemizers/base.py +216 -0
- phoonnx/phonemizers/en.py +250 -0
- phoonnx/phonemizers/fa.py +46 -0
- phoonnx/phonemizers/gl.py +142 -0
- phoonnx/phonemizers/he.py +67 -0
- phoonnx/phonemizers/ja.py +119 -0
- phoonnx/phonemizers/ko.py +97 -0
- phoonnx/phonemizers/mul.py +606 -0
- phoonnx/phonemizers/vi.py +44 -0
- phoonnx/phonemizers/zh.py +308 -0
- phoonnx/thirdparty/__init__.py +0 -0
- phoonnx/thirdparty/arpa2ipa.py +249 -0
- phoonnx/thirdparty/cotovia/cotovia_aarch64 +0 -0
- phoonnx/thirdparty/cotovia/cotovia_x86_64 +0 -0
- phoonnx/thirdparty/hangul2ipa.py +783 -0
- phoonnx/thirdparty/ko_tables/aspiration.csv +20 -0
- phoonnx/thirdparty/ko_tables/assimilation.csv +31 -0
- phoonnx/thirdparty/ko_tables/double_coda.csv +17 -0
- phoonnx/thirdparty/ko_tables/hanja.tsv +8525 -0
- phoonnx/thirdparty/ko_tables/ipa.csv +22 -0
- phoonnx/thirdparty/ko_tables/neutralization.csv +11 -0
- phoonnx/thirdparty/ko_tables/tensification.csv +56 -0
- phoonnx/thirdparty/ko_tables/yale.csv +22 -0
- phoonnx/thirdparty/kog2p/__init__.py +385 -0
- phoonnx/thirdparty/kog2p/rulebook.txt +212 -0
- phoonnx/thirdparty/mantoq/__init__.py +67 -0
- phoonnx/thirdparty/mantoq/buck/__init__.py +0 -0
- phoonnx/thirdparty/mantoq/buck/phonetise_buckwalter.py +569 -0
- phoonnx/thirdparty/mantoq/buck/symbols.py +64 -0
- phoonnx/thirdparty/mantoq/buck/tokenization.py +105 -0
- phoonnx/thirdparty/mantoq/num2words.py +37 -0
- phoonnx/thirdparty/mantoq/pyarabic/__init__.py +12 -0
- phoonnx/thirdparty/mantoq/pyarabic/arabrepr.py +64 -0
- phoonnx/thirdparty/mantoq/pyarabic/araby.py +1647 -0
- phoonnx/thirdparty/mantoq/pyarabic/named_const.py +227 -0
- phoonnx/thirdparty/mantoq/pyarabic/normalize.py +161 -0
- phoonnx/thirdparty/mantoq/pyarabic/number.py +826 -0
- phoonnx/thirdparty/mantoq/pyarabic/number_const.py +1704 -0
- phoonnx/thirdparty/mantoq/pyarabic/stack.py +52 -0
- phoonnx/thirdparty/mantoq/pyarabic/trans.py +517 -0
- phoonnx/thirdparty/mantoq/unicode_symbol2label.py +4173 -0
- phoonnx/thirdparty/tashkeel/LICENSE +22 -0
- phoonnx/thirdparty/tashkeel/SOURCE +1 -0
- phoonnx/thirdparty/tashkeel/__init__.py +212 -0
- phoonnx/thirdparty/tashkeel/hint_id_map.json +18 -0
- phoonnx/thirdparty/tashkeel/input_id_map.json +56 -0
- phoonnx/thirdparty/tashkeel/model.onnx +0 -0
- phoonnx/thirdparty/tashkeel/target_id_map.json +17 -0
- phoonnx/thirdparty/zh_num.py +238 -0
- phoonnx/util.py +705 -0
- phoonnx/version.py +6 -0
- phoonnx/voice.py +521 -0
- phoonnx-0.0.0.dist-info/METADATA +255 -0
- phoonnx-0.0.0.dist-info/RECORD +86 -0
- phoonnx-0.0.0.dist-info/WHEEL +5 -0
- phoonnx-0.0.0.dist-info/top_level.txt +2 -0
- phoonnx_train/__main__.py +151 -0
- phoonnx_train/export_onnx.py +109 -0
- phoonnx_train/norm_audio/__init__.py +92 -0
- phoonnx_train/norm_audio/trim.py +54 -0
- phoonnx_train/norm_audio/vad.py +54 -0
- phoonnx_train/preprocess.py +420 -0
- phoonnx_train/vits/__init__.py +0 -0
- phoonnx_train/vits/attentions.py +427 -0
- phoonnx_train/vits/commons.py +147 -0
- phoonnx_train/vits/config.py +330 -0
- phoonnx_train/vits/dataset.py +214 -0
- phoonnx_train/vits/lightning.py +352 -0
- phoonnx_train/vits/losses.py +58 -0
- phoonnx_train/vits/mel_processing.py +139 -0
- phoonnx_train/vits/models.py +732 -0
- phoonnx_train/vits/modules.py +527 -0
- phoonnx_train/vits/monotonic_align/__init__.py +20 -0
- phoonnx_train/vits/monotonic_align/setup.py +13 -0
- phoonnx_train/vits/transforms.py +212 -0
- phoonnx_train/vits/utils.py +16 -0
- phoonnx_train/vits/wavfile.py +860 -0
@@ -0,0 +1,212 @@
|
|
1
|
+
### ----------------------------------------------------------------------------------------
|
2
|
+
### 한국어 발음규칙 (Korean G2P Rulebook)
|
3
|
+
### Last updated: 2019-01-31
|
4
|
+
### Yejin Cho (ycho@utexas.edu)
|
5
|
+
### 처리순서:
|
6
|
+
### 예외처리 - 유기음화(겹받침) - 겹받침관련규칙 - 경음화 - 겹받침단순화 - 비음화 - 리을 재음절화
|
7
|
+
### - 유음화 - 구개음화 - 유기음화(홑받침) - 연음 - 종성중화 - 리을 재음절화 [종료]
|
8
|
+
### ----------------------------------------------------------------------------------------
|
9
|
+
### 예외처리
|
10
|
+
ii,ll,[#-]y([aeoquv]), ii,ll,rr,y\1, # 일 연대, 삼십일여간
|
11
|
+
(h0,aa|t0,xx),ll,-ii,ll, \1,ll,rr,ii,ll, # 들일, 볼일, 할일
|
12
|
+
s0,vv,ll,-ii,kf, s0,vv,ll,rr,ii,kf, # 설익(다)
|
13
|
+
mm,uu,ll,-k0,oo,-k0,ii, mm,uu,ll,kk,oo,k0,ii, # 물고기
|
14
|
+
s0,ii,ll,-s0,ii,ll s0,ii,ll,s0,ii,ll # 실실
|
15
|
+
k0,ii,-s0,xx,lk, k0,ii,s0,xx,kf, # 기슭
|
16
|
+
c0,vv,ll,-ya,kf, c0,vv,rr,ya,kf, # 절약
|
17
|
+
k0,xx,mf,-yo,-ii,ll, k0,xx,-mm,yo,-ii,ll, # 금요일
|
18
|
+
lt,-ii, ll,-ch,ii, # 훑이
|
19
|
+
(?<=nn,vv,)lb,(?=-(c0,(uu|vv),kf|t0,(uu|vv),ng)) pf, # 넓죽/넓둥글다
|
20
|
+
(?<=s0,ii,)lh,-c0,(?=xx,ng) ll,cc, # 싫증
|
21
|
+
t0,aa,lk, t0,aa,kf, # 닭
|
22
|
+
(wq|we|oo),nf,-k0,aa,c0, \1,nf,k0,aa,tf, # 온갖
|
23
|
+
mm,aa,tf,-h0,yv,ng, mm,aa,th,yv,ng, # 맏형
|
24
|
+
k0,vv,th,-oo,s0, k0,vv,t0,oo,tf, # 겉옷
|
25
|
+
c0,uu,ll,-nn,vv,mf,-k0,ii, c0,uu,ll,rr,vv,mf,-kk,ii, # 줄넘기
|
26
|
+
h0,oo,th,-ii,-p0,uu,ll, h0,oo,nf,nn,ii,p0,uu,ll, # 홑이불
|
27
|
+
s0,aa,ks,-ii,ll, s0,aa,ng,nn,ii,ll, # 삯일
|
28
|
+
mm,qq,nf,-ii,pf, mm,qq,nf,nn,ii,pf, # 맨입
|
29
|
+
kk,oo,ch,-ii,ph, kk,oo,nf,nn,ii,pf, # 꽃잎
|
30
|
+
nn,qq,-p0,oo,kf,-ya,kf, nn,qq,p0,oo,ng,nn,ya,kf, # 내복약
|
31
|
+
h0,aa,nf,-yv,-rr,xx,mf, h0,aa,nf,nn,yv,rr,xx,mf, # 한여름
|
32
|
+
nn,aa,mf,-c0,oo,nf,-yv,-p0,ii, nn,aa,mf,c0,oo,nf,nn,yv,p0,ii, # 남존여비
|
33
|
+
s0,ii,nf,-yv,-s0,vv,ng, s0,ii,nf,nn,yv,s0,vv,ng, # 신여성
|
34
|
+
s0,qq,kf,-yv,nf,-ph,ii,ll, s0,qq,ng,nn,yv,nf,ph,ii,ll, # 색연필
|
35
|
+
t0,aa,mf,-yo, t0,aa,mf,nn,yo, # 담요
|
36
|
+
nn,uu,nf,-yo,-k0,ii, nn,uu,nf,nn,yo,k0,ii, # 눈요기
|
37
|
+
vv,pf,-yo,ng, vv,mf,nn,yo,ng, # (영)업용
|
38
|
+
s0,ii,kf,-yo,ng,-yu, s0,ii,k0,yo,ng,nn,yu, # 식용유
|
39
|
+
nf,-yu,nf,-rr,ii, nf,nn,yu,ll,rr,ii, # (국민)윤리
|
40
|
+
(c0|s0),(aa|oo|uu),ll,-ii,(ph|p0|pf), \1,\2,ll,rr,ii,pf, # 잘입다, 솔잎, 술잎
|
41
|
+
(?=(^|#))h0,aa,nf,-ii,ll, h0,aa,nf,nn,ii,ll, # 한일
|
42
|
+
(?=(^|#))mm,aa,kf,-ii,ll, mm,aa,ng,nn,ii,ll, # 막일
|
43
|
+
mm,oo,ll,-s0,aa,ng,-s0,ii,kf, mm,oo,ll,ss,aa,ng,s0,ii,kf, # 몰상식
|
44
|
+
oo,s0,#ii,pf, oo,nf,nn,ii,pf, # 옷입(다)
|
45
|
+
(nf|ll),-yv,-s0,vv,-s0, \1,nn,yv,s0,vv,tf, # (스물/서른)여섯
|
46
|
+
(ng|mf|nf),-y([aeoquv]), \1,nn,y\2, # 밤윷, 직행열차, 콩엿, 볶은엿
|
47
|
+
(wv|ii),ll,-y([aeoquv]), \1,rr,y\2, # 일/월요일
|
48
|
+
ll,-y([aeoquv]), ll,rr,y\1, # 불여우, 물약, 서울역, 물엿, 물옆, 굴옆, 휘발유, 유들유들
|
49
|
+
ii,ll,-c0,vv,ll, ii,ll,cc,vv,ll, # 일절
|
50
|
+
(th|tf|s0),-y([aeoquv]), nf,-nn,y\2, # 쑥갓요
|
51
|
+
(<=^|#)mm,aa,kf,-ii,ll mm,aa,ng,nn,ii,ll # 막일
|
52
|
+
k0,uu,-k0,xx,nf,-rr,yu, k0,uu,k0,xx,nf,nn,yu, # 구근류
|
53
|
+
k0,aa,ll,-([ct])0,xx,ng, k0,aa,ll,\1\1,xx,ng, # 갈등/갈증
|
54
|
+
p0,aa,ll,-t0,oo,ng, p0,aa,ll,tt,oo,ng, # 발동
|
55
|
+
c0,vv,ll,-t0,oo, c0,vv,ll,tt,oo, # 절도
|
56
|
+
mm,aa,ll,-s0,aa,ll, mm,aa,ll,ss,aa,ll, # 말살
|
57
|
+
p0,uu,ll,-s0, p0,uu,ll,ss, # 불소/불세출
|
58
|
+
ii,ll,-s0,ii, ii,ll,ss,ii, # 일시
|
59
|
+
p0,aa,ll,-c0,vv,nf, p0,aa,ll,cc,vv,nf, # 발전
|
60
|
+
(?<=(s0,ii,nf,|s0,aa,mf,)-)(c|k|t)0, \2\2, # 신고, 신다, 신자, 삼고, 삼다, 삼자
|
61
|
+
(?<=k0,ii,mf,-)p0, pp, # 김밥
|
62
|
+
(?<=t0,vv,-t0,xx,mf,-)c0, cc, # 더듬지
|
63
|
+
c0,aa,mf,-c0,aa,-rr,ii, c0,aa,mf,cc,aa,rr,ii, # 잠자리
|
64
|
+
(?<=(ng|ll),-)c0,(?=uu,ll,-k0,ii) cc, # 물줄기, 강줄기
|
65
|
+
(?<=(nf|ll),-)p0,vv,pf, pp,vv,pf, # 문법, 불법
|
66
|
+
(?<=(nf|tf),-)p0,(?=aa,-rr,aa,mf) pp, # 신바람, 늦바람
|
67
|
+
p0,aa,-rr,aa,mf,-k0,yv,ll, p0,aa,rr,aa,mf,kk,yv,ll, # 바람결
|
68
|
+
(?<=(mf|kf),-)p0,(?=aa,pf,) pp, # 아침밥, 점심밥, 저녁밥
|
69
|
+
(?<=nn,uu,nf,-)t0, tt, # 눈동자, 눈대중
|
70
|
+
mm,aa,kf,-yv,mf, mm,aa,ng,nn,yv,mf, # 늑막염, 결막염
|
71
|
+
p0,aa,lb,-(t|k)0, p0,aa,pf,\1\1, # 밟다, 밟고
|
72
|
+
p0,aa,lb,-nn, p0,aa,mf,nn, # 밟는
|
73
|
+
nn,vv,lb,-(t|k)0, nn,vv,ll,\1\1, # 넓다, 넓고
|
74
|
+
mm,(aa|vv),s0,-ii,ss,-t0,aa, mm,\1,t0,ii,tf,tt,aa, # 맛있다
|
75
|
+
mm,(aa|vv),s0,-vv,ps,-t0,aa, mm,\1,t0,vv,pf,tt,aa, # 맛없다
|
76
|
+
c0,vv,c0,-vv,-mm,ii, c0,vv,t0,vv,mm,ii, # 젖어미
|
77
|
+
h0,vv,s0,-uu,s0,-xx,mf, h0,vv,t0,uu,s0,xx,mf, # 헛웃음
|
78
|
+
k0,aa,ps,-vv,-ch,ii, k0,aa,p0,vv,ch,ii, # 값어치
|
79
|
+
k0,aa,ps,-ii,ss,-nn,xx,nf, k0,aa,p0,ii,nf,nn,xx,nf, # 값있는
|
80
|
+
c0,vv,lm,-c0,ii, c0,vv,mf,cc,ii, # 젊지
|
81
|
+
oo,lm,-k0,(?=[iy]) oo,mf,k0, # 옮기(다)
|
82
|
+
k0,uu,lm,-k0,ii,-t0,aa, k0,uu,mf,k0,ii,t0,aa, # 굶기다
|
83
|
+
(nn|k0|h0),aa,ll,-(p|s|c|k|t)0, \1,aa,ll,\2\2, # 갈바, 할바, 만날것
|
84
|
+
ch,vv,s0,-ii,nf, ch,vv,t0,ii,nf, # 첫인(상)
|
85
|
+
(?<=(mf|nf),-)ii,-p0,uu,ll, nn,ii,p0,uu,ll, # 솜이불
|
86
|
+
(?<=(nf|ll),-)k0,oo,-rr,ii, kk,oo,rr,ii, # 문고리
|
87
|
+
(?<=(nf|ll),-)s0,qq, ss,qq, # 산새, 들새
|
88
|
+
(?<=(nf|ll),-)c0,qq,-c0,uu, cc,qq,c0,uu, # 손재주, 글재주
|
89
|
+
k0,ii,ll,-k0,aa, k0,ii,ll,kk,aa, # 길가
|
90
|
+
mm,uu,ll,-t0,oo,ng,-ii, mm,uu,ll,tt,oo,ng,ii, # 물동이
|
91
|
+
mm,uu,ll,-c0, mm,uu,ll,-cc, # 물증
|
92
|
+
(?<=(nf|ll),-)p0,aa,-t0,aa,kf, pp,aa,t0,aa,kf, # 발바닥, 손바닥
|
93
|
+
(?<=(nf|ll),-)s0,oo,kf, ss,oo,kf, # 굴속, 물속
|
94
|
+
(?<=s0,uu,ll,-)(c|p|t)0, \1\1, # 술잔, 술독, 술병, 술자리
|
95
|
+
k0,aa,ng,-k0,aa, k0,aa,ng,kk,aa, # 강가
|
96
|
+
(?<=(ng|mf),-)t0,aa,ll, tt,aa,ll, # 초승달
|
97
|
+
t0,xx,ng,-p0,uu,ll, t0,xx,ng,pp,uu,ll, # 등불
|
98
|
+
ch,aa,ng,-s0,aa,ll, ch,aa,ng,ss,aa,ll, # 창살
|
99
|
+
(?<=(ll|ng),-)c0,uu,ll,-k0,ii, k0,aa,ng,cc,uu,ll,k0,ii, # 강줄기, 물줄기
|
100
|
+
aa,nf,-k0,oo, aa,nf,kk,oo, # 안고
|
101
|
+
(?<=kk,yv,-aa,nf,-)(t|c)0, \1\1, # 껴안지, 껴안다
|
102
|
+
ii,-c0,uu,kf,-ii,-c0,uu,kf, ii,c0,uu,ng,nn,ii,c0,uu,kf, # 이죽이죽
|
103
|
+
ya,-k0,xx,mf,-ya,-k0,xx,mf, ya,k0,xx,mf,nn,ya,k0,xx,mf, # 야금야금
|
104
|
+
p0,ee,-k0,qq,s0,-ii,s0, p0,ee,k0,qq,nf,nn,ii,tf, # 베갯잇
|
105
|
+
kk,qq,s0,-ii,ph, kk,qq,nf,nn,ii,pf, # 깻잎
|
106
|
+
nn,aa,-mm,uu,s0,-ii,ph, nn,aa,mm,uu,nf,nn,ii,pf, # 나뭇잎
|
107
|
+
qq,s0,-yv,ll, qq,nf,nn,yv,ll, # 도리깻열
|
108
|
+
t0,wi,s0,-(?=[aeqiouyvwx]) t0,wi,nf,-nn, # 뒷윷, 뒷얘기
|
109
|
+
nn,xx,c0,-yv,-rr,xx,mf, nn,xx,tf,nn,yv,rr,xx,mf, # 늦여름
|
110
|
+
t0,ii,-k0,xx,tf,-(ii|xx|ee), t0,ii,k0,xx,s0,\1, # 디귿에, 디귿이
|
111
|
+
(c0|ch|th|h0),ii,-xx,(c0|ch|th|h0),-(ii|xx|ee), \1,ii,xx,s0,\3, # 치읓이, 지읒에
|
112
|
+
ph,ii,-xx,ph,-(ii|xx|ee), ph,ii,xx,p0,\1, # 피읖에
|
113
|
+
kh,ii,-xx,kh,-(ii|xx|ee), kh,ii,xx,k0,\1, # 키읔이
|
114
|
+
### 유기음화 (겹받침)
|
115
|
+
l(b|p),-h0, ll,-ph,
|
116
|
+
nh,-(c|k|t)0, nf,-\1h,
|
117
|
+
lh,-(c|k|t)0, ll,-\1h,
|
118
|
+
lk,-h0, ll,-kh,
|
119
|
+
nc,-h0, nf,-ch,
|
120
|
+
### 겹받침 규칙 (ㄹㅎ)
|
121
|
+
(k0,aa,|k0,uu,|k0,vv,|oo,|p0,aa,|nn,aa,|nn,xx,|p0,uu,|^ii,|-,ii,mm,aa,|mm,uu,|(^|-,)vv,)lk,-(t0|c0|s0), \1kf,-\3,
|
122
|
+
(k0,aa,|k0,uu,|k0,vv,|vv,|oo,|mm,aa,|p0,aa,|nn,aa,|nn,xx,|mm,uu,|p0,uu,|^ii,|-,ii,)lk,-k0, \1ll,-kk,
|
123
|
+
### 겹받침 규칙 (ㄴㅎ)
|
124
|
+
nh,-(k|t|c)0, nf,-\1h,
|
125
|
+
nh,-s0, nf,-ss,
|
126
|
+
nh,-nn, nf,-nn,
|
127
|
+
nh,-(?=[aeqiouyvwx]) -nn,
|
128
|
+
### 겹받침 규칙 (ㄹㅎ)
|
129
|
+
lh,-nn, ll,-rr,
|
130
|
+
lh,-(k|t|c)0, ll,-\1h,
|
131
|
+
lh,-s0, ll,-ss,
|
132
|
+
lh,-(?=[aeqiouyvwx]) -rr,
|
133
|
+
### 겹받침 규칙 (ㄴㅈ)
|
134
|
+
nc,-([ktsc])0, nf,-\1\1,
|
135
|
+
### 겹받침 규칙 (ㄹㅁ)
|
136
|
+
(c0,vv,|c0,ii,|k0,uu,|t0,aa,|(^|-,)oo,|k0,oo,)lm,-([ktsc])0, \1mf,-\3\3,
|
137
|
+
### 겹받침 규칙 (ㄹㅂ)
|
138
|
+
(p0,aa,|tt,vv,|(^|-,)yv,|nn,vv,|(^|-,)ya,|cc,aa,)lb,-([ktsc])0, \1ll,-\4\4,
|
139
|
+
### 겹받침 규칙 (ㄹㅌ)
|
140
|
+
h0,(aa|uu),lt,-nn, h0,\1,ll,-ll,
|
141
|
+
h0,(aa|uu),lt,-([ktsc])0, h0,\1,ll,-\2\2,
|
142
|
+
### 경음화
|
143
|
+
lk,-(c|k|p|s|t)0, kf,-\1\1,
|
144
|
+
l(b|p),-p0, pf,-pp,
|
145
|
+
s0,-p0, tf,-pp,
|
146
|
+
l(b|t),-(c|k|s|t|p)0, ll,-\2\2,
|
147
|
+
lp,-(c|k|s|t)0, pf,-\1\1,
|
148
|
+
(c[h0]|s[s0]|t[fh]),-(c|k|s|t)0, tf,-\2\2,
|
149
|
+
k[fhks],-(c|k|p|s|t)0, kf,-\1\1,
|
150
|
+
p[sfh],-(c|k|p|s|t)0, pf,-\1\1,
|
151
|
+
(?<=(kf|kh|ks|ss|c0|ch|tf|th),-)p0, pp,
|
152
|
+
h0,-s0, -ss,
|
153
|
+
nh,-s0, nf,-ss,
|
154
|
+
lh,-s0, ll,-ss,
|
155
|
+
### 겹받침 단순화: 어말 또는 자음 앞
|
156
|
+
(ks|lk),(?=(#|$|-[ptkshcmnr])) kf,
|
157
|
+
n[ch],(?=(#|$|-[ptkshcmnr])) nf,
|
158
|
+
l[bsth],(?=(#|$|-[ptkshcmnr])) ll,
|
159
|
+
lm,(?=(#|$|-[ptkshcmnr])) mf,
|
160
|
+
(ps|lp),(?=(#|$|-[ptkshcmnr])) pf,
|
161
|
+
### 겹받침 단순화: 모음 앞
|
162
|
+
([kp])s,-(?=[aeqiouyvwx]) \1f,-ss,
|
163
|
+
ls,-(?=[aeqiouyvwx]) ll,-ss,
|
164
|
+
nc,-(?=[aeqiouyvwx]) nf,-c0,
|
165
|
+
lk,-(?=[aeqiouyvwx]) ll,-k0,
|
166
|
+
lm,-(?=[aeqiouyvwx]) ll,-mm,
|
167
|
+
lb,-(?=[aeqiouyvwx]) ll,-p0,
|
168
|
+
l([tp]),-(?=[aeqiouyvwx]) ll,-\1h,
|
169
|
+
### 비음화
|
170
|
+
(?<=[pk])0,-rr, f,-nn,
|
171
|
+
(c0|ch|s0|ss|tf|nh|h0),-nn, nf,-nn,
|
172
|
+
nc,-(p|t|k)0, nf,-\1\1,
|
173
|
+
nc,(?=-[ptkshcmnr]) nf,
|
174
|
+
lm,-k0, mf,-kk,
|
175
|
+
lm,(?=-[ptkshcmnr]) mf,
|
176
|
+
k[fhks],(?=-(nn|mm),) ng,
|
177
|
+
lk,(?=-(nn|mm),) ng,
|
178
|
+
p[sfh],(?=-(nn|mm),) mf,
|
179
|
+
l[bp],(?=-(nn|mm),) mf,
|
180
|
+
(?<=(mf|ng|pf|kf),-)rr, nn,
|
181
|
+
(c0|ch|s0|ss|tf|nh|h0),(?=-mm,) nf,
|
182
|
+
### 리을 재음절화
|
183
|
+
ll,-(?=y) -rr,
|
184
|
+
### 유음화
|
185
|
+
(nf|ll),-rr, ll,-rr,
|
186
|
+
l[lht],-nn, ll,-rr,
|
187
|
+
### 구개음화
|
188
|
+
tf,-(?=[iy]) -c0,
|
189
|
+
th,-(?=[iy]) -ch,
|
190
|
+
tf,-h0,(?=[iy]) -ch,
|
191
|
+
### 유기음화 (홑받침)
|
192
|
+
(p|k)f,-h0, -\1h,
|
193
|
+
h0,-(c|k|t)0, -\1h,
|
194
|
+
(tf|th|s0),(-|#)h0, -th,
|
195
|
+
### 연음규칙
|
196
|
+
(s0|ss|kk|p0|ph|pp|t0|th|tt|c0|ch|kh|kk|k0|mm|nn),-(?=[aeqiouyvwx]) -\1,
|
197
|
+
nh,-(?=[aeqiouyvwx]) -nn,
|
198
|
+
(s0|ss|c0|ch|th),(?=-[ptkshcmnr]) tf,
|
199
|
+
h0,-(?=[aeqiouyvwx]) -
|
200
|
+
lh,-?(?=[aeqiouyvwx]) -rr,
|
201
|
+
(p|t|k)f,-?(?=[aeqiouyvwx]) -\g<1>0,
|
202
|
+
(m|n)f,-?(?=[aeqiouyvwx]) -\1\1,
|
203
|
+
### 종성규칙
|
204
|
+
(s0|ss|c0|ch|th),(?=-|#|$) tf,
|
205
|
+
(kh|kk|ks|lk),(?=-|#|$|[ptkshcmnr]) kf, # (ks|lk),(?=-[ptkshcmnr]) kf,
|
206
|
+
(ph|lp|ps),(?=-|#|$|[ptkshcmnr]) pf,
|
207
|
+
(?<=[ptkshcmnr].),-(?=[aeqiouyvwx]) ,
|
208
|
+
l[bhstp],(?=-|#|$|[ptkshcmnr]) ll, # l[bt],(?=-[ptkshcmnr]) ll,
|
209
|
+
nh,(?=-|#|$|[ptkshcmnr]) nf,-
|
210
|
+
### 리을 재음절화
|
211
|
+
(?<=[aeqiouyvwx].,)ll,-(?=[aeqiouyvwx]) -rr,
|
212
|
+
ll,-ll, ll,-rr,
|
@@ -0,0 +1,67 @@
|
|
1
|
+
from phoonnx.thirdparty.mantoq.buck import symbols
|
2
|
+
from phoonnx.thirdparty.mantoq.buck.tokenization import (arabic_to_phonemes, phon_to_id_,
|
3
|
+
phonemes_to_tokens, simplify_phonemes)
|
4
|
+
from phoonnx.thirdparty.mantoq.buck.tokenization import tokens_to_ids as _tokens_to_id
|
5
|
+
from phoonnx.thirdparty.mantoq.num2words import num2words
|
6
|
+
import warnings
|
7
|
+
from phoonnx.thirdparty.tashkeel import TashkeelDiacritizer
|
8
|
+
try:
|
9
|
+
import onnxruntime
|
10
|
+
|
11
|
+
_TASHKEEL_AVAILABLE = True
|
12
|
+
except ImportError:
|
13
|
+
_TASHKEEL_AVAILABLE = False
|
14
|
+
|
15
|
+
_DIACRITIZER_INST = None
|
16
|
+
|
17
|
+
MANTOQ_SYMBOLS = dict(phon_to_id_)
|
18
|
+
MANTOQ_SPECIAL_SYMBOLS = dict(
|
19
|
+
pad=phon_to_id_[symbols.PADDING_TOKEN],
|
20
|
+
eos=phon_to_id_[symbols.EOS_TOKEN],
|
21
|
+
)
|
22
|
+
# Maps Arabic-specific puncs with their English equivlents
|
23
|
+
AR_SPECIAL_PUNCS_TABLE = str.maketrans("،؟؛", ",?;")
|
24
|
+
OMITTED_SYMBOLS = str.maketrans("", "", "+=<>")
|
25
|
+
|
26
|
+
# Quotes
|
27
|
+
QUOTES = '“”„«»'
|
28
|
+
QUOTES_TABLE = str.maketrans(QUOTES, '"' * len(QUOTES))
|
29
|
+
BRACKETS_TABLE = str.maketrans("[]{}", "()()")
|
30
|
+
|
31
|
+
|
32
|
+
|
33
|
+
|
34
|
+
def tashkeel(text: str) -> str:
|
35
|
+
global _DIACRITIZER_INST
|
36
|
+
if not _TASHKEEL_AVAILABLE:
|
37
|
+
warnings.warn(
|
38
|
+
"Warning: The Tashkeel feature will not be available. Please re-install with the `libtashkeel` extra.",
|
39
|
+
UserWarning,
|
40
|
+
)
|
41
|
+
return text
|
42
|
+
if _DIACRITIZER_INST is None:
|
43
|
+
_DIACRITIZER_INST = TashkeelDiacritizer()
|
44
|
+
return _DIACRITIZER_INST.diacritize(text)
|
45
|
+
|
46
|
+
def g2p(
|
47
|
+
text: str,
|
48
|
+
add_tashkeel: bool = True,
|
49
|
+
process_numbers: bool = True,
|
50
|
+
append_eos: bool = False,
|
51
|
+
) -> list[str]:
|
52
|
+
text = text.translate(AR_SPECIAL_PUNCS_TABLE).translate(QUOTES_TABLE).translate(BRACKETS_TABLE)
|
53
|
+
if add_tashkeel:
|
54
|
+
text = tashkeel(text)
|
55
|
+
if process_numbers:
|
56
|
+
text = num2words(text)
|
57
|
+
normalized_text = text
|
58
|
+
phones = arabic_to_phonemes(text)
|
59
|
+
phones = simplify_phonemes(phones)
|
60
|
+
tokens = phonemes_to_tokens(phones)
|
61
|
+
if not append_eos:
|
62
|
+
tokens = tokens[:-1]
|
63
|
+
return normalized_text, tokens
|
64
|
+
|
65
|
+
|
66
|
+
def tokens2ids(tokens: list[str]) -> list[int]:
|
67
|
+
return _tokens_to_id(tokens)
|
File without changes
|