sigilyph 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sigilyph/__init__.py +18 -0
- sigilyph/core/__init__.py +0 -0
- sigilyph/core/bert_align.py +163 -0
- sigilyph/core/g2p_func.py +47 -0
- sigilyph/core/norm_func_bk.py +98 -0
- sigilyph/core/predict.py +64 -0
- sigilyph/core/preprocess.py +16 -0
- sigilyph/core/py2phone.dict +2165 -0
- sigilyph/core/sigilyph_class.py +246 -0
- sigilyph/core/special_dict.json +26 -0
- sigilyph/core/symbols.py +445 -0
- sigilyph/core/text_process.py +328 -0
- sigilyph/fst_tool/__init__.py +0 -0
- sigilyph/fst_tool/infer_normalizer.py +49 -0
- sigilyph/fst_tool/processor.py +122 -0
- sigilyph/fst_tool/token_parser.py +159 -0
- sigilyph/text_norm/__init__.py +0 -0
- sigilyph/text_norm/norm_func.py +155 -0
- sigilyph/text_norm/norm_func_new.py +89 -0
- sigilyph/text_norm/sigilyph_norm.py +179 -0
- sigilyph-0.5.2.dist-info/METADATA +24 -0
- sigilyph-0.5.2.dist-info/RECORD +24 -0
- sigilyph-0.5.2.dist-info/WHEEL +5 -0
- sigilyph-0.5.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
'''
|
|
2
|
+
FilePath: /python-Sigilyph/sigilyph/core/sigilyph_class.py
|
|
3
|
+
Descripttion:
|
|
4
|
+
Author: Yixiang Chen
|
|
5
|
+
version:
|
|
6
|
+
Date: 2025-08-12 14:42:50
|
|
7
|
+
LastEditors: Yixiang Chen
|
|
8
|
+
LastEditTime: 2026-01-16 18:45:50
|
|
9
|
+
'''
|
|
10
|
+
|
|
11
|
+
import langid
|
|
12
|
+
import re
|
|
13
|
+
import json
|
|
14
|
+
|
|
15
|
+
import jieba
|
|
16
|
+
import jieba.posseg
|
|
17
|
+
|
|
18
|
+
from sigilyph.core.g2p_func import g2p_en, g2p_cn
|
|
19
|
+
from sigilyph.text_norm.norm_func import preprocess_first, post_process, text_norm_en, text_norm_cn
|
|
20
|
+
from sigilyph.core.symbols import punctuation
|
|
21
|
+
from sigilyph.core.predict import before_replace_dict, special_word_dict, special_phrase
|
|
22
|
+
|
|
23
|
+
norm_func_dict = {
|
|
24
|
+
'en': text_norm_en,
|
|
25
|
+
'zh': text_norm_cn
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
g2p_func_dict = {
|
|
29
|
+
'en': g2p_en,
|
|
30
|
+
'zh': g2p_cn
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
class Sigilyph:
|
|
34
|
+
def __init__(self, before_dict_path=None, special_dict_path=None):
|
|
35
|
+
self.sil1symbol='-'
|
|
36
|
+
self.punctuation = punctuation
|
|
37
|
+
|
|
38
|
+
self.before_replace_dict = before_replace_dict
|
|
39
|
+
if before_dict_path:
|
|
40
|
+
with open(before_dict_path, 'r', encoding="utf-8") as obdp:
|
|
41
|
+
extra_before_dict = json.load(obdp)
|
|
42
|
+
self.before_replace_dict.update(extra_before_dict)
|
|
43
|
+
|
|
44
|
+
self.special_word_dict = special_word_dict
|
|
45
|
+
if special_dict_path:
|
|
46
|
+
with open(special_dict_path, 'r', encoding="utf-8") as obdp:
|
|
47
|
+
extra_special_dict = json.load(obdp)
|
|
48
|
+
self.special_word_dict.update(extra_special_dict)
|
|
49
|
+
|
|
50
|
+
self.special_phrase = special_phrase
|
|
51
|
+
|
|
52
|
+
def forward(self, text, lang, spflag=False, norm_use_lang='zh'):
|
|
53
|
+
phones = self.text_process(text, lang, spflag, norm_use_lang)
|
|
54
|
+
phones = self.replace_sil2label(phones)
|
|
55
|
+
return phones
|
|
56
|
+
|
|
57
|
+
def text_process(self, text, lang, spflag=False, norm_use_lang='zh'):
|
|
58
|
+
text = preprocess_first(text, self.before_replace_dict, special_word_dict, norm_use_lang=norm_use_lang)
|
|
59
|
+
text = self.text_norm(use_text, norm_use_lang)
|
|
60
|
+
text = post_process(text, special_word_dict)
|
|
61
|
+
|
|
62
|
+
multi_lang_text_list = self.text_split_lang(text, lang)
|
|
63
|
+
|
|
64
|
+
all_phone = []
|
|
65
|
+
for text_split_dict in multi_lang_text_list:
|
|
66
|
+
use_lang = text_split_dict['lang']
|
|
67
|
+
use_text = text_split_dict['text_split']
|
|
68
|
+
if use_lang == 'phone':
|
|
69
|
+
phonelist = use_text.split()
|
|
70
|
+
all_phone.extend(phonelist)
|
|
71
|
+
else:
|
|
72
|
+
if use_lang not in norm_func_dict.keys():
|
|
73
|
+
use_lang = 'zh'
|
|
74
|
+
#use_text = self.text_norm(use_text, use_lang)
|
|
75
|
+
use_text = use_text
|
|
76
|
+
phone_list = self.g2p(use_text, use_lang)
|
|
77
|
+
#all_phone.append('sil')
|
|
78
|
+
all_phone.append('sil_lang')
|
|
79
|
+
all_phone.append('<sp>')
|
|
80
|
+
all_phone.extend(phone_list)
|
|
81
|
+
#all_phone = postprocess(all_phone)
|
|
82
|
+
all_phone = self.postprocess_tts(all_phone)
|
|
83
|
+
if not spflag:
|
|
84
|
+
while '<sp>' in all_phone:
|
|
85
|
+
all_phone.remove('<sp>')
|
|
86
|
+
return all_phone
|
|
87
|
+
|
|
88
|
+
def text_process_old(self, text, lang, spflag=False, norm_use_lang='zh'):
|
|
89
|
+
text = preprocess_first(text, self.before_replace_dict, special_word_dict, norm_use_lang=norm_use_lang)
|
|
90
|
+
|
|
91
|
+
multi_lang_text_list = self.text_split_lang(text, lang)
|
|
92
|
+
|
|
93
|
+
all_phone = []
|
|
94
|
+
for text_split_dict in multi_lang_text_list:
|
|
95
|
+
use_lang = text_split_dict['lang']
|
|
96
|
+
use_text = text_split_dict['text_split']
|
|
97
|
+
if use_lang == 'phone':
|
|
98
|
+
phonelist = use_text.split()
|
|
99
|
+
all_phone.extend(phonelist)
|
|
100
|
+
else:
|
|
101
|
+
if use_lang not in norm_func_dict.keys():
|
|
102
|
+
use_lang = 'zh'
|
|
103
|
+
use_text = self.text_norm(use_text, use_lang)
|
|
104
|
+
phone_list = self.g2p(use_text, use_lang)
|
|
105
|
+
#all_phone.append('sil')
|
|
106
|
+
all_phone.append('sil_lang')
|
|
107
|
+
all_phone.append('<sp>')
|
|
108
|
+
all_phone.extend(phone_list)
|
|
109
|
+
#all_phone = postprocess(all_phone)
|
|
110
|
+
all_phone = self.postprocess_tts(all_phone)
|
|
111
|
+
if not spflag:
|
|
112
|
+
while '<sp>' in all_phone:
|
|
113
|
+
all_phone.remove('<sp>')
|
|
114
|
+
return all_phone
|
|
115
|
+
|
|
116
|
+
############### split text in line with lang ##############
|
|
117
|
+
def text_split_lang(self, text, lang):
|
|
118
|
+
if lang == 'ZH' or lang == 'zh':
|
|
119
|
+
multi_lang_text_list = [{'lang':'zh', 'text_split': text}]
|
|
120
|
+
elif lang == 'en':
|
|
121
|
+
multi_lang_text_list = [{'lang':'en', 'text_split': text}]
|
|
122
|
+
else:
|
|
123
|
+
pretext_split = re.split("(\[.*?\])", text, re.I|re.M)
|
|
124
|
+
multi_lang_text_list = []
|
|
125
|
+
pretext_split = list(filter(None, pretext_split))
|
|
126
|
+
for utext in pretext_split:
|
|
127
|
+
if utext[0] != '[':
|
|
128
|
+
pattern = r'([a-zA-Z ,.\!\?]+|[\u4e00-\u9fa5 ,。,.\t \"\!\?\“\”\、]+)'
|
|
129
|
+
text_split = re.findall(pattern, utext)
|
|
130
|
+
#print(text_split)
|
|
131
|
+
for idx in range(len(text_split)):
|
|
132
|
+
tmpts = text_split[idx]
|
|
133
|
+
tmp_lang = langid.classify(tmpts)[0]
|
|
134
|
+
if len(tmpts)>20:
|
|
135
|
+
if not self.has_punc(tmpts[:-1]):
|
|
136
|
+
tmpts = self.add_pause(tmpts, 'p')
|
|
137
|
+
if not self.has_punc(tmpts[:-1]):
|
|
138
|
+
tmpts = self.add_pause(tmpts, 'v')
|
|
139
|
+
if tmpts in self.special_phrase:
|
|
140
|
+
tmpts = tmpts+self.sil1symbol
|
|
141
|
+
if tmp_lang in ['zh', 'jp', 'ja']:
|
|
142
|
+
tmp_lang = 'zh'
|
|
143
|
+
tmpts = tmpts.replace(' ', self.sil1symbol)
|
|
144
|
+
else:
|
|
145
|
+
tmp_lang = 'en'
|
|
146
|
+
if not tmpts.isspace():
|
|
147
|
+
multi_lang_text_list.append({'lang':tmp_lang, 'text_split': tmpts})
|
|
148
|
+
else:
|
|
149
|
+
phones = utext[1:-1]
|
|
150
|
+
multi_lang_text_list.append({'lang':'phone', 'text_split': phones})
|
|
151
|
+
return multi_lang_text_list
|
|
152
|
+
|
|
153
|
+
########## add parse ###############
|
|
154
|
+
def has_punc(self, text):
|
|
155
|
+
for char in text:
|
|
156
|
+
if char in [',', '.', '!', '?', ',','。','?','!', self.sil1symbol]:
|
|
157
|
+
return True
|
|
158
|
+
return False
|
|
159
|
+
|
|
160
|
+
def add_pause(self, text, tf='v'):
|
|
161
|
+
segment = jieba.posseg.cut(text.strip())
|
|
162
|
+
wlist = []
|
|
163
|
+
flist = []
|
|
164
|
+
for x in segment:
|
|
165
|
+
wlist.append(x.word)
|
|
166
|
+
flist.append(x.flag)
|
|
167
|
+
idx = self.search_ele_mid(flist, tf)
|
|
168
|
+
if idx != len(flist)-1:
|
|
169
|
+
wlist.insert(idx, self.sil1symbol)
|
|
170
|
+
outtext = ''.join(wlist)
|
|
171
|
+
return outtext
|
|
172
|
+
|
|
173
|
+
def search_ele_mid(self, flaglist, tf = 'v'):
|
|
174
|
+
nowidx = -1
|
|
175
|
+
halflen = (len(flaglist))//2
|
|
176
|
+
for gap in range(len(flaglist)-halflen):
|
|
177
|
+
nowidx = halflen - gap
|
|
178
|
+
if flaglist[nowidx]==tf:
|
|
179
|
+
return nowidx
|
|
180
|
+
nowidx = halflen + gap
|
|
181
|
+
if flaglist[nowidx]==tf:
|
|
182
|
+
return nowidx
|
|
183
|
+
return nowidx
|
|
184
|
+
|
|
185
|
+
######## text norm #########
|
|
186
|
+
def text_norm(self, text, lang):
|
|
187
|
+
outtext = norm_func_dict[lang](text)
|
|
188
|
+
return outtext
|
|
189
|
+
|
|
190
|
+
############ g2p ################
|
|
191
|
+
def g2p(self, text, lang):
|
|
192
|
+
phoneme_list = g2p_func_dict[lang](text)
|
|
193
|
+
return phoneme_list
|
|
194
|
+
|
|
195
|
+
############# post process #############
|
|
196
|
+
def postprocess_tts(self, phonelist):
|
|
197
|
+
#outlist = ['sil', '<sp>']
|
|
198
|
+
outlist = []
|
|
199
|
+
#print(phonelist)
|
|
200
|
+
for idx in range(len(phonelist)):
|
|
201
|
+
pm = phonelist[idx]
|
|
202
|
+
if pm not in self.punctuation:
|
|
203
|
+
outlist.append(pm)
|
|
204
|
+
elif pm == self.sil1symbol:
|
|
205
|
+
outlist.append('sil_1')
|
|
206
|
+
else:
|
|
207
|
+
#outlist.append('sil')
|
|
208
|
+
outlist.append('sil_punc')
|
|
209
|
+
#outlist.append('<sp>')
|
|
210
|
+
#if outlist[-1] == 'sil':
|
|
211
|
+
# outlist.append('<sp>')
|
|
212
|
+
#elif outlist[-2] != 'sil':
|
|
213
|
+
# outlist.append('sil')
|
|
214
|
+
# outlist.append('<sp>')
|
|
215
|
+
if phonelist[-2] not in self.punctuation and outlist[-1].split('_')[0] != 'sil':
|
|
216
|
+
#outlist.append('sil')
|
|
217
|
+
outlist.append('sil_end')
|
|
218
|
+
outlist.append('<sp>')
|
|
219
|
+
return outlist
|
|
220
|
+
|
|
221
|
+
########## replace silence token ###############
|
|
222
|
+
def replace_sil2label(self, phones):
|
|
223
|
+
#phones = ['sil_1' if xx == 'sil_lang' else xx for xx in phones]
|
|
224
|
+
phones = ['' if xx == 'sil_lang' else xx for xx in phones]
|
|
225
|
+
phones = ['sil_2' if xx == 'sil_punc' else xx for xx in phones]
|
|
226
|
+
phones = ['sil_2' if xx == 'sil_end' else xx for xx in phones]
|
|
227
|
+
phones = ['sil_1' if xx == 'sil' else xx for xx in phones]
|
|
228
|
+
phones = list(filter(None, phones))
|
|
229
|
+
#outphones = []
|
|
230
|
+
outphones = ['sil_1']
|
|
231
|
+
for ele in phones:
|
|
232
|
+
if outphones == []:
|
|
233
|
+
outphones.append(ele)
|
|
234
|
+
else:
|
|
235
|
+
if ele.split('_')[0] == 'sil' and outphones[-1].split('_')[0] == 'sil':
|
|
236
|
+
outphones[-1] = 'sil_2'
|
|
237
|
+
#outphones[-1] = 'sil_1'
|
|
238
|
+
else:
|
|
239
|
+
outphones.append(ele)
|
|
240
|
+
#if outphones[-1].split('_')[0] == 'sil':
|
|
241
|
+
# outphones = outphones[:-1]
|
|
242
|
+
return outphones
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
{
|
|
2
|
+
"甄嬛传": "[zh en1 h uan2 zh uan4]",
|
|
3
|
+
"藏海传": "[z ang4 h ai3 zh uan4]",
|
|
4
|
+
"藏海": "[z ang4 h ai3]",
|
|
5
|
+
"井柏然": "[j ing3 b o2 r an2]",
|
|
6
|
+
"Alibaba": "[AA a1 l i3 b a1 b a1]",
|
|
7
|
+
"TFBOYS": "[T IY1 EH1 F B OY1 Z]",
|
|
8
|
+
"肖战": "[x iao1 zh an4]",
|
|
9
|
+
"肖申克": "[x iao1 sh en1 k e4]",
|
|
10
|
+
"肖像": "[x iao1 x iang4]",
|
|
11
|
+
"戛纳": "[g a1 n a4]",
|
|
12
|
+
"爆肚": "[b ao4 d u3 EE]",
|
|
13
|
+
"花呗": "[h ua1 b ei4]",
|
|
14
|
+
"肚儿": "[d u3 er5]",
|
|
15
|
+
"种了": "[zh ong4 l e5]",
|
|
16
|
+
"仇老五": "[q iu2 l ao3 w u3]",
|
|
17
|
+
"彩云曲": "[c ai3 y vn2 q v3]",
|
|
18
|
+
"雷洛传": "[l ei2 l uo4 zh uan4]",
|
|
19
|
+
"朝雪录": "[zh ao1 x ve3 l u4]",
|
|
20
|
+
"凡人修仙传": "[f an2 r en2 x iu1 x ian1 zh uan4]",
|
|
21
|
+
"+86": "[b a1 l iu]",
|
|
22
|
+
"(正八十六)": "[b a1 l iu]",
|
|
23
|
+
"八 a.m.": "[sh ang4 w u3 b a1 d ian3]",
|
|
24
|
+
"F.I.R.": "[EH1 F sil_1 AY2 sil_1 AA0 R sil_1]",
|
|
25
|
+
"S.C.I": "[sil_2 EH2 S sil_1 S IY1 sil_1 AY1 sil_1]"
|
|
26
|
+
}
|