sigilyph 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sigilyph might be problematic. Click here for more details.
- sigilyph/__init__.py +11 -0
- sigilyph/core/__init__.py +0 -0
- sigilyph/core/g2p_func.py +47 -0
- sigilyph/core/norm_func.py +85 -0
- sigilyph/core/predict.py +66 -0
- sigilyph/core/preprocess.py +16 -0
- sigilyph/core/sigilyph_class.py +215 -0
- sigilyph/core/symbols.py +444 -0
- sigilyph/core/text_process.py +327 -0
- sigilyph-0.0.1.dist-info/METADATA +23 -0
- sigilyph-0.0.1.dist-info/RECORD +13 -0
- sigilyph-0.0.1.dist-info/WHEEL +5 -0
- sigilyph-0.0.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,327 @@
|
|
|
1
|
+
'''
|
|
2
|
+
FilePath: /python-Sigilyph/sigilyph/core/text_process.py
|
|
3
|
+
Descripttion:
|
|
4
|
+
Author: Yixiang Chen
|
|
5
|
+
version:
|
|
6
|
+
Date: 2025-03-31 16:31:26
|
|
7
|
+
LastEditors: Yixiang Chen
|
|
8
|
+
LastEditTime: 2025-08-12 14:40:42
|
|
9
|
+
'''
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
import langid
|
|
13
|
+
import re
|
|
14
|
+
|
|
15
|
+
import jieba
|
|
16
|
+
import jieba.posseg
|
|
17
|
+
|
|
18
|
+
from sigilyph.core.g2p_func import g2p_en, g2p_cn
|
|
19
|
+
from sigilyph.core.norm_func import preprocess_first, text_norm_en, text_norm_cn
|
|
20
|
+
|
|
21
|
+
from sigilyph.core.symbols import base_phone_set, cn_phone_set, en_phone_set, punctuation, special_phrase
|
|
22
|
+
|
|
23
|
+
#all_phone_set = [] + sorted(set(base_phone_set + cn_phone_set + en_phone_set))
|
|
24
|
+
#all_phone_set = [] + list(set(base_phone_set)) + list(set(cn_phone_set + en_phone_set))
|
|
25
|
+
all_phone_set = [] + sorted(set(base_phone_set)) + sorted(set(cn_phone_set)) + sorted(set(en_phone_set))
|
|
26
|
+
all_phone_dict = {xx:idx for idx, xx in enumerate(all_phone_set)}
|
|
27
|
+
|
|
28
|
+
norm_func_dict = {
|
|
29
|
+
'en': text_norm_en,
|
|
30
|
+
'zh': text_norm_cn
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
g2p_func_dict = {
|
|
34
|
+
'en': g2p_en,
|
|
35
|
+
'zh': g2p_cn
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
sil1symbol='-'
|
|
39
|
+
|
|
40
|
+
def text_split_lang_old(text, lang):
|
|
41
|
+
if lang == 'ZH' or lang == 'zh':
|
|
42
|
+
multi_lang_text_list = [{'lang':'zh', 'text_split': text}]
|
|
43
|
+
elif lang == 'en':
|
|
44
|
+
multi_lang_text_list = [{'lang':'en', 'text_split': text}]
|
|
45
|
+
else:
|
|
46
|
+
pattern = r'([a-zA-Z ,.\!\?\"]+|[\u4e00-\u9fa5 ,。\!\?\“\”]+)'
|
|
47
|
+
text_split = re.findall(pattern, text)
|
|
48
|
+
multi_lang_text_list = []
|
|
49
|
+
for idx in range(len(text_split)):
|
|
50
|
+
tmpts = text_split[idx]
|
|
51
|
+
tmp_lang = langid.classify(tmpts)[0]
|
|
52
|
+
multi_lang_text_list.append({'lang':tmp_lang, 'text_split': tmpts})
|
|
53
|
+
return multi_lang_text_list
|
|
54
|
+
|
|
55
|
+
def text_split_lang_bk0724(text, lang):
|
|
56
|
+
if lang == 'ZH' or lang == 'zh':
|
|
57
|
+
multi_lang_text_list = [{'lang':'zh', 'text_split': text}]
|
|
58
|
+
elif lang == 'en':
|
|
59
|
+
multi_lang_text_list = [{'lang':'en', 'text_split': text}]
|
|
60
|
+
else:
|
|
61
|
+
pretext_split = re.split("(\[.*?\])", text, re.I|re.M)
|
|
62
|
+
multi_lang_text_list = []
|
|
63
|
+
pretext_split = list(filter(None, pretext_split))
|
|
64
|
+
for utext in pretext_split:
|
|
65
|
+
if utext[0] != '[':
|
|
66
|
+
pattern = r'([a-zA-Z ,.\!\?\"]+|[\u4e00-\u9fa5 ,。,.\t \!\?]+)'
|
|
67
|
+
text_split = re.findall(pattern, utext)
|
|
68
|
+
for idx in range(len(text_split)):
|
|
69
|
+
tmpts = text_split[idx]
|
|
70
|
+
tmp_lang = langid.classify(tmpts)[0]
|
|
71
|
+
if tmp_lang in ['zh', 'jp', 'ja']:
|
|
72
|
+
tmp_lang = 'zh'
|
|
73
|
+
else:
|
|
74
|
+
tmp_lang = 'en'
|
|
75
|
+
if not tmpts.isspace():
|
|
76
|
+
multi_lang_text_list.append({'lang':tmp_lang, 'text_split': tmpts})
|
|
77
|
+
else:
|
|
78
|
+
phones = utext[1:-1]
|
|
79
|
+
multi_lang_text_list.append({'lang':'phone', 'text_split': phones})
|
|
80
|
+
return multi_lang_text_list
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def search_ele_mid(flaglist, tf = 'v'):
|
|
84
|
+
nowidx = -1
|
|
85
|
+
halflen = (len(flaglist))//2
|
|
86
|
+
for gap in range(len(flaglist)-halflen):
|
|
87
|
+
nowidx = halflen - gap
|
|
88
|
+
if flaglist[nowidx]==tf:
|
|
89
|
+
return nowidx
|
|
90
|
+
nowidx = halflen + gap
|
|
91
|
+
if flaglist[nowidx]==tf:
|
|
92
|
+
return nowidx
|
|
93
|
+
return nowidx
|
|
94
|
+
|
|
95
|
+
def add_pause(text, tf='v'):
|
|
96
|
+
segment = jieba.posseg.cut(text.strip())
|
|
97
|
+
wlist = []
|
|
98
|
+
flist = []
|
|
99
|
+
for x in segment:
|
|
100
|
+
wlist.append(x.word)
|
|
101
|
+
flist.append(x.flag)
|
|
102
|
+
idx = search_ele_mid(flist, tf)
|
|
103
|
+
if idx != len(flist)-1:
|
|
104
|
+
wlist.insert(idx, sil1symbol)
|
|
105
|
+
outtext = ''.join(wlist)
|
|
106
|
+
return outtext
|
|
107
|
+
|
|
108
|
+
def has_punc(text):
|
|
109
|
+
for char in text:
|
|
110
|
+
if char in [',', '.', '!', '?', ',','。','?','!', sil1symbol]:
|
|
111
|
+
return True
|
|
112
|
+
return False
|
|
113
|
+
|
|
114
|
+
def text_split_lang(text, lang):
|
|
115
|
+
if lang == 'ZH' or lang == 'zh':
|
|
116
|
+
multi_lang_text_list = [{'lang':'zh', 'text_split': text}]
|
|
117
|
+
elif lang == 'en':
|
|
118
|
+
multi_lang_text_list = [{'lang':'en', 'text_split': text}]
|
|
119
|
+
else:
|
|
120
|
+
pretext_split = re.split("(\[.*?\])", text, re.I|re.M)
|
|
121
|
+
multi_lang_text_list = []
|
|
122
|
+
pretext_split = list(filter(None, pretext_split))
|
|
123
|
+
for utext in pretext_split:
|
|
124
|
+
if utext[0] != '[':
|
|
125
|
+
pattern = r'([a-zA-Z ,.\!\?]+|[\u4e00-\u9fa5 ,。,.\t \"\!\?\“\”\、]+)'
|
|
126
|
+
text_split = re.findall(pattern, utext)
|
|
127
|
+
print(text_split)
|
|
128
|
+
for idx in range(len(text_split)):
|
|
129
|
+
tmpts = text_split[idx]
|
|
130
|
+
tmp_lang = langid.classify(tmpts)[0]
|
|
131
|
+
if len(tmpts)>20:
|
|
132
|
+
if not has_punc(tmpts[:-1]):
|
|
133
|
+
tmpts = add_pause(tmpts, 'p')
|
|
134
|
+
if not has_punc(tmpts[:-1]):
|
|
135
|
+
tmpts = add_pause(tmpts, 'v')
|
|
136
|
+
if tmpts in special_phrase:
|
|
137
|
+
tmpts = tmpts+sil1symbol
|
|
138
|
+
if tmp_lang in ['zh', 'jp', 'ja']:
|
|
139
|
+
tmp_lang = 'zh'
|
|
140
|
+
tmpts = tmpts.replace(' ', sil1symbol)
|
|
141
|
+
else:
|
|
142
|
+
tmp_lang = 'en'
|
|
143
|
+
if not tmpts.isspace():
|
|
144
|
+
multi_lang_text_list.append({'lang':tmp_lang, 'text_split': tmpts})
|
|
145
|
+
else:
|
|
146
|
+
phones = utext[1:-1]
|
|
147
|
+
multi_lang_text_list.append({'lang':'phone', 'text_split': phones})
|
|
148
|
+
return multi_lang_text_list
|
|
149
|
+
|
|
150
|
+
def text_norm(text, lang):
|
|
151
|
+
outtext = norm_func_dict[lang](text)
|
|
152
|
+
return outtext
|
|
153
|
+
|
|
154
|
+
def g2p(text, lang):
|
|
155
|
+
phoneme_list = g2p_func_dict[lang](text)
|
|
156
|
+
return phoneme_list
|
|
157
|
+
|
|
158
|
+
def tokenizer(phoneme_list):
|
|
159
|
+
#token_list = [all_phone_dict[pho] for pho in phoneme_list]
|
|
160
|
+
token_list = [all_phone_dict[pho] if pho in all_phone_dict.keys() else all_phone_dict['sil'] for pho in phoneme_list]
|
|
161
|
+
return token_list
|
|
162
|
+
|
|
163
|
+
def postprocess(phonelist):
|
|
164
|
+
outlist = [xx if xx not in punctuation else 'sil' for xx in phonelist]
|
|
165
|
+
return outlist
|
|
166
|
+
|
|
167
|
+
def postprocess_tts(phonelist):
|
|
168
|
+
#outlist = ['sil', '<sp>']
|
|
169
|
+
outlist = []
|
|
170
|
+
print(phonelist)
|
|
171
|
+
for idx in range(len(phonelist)):
|
|
172
|
+
pm = phonelist[idx]
|
|
173
|
+
if pm not in punctuation:
|
|
174
|
+
outlist.append(pm)
|
|
175
|
+
elif pm == sil1symbol:
|
|
176
|
+
outlist.append('sil_1')
|
|
177
|
+
else:
|
|
178
|
+
#outlist.append('sil')
|
|
179
|
+
outlist.append('sil_punc')
|
|
180
|
+
#outlist.append('<sp>')
|
|
181
|
+
#if outlist[-1] == 'sil':
|
|
182
|
+
# outlist.append('<sp>')
|
|
183
|
+
#elif outlist[-2] != 'sil':
|
|
184
|
+
# outlist.append('sil')
|
|
185
|
+
# outlist.append('<sp>')
|
|
186
|
+
if phonelist[-2] not in punctuation and outlist[-1].split('_')[0] != 'sil':
|
|
187
|
+
#outlist.append('sil')
|
|
188
|
+
outlist.append('sil_end')
|
|
189
|
+
outlist.append('<sp>')
|
|
190
|
+
return outlist
|
|
191
|
+
|
|
192
|
+
def text_process_old(text, lang, spflag=True):
|
|
193
|
+
multi_lang_text_list = text_split_lang(text, lang)
|
|
194
|
+
|
|
195
|
+
all_phone = []
|
|
196
|
+
for text_split_dict in multi_lang_text_list:
|
|
197
|
+
use_lang = text_split_dict['lang']
|
|
198
|
+
if use_lang not in norm_func_dict.keys():
|
|
199
|
+
use_lang = 'zh'
|
|
200
|
+
use_text = text_split_dict['text_split']
|
|
201
|
+
use_text = text_norm(use_text, use_lang)
|
|
202
|
+
phone_list = g2p(use_text, use_lang)
|
|
203
|
+
#all_phone.append('sil')
|
|
204
|
+
all_phone.append('sil_lang')
|
|
205
|
+
all_phone.append('<sp>')
|
|
206
|
+
all_phone.extend(phone_list)
|
|
207
|
+
#all_phone = postprocess(all_phone)
|
|
208
|
+
all_phone = postprocess_tts(all_phone)
|
|
209
|
+
if not spflag:
|
|
210
|
+
while '<sp>' in all_phone:
|
|
211
|
+
all_phone.remove('<sp>')
|
|
212
|
+
return all_phone
|
|
213
|
+
|
|
214
|
+
def text_process(text, lang, spflag=True, use_lang='zh'):
|
|
215
|
+
text = preprocess_first(text, use_lang=use_lang)
|
|
216
|
+
|
|
217
|
+
multi_lang_text_list = text_split_lang(text, lang)
|
|
218
|
+
|
|
219
|
+
all_phone = []
|
|
220
|
+
for text_split_dict in multi_lang_text_list:
|
|
221
|
+
use_lang = text_split_dict['lang']
|
|
222
|
+
use_text = text_split_dict['text_split']
|
|
223
|
+
if use_lang == 'phone':
|
|
224
|
+
phonelist = use_text.split()
|
|
225
|
+
all_phone.extend(phonelist)
|
|
226
|
+
else:
|
|
227
|
+
if use_lang not in norm_func_dict.keys():
|
|
228
|
+
use_lang = 'zh'
|
|
229
|
+
use_text = text_norm(use_text, use_lang)
|
|
230
|
+
phone_list = g2p(use_text, use_lang)
|
|
231
|
+
#all_phone.append('sil')
|
|
232
|
+
all_phone.append('sil_lang')
|
|
233
|
+
all_phone.append('<sp>')
|
|
234
|
+
all_phone.extend(phone_list)
|
|
235
|
+
#all_phone = postprocess(all_phone)
|
|
236
|
+
all_phone = postprocess_tts(all_phone)
|
|
237
|
+
if not spflag:
|
|
238
|
+
while '<sp>' in all_phone:
|
|
239
|
+
all_phone.remove('<sp>')
|
|
240
|
+
return all_phone
|
|
241
|
+
|
|
242
|
+
def replace_sil2label_old(phones):
|
|
243
|
+
phones = ['sil_1' if xx == 'sil_lang' else xx for xx in phones]
|
|
244
|
+
phones = ['sil_2' if xx == 'sil_punc' else xx for xx in phones]
|
|
245
|
+
phones = ['sil_2' if xx == 'sil_end' else xx for xx in phones]
|
|
246
|
+
phones = ['sil_1' if xx == 'sil' else xx for xx in phones]
|
|
247
|
+
outphones = []
|
|
248
|
+
for ele in phones:
|
|
249
|
+
if outphones == []:
|
|
250
|
+
outphones.append(ele)
|
|
251
|
+
else:
|
|
252
|
+
if ele.split('_')[0] == 'sil' and outphones[-1].split('_')[0] == 'sil':
|
|
253
|
+
#outphones[-1] = 'sil_2'
|
|
254
|
+
outphones[-1] = 'sil_1'
|
|
255
|
+
else:
|
|
256
|
+
outphones.append(ele)
|
|
257
|
+
if outphones[-1].split('_')[0] == 'sil':
|
|
258
|
+
outphones = outphones[:-1]
|
|
259
|
+
return outphones
|
|
260
|
+
|
|
261
|
+
def replace_sil2label_0808(phones):
|
|
262
|
+
#phones = ['sil_1' if xx == 'sil_lang' else xx for xx in phones]
|
|
263
|
+
phones = ['' if xx == 'sil_lang' else xx for xx in phones]
|
|
264
|
+
phones = ['sil_2' if xx == 'sil_punc' else xx for xx in phones]
|
|
265
|
+
phones = ['sil_2' if xx == 'sil_end' else xx for xx in phones]
|
|
266
|
+
phones = ['sil_1' if xx == 'sil' else xx for xx in phones]
|
|
267
|
+
phones = list(filter(None, phones))
|
|
268
|
+
#outphones = []
|
|
269
|
+
outphones = ['sil_1']
|
|
270
|
+
for ele in phones:
|
|
271
|
+
if outphones == []:
|
|
272
|
+
outphones.append(ele)
|
|
273
|
+
else:
|
|
274
|
+
if ele.split('_')[0] == 'sil' and outphones[-1].split('_')[0] == 'sil':
|
|
275
|
+
#outphones[-1] = 'sil_2'
|
|
276
|
+
outphones[-1] = 'sil_1'
|
|
277
|
+
else:
|
|
278
|
+
outphones.append(ele)
|
|
279
|
+
if outphones[-1].split('_')[0] == 'sil':
|
|
280
|
+
outphones = outphones[:-1]
|
|
281
|
+
return outphones
|
|
282
|
+
|
|
283
|
+
def replace_sil2label(phones):
|
|
284
|
+
#phones = ['sil_1' if xx == 'sil_lang' else xx for xx in phones]
|
|
285
|
+
phones = ['' if xx == 'sil_lang' else xx for xx in phones]
|
|
286
|
+
phones = ['sil_2' if xx == 'sil_punc' else xx for xx in phones]
|
|
287
|
+
phones = ['sil_2' if xx == 'sil_end' else xx for xx in phones]
|
|
288
|
+
phones = ['sil_1' if xx == 'sil' else xx for xx in phones]
|
|
289
|
+
phones = list(filter(None, phones))
|
|
290
|
+
#outphones = []
|
|
291
|
+
outphones = ['sil_1']
|
|
292
|
+
for ele in phones:
|
|
293
|
+
if outphones == []:
|
|
294
|
+
outphones.append(ele)
|
|
295
|
+
else:
|
|
296
|
+
if ele.split('_')[0] == 'sil' and outphones[-1].split('_')[0] == 'sil':
|
|
297
|
+
outphones[-1] = 'sil_2'
|
|
298
|
+
#outphones[-1] = 'sil_1'
|
|
299
|
+
else:
|
|
300
|
+
outphones.append(ele)
|
|
301
|
+
#if outphones[-1].split('_')[0] == 'sil':
|
|
302
|
+
# outphones = outphones[:-1]
|
|
303
|
+
return outphones
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def text_process_asr(text, lang):
|
|
307
|
+
multi_lang_text_list = text_split_lang(text, lang)
|
|
308
|
+
|
|
309
|
+
all_phone = []
|
|
310
|
+
for text_split_dict in multi_lang_text_list:
|
|
311
|
+
use_lang = text_split_dict['lang']
|
|
312
|
+
use_text = text_split_dict['text_split']
|
|
313
|
+
use_text = text_norm(use_text, use_lang)
|
|
314
|
+
phone_list = g2p(use_text, use_lang)
|
|
315
|
+
phone_list_new = []
|
|
316
|
+
for idx in range(len(phone_list)):
|
|
317
|
+
tmpp = phone_list[idx]
|
|
318
|
+
if tmpp != '<sp>':
|
|
319
|
+
phone_list_new.append(tmpp)
|
|
320
|
+
all_phone.extend(phone_list_new)
|
|
321
|
+
all_phone = postprocess(all_phone)
|
|
322
|
+
if all_phone[0] != 'sil':
|
|
323
|
+
all_phone = ['sil'] + all_phone
|
|
324
|
+
if all_phone[-1] != 'sil':
|
|
325
|
+
all_phone = all_phone + ['sil']
|
|
326
|
+
|
|
327
|
+
return all_phone
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: sigilyph
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: Text Front for TTS
|
|
5
|
+
Home-page: https://github.com/yixiangchen1995/python-Sigilyph
|
|
6
|
+
Author: Yixiang Chen
|
|
7
|
+
Author-email: <yixiangchen1995@gmail.com>
|
|
8
|
+
License: MIT
|
|
9
|
+
Keywords: python,first package
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Programming Language :: Python
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
Requires-Dist: g2p-en
|
|
16
|
+
Requires-Dist: jieba
|
|
17
|
+
Requires-Dist: jieba-fast
|
|
18
|
+
Requires-Dist: pypinyin
|
|
19
|
+
Requires-Dist: WeTextProcessing ==1.0.3
|
|
20
|
+
Requires-Dist: langid
|
|
21
|
+
|
|
22
|
+
# python-Sigilyph
|
|
23
|
+
The TTS Text Frontend for the use of own
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
sigilyph/__init__.py,sha256=LzNFnxQwBgEn9J_Xigs2Al22xXisVlLW_J_Ujp3yx3E,238
|
|
2
|
+
sigilyph/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
sigilyph/core/g2p_func.py,sha256=ccwnMA9VgK0xTWKPVR1Pwb5Gop-CNH5_ipHH8SupNUk,1235
|
|
4
|
+
sigilyph/core/norm_func.py,sha256=K9NwNf8reiiUfcuKFhte9X5mTkZJm5YBnJB9vxeTY5o,2709
|
|
5
|
+
sigilyph/core/predict.py,sha256=Vn7A5TXONqhBe-lJ7uB6Kln9ia1zzYtNzJx5mq_KEBI,1810
|
|
6
|
+
sigilyph/core/preprocess.py,sha256=l55mqh6TK2cuFQhcl8ouys1-U9XfgNFwXLpFeKI17xk,316
|
|
7
|
+
sigilyph/core/sigilyph_class.py,sha256=wLmykl7qq_KBfuvWazVqku9WYL_XrhDr7n9dQ-pcGbc,7965
|
|
8
|
+
sigilyph/core/symbols.py,sha256=YRTynsUkLlnmGUvVT9yoowJTIx4APfA0Xta1KbfgokQ,53728
|
|
9
|
+
sigilyph/core/text_process.py,sha256=NAexBQxTD33pJegQwRPgvun-h6rcJc9Ikx8ZMlsIsNw,11734
|
|
10
|
+
sigilyph-0.0.1.dist-info/METADATA,sha256=rCVwCkLulNyvAN7rm8IX7SD5sKXMx4MF7w13HrGD-zc,686
|
|
11
|
+
sigilyph-0.0.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
12
|
+
sigilyph-0.0.1.dist-info/top_level.txt,sha256=caZwwDzakMbaNNk9MOK172HjSUj1HmJr3oK4iOGUyTo,9
|
|
13
|
+
sigilyph-0.0.1.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
sigilyph
|