sigilyph 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sigilyph might be problematic. Click here for more details.

@@ -0,0 +1,327 @@
1
+ '''
2
+ FilePath: /python-Sigilyph/sigilyph/core/text_process.py
3
+ Descripttion:
4
+ Author: Yixiang Chen
5
+ version:
6
+ Date: 2025-03-31 16:31:26
7
+ LastEditors: Yixiang Chen
8
+ LastEditTime: 2025-08-12 14:40:42
9
+ '''
10
+
11
+
12
+ import langid
13
+ import re
14
+
15
+ import jieba
16
+ import jieba.posseg
17
+
18
+ from sigilyph.core.g2p_func import g2p_en, g2p_cn
19
+ from sigilyph.core.norm_func import preprocess_first, text_norm_en, text_norm_cn
20
+
21
+ from sigilyph.core.symbols import base_phone_set, cn_phone_set, en_phone_set, punctuation, special_phrase
22
+
23
+ #all_phone_set = [] + sorted(set(base_phone_set + cn_phone_set + en_phone_set))
24
+ #all_phone_set = [] + list(set(base_phone_set)) + list(set(cn_phone_set + en_phone_set))
25
+ all_phone_set = [] + sorted(set(base_phone_set)) + sorted(set(cn_phone_set)) + sorted(set(en_phone_set))
26
+ all_phone_dict = {xx:idx for idx, xx in enumerate(all_phone_set)}
27
+
28
+ norm_func_dict = {
29
+ 'en': text_norm_en,
30
+ 'zh': text_norm_cn
31
+ }
32
+
33
+ g2p_func_dict = {
34
+ 'en': g2p_en,
35
+ 'zh': g2p_cn
36
+ }
37
+
38
+ sil1symbol='-'
39
+
40
+ def text_split_lang_old(text, lang):
41
+ if lang == 'ZH' or lang == 'zh':
42
+ multi_lang_text_list = [{'lang':'zh', 'text_split': text}]
43
+ elif lang == 'en':
44
+ multi_lang_text_list = [{'lang':'en', 'text_split': text}]
45
+ else:
46
+ pattern = r'([a-zA-Z ,.\!\?\"]+|[\u4e00-\u9fa5 ,。\!\?\“\”]+)'
47
+ text_split = re.findall(pattern, text)
48
+ multi_lang_text_list = []
49
+ for idx in range(len(text_split)):
50
+ tmpts = text_split[idx]
51
+ tmp_lang = langid.classify(tmpts)[0]
52
+ multi_lang_text_list.append({'lang':tmp_lang, 'text_split': tmpts})
53
+ return multi_lang_text_list
54
+
55
+ def text_split_lang_bk0724(text, lang):
56
+ if lang == 'ZH' or lang == 'zh':
57
+ multi_lang_text_list = [{'lang':'zh', 'text_split': text}]
58
+ elif lang == 'en':
59
+ multi_lang_text_list = [{'lang':'en', 'text_split': text}]
60
+ else:
61
+ pretext_split = re.split("(\[.*?\])", text, re.I|re.M)
62
+ multi_lang_text_list = []
63
+ pretext_split = list(filter(None, pretext_split))
64
+ for utext in pretext_split:
65
+ if utext[0] != '[':
66
+ pattern = r'([a-zA-Z ,.\!\?\"]+|[\u4e00-\u9fa5 ,。,.\t \!\?]+)'
67
+ text_split = re.findall(pattern, utext)
68
+ for idx in range(len(text_split)):
69
+ tmpts = text_split[idx]
70
+ tmp_lang = langid.classify(tmpts)[0]
71
+ if tmp_lang in ['zh', 'jp', 'ja']:
72
+ tmp_lang = 'zh'
73
+ else:
74
+ tmp_lang = 'en'
75
+ if not tmpts.isspace():
76
+ multi_lang_text_list.append({'lang':tmp_lang, 'text_split': tmpts})
77
+ else:
78
+ phones = utext[1:-1]
79
+ multi_lang_text_list.append({'lang':'phone', 'text_split': phones})
80
+ return multi_lang_text_list
81
+
82
+
83
+ def search_ele_mid(flaglist, tf = 'v'):
84
+ nowidx = -1
85
+ halflen = (len(flaglist))//2
86
+ for gap in range(len(flaglist)-halflen):
87
+ nowidx = halflen - gap
88
+ if flaglist[nowidx]==tf:
89
+ return nowidx
90
+ nowidx = halflen + gap
91
+ if flaglist[nowidx]==tf:
92
+ return nowidx
93
+ return nowidx
94
+
95
+ def add_pause(text, tf='v'):
96
+ segment = jieba.posseg.cut(text.strip())
97
+ wlist = []
98
+ flist = []
99
+ for x in segment:
100
+ wlist.append(x.word)
101
+ flist.append(x.flag)
102
+ idx = search_ele_mid(flist, tf)
103
+ if idx != len(flist)-1:
104
+ wlist.insert(idx, sil1symbol)
105
+ outtext = ''.join(wlist)
106
+ return outtext
107
+
108
+ def has_punc(text):
109
+ for char in text:
110
+ if char in [',', '.', '!', '?', ',','。','?','!', sil1symbol]:
111
+ return True
112
+ return False
113
+
114
+ def text_split_lang(text, lang):
115
+ if lang == 'ZH' or lang == 'zh':
116
+ multi_lang_text_list = [{'lang':'zh', 'text_split': text}]
117
+ elif lang == 'en':
118
+ multi_lang_text_list = [{'lang':'en', 'text_split': text}]
119
+ else:
120
+ pretext_split = re.split("(\[.*?\])", text, re.I|re.M)
121
+ multi_lang_text_list = []
122
+ pretext_split = list(filter(None, pretext_split))
123
+ for utext in pretext_split:
124
+ if utext[0] != '[':
125
+ pattern = r'([a-zA-Z ,.\!\?]+|[\u4e00-\u9fa5 ,。,.\t \"\!\?\“\”\、]+)'
126
+ text_split = re.findall(pattern, utext)
127
+ print(text_split)
128
+ for idx in range(len(text_split)):
129
+ tmpts = text_split[idx]
130
+ tmp_lang = langid.classify(tmpts)[0]
131
+ if len(tmpts)>20:
132
+ if not has_punc(tmpts[:-1]):
133
+ tmpts = add_pause(tmpts, 'p')
134
+ if not has_punc(tmpts[:-1]):
135
+ tmpts = add_pause(tmpts, 'v')
136
+ if tmpts in special_phrase:
137
+ tmpts = tmpts+sil1symbol
138
+ if tmp_lang in ['zh', 'jp', 'ja']:
139
+ tmp_lang = 'zh'
140
+ tmpts = tmpts.replace(' ', sil1symbol)
141
+ else:
142
+ tmp_lang = 'en'
143
+ if not tmpts.isspace():
144
+ multi_lang_text_list.append({'lang':tmp_lang, 'text_split': tmpts})
145
+ else:
146
+ phones = utext[1:-1]
147
+ multi_lang_text_list.append({'lang':'phone', 'text_split': phones})
148
+ return multi_lang_text_list
149
+
150
+ def text_norm(text, lang):
151
+ outtext = norm_func_dict[lang](text)
152
+ return outtext
153
+
154
+ def g2p(text, lang):
155
+ phoneme_list = g2p_func_dict[lang](text)
156
+ return phoneme_list
157
+
158
+ def tokenizer(phoneme_list):
159
+ #token_list = [all_phone_dict[pho] for pho in phoneme_list]
160
+ token_list = [all_phone_dict[pho] if pho in all_phone_dict.keys() else all_phone_dict['sil'] for pho in phoneme_list]
161
+ return token_list
162
+
163
+ def postprocess(phonelist):
164
+ outlist = [xx if xx not in punctuation else 'sil' for xx in phonelist]
165
+ return outlist
166
+
167
+ def postprocess_tts(phonelist):
168
+ #outlist = ['sil', '<sp>']
169
+ outlist = []
170
+ print(phonelist)
171
+ for idx in range(len(phonelist)):
172
+ pm = phonelist[idx]
173
+ if pm not in punctuation:
174
+ outlist.append(pm)
175
+ elif pm == sil1symbol:
176
+ outlist.append('sil_1')
177
+ else:
178
+ #outlist.append('sil')
179
+ outlist.append('sil_punc')
180
+ #outlist.append('<sp>')
181
+ #if outlist[-1] == 'sil':
182
+ # outlist.append('<sp>')
183
+ #elif outlist[-2] != 'sil':
184
+ # outlist.append('sil')
185
+ # outlist.append('<sp>')
186
+ if phonelist[-2] not in punctuation and outlist[-1].split('_')[0] != 'sil':
187
+ #outlist.append('sil')
188
+ outlist.append('sil_end')
189
+ outlist.append('<sp>')
190
+ return outlist
191
+
192
+ def text_process_old(text, lang, spflag=True):
193
+ multi_lang_text_list = text_split_lang(text, lang)
194
+
195
+ all_phone = []
196
+ for text_split_dict in multi_lang_text_list:
197
+ use_lang = text_split_dict['lang']
198
+ if use_lang not in norm_func_dict.keys():
199
+ use_lang = 'zh'
200
+ use_text = text_split_dict['text_split']
201
+ use_text = text_norm(use_text, use_lang)
202
+ phone_list = g2p(use_text, use_lang)
203
+ #all_phone.append('sil')
204
+ all_phone.append('sil_lang')
205
+ all_phone.append('<sp>')
206
+ all_phone.extend(phone_list)
207
+ #all_phone = postprocess(all_phone)
208
+ all_phone = postprocess_tts(all_phone)
209
+ if not spflag:
210
+ while '<sp>' in all_phone:
211
+ all_phone.remove('<sp>')
212
+ return all_phone
213
+
214
+ def text_process(text, lang, spflag=True, use_lang='zh'):
215
+ text = preprocess_first(text, use_lang=use_lang)
216
+
217
+ multi_lang_text_list = text_split_lang(text, lang)
218
+
219
+ all_phone = []
220
+ for text_split_dict in multi_lang_text_list:
221
+ use_lang = text_split_dict['lang']
222
+ use_text = text_split_dict['text_split']
223
+ if use_lang == 'phone':
224
+ phonelist = use_text.split()
225
+ all_phone.extend(phonelist)
226
+ else:
227
+ if use_lang not in norm_func_dict.keys():
228
+ use_lang = 'zh'
229
+ use_text = text_norm(use_text, use_lang)
230
+ phone_list = g2p(use_text, use_lang)
231
+ #all_phone.append('sil')
232
+ all_phone.append('sil_lang')
233
+ all_phone.append('<sp>')
234
+ all_phone.extend(phone_list)
235
+ #all_phone = postprocess(all_phone)
236
+ all_phone = postprocess_tts(all_phone)
237
+ if not spflag:
238
+ while '<sp>' in all_phone:
239
+ all_phone.remove('<sp>')
240
+ return all_phone
241
+
242
+ def replace_sil2label_old(phones):
243
+ phones = ['sil_1' if xx == 'sil_lang' else xx for xx in phones]
244
+ phones = ['sil_2' if xx == 'sil_punc' else xx for xx in phones]
245
+ phones = ['sil_2' if xx == 'sil_end' else xx for xx in phones]
246
+ phones = ['sil_1' if xx == 'sil' else xx for xx in phones]
247
+ outphones = []
248
+ for ele in phones:
249
+ if outphones == []:
250
+ outphones.append(ele)
251
+ else:
252
+ if ele.split('_')[0] == 'sil' and outphones[-1].split('_')[0] == 'sil':
253
+ #outphones[-1] = 'sil_2'
254
+ outphones[-1] = 'sil_1'
255
+ else:
256
+ outphones.append(ele)
257
+ if outphones[-1].split('_')[0] == 'sil':
258
+ outphones = outphones[:-1]
259
+ return outphones
260
+
261
+ def replace_sil2label_0808(phones):
262
+ #phones = ['sil_1' if xx == 'sil_lang' else xx for xx in phones]
263
+ phones = ['' if xx == 'sil_lang' else xx for xx in phones]
264
+ phones = ['sil_2' if xx == 'sil_punc' else xx for xx in phones]
265
+ phones = ['sil_2' if xx == 'sil_end' else xx for xx in phones]
266
+ phones = ['sil_1' if xx == 'sil' else xx for xx in phones]
267
+ phones = list(filter(None, phones))
268
+ #outphones = []
269
+ outphones = ['sil_1']
270
+ for ele in phones:
271
+ if outphones == []:
272
+ outphones.append(ele)
273
+ else:
274
+ if ele.split('_')[0] == 'sil' and outphones[-1].split('_')[0] == 'sil':
275
+ #outphones[-1] = 'sil_2'
276
+ outphones[-1] = 'sil_1'
277
+ else:
278
+ outphones.append(ele)
279
+ if outphones[-1].split('_')[0] == 'sil':
280
+ outphones = outphones[:-1]
281
+ return outphones
282
+
283
+ def replace_sil2label(phones):
284
+ #phones = ['sil_1' if xx == 'sil_lang' else xx for xx in phones]
285
+ phones = ['' if xx == 'sil_lang' else xx for xx in phones]
286
+ phones = ['sil_2' if xx == 'sil_punc' else xx for xx in phones]
287
+ phones = ['sil_2' if xx == 'sil_end' else xx for xx in phones]
288
+ phones = ['sil_1' if xx == 'sil' else xx for xx in phones]
289
+ phones = list(filter(None, phones))
290
+ #outphones = []
291
+ outphones = ['sil_1']
292
+ for ele in phones:
293
+ if outphones == []:
294
+ outphones.append(ele)
295
+ else:
296
+ if ele.split('_')[0] == 'sil' and outphones[-1].split('_')[0] == 'sil':
297
+ outphones[-1] = 'sil_2'
298
+ #outphones[-1] = 'sil_1'
299
+ else:
300
+ outphones.append(ele)
301
+ #if outphones[-1].split('_')[0] == 'sil':
302
+ # outphones = outphones[:-1]
303
+ return outphones
304
+
305
+
306
+ def text_process_asr(text, lang):
307
+ multi_lang_text_list = text_split_lang(text, lang)
308
+
309
+ all_phone = []
310
+ for text_split_dict in multi_lang_text_list:
311
+ use_lang = text_split_dict['lang']
312
+ use_text = text_split_dict['text_split']
313
+ use_text = text_norm(use_text, use_lang)
314
+ phone_list = g2p(use_text, use_lang)
315
+ phone_list_new = []
316
+ for idx in range(len(phone_list)):
317
+ tmpp = phone_list[idx]
318
+ if tmpp != '<sp>':
319
+ phone_list_new.append(tmpp)
320
+ all_phone.extend(phone_list_new)
321
+ all_phone = postprocess(all_phone)
322
+ if all_phone[0] != 'sil':
323
+ all_phone = ['sil'] + all_phone
324
+ if all_phone[-1] != 'sil':
325
+ all_phone = all_phone + ['sil']
326
+
327
+ return all_phone
@@ -0,0 +1,23 @@
1
+ Metadata-Version: 2.1
2
+ Name: sigilyph
3
+ Version: 0.0.1
4
+ Summary: Text Front for TTS
5
+ Home-page: https://github.com/yixiangchen1995/python-Sigilyph
6
+ Author: Yixiang Chen
7
+ Author-email: <yixiangchen1995@gmail.com>
8
+ License: MIT
9
+ Keywords: python,first package
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Programming Language :: Python
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Description-Content-Type: text/markdown
15
+ Requires-Dist: g2p-en
16
+ Requires-Dist: jieba
17
+ Requires-Dist: jieba-fast
18
+ Requires-Dist: pypinyin
19
+ Requires-Dist: WeTextProcessing ==1.0.3
20
+ Requires-Dist: langid
21
+
22
+ # python-Sigilyph
23
+ The TTS Text Frontend for the use of own
@@ -0,0 +1,13 @@
1
+ sigilyph/__init__.py,sha256=LzNFnxQwBgEn9J_Xigs2Al22xXisVlLW_J_Ujp3yx3E,238
2
+ sigilyph/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ sigilyph/core/g2p_func.py,sha256=ccwnMA9VgK0xTWKPVR1Pwb5Gop-CNH5_ipHH8SupNUk,1235
4
+ sigilyph/core/norm_func.py,sha256=K9NwNf8reiiUfcuKFhte9X5mTkZJm5YBnJB9vxeTY5o,2709
5
+ sigilyph/core/predict.py,sha256=Vn7A5TXONqhBe-lJ7uB6Kln9ia1zzYtNzJx5mq_KEBI,1810
6
+ sigilyph/core/preprocess.py,sha256=l55mqh6TK2cuFQhcl8ouys1-U9XfgNFwXLpFeKI17xk,316
7
+ sigilyph/core/sigilyph_class.py,sha256=wLmykl7qq_KBfuvWazVqku9WYL_XrhDr7n9dQ-pcGbc,7965
8
+ sigilyph/core/symbols.py,sha256=YRTynsUkLlnmGUvVT9yoowJTIx4APfA0Xta1KbfgokQ,53728
9
+ sigilyph/core/text_process.py,sha256=NAexBQxTD33pJegQwRPgvun-h6rcJc9Ikx8ZMlsIsNw,11734
10
+ sigilyph-0.0.1.dist-info/METADATA,sha256=rCVwCkLulNyvAN7rm8IX7SD5sKXMx4MF7w13HrGD-zc,686
11
+ sigilyph-0.0.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
12
+ sigilyph-0.0.1.dist-info/top_level.txt,sha256=caZwwDzakMbaNNk9MOK172HjSUj1HmJr3oK4iOGUyTo,9
13
+ sigilyph-0.0.1.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: bdist_wheel (0.43.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ sigilyph