sigilyph 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,246 @@
1
+ '''
2
+ FilePath: /python-Sigilyph/sigilyph/core/sigilyph_class.py
3
+ Descripttion:
4
+ Author: Yixiang Chen
5
+ version:
6
+ Date: 2025-08-12 14:42:50
7
+ LastEditors: Yixiang Chen
8
+ LastEditTime: 2026-01-16 18:45:50
9
+ '''
10
+
11
+ import langid
12
+ import re
13
+ import json
14
+
15
+ import jieba
16
+ import jieba.posseg
17
+
18
+ from sigilyph.core.g2p_func import g2p_en, g2p_cn
19
+ from sigilyph.text_norm.norm_func import preprocess_first, post_process, text_norm_en, text_norm_cn
20
+ from sigilyph.core.symbols import punctuation
21
+ from sigilyph.core.predict import before_replace_dict, special_word_dict, special_phrase
22
+
23
+ norm_func_dict = {
24
+ 'en': text_norm_en,
25
+ 'zh': text_norm_cn
26
+ }
27
+
28
+ g2p_func_dict = {
29
+ 'en': g2p_en,
30
+ 'zh': g2p_cn
31
+ }
32
+
33
+ class Sigilyph:
34
+ def __init__(self, before_dict_path=None, special_dict_path=None):
35
+ self.sil1symbol='-'
36
+ self.punctuation = punctuation
37
+
38
+ self.before_replace_dict = before_replace_dict
39
+ if before_dict_path:
40
+ with open(before_dict_path, 'r', encoding="utf-8") as obdp:
41
+ extra_before_dict = json.load(obdp)
42
+ self.before_replace_dict.update(extra_before_dict)
43
+
44
+ self.special_word_dict = special_word_dict
45
+ if special_dict_path:
46
+ with open(special_dict_path, 'r', encoding="utf-8") as obdp:
47
+ extra_special_dict = json.load(obdp)
48
+ self.special_word_dict.update(extra_special_dict)
49
+
50
+ self.special_phrase = special_phrase
51
+
52
+ def forward(self, text, lang, spflag=False, norm_use_lang='zh'):
53
+ phones = self.text_process(text, lang, spflag, norm_use_lang)
54
+ phones = self.replace_sil2label(phones)
55
+ return phones
56
+
57
+ def text_process(self, text, lang, spflag=False, norm_use_lang='zh'):
58
+ text = preprocess_first(text, self.before_replace_dict, special_word_dict, norm_use_lang=norm_use_lang)
59
+ text = self.text_norm(use_text, norm_use_lang)
60
+ text = post_process(text, special_word_dict)
61
+
62
+ multi_lang_text_list = self.text_split_lang(text, lang)
63
+
64
+ all_phone = []
65
+ for text_split_dict in multi_lang_text_list:
66
+ use_lang = text_split_dict['lang']
67
+ use_text = text_split_dict['text_split']
68
+ if use_lang == 'phone':
69
+ phonelist = use_text.split()
70
+ all_phone.extend(phonelist)
71
+ else:
72
+ if use_lang not in norm_func_dict.keys():
73
+ use_lang = 'zh'
74
+ #use_text = self.text_norm(use_text, use_lang)
75
+ use_text = use_text
76
+ phone_list = self.g2p(use_text, use_lang)
77
+ #all_phone.append('sil')
78
+ all_phone.append('sil_lang')
79
+ all_phone.append('<sp>')
80
+ all_phone.extend(phone_list)
81
+ #all_phone = postprocess(all_phone)
82
+ all_phone = self.postprocess_tts(all_phone)
83
+ if not spflag:
84
+ while '<sp>' in all_phone:
85
+ all_phone.remove('<sp>')
86
+ return all_phone
87
+
88
+ def text_process_old(self, text, lang, spflag=False, norm_use_lang='zh'):
89
+ text = preprocess_first(text, self.before_replace_dict, special_word_dict, norm_use_lang=norm_use_lang)
90
+
91
+ multi_lang_text_list = self.text_split_lang(text, lang)
92
+
93
+ all_phone = []
94
+ for text_split_dict in multi_lang_text_list:
95
+ use_lang = text_split_dict['lang']
96
+ use_text = text_split_dict['text_split']
97
+ if use_lang == 'phone':
98
+ phonelist = use_text.split()
99
+ all_phone.extend(phonelist)
100
+ else:
101
+ if use_lang not in norm_func_dict.keys():
102
+ use_lang = 'zh'
103
+ use_text = self.text_norm(use_text, use_lang)
104
+ phone_list = self.g2p(use_text, use_lang)
105
+ #all_phone.append('sil')
106
+ all_phone.append('sil_lang')
107
+ all_phone.append('<sp>')
108
+ all_phone.extend(phone_list)
109
+ #all_phone = postprocess(all_phone)
110
+ all_phone = self.postprocess_tts(all_phone)
111
+ if not spflag:
112
+ while '<sp>' in all_phone:
113
+ all_phone.remove('<sp>')
114
+ return all_phone
115
+
116
+ ############### split text in line with lang ##############
117
+ def text_split_lang(self, text, lang):
118
+ if lang == 'ZH' or lang == 'zh':
119
+ multi_lang_text_list = [{'lang':'zh', 'text_split': text}]
120
+ elif lang == 'en':
121
+ multi_lang_text_list = [{'lang':'en', 'text_split': text}]
122
+ else:
123
+ pretext_split = re.split("(\[.*?\])", text, re.I|re.M)
124
+ multi_lang_text_list = []
125
+ pretext_split = list(filter(None, pretext_split))
126
+ for utext in pretext_split:
127
+ if utext[0] != '[':
128
+ pattern = r'([a-zA-Z ,.\!\?]+|[\u4e00-\u9fa5 ,。,.\t \"\!\?\“\”\、]+)'
129
+ text_split = re.findall(pattern, utext)
130
+ #print(text_split)
131
+ for idx in range(len(text_split)):
132
+ tmpts = text_split[idx]
133
+ tmp_lang = langid.classify(tmpts)[0]
134
+ if len(tmpts)>20:
135
+ if not self.has_punc(tmpts[:-1]):
136
+ tmpts = self.add_pause(tmpts, 'p')
137
+ if not self.has_punc(tmpts[:-1]):
138
+ tmpts = self.add_pause(tmpts, 'v')
139
+ if tmpts in self.special_phrase:
140
+ tmpts = tmpts+self.sil1symbol
141
+ if tmp_lang in ['zh', 'jp', 'ja']:
142
+ tmp_lang = 'zh'
143
+ tmpts = tmpts.replace(' ', self.sil1symbol)
144
+ else:
145
+ tmp_lang = 'en'
146
+ if not tmpts.isspace():
147
+ multi_lang_text_list.append({'lang':tmp_lang, 'text_split': tmpts})
148
+ else:
149
+ phones = utext[1:-1]
150
+ multi_lang_text_list.append({'lang':'phone', 'text_split': phones})
151
+ return multi_lang_text_list
152
+
153
+ ########## add parse ###############
154
+ def has_punc(self, text):
155
+ for char in text:
156
+ if char in [',', '.', '!', '?', ',','。','?','!', self.sil1symbol]:
157
+ return True
158
+ return False
159
+
160
+ def add_pause(self, text, tf='v'):
161
+ segment = jieba.posseg.cut(text.strip())
162
+ wlist = []
163
+ flist = []
164
+ for x in segment:
165
+ wlist.append(x.word)
166
+ flist.append(x.flag)
167
+ idx = self.search_ele_mid(flist, tf)
168
+ if idx != len(flist)-1:
169
+ wlist.insert(idx, self.sil1symbol)
170
+ outtext = ''.join(wlist)
171
+ return outtext
172
+
173
+ def search_ele_mid(self, flaglist, tf = 'v'):
174
+ nowidx = -1
175
+ halflen = (len(flaglist))//2
176
+ for gap in range(len(flaglist)-halflen):
177
+ nowidx = halflen - gap
178
+ if flaglist[nowidx]==tf:
179
+ return nowidx
180
+ nowidx = halflen + gap
181
+ if flaglist[nowidx]==tf:
182
+ return nowidx
183
+ return nowidx
184
+
185
+ ######## text norm #########
186
+ def text_norm(self, text, lang):
187
+ outtext = norm_func_dict[lang](text)
188
+ return outtext
189
+
190
+ ############ g2p ################
191
+ def g2p(self, text, lang):
192
+ phoneme_list = g2p_func_dict[lang](text)
193
+ return phoneme_list
194
+
195
+ ############# post process #############
196
+ def postprocess_tts(self, phonelist):
197
+ #outlist = ['sil', '<sp>']
198
+ outlist = []
199
+ #print(phonelist)
200
+ for idx in range(len(phonelist)):
201
+ pm = phonelist[idx]
202
+ if pm not in self.punctuation:
203
+ outlist.append(pm)
204
+ elif pm == self.sil1symbol:
205
+ outlist.append('sil_1')
206
+ else:
207
+ #outlist.append('sil')
208
+ outlist.append('sil_punc')
209
+ #outlist.append('<sp>')
210
+ #if outlist[-1] == 'sil':
211
+ # outlist.append('<sp>')
212
+ #elif outlist[-2] != 'sil':
213
+ # outlist.append('sil')
214
+ # outlist.append('<sp>')
215
+ if phonelist[-2] not in self.punctuation and outlist[-1].split('_')[0] != 'sil':
216
+ #outlist.append('sil')
217
+ outlist.append('sil_end')
218
+ outlist.append('<sp>')
219
+ return outlist
220
+
221
+ ########## replace silence token ###############
222
+ def replace_sil2label(self, phones):
223
+ #phones = ['sil_1' if xx == 'sil_lang' else xx for xx in phones]
224
+ phones = ['' if xx == 'sil_lang' else xx for xx in phones]
225
+ phones = ['sil_2' if xx == 'sil_punc' else xx for xx in phones]
226
+ phones = ['sil_2' if xx == 'sil_end' else xx for xx in phones]
227
+ phones = ['sil_1' if xx == 'sil' else xx for xx in phones]
228
+ phones = list(filter(None, phones))
229
+ #outphones = []
230
+ outphones = ['sil_1']
231
+ for ele in phones:
232
+ if outphones == []:
233
+ outphones.append(ele)
234
+ else:
235
+ if ele.split('_')[0] == 'sil' and outphones[-1].split('_')[0] == 'sil':
236
+ outphones[-1] = 'sil_2'
237
+ #outphones[-1] = 'sil_1'
238
+ else:
239
+ outphones.append(ele)
240
+ #if outphones[-1].split('_')[0] == 'sil':
241
+ # outphones = outphones[:-1]
242
+ return outphones
243
+
244
+
245
+
246
+
@@ -0,0 +1,26 @@
1
+ {
2
+ "甄嬛传": "[zh en1 h uan2 zh uan4]",
3
+ "藏海传": "[z ang4 h ai3 zh uan4]",
4
+ "藏海": "[z ang4 h ai3]",
5
+ "井柏然": "[j ing3 b o2 r an2]",
6
+ "Alibaba": "[AA a1 l i3 b a1 b a1]",
7
+ "TFBOYS": "[T IY1 EH1 F B OY1 Z]",
8
+ "肖战": "[x iao1 zh an4]",
9
+ "肖申克": "[x iao1 sh en1 k e4]",
10
+ "肖像": "[x iao1 x iang4]",
11
+ "戛纳": "[g a1 n a4]",
12
+ "爆肚": "[b ao4 d u3 EE]",
13
+ "花呗": "[h ua1 b ei4]",
14
+ "肚儿": "[d u3 er5]",
15
+ "种了": "[zh ong4 l e5]",
16
+ "仇老五": "[q iu2 l ao3 w u3]",
17
+ "彩云曲": "[c ai3 y vn2 q v3]",
18
+ "雷洛传": "[l ei2 l uo4 zh uan4]",
19
+ "朝雪录": "[zh ao1 x ve3 l u4]",
20
+ "凡人修仙传": "[f an2 r en2 x iu1 x ian1 zh uan4]",
21
+ "+86": "[b a1 l iu]",
22
+ "(正八十六)": "[b a1 l iu]",
23
+ "八 a.m.": "[sh ang4 w u3 b a1 d ian3]",
24
+ "F.I.R.": "[EH1 F sil_1 AY2 sil_1 AA0 R sil_1]",
25
+ "S.C.I": "[sil_2 EH2 S sil_1 S IY1 sil_1 AY1 sil_1]"
26
+ }