sigilyph 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sigilyph might be problematic. Click here for more details.

sigilyph/__init__.py ADDED
@@ -0,0 +1,11 @@
1
+ '''
2
+ FilePath: /python-Sigilyph/sigilyph/__init__.py
3
+ Descripttion:
4
+ Author: Yixiang Chen
5
+ version:
6
+ Date: 2025-05-13 11:01:07
7
+ LastEditors: Yixiang Chen
8
+ LastEditTime: 2025-08-12 16:39:15
9
+ '''
10
+
11
+ from sigilyph.core.sigilyph_class import Sigilyph
File without changes
@@ -0,0 +1,47 @@
1
+ '''
2
+ FilePath: /python-Sigilyph/sigilyph/core/g2p_func.py
3
+ Descripttion:
4
+ Author: Yixiang Chen
5
+ version:
6
+ Date: 2025-03-31 16:55:51
7
+ LastEditors: Yixiang Chen
8
+ LastEditTime: 2025-08-12 14:42:02
9
+ '''
10
+
11
+ from g2p_en import G2p
12
+ _g2p_en = G2p()
13
+
14
+ def g2p_en(text, sp_sign='<sp>'):
15
+ phone_list = _g2p_en(text)
16
+ phone_list = [sp_sign if xx == " " else xx for xx in phone_list]
17
+ if len(phone_list)>1 and phone_list[-1] != sp_sign:
18
+ phone_list.append(sp_sign)
19
+ return phone_list
20
+
21
+
22
+ from pypinyin import lazy_pinyin, Style
23
+
24
+ from sigilyph.core.symbols import punctuation, punc_map_ch, cn_word2phone_dict
25
+ for punc in punctuation:
26
+ cn_word2phone_dict[punc] = punc
27
+
28
+ def g2p_cn(text):
29
+ pinyinlist = lazy_pinyin(text, style=Style.TONE3, neutral_tone_with_five=True, tone_sandhi=True)
30
+ outlist = []
31
+ for pp in pinyinlist:
32
+ if pp in cn_word2phone_dict.keys():
33
+ outlist.extend(cn_word2phone_dict[pp])
34
+ outlist.append('<sp>')
35
+ else:
36
+ for ch in pp:
37
+ outlist.extend(cn_word2phone_dict[ch])
38
+ outlist.append('<sp>')
39
+ if len(outlist) > 4:
40
+ if outlist[-2] == 'sil' and outlist[-4] == 'sil':
41
+ outlist = outlist[:-2]
42
+ return outlist
43
+
44
+
45
+
46
+
47
+
@@ -0,0 +1,85 @@
1
+ '''
2
+ FilePath: /python-Sigilyph/sigilyph/core/norm_func.py
3
+ Descripttion:
4
+ Author: Yixiang Chen
5
+ version:
6
+ Date: 2025-03-31 17:50:26
7
+ LastEditors: Yixiang Chen
8
+ LastEditTime: 2025-08-12 15:42:55
9
+ '''
10
+
11
+
12
+ import re
13
+
14
+ from sigilyph.core.symbols import punctuation, punc_map_ch
15
+
16
+ from tn.chinese.normalizer import Normalizer as ZhNormalizer
17
+ from tn.english.normalizer import Normalizer as EnNormalizer
18
+
19
+ #zh_tn_model = ZhNormalizer(remove_erhua=False, full_to_half=False)
20
+ #en_tn_model = EnNormalizer()
21
+ zh_tn_model = ZhNormalizer(cache_dir='./sigilyph/core/cache_dir', remove_erhua=False, full_to_half=False)
22
+ en_tn_model = EnNormalizer(cache_dir='./sigilyph/core/cache_dir')
23
+
24
+ import json
25
+ #import sys
26
+ #sys.path.append('text_front')
27
+ #with open('./special_dict.json', 'r', encoding="utf-8") as infi:
28
+ #with open('./text_front/special_dict.json', 'r', encoding="utf-8") as infi:
29
+ with open('./sigilyph/core/special_dict.json', 'r', encoding="utf-8") as infi:
30
+ special_dict = json.load(infi)
31
+
32
+ def pro_norm(text, use_lang='zh'):
33
+ if use_lang == 'zh':
34
+ norm_text = zh_tn_model.normalize(text)
35
+ #print("zh ", norm_text)
36
+ else:
37
+ norm_text = en_tn_model.normalize(text)
38
+ #print("en ", norm_text)
39
+ return norm_text
40
+
41
+
42
+ def replace_with_dict(text, replace_dict):
43
+ for old, new in replace_dict.items():
44
+ text = text.replace(old, new)
45
+ return text
46
+ def replace_with_dict_re(text, replace_dict):
47
+ pattern = re.compile("|".join(re.escape(key) for key in replace_dict.keys()))
48
+ return pattern.sub(lambda m: replace_dict[m.group(0)], text)
49
+
50
+ pre_replace_dict = {"AlphaFold-Plus": "AlphaFold Plus"}
51
+ def preprocess_first_old(text, use_lang='zh'):
52
+ text = replace_with_dict(text, pre_replace_dict)
53
+ norm_text = pro_norm(text, use_lang)
54
+ print(norm_text)
55
+ rep_text = replace_with_dict(norm_text, special_dict)
56
+ return rep_text
57
+
58
+ def preprocess_first(text, before_replace_dict, special_word_dict, norm_use_lang='zh'):
59
+ text = replace_with_dict(text, before_replace_dict)
60
+ norm_text = pro_norm(text, norm_use_lang)
61
+ print(norm_text)
62
+ rep_text = replace_with_dict(norm_text, special_word_dict)
63
+ return rep_text
64
+
65
+
66
+ def normalizer(text):
67
+ return text
68
+
69
+ def replace_punc(text):
70
+ #text = text.replace("嗯", "恩").replace("呣", "母")
71
+ pattern = re.compile("|".join(re.escape(p) for p in punc_map_ch.keys()))
72
+ replaced_text = pattern.sub(lambda x: punc_map_ch[x.group()], text)
73
+ replaced_text = re.sub(
74
+ r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text
75
+ )
76
+ return replaced_text
77
+
78
+ def text_norm_cn(text):
79
+ text = normalizer(text)
80
+ text = replace_punc(text)
81
+ return text
82
+
83
+ def text_norm_en(text):
84
+
85
+ return text
@@ -0,0 +1,66 @@
1
+
2
+
3
+
4
+ before_replace_dict = {
5
+ "AlphaFold-Plus": "AlphaFold Plus"
6
+ }
7
+
8
+ special_phrase = ['据了解']
9
+
10
+ special_word_dict = {
11
+ "iPhone": "[AY1 F OW0 N]",
12
+ "IOS": "[AY1 OW1 AE1 S]",
13
+ "A十七": "[EY1 sh ir2 q i1]",
14
+ "A seventeen": "[EY1 S EH1 V AH0 N T IY1 N]",
15
+ "CEO": "[S IY1 IY1 OW1]",
16
+ "AI": "[EY1 AY1]",
17
+ "ID": "[AY1 D IY1]",
18
+ "ABC": "[EY1 B IY1 S IY1]",
19
+ "VIP": "[V IY1 AY1 P IY1]",
20
+ "PDF": "[P IY1 D IY1 AE1 F]",
21
+ "NLP": "[EH1 NG EH2 L P IY1]",
22
+ "API": "[EY1 P IY1 AY1]",
23
+ "GPU": "[JH IY1 P IY1 Y UW1]",
24
+ "WeChat": "[W IY1 CH AE1 T]",
25
+ "PPT": "[P IY1 P IY1 T IY1]",
26
+ "CA": "[S IY1 EY1]",
27
+ ".com": "[d ian3 K AA1 M]",
28
+ ".zhang": "[ZH AA1 NG]",
29
+ "live": "[L AY0 V]",
30
+ "@": "[sil_1 AE1 T sil_1]",
31
+ "睡不着觉": "[sh ui4 b u4 zh e5 j iao4]",
32
+ "月经不调": "[y ve4 j ing1 b u4 t iao2]",
33
+ "长护险": "[ch ang2 h u4 x ian3]",
34
+ "长时间": "[ch ang2 sh ir2 j ian1]",
35
+ "长住外地": "[ch ang2 zh u4 w ai4 d i4]",
36
+ "长按": "[ch ang2 AA an4]",
37
+ "喉咙干疼": "[h ou2 l ong2 g an1 t eng2]",
38
+ "死对头": "[s ii3 d ui4 t ou5]",
39
+ "成名曲": "[ch eng2 m ing2 q v3]",
40
+ "古朴": "[g u3 p u3]",
41
+ "啊": "[AA a1]",
42
+ "sinα": "[S AY1 N AH2 AE1 L F a3]",
43
+ "cosα": "[K OW0 S AY1 N AH2 AE1 L F a3]",
44
+ "tanα": "[T AE1 N JH AH0 N T AH2 AE1 L F a3]",
45
+ "α": "[AE1 L F a3]",
46
+ "Ⅰ": "[y i1]",
47
+ "Ⅱ": "[EE er4]",
48
+ "qq": "[K Y UW1 K Y UW1]",
49
+ "≠": "[b u4 d eng3 y v2]",
50
+ "Beijing": "[b ei3 j ing1]",
51
+ "<sil>": "[sil_1]",
52
+ "--": "[sil]",
53
+ "=-=": "[sil]",
54
+ ":": "[sil_1]",
55
+ "-": "[sil]",
56
+ "(": "[sil]",
57
+ ")": "[sil]",
58
+ "“": "[sil_1]",
59
+ "”": "[sil_1]",
60
+ "《": "[sil_1]",
61
+ "》": "[sil_1]",
62
+ "(": "[sil]",
63
+ ")": "[sil]",
64
+ ":": "[sil_1]",
65
+ }
66
+
@@ -0,0 +1,16 @@
1
+ '''
2
+ FilePath: /python-Sigilyph/sigilyph/core/preprocess.py
3
+ Descripttion:
4
+ Author: Yixiang Chen
5
+ version:
6
+ Date: 2025-05-13 11:01:26
7
+ LastEditors: Yixiang Chen
8
+ LastEditTime: 2025-05-14 20:26:27
9
+ '''
10
+
11
+ def replace_proper(text, namedict):
12
+ for k,v in namedict.items():
13
+ text = text.replace(k,v)
14
+ return text
15
+
16
+
@@ -0,0 +1,215 @@
1
+ '''
2
+ FilePath: /python-Sigilyph/sigilyph/core/sigilyph_class.py
3
+ Descripttion:
4
+ Author: Yixiang Chen
5
+ version:
6
+ Date: 2025-08-12 14:42:50
7
+ LastEditors: Yixiang Chen
8
+ LastEditTime: 2025-08-12 15:41:33
9
+ '''
10
+
11
+ import langid
12
+ import re
13
+ import json
14
+
15
+ import jieba
16
+ import jieba.posseg
17
+
18
+ from sigilyph.core.g2p_func import g2p_en, g2p_cn
19
+ from sigilyph.core.norm_func import preprocess_first, text_norm_en, text_norm_cn
20
+ from sigilyph.core.symbols import punctuation
21
+ from sigilyph.core.predict import before_replace_dict, special_word_dict, special_phrase
22
+
23
+ norm_func_dict = {
24
+ 'en': text_norm_en,
25
+ 'zh': text_norm_cn
26
+ }
27
+
28
+ g2p_func_dict = {
29
+ 'en': g2p_en,
30
+ 'zh': g2p_cn
31
+ }
32
+
33
+ class Sigilyph:
34
+ def __init__(self, before_dict_path=None, special_dict_path=None):
35
+ self.sil1symbol='-'
36
+ self.punctuation = punctuation
37
+
38
+ self.before_replace_dict = before_replace_dict
39
+ if before_dict_path:
40
+ with open(before_dict_path, 'r', encoding="utf-8") as obdp:
41
+ extra_before_dict = json.load(obdp)
42
+ self.before_replace_dict.update(extra_before_dict)
43
+
44
+ self.special_word_dict = special_word_dict
45
+ if special_dict_path:
46
+ with open(special_dict_path, 'r', encoding="utf-8") as obdp:
47
+ extra_special_dict = json.load(obdp)
48
+ self.special_word_dict.update(extra_special_dict)
49
+
50
+ self.special_phrase = special_phrase
51
+
52
+ def forward(self, text, lang):
53
+ phones = self.text_process(text, lang)
54
+ phones = self.replace_sil2label(phones)
55
+ return phones
56
+
57
+ def text_process(self, text, lang, spflag=True, use_lang='zh'):
58
+ text = preprocess_first(text, self.before_replace_dict, special_word_dict, norm_use_lang='zh')
59
+
60
+ multi_lang_text_list = self.text_split_lang(text, lang)
61
+
62
+ all_phone = []
63
+ for text_split_dict in multi_lang_text_list:
64
+ use_lang = text_split_dict['lang']
65
+ use_text = text_split_dict['text_split']
66
+ if use_lang == 'phone':
67
+ phonelist = use_text.split()
68
+ all_phone.extend(phonelist)
69
+ else:
70
+ if use_lang not in norm_func_dict.keys():
71
+ use_lang = 'zh'
72
+ use_text = self.text_norm(use_text, use_lang)
73
+ phone_list = self.g2p(use_text, use_lang)
74
+ #all_phone.append('sil')
75
+ all_phone.append('sil_lang')
76
+ all_phone.append('<sp>')
77
+ all_phone.extend(phone_list)
78
+ #all_phone = postprocess(all_phone)
79
+ all_phone = self.postprocess_tts(all_phone)
80
+ if not spflag:
81
+ while '<sp>' in all_phone:
82
+ all_phone.remove('<sp>')
83
+ return all_phone
84
+
85
+ ############### split text in line with lang ##############
86
+ def text_split_lang(self, text, lang):
87
+ if lang == 'ZH' or lang == 'zh':
88
+ multi_lang_text_list = [{'lang':'zh', 'text_split': text}]
89
+ elif lang == 'en':
90
+ multi_lang_text_list = [{'lang':'en', 'text_split': text}]
91
+ else:
92
+ pretext_split = re.split("(\[.*?\])", text, re.I|re.M)
93
+ multi_lang_text_list = []
94
+ pretext_split = list(filter(None, pretext_split))
95
+ for utext in pretext_split:
96
+ if utext[0] != '[':
97
+ pattern = r'([a-zA-Z ,.\!\?]+|[\u4e00-\u9fa5 ,。,.\t \"\!\?\“\”\、]+)'
98
+ text_split = re.findall(pattern, utext)
99
+ print(text_split)
100
+ for idx in range(len(text_split)):
101
+ tmpts = text_split[idx]
102
+ tmp_lang = langid.classify(tmpts)[0]
103
+ if len(tmpts)>20:
104
+ if not self.has_punc(tmpts[:-1]):
105
+ tmpts = self.add_pause(tmpts, 'p')
106
+ if not self.has_punc(tmpts[:-1]):
107
+ tmpts = self.add_pause(tmpts, 'v')
108
+ if tmpts in self.special_phrase:
109
+ tmpts = tmpts+self.sil1symbol
110
+ if tmp_lang in ['zh', 'jp', 'ja']:
111
+ tmp_lang = 'zh'
112
+ tmpts = tmpts.replace(' ', self.sil1symbol)
113
+ else:
114
+ tmp_lang = 'en'
115
+ if not tmpts.isspace():
116
+ multi_lang_text_list.append({'lang':tmp_lang, 'text_split': tmpts})
117
+ else:
118
+ phones = utext[1:-1]
119
+ multi_lang_text_list.append({'lang':'phone', 'text_split': phones})
120
+ return multi_lang_text_list
121
+
122
+ ########## add parse ###############
123
+ def has_punc(self, text):
124
+ for char in text:
125
+ if char in [',', '.', '!', '?', ',','。','?','!', self.sil1symbol]:
126
+ return True
127
+ return False
128
+
129
+ def add_pause(self, text, tf='v'):
130
+ segment = jieba.posseg.cut(text.strip())
131
+ wlist = []
132
+ flist = []
133
+ for x in segment:
134
+ wlist.append(x.word)
135
+ flist.append(x.flag)
136
+ idx = self.search_ele_mid(flist, tf)
137
+ if idx != len(flist)-1:
138
+ wlist.insert(idx, self.sil1symbol)
139
+ outtext = ''.join(wlist)
140
+ return outtext
141
+
142
+ def search_ele_mid(self, flaglist, tf = 'v'):
143
+ nowidx = -1
144
+ halflen = (len(flaglist))//2
145
+ for gap in range(len(flaglist)-halflen):
146
+ nowidx = halflen - gap
147
+ if flaglist[nowidx]==tf:
148
+ return nowidx
149
+ nowidx = halflen + gap
150
+ if flaglist[nowidx]==tf:
151
+ return nowidx
152
+ return nowidx
153
+
154
+ ######## text norm #########
155
+ def text_norm(self, text, lang):
156
+ outtext = norm_func_dict[lang](text)
157
+ return outtext
158
+
159
+ ############ g2p ################
160
+ def g2p(self, text, lang):
161
+ phoneme_list = g2p_func_dict[lang](text)
162
+ return phoneme_list
163
+
164
+ ############# post process #############
165
+ def postprocess_tts(self, phonelist):
166
+ #outlist = ['sil', '<sp>']
167
+ outlist = []
168
+ print(phonelist)
169
+ for idx in range(len(phonelist)):
170
+ pm = phonelist[idx]
171
+ if pm not in self.punctuation:
172
+ outlist.append(pm)
173
+ elif pm == self.sil1symbol:
174
+ outlist.append('sil_1')
175
+ else:
176
+ #outlist.append('sil')
177
+ outlist.append('sil_punc')
178
+ #outlist.append('<sp>')
179
+ #if outlist[-1] == 'sil':
180
+ # outlist.append('<sp>')
181
+ #elif outlist[-2] != 'sil':
182
+ # outlist.append('sil')
183
+ # outlist.append('<sp>')
184
+ if phonelist[-2] not in self.punctuation and outlist[-1].split('_')[0] != 'sil':
185
+ #outlist.append('sil')
186
+ outlist.append('sil_end')
187
+ outlist.append('<sp>')
188
+ return outlist
189
+
190
+ ########## replace silence token ###############
191
+ def replace_sil2label(phones):
192
+ #phones = ['sil_1' if xx == 'sil_lang' else xx for xx in phones]
193
+ phones = ['' if xx == 'sil_lang' else xx for xx in phones]
194
+ phones = ['sil_2' if xx == 'sil_punc' else xx for xx in phones]
195
+ phones = ['sil_2' if xx == 'sil_end' else xx for xx in phones]
196
+ phones = ['sil_1' if xx == 'sil' else xx for xx in phones]
197
+ phones = list(filter(None, phones))
198
+ #outphones = []
199
+ outphones = ['sil_1']
200
+ for ele in phones:
201
+ if outphones == []:
202
+ outphones.append(ele)
203
+ else:
204
+ if ele.split('_')[0] == 'sil' and outphones[-1].split('_')[0] == 'sil':
205
+ outphones[-1] = 'sil_2'
206
+ #outphones[-1] = 'sil_1'
207
+ else:
208
+ outphones.append(ele)
209
+ #if outphones[-1].split('_')[0] == 'sil':
210
+ # outphones = outphones[:-1]
211
+ return outphones
212
+
213
+
214
+
215
+