sigilyph 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sigilyph/__init__.py ADDED
@@ -0,0 +1,18 @@
1
+ '''
2
+ FilePath: /python-Sigilyph/sigilyph/__init__.py
3
+ Descripttion:
4
+ Author: Yixiang Chen
5
+ version:
6
+ Date: 2025-05-13 11:01:07
7
+ LastEditors: Yixiang Chen
8
+ LastEditTime: 2026-01-16 18:48:51
9
+ '''
10
+
11
+ from sigilyph.core.sigilyph_class import Sigilyph
12
+
13
+ from sigilyph.core.symbols import all_phone_dict
14
+
15
+ from sigilyph.core.bert_align import AlignBert
16
+
17
+ from sigilyph.text_norm.sigilyph_norm import SigilyphNormalizer
18
+
File without changes
@@ -0,0 +1,163 @@
1
+ '''
2
+ FilePath: /python-Sigilyph/sigilyph/core/bert_align.py
3
+ Descripttion:
4
+ Author: Yixiang Chen
5
+ version:
6
+ Date: 2025-09-24 15:13:38
7
+ LastEditors: Yixiang Chen
8
+ LastEditTime: 2026-01-16 18:53:54
9
+ '''
10
+
11
+ import torch
12
+ from transformers import BertTokenizer, BertModel
13
+
14
+ import langid
15
+ from g2p_en import G2p
16
+ _g2p_en = G2p()
17
+
18
+ from pypinyin import lazy_pinyin, Style
19
+
20
+ from sigilyph.text_norm.norm_func import text_norm_cn
21
+ from sigilyph.core.symbols import punctuation, punc_map_ch, cn_word2phone_dict
22
+ for punc in punctuation:
23
+ cn_word2phone_dict[punc] = punc
24
+
25
+ def _g2p_cn(text):
26
+ pinyinlist = lazy_pinyin(text, style=Style.TONE3, neutral_tone_with_five=True, tone_sandhi=True)
27
+ outlist = []
28
+ for pp in pinyinlist:
29
+ if pp in cn_word2phone_dict.keys():
30
+ outlist.extend(cn_word2phone_dict[pp])
31
+ else:
32
+ for ch in pp:
33
+ if ch in cn_word2phone_dict.keys():
34
+ outlist.extend(cn_word2phone_dict[ch])
35
+ else:
36
+ outlist.extend('sil')
37
+ return outlist
38
+
39
+ def g2p_word(word):
40
+ tmp_lang = langid.classify(word)[0]
41
+ if tmp_lang in ['zh', 'jp', 'ja']:
42
+ tmp_lang = 'zh'
43
+ else:
44
+ tmp_lang = 'en'
45
+ if tmp_lang == 'zh':
46
+ return _g2p_cn(word)
47
+ else:
48
+ return _g2p_en(word)
49
+
50
+ class AlignBert():
51
+ def __init__(self, tkn_cache_dir, vocab_file):
52
+
53
+ #tkn_cache_dir = "./tmp/"
54
+ self.vocab_dict = self.load_vocab(vocab_file)
55
+ self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', cache_dir=tkn_cache_dir)
56
+ self.g2p_word = g2p_word
57
+ self.symbol_list = ['[CLS]', '[SEP]', '?', '!', '.', ',', ',', '。']
58
+
59
+ self.empty_bert = torch.zeros([768])
60
+
61
+ def gen_seqbert(self, mfa_phones, text, bert):
62
+ norm_text = text_norm_cn(text)
63
+ midph_list, midph2bertid_dict = self.get_midph(norm_text)
64
+ phoneme2midph_dict = self.get_phoneme2midph_dict(mfa_phones, midph_list)
65
+ phoneme2bertid_dict = self.get_phoneme2bertid_dict(midph2bertid_dict, phoneme2midph_dict)
66
+ seqbert = []
67
+ for idx in range(len(phoneme2bertid_dict)):
68
+ bertid = phoneme2bertid_dict[idx]
69
+ if bertid >=0:
70
+ seqbert.append(bert[bertid])
71
+ else:
72
+ seqbert.append(self.empty_bert)
73
+ seqbert = torch.stack(seqbert)
74
+ return seqbert
75
+
76
+ def load_vocab(self, vocab_file):
77
+ vocab_dict = {}
78
+ with open(vocab_file, 'r') as ovf:
79
+ lines = ovf.readlines()
80
+ for idx in range(len(lines)):
81
+ line = lines[idx]
82
+ tt = line.strip()
83
+ vocab_dict[idx] = tt
84
+ del lines
85
+ return vocab_dict
86
+
87
+ def id2text(self, idlist):
88
+ outlist = []
89
+ for idx in range(len(idlist)):
90
+ outlist.append(self.vocab_dict[int(idlist[idx])])
91
+ return outlist
92
+
93
+ def get_midph(self, text):
94
+ encoded_input = self.tokenizer(text, return_tensors='pt')
95
+ ret = self.id2text(encoded_input['input_ids'][0])
96
+ wordph_list = []
97
+ for word in ret[1:-1]:
98
+ word_phoneme = self.g2p_word(word)
99
+ wordph_list.append(word_phoneme)
100
+ midph_list = []
101
+ midph2bertid_dict = {}
102
+ midph_list.append(ret[0])
103
+ midph2bertid_dict[0] = 0
104
+ for widx in range(len(wordph_list)):
105
+ for phidx in range(len(wordph_list[widx])):
106
+ phoneme = wordph_list[widx][phidx]
107
+ midph_list.append(phoneme)
108
+ midph2bertid_dict[len(midph_list)-1]=widx+1
109
+ midph_list.append(ret[-1])
110
+ midph2bertid_dict[len(midph_list)-1] = len(ret)-1
111
+ return midph_list, midph2bertid_dict
112
+
113
+ def get_phoneme2midph_dict(self, mfa_phones, midph_list):
114
+ fixed_midph_list = []
115
+ for idx in range(len(midph_list)):
116
+ if midph_list[idx] in self.symbol_list:
117
+ fixed_midph_list.append('sil')
118
+ else:
119
+ fixed_midph_list.append(midph_list[idx])
120
+ phoneme2midph_dict = self.lcs(mfa_phones, fixed_midph_list)
121
+ return phoneme2midph_dict
122
+
123
+ def lcs(self, mfa_phones, midph_list):
124
+ n, m = len(midph_list), len(mfa_phones)
125
+ dp = [[0] * (n + 1) for _ in range(m + 1)]
126
+
127
+ phoneme2midph_dict = {}
128
+ for idx in range(1, m+1):
129
+ phoneme2midph_dict[idx-1]=-1
130
+
131
+ for idx in range(1, m+1):
132
+ for midph_id in range(1, n+1):
133
+ curr_ph = mfa_phones[idx-1]
134
+ if curr_ph == midph_list[midph_id-1]:
135
+ dp[idx][midph_id] = dp[idx-1][midph_id-1] + 1
136
+ else:
137
+ dp[idx][midph_id] = max(dp[idx-1][midph_id], dp[idx][midph_id-1])
138
+ n, m = len(midph_list), len(mfa_phones)
139
+ while m > 0 and n > 0:
140
+ if mfa_phones[m-1] == midph_list[n-1] and dp[m][n] == dp[m-1][n-1] + 1:
141
+ phoneme2midph_dict[m-1] = n-1
142
+ m, n = m-1, n-1
143
+ continue
144
+ if dp[m][n] == dp[m-1][n]:
145
+ m, n = m-1, n
146
+ continue
147
+ if dp[m][n] == dp[m][n-1]:
148
+ m, n = m, n-1
149
+ continue
150
+ return phoneme2midph_dict
151
+
152
+
153
+ def get_phoneme2bertid_dict(self, midph2bertid_dict, phoneme2midph_dict):
154
+ phoneme2bertid_dict = {}
155
+ for idx in range(len(phoneme2midph_dict)):
156
+ phoneme_id = idx
157
+ midph_id = phoneme2midph_dict[phoneme_id]
158
+ #if midph_id not in midph2bertid_dict.keys():
159
+ if midph_id == -1:
160
+ phoneme2bertid_dict[idx] = -1
161
+ else:
162
+ phoneme2bertid_dict[idx] = midph2bertid_dict[midph_id]
163
+ return phoneme2bertid_dict
@@ -0,0 +1,47 @@
1
+ '''
2
+ FilePath: /python-Sigilyph/sigilyph/core/g2p_func.py
3
+ Descripttion:
4
+ Author: Yixiang Chen
5
+ version:
6
+ Date: 2025-03-31 16:55:51
7
+ LastEditors: Yixiang Chen
8
+ LastEditTime: 2025-08-12 14:42:02
9
+ '''
10
+
11
+ from g2p_en import G2p
12
+ _g2p_en = G2p()
13
+
14
+ def g2p_en(text, sp_sign='<sp>'):
15
+ phone_list = _g2p_en(text)
16
+ phone_list = [sp_sign if xx == " " else xx for xx in phone_list]
17
+ if len(phone_list)>1 and phone_list[-1] != sp_sign:
18
+ phone_list.append(sp_sign)
19
+ return phone_list
20
+
21
+
22
+ from pypinyin import lazy_pinyin, Style
23
+
24
+ from sigilyph.core.symbols import punctuation, punc_map_ch, cn_word2phone_dict
25
+ for punc in punctuation:
26
+ cn_word2phone_dict[punc] = punc
27
+
28
+ def g2p_cn(text):
29
+ pinyinlist = lazy_pinyin(text, style=Style.TONE3, neutral_tone_with_five=True, tone_sandhi=True)
30
+ outlist = []
31
+ for pp in pinyinlist:
32
+ if pp in cn_word2phone_dict.keys():
33
+ outlist.extend(cn_word2phone_dict[pp])
34
+ outlist.append('<sp>')
35
+ else:
36
+ for ch in pp:
37
+ outlist.extend(cn_word2phone_dict[ch])
38
+ outlist.append('<sp>')
39
+ if len(outlist) > 4:
40
+ if outlist[-2] == 'sil' and outlist[-4] == 'sil':
41
+ outlist = outlist[:-2]
42
+ return outlist
43
+
44
+
45
+
46
+
47
+
@@ -0,0 +1,98 @@
1
+ '''
2
+ FilePath: /python-Sigilyph/sigilyph/core/norm_func.py
3
+ Descripttion:
4
+ Author: Yixiang Chen
5
+ version:
6
+ Date: 2025-03-31 17:50:26
7
+ LastEditors: Yixiang Chen
8
+ LastEditTime: 2026-01-13 17:44:02
9
+ '''
10
+
11
+
12
+ import re
13
+
14
+ from sigilyph.core.symbols import punctuation, punc_map_ch
15
+
16
+ #from tn.chinese.normalizer import Normalizer as ZhNormalizer
17
+ #from tn.english.normalizer import Normalizer as EnNormalizer
18
+ from sigilyph.fst_tool.infer_normalizer import ZhNormalizer, EnNormalizer
19
+
20
+
21
+ import os
22
+ from importlib_resources import files
23
+ basedir = files('sigilyph')
24
+
25
+ #zh_tn_model = ZhNormalizer(remove_erhua=False, full_to_half=False)
26
+ #en_tn_model = EnNormalizer()
27
+ #zh_tn_model = ZhNormalizer(cache_dir='./sigilyph/core/cache_dir', remove_erhua=False, full_to_half=False)
28
+ #en_tn_model = EnNormalizer(cache_dir='./sigilyph/core/cache_dir')
29
+ zh_tn_model = ZhNormalizer(cache_dir=os.path.join(basedir, 'core', 'cache_dir'), remove_erhua=False, full_to_half=False)
30
+ en_tn_model = EnNormalizer(cache_dir=os.path.join(basedir, 'core', 'cache_dir'))
31
+
32
+ import json
33
+ #import sys
34
+ #sys.path.append('text_front')
35
+ #with open('./special_dict.json', 'r', encoding="utf-8") as infi:
36
+ #with open('./text_front/special_dict.json', 'r', encoding="utf-8") as infi:
37
+ #with open('./sigilyph/core/special_dict.json', 'r', encoding="utf-8") as infi:
38
+ with open(os.path.join(basedir, 'core', 'special_dict.json'), 'r', encoding="utf-8") as infi:
39
+ special_dict = json.load(infi)
40
+
41
+ def pro_norm(text, use_lang='zh'):
42
+ if use_lang == 'zh':
43
+ norm_text = zh_tn_model.normalize(text)
44
+ #print("zh ", norm_text)
45
+ else:
46
+ norm_text = en_tn_model.normalize(text)
47
+ #print("en ", norm_text)
48
+ return norm_text
49
+
50
+ def replace_with_dict(text, replace_dict):
51
+ for old, new in replace_dict.items():
52
+ text = text.replace(old, new)
53
+ return text
54
+
55
+ def replace_with_dict_re(text, replace_dict):
56
+ pattern = re.compile("|".join(re.escape(key) for key in replace_dict.keys()))
57
+ return pattern.sub(lambda m: replace_dict[m.group(0)], text)
58
+
59
+ pre_replace_dict = {"AlphaFold-Plus": "AlphaFold Plus"}
60
+ def preprocess_first_old(text, use_lang='zh'):
61
+ text = replace_with_dict(text, pre_replace_dict)
62
+ norm_text = pro_norm(text, use_lang)
63
+ #print(norm_text)
64
+ rep_text = replace_with_dict(norm_text, special_dict)
65
+ return rep_text
66
+
67
+ def preprocess_first(text, before_replace_dict, special_word_dict, norm_use_lang='zh'):
68
+ text = replace_with_dict(text, before_replace_dict)
69
+ norm_text = pro_norm(text, norm_use_lang)
70
+ #print(norm_text)
71
+ rep_text = replace_with_dict(norm_text, special_word_dict)
72
+ return rep_text
73
+
74
+ def preprocess_first_for_norm(text, before_replace_dict, norm_use_lang='zh'):
75
+ text = replace_with_dict(text, before_replace_dict)
76
+ norm_text = pro_norm(text, norm_use_lang)
77
+ return norm_text
78
+
79
+ def normalizer(text):
80
+ return text
81
+
82
+ def replace_punc(text):
83
+ #text = text.replace("嗯", "恩").replace("呣", "母")
84
+ pattern = re.compile("|".join(re.escape(p) for p in punc_map_ch.keys()))
85
+ replaced_text = pattern.sub(lambda x: punc_map_ch[x.group()], text)
86
+ replaced_text = re.sub(
87
+ r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text
88
+ )
89
+ return replaced_text
90
+
91
+ def text_norm_cn(text):
92
+ text = normalizer(text)
93
+ text = replace_punc(text)
94
+ return text
95
+
96
+ def text_norm_en(text):
97
+
98
+ return text
@@ -0,0 +1,64 @@
1
+
2
+ before_replace_dict = {
3
+ "AlphaFold-Plus": "AlphaFold Plus"
4
+ }
5
+
6
+ special_phrase = ['据了解']
7
+
8
+ special_word_dict = {
9
+ "iPhone": "[AY1 F OW0 N]",
10
+ "IOS": "[AY1 OW1 AE1 S]",
11
+ "A十七": "[EY1 sh ir2 q i1]",
12
+ "A seventeen": "[EY1 S EH1 V AH0 N T IY1 N]",
13
+ "CEO": "[S IY1 IY1 OW1]",
14
+ "AI": "[EY1 AY1]",
15
+ "ID": "[AY1 D IY1]",
16
+ "ABC": "[EY1 B IY1 S IY1]",
17
+ "VIP": "[V IY1 AY1 P IY1]",
18
+ "PDF": "[P IY1 D IY1 AE1 F]",
19
+ "NLP": "[EH1 NG EH2 L P IY1]",
20
+ "API": "[EY1 P IY1 AY1]",
21
+ "GPU": "[JH IY1 P IY1 Y UW1]",
22
+ "WeChat": "[W IY1 CH AE1 T]",
23
+ "PPT": "[P IY1 P IY1 T IY1]",
24
+ "CA": "[S IY1 EY1]",
25
+ ".com": "[d ian3 K AA1 M]",
26
+ ".zhang": "[ZH AA1 NG]",
27
+ "live": "[L AY0 V]",
28
+ "@": "[sil_1 AE1 T sil_1]",
29
+ "睡不着觉": "[sh ui4 b u4 zh e5 j iao4]",
30
+ "月经不调": "[y ve4 j ing1 b u4 t iao2]",
31
+ "长护险": "[ch ang2 h u4 x ian3]",
32
+ "长时间": "[ch ang2 sh ir2 j ian1]",
33
+ "长住外地": "[ch ang2 zh u4 w ai4 d i4]",
34
+ "长按": "[ch ang2 AA an4]",
35
+ "喉咙干疼": "[h ou2 l ong2 g an1 t eng2]",
36
+ "死对头": "[s ii3 d ui4 t ou5]",
37
+ "成名曲": "[ch eng2 m ing2 q v3]",
38
+ "古朴": "[g u3 p u3]",
39
+ "啊": "[AA a1]",
40
+ "sinα": "[S AY1 N AH2 AE1 L F a3]",
41
+ "cosα": "[K OW0 S AY1 N AH2 AE1 L F a3]",
42
+ "tanα": "[T AE1 N JH AH0 N T AH2 AE1 L F a3]",
43
+ "α": "[AE1 L F a3]",
44
+ "Ⅰ": "[y i1]",
45
+ "Ⅱ": "[EE er4]",
46
+ "qq": "[K Y UW1 K Y UW1]",
47
+ "≠": "[b u4 d eng3 y v2]",
48
+ "Beijing": "[b ei3 j ing1]",
49
+ "<sil>": "[sil_1]",
50
+ "--": "[sil]",
51
+ "=-=": "[sil]",
52
+ ":": "[sil_1]",
53
+ "-": "[sil]",
54
+ "(": "[sil]",
55
+ ")": "[sil]",
56
+ "“": "[sil_1]",
57
+ "”": "[sil_1]",
58
+ "《": "[sil_1]",
59
+ "》": "[sil_1]",
60
+ "(": "[sil]",
61
+ ")": "[sil]",
62
+ ":": "[sil_1]",
63
+ }
64
+
@@ -0,0 +1,16 @@
1
+ '''
2
+ FilePath: /python-Sigilyph/sigilyph/core/preprocess.py
3
+ Descripttion:
4
+ Author: Yixiang Chen
5
+ version:
6
+ Date: 2025-05-13 11:01:26
7
+ LastEditors: Yixiang Chen
8
+ LastEditTime: 2025-05-14 20:26:27
9
+ '''
10
+
11
+ def replace_proper(text, namedict):
12
+ for k,v in namedict.items():
13
+ text = text.replace(k,v)
14
+ return text
15
+
16
+