PyPI - sigilyph - Versions diffs - 0.0.1__py3-none-any.whl - Mend

sigilyph 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of sigilyph might be problematic. Click here for more details.

Files changed (13) hide show

sigilyph/__init__.py +11 -0
sigilyph/core/__init__.py +0 -0
sigilyph/core/g2p_func.py +47 -0
sigilyph/core/norm_func.py +85 -0
sigilyph/core/predict.py +66 -0
sigilyph/core/preprocess.py +16 -0
sigilyph/core/sigilyph_class.py +215 -0
sigilyph/core/symbols.py +444 -0
sigilyph/core/text_process.py +327 -0
sigilyph-0.0.1.dist-info/METADATA +23 -0
sigilyph-0.0.1.dist-info/RECORD +13 -0
sigilyph-0.0.1.dist-info/WHEEL +5 -0
sigilyph-0.0.1.dist-info/top_level.txt +1 -0

sigilyph/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+'''
+FilePath: /python-Sigilyph/sigilyph/__init__.py
+Descripttion:
+Author: Yixiang Chen
+version:
+Date: 2025-05-13 11:01:07
+LastEditors: Yixiang Chen
+LastEditTime: 2025-08-12 16:39:15
+'''
+from sigilyph.core.sigilyph_class import Sigilyph

sigilyph/core/__init__.py ADDED Viewed

File without changes

sigilyph/core/g2p_func.py ADDED Viewed

@@ -0,0 +1,47 @@
+'''
+FilePath: /python-Sigilyph/sigilyph/core/g2p_func.py
+Descripttion:
+Author: Yixiang Chen
+version:
+Date: 2025-03-31 16:55:51
+LastEditors: Yixiang Chen
+LastEditTime: 2025-08-12 14:42:02
+'''
+from g2p_en import G2p
+_g2p_en = G2p()
+def g2p_en(text, sp_sign='<sp>'):
+    phone_list = _g2p_en(text)
+    phone_list = [sp_sign if xx == " " else xx for xx in phone_list]
+    if len(phone_list)>1 and phone_list[-1] != sp_sign:
+        phone_list.append(sp_sign)
+    return phone_list
+from pypinyin import lazy_pinyin, Style
+from sigilyph.core.symbols import punctuation, punc_map_ch, cn_word2phone_dict
+for punc in punctuation:
+    cn_word2phone_dict[punc] = punc
+def g2p_cn(text):
+    pinyinlist = lazy_pinyin(text, style=Style.TONE3, neutral_tone_with_five=True, tone_sandhi=True)
+    outlist = []
+    for pp in pinyinlist:
+        if pp in cn_word2phone_dict.keys():
+            outlist.extend(cn_word2phone_dict[pp])
+            outlist.append('<sp>')
+        else:
+            for ch in pp:
+                outlist.extend(cn_word2phone_dict[ch])
+                outlist.append('<sp>')
+    if len(outlist) > 4:
+        if outlist[-2] == 'sil' and outlist[-4] == 'sil':
+            outlist = outlist[:-2]
+    return outlist

sigilyph/core/norm_func.py ADDED Viewed

@@ -0,0 +1,85 @@
+'''
+FilePath: /python-Sigilyph/sigilyph/core/norm_func.py
+Descripttion:
+Author: Yixiang Chen
+version:
+Date: 2025-03-31 17:50:26
+LastEditors: Yixiang Chen
+LastEditTime: 2025-08-12 15:42:55
+'''
+import re
+from sigilyph.core.symbols import punctuation, punc_map_ch
+from tn.chinese.normalizer import Normalizer as ZhNormalizer
+from tn.english.normalizer import Normalizer as EnNormalizer
+#zh_tn_model = ZhNormalizer(remove_erhua=False, full_to_half=False)
+#en_tn_model = EnNormalizer()
+zh_tn_model = ZhNormalizer(cache_dir='./sigilyph/core/cache_dir', remove_erhua=False, full_to_half=False)
+en_tn_model = EnNormalizer(cache_dir='./sigilyph/core/cache_dir')
+import json
+#import sys
+#sys.path.append('text_front')
+#with open('./special_dict.json', 'r', encoding="utf-8") as infi:
+#with open('./text_front/special_dict.json', 'r', encoding="utf-8") as infi:
+with open('./sigilyph/core/special_dict.json', 'r', encoding="utf-8") as infi:
+    special_dict = json.load(infi)
+def pro_norm(text, use_lang='zh'):
+    if use_lang == 'zh':
+        norm_text = zh_tn_model.normalize(text)
+        #print("zh ", norm_text)
+    else:
+        norm_text = en_tn_model.normalize(text)
+        #print("en ", norm_text)
+    return norm_text
+def replace_with_dict(text, replace_dict):
+    for old, new in replace_dict.items():
+        text = text.replace(old, new)
+    return text
+def replace_with_dict_re(text, replace_dict):
+    pattern = re.compile("|".join(re.escape(key) for key in replace_dict.keys()))
+    return pattern.sub(lambda m: replace_dict[m.group(0)], text)
+pre_replace_dict = {"AlphaFold-Plus": "AlphaFold Plus"}
+def preprocess_first_old(text, use_lang='zh'):
+    text = replace_with_dict(text, pre_replace_dict)
+    norm_text = pro_norm(text, use_lang)
+    print(norm_text)
+    rep_text = replace_with_dict(norm_text, special_dict)
+    return rep_text
+def preprocess_first(text, before_replace_dict, special_word_dict, norm_use_lang='zh'):
+    text = replace_with_dict(text, before_replace_dict)
+    norm_text = pro_norm(text, norm_use_lang)
+    print(norm_text)
+    rep_text = replace_with_dict(norm_text, special_word_dict)
+    return rep_text
+def normalizer(text):
+    return text
+def replace_punc(text):
+    #text = text.replace("嗯", "恩").replace("呣", "母")
+    pattern = re.compile("|".join(re.escape(p) for p in punc_map_ch.keys()))
+    replaced_text = pattern.sub(lambda x: punc_map_ch[x.group()], text)
+    replaced_text = re.sub(
+        r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text
+    )
+    return replaced_text
+def text_norm_cn(text):
+    text = normalizer(text)
+    text = replace_punc(text)
+    return text
+def text_norm_en(text):
+    return text

sigilyph/core/predict.py ADDED Viewed

@@ -0,0 +1,66 @@
+before_replace_dict = {
+    "AlphaFold-Plus": "AlphaFold Plus"
+}
+special_phrase = ['据了解']
+special_word_dict = {
+    "iPhone": "[AY1 F OW0 N]",
+    "IOS": "[AY1 OW1 AE1 S]",
+    "A十七": "[EY1 sh ir2 q i1]",
+    "A seventeen": "[EY1 S EH1 V AH0 N T IY1 N]",
+    "CEO": "[S IY1 IY1 OW1]",
+    "AI": "[EY1 AY1]",
+    "ID": "[AY1 D IY1]",
+    "ABC": "[EY1 B IY1 S IY1]",
+    "VIP": "[V IY1 AY1 P IY1]",
+    "PDF": "[P IY1 D IY1 AE1 F]",
+    "NLP": "[EH1 NG EH2 L P IY1]",
+    "API": "[EY1 P IY1 AY1]",
+    "GPU": "[JH IY1 P IY1 Y UW1]",
+    "WeChat": "[W IY1 CH AE1 T]",
+    "PPT": "[P IY1 P IY1 T IY1]",
+    "CA": "[S IY1 EY1]",
+    ".com": "[d ian3 K AA1 M]",
+    ".zhang": "[ZH AA1 NG]",
+    "live": "[L AY0 V]",
+    "@": "[sil_1 AE1 T sil_1]",
+    "睡不着觉": "[sh ui4 b u4 zh e5 j iao4]",
+    "月经不调": "[y ve4 j ing1 b u4 t iao2]",
+    "长护险": "[ch ang2 h u4 x ian3]",
+    "长时间": "[ch ang2 sh ir2 j ian1]",
+    "长住外地": "[ch ang2 zh u4 w ai4 d i4]",
+    "长按": "[ch ang2 AA an4]",
+    "喉咙干疼": "[h ou2 l ong2 g an1 t eng2]",
+    "死对头": "[s ii3 d ui4 t ou5]",
+    "成名曲": "[ch eng2 m ing2 q v3]",
+    "古朴": "[g u3 p u3]",
+    "啊": "[AA a1]",
+    "sinα": "[S AY1 N AH2 AE1 L F a3]",
+    "cosα": "[K OW0 S AY1 N AH2 AE1 L F a3]",
+    "tanα": "[T AE1 N JH AH0 N T AH2 AE1 L F a3]",
+    "α": "[AE1 L F a3]",
+    "Ⅰ": "[y i1]",
+    "Ⅱ": "[EE er4]",
+    "qq": "[K Y UW1 K Y UW1]",
+    "≠": "[b u4 d eng3 y v2]",
+    "Beijing": "[b ei3 j ing1]",
+    "<sil>": "[sil_1]",
+    "--": "[sil]",
+    "=-=": "[sil]",
+    ":": "[sil_1]",
+    "-": "[sil]",
+    "(": "[sil]",
+    ")": "[sil]",
+    "“": "[sil_1]",
+    "”": "[sil_1]",
+    "《": "[sil_1]",
+    "》": "[sil_1]",
+    "（": "[sil]",
+    "）": "[sil]",
+    "：": "[sil_1]",
+}

sigilyph/core/preprocess.py ADDED Viewed

@@ -0,0 +1,16 @@
+'''
+FilePath: /python-Sigilyph/sigilyph/core/preprocess.py
+Descripttion:
+Author: Yixiang Chen
+version:
+Date: 2025-05-13 11:01:26
+LastEditors: Yixiang Chen
+LastEditTime: 2025-05-14 20:26:27
+'''
+def replace_proper(text, namedict):
+    for k,v in namedict.items():
+        text = text.replace(k,v)
+    return text

sigilyph/core/sigilyph_class.py ADDED Viewed

@@ -0,0 +1,215 @@
+'''
+FilePath: /python-Sigilyph/sigilyph/core/sigilyph_class.py
+Descripttion:
+Author: Yixiang Chen
+version:
+Date: 2025-08-12 14:42:50
+LastEditors: Yixiang Chen
+LastEditTime: 2025-08-12 15:41:33
+'''
+import langid
+import re
+import json
+import jieba
+import jieba.posseg
+from sigilyph.core.g2p_func import g2p_en, g2p_cn
+from sigilyph.core.norm_func import preprocess_first, text_norm_en, text_norm_cn
+from sigilyph.core.symbols import punctuation
+from sigilyph.core.predict import before_replace_dict, special_word_dict, special_phrase
+norm_func_dict = {
+    'en': text_norm_en,
+    'zh': text_norm_cn
+}
+g2p_func_dict = {
+    'en': g2p_en,
+    'zh': g2p_cn
+}
+class Sigilyph:
+    def __init__(self, before_dict_path=None, special_dict_path=None):
+        self.sil1symbol='-'
+        self.punctuation = punctuation
+        self.before_replace_dict = before_replace_dict
+        if before_dict_path:
+            with open(before_dict_path, 'r', encoding="utf-8") as obdp:
+                extra_before_dict = json.load(obdp)
+            self.before_replace_dict.update(extra_before_dict)
+        self.special_word_dict = special_word_dict
+        if special_dict_path:
+            with open(special_dict_path, 'r', encoding="utf-8") as obdp:
+                extra_special_dict = json.load(obdp)
+            self.special_word_dict.update(extra_special_dict)
+        self.special_phrase = special_phrase
+    def forward(self, text, lang):
+        phones = self.text_process(text, lang)
+        phones = self.replace_sil2label(phones)
+        return phones
+    def text_process(self, text, lang, spflag=True, use_lang='zh'):
+        text = preprocess_first(text, self.before_replace_dict, special_word_dict, norm_use_lang='zh')
+        multi_lang_text_list = self.text_split_lang(text, lang)
+        all_phone = []
+        for text_split_dict in multi_lang_text_list:
+            use_lang = text_split_dict['lang']
+            use_text = text_split_dict['text_split']
+            if use_lang == 'phone':
+                phonelist = use_text.split()
+                all_phone.extend(phonelist)
+            else:
+                if use_lang not in norm_func_dict.keys():
+                    use_lang = 'zh'
+                use_text = self.text_norm(use_text, use_lang)
+                phone_list = self.g2p(use_text, use_lang)
+                #all_phone.append('sil')
+                all_phone.append('sil_lang')
+                all_phone.append('<sp>')
+                all_phone.extend(phone_list)
+        #all_phone = postprocess(all_phone)
+        all_phone = self.postprocess_tts(all_phone)
+        if not spflag:
+            while '<sp>' in all_phone:
+                all_phone.remove('<sp>')
+        return all_phone
+    ###############  split text in line with lang ##############
+    def text_split_lang(self, text, lang):
+        if lang == 'ZH' or lang == 'zh':
+            multi_lang_text_list = [{'lang':'zh', 'text_split': text}]
+        elif lang == 'en':
+            multi_lang_text_list = [{'lang':'en', 'text_split': text}]
+        else:
+            pretext_split =  re.split("(\[.*?\])", text, re.I|re.M)
+            multi_lang_text_list = []
+            pretext_split = list(filter(None, pretext_split))
+            for utext in pretext_split:
+                if utext[0] != '[':
+                    pattern = r'([a-zA-Z ,.\!\?]+|[\u4e00-\u9fa5 ，。,.\t \"\！\？\“\”\、]+)'
+                    text_split = re.findall(pattern, utext)
+                    print(text_split)
+                    for idx in range(len(text_split)):
+                        tmpts = text_split[idx]
+                        tmp_lang = langid.classify(tmpts)[0]
+                        if len(tmpts)>20:
+                            if not self.has_punc(tmpts[:-1]):
+                                tmpts = self.add_pause(tmpts, 'p')
+                            if not self.has_punc(tmpts[:-1]):
+                                tmpts = self.add_pause(tmpts, 'v')
+                        if tmpts in self.special_phrase:
+                            tmpts = tmpts+self.sil1symbol
+                        if tmp_lang in ['zh', 'jp', 'ja']:
+                            tmp_lang = 'zh'
+                            tmpts = tmpts.replace(' ', self.sil1symbol)
+                        else:
+                            tmp_lang = 'en'
+                        if not tmpts.isspace():
+                            multi_lang_text_list.append({'lang':tmp_lang, 'text_split': tmpts})
+                else:
+                    phones = utext[1:-1]
+                    multi_lang_text_list.append({'lang':'phone', 'text_split': phones})
+        return multi_lang_text_list
+    ##########  add parse ###############
+    def has_punc(self, text):
+        for char in text:
+            if char in [',', '.', '!', '?', '，','。','？','！', self.sil1symbol]:
+                return True
+        return False
+    def add_pause(self, text, tf='v'):
+        segment = jieba.posseg.cut(text.strip())
+        wlist = []
+        flist = []
+        for x in segment:
+            wlist.append(x.word)
+            flist.append(x.flag)
+        idx = self.search_ele_mid(flist, tf)
+        if idx != len(flist)-1:
+            wlist.insert(idx, self.sil1symbol)
+        outtext = ''.join(wlist)
+        return outtext
+    def search_ele_mid(self, flaglist, tf = 'v'):
+        nowidx = -1
+        halflen = (len(flaglist))//2
+        for gap in range(len(flaglist)-halflen):
+            nowidx = halflen - gap
+            if flaglist[nowidx]==tf:
+                return nowidx
+            nowidx = halflen + gap
+            if flaglist[nowidx]==tf:
+                return nowidx
+        return nowidx
+    ######## text norm #########
+    def text_norm(self, text, lang):
+        outtext = norm_func_dict[lang](text)
+        return outtext
+    ############ g2p ################
+    def g2p(self, text, lang):
+        phoneme_list = g2p_func_dict[lang](text)
+        return phoneme_list
+    ############# post process #############
+    def postprocess_tts(self, phonelist):
+        #outlist = ['sil', '<sp>']
+        outlist = []
+        print(phonelist)
+        for idx in range(len(phonelist)):
+            pm = phonelist[idx]
+            if pm not in self.punctuation:
+                outlist.append(pm)
+            elif pm == self.sil1symbol:
+                outlist.append('sil_1')
+            else:
+                #outlist.append('sil')
+                outlist.append('sil_punc')
+                #outlist.append('<sp>')
+        #if outlist[-1] == 'sil':
+        #    outlist.append('<sp>')
+        #elif outlist[-2] != 'sil':
+        #    outlist.append('sil')
+        #    outlist.append('<sp>')
+        if phonelist[-2] not in self.punctuation and outlist[-1].split('_')[0] != 'sil':
+            #outlist.append('sil')
+            outlist.append('sil_end')
+            outlist.append('<sp>')
+        return outlist
+    ########## replace silence token ###############
+    def replace_sil2label(phones):
+        #phones = ['sil_1' if xx == 'sil_lang' else xx for xx in phones]
+        phones = ['' if xx == 'sil_lang' else xx for xx in phones]
+        phones = ['sil_2' if xx == 'sil_punc' else xx for xx in phones]
+        phones = ['sil_2' if xx == 'sil_end' else xx for xx in phones]
+        phones = ['sil_1' if xx == 'sil' else xx for xx in phones]
+        phones = list(filter(None, phones))
+        #outphones = []
+        outphones = ['sil_1']
+        for ele in phones:
+            if outphones == []:
+                outphones.append(ele)
+            else:
+                if ele.split('_')[0] == 'sil' and outphones[-1].split('_')[0] == 'sil':
+                    outphones[-1] = 'sil_2'
+                    #outphones[-1] = 'sil_1'
+                else:
+                    outphones.append(ele)
+        #if outphones[-1].split('_')[0] == 'sil':
+        #    outphones = outphones[:-1]
+        return outphones