PyPI - sigilyph - Versions diffs - 0.5.2__py3-none-any.whl - Mend

sigilyph 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

sigilyph/__init__.py +18 -0
sigilyph/core/__init__.py +0 -0
sigilyph/core/bert_align.py +163 -0
sigilyph/core/g2p_func.py +47 -0
sigilyph/core/norm_func_bk.py +98 -0
sigilyph/core/predict.py +64 -0
sigilyph/core/preprocess.py +16 -0
sigilyph/core/py2phone.dict +2165 -0
sigilyph/core/sigilyph_class.py +246 -0
sigilyph/core/special_dict.json +26 -0
sigilyph/core/symbols.py +445 -0
sigilyph/core/text_process.py +328 -0
sigilyph/fst_tool/__init__.py +0 -0
sigilyph/fst_tool/infer_normalizer.py +49 -0
sigilyph/fst_tool/processor.py +122 -0
sigilyph/fst_tool/token_parser.py +159 -0
sigilyph/text_norm/__init__.py +0 -0
sigilyph/text_norm/norm_func.py +155 -0
sigilyph/text_norm/norm_func_new.py +89 -0
sigilyph/text_norm/sigilyph_norm.py +179 -0
sigilyph-0.5.2.dist-info/METADATA +24 -0
sigilyph-0.5.2.dist-info/RECORD +24 -0
sigilyph-0.5.2.dist-info/WHEEL +5 -0
sigilyph-0.5.2.dist-info/top_level.txt +1 -0

sigilyph/__init__.py ADDED Viewed

@@ -0,0 +1,18 @@
+'''
+FilePath: /python-Sigilyph/sigilyph/__init__.py
+Descripttion:
+Author: Yixiang Chen
+version:
+Date: 2025-05-13 11:01:07
+LastEditors: Yixiang Chen
+LastEditTime: 2026-01-16 18:48:51
+'''
+from sigilyph.core.sigilyph_class import Sigilyph
+from sigilyph.core.symbols import all_phone_dict
+from sigilyph.core.bert_align import AlignBert
+from sigilyph.text_norm.sigilyph_norm import SigilyphNormalizer

sigilyph/core/__init__.py ADDED Viewed

File without changes

sigilyph/core/bert_align.py ADDED Viewed

@@ -0,0 +1,163 @@
+'''
+FilePath: /python-Sigilyph/sigilyph/core/bert_align.py
+Descripttion:
+Author: Yixiang Chen
+version:
+Date: 2025-09-24 15:13:38
+LastEditors: Yixiang Chen
+LastEditTime: 2026-01-16 18:53:54
+'''
+import torch
+from transformers import BertTokenizer, BertModel
+import langid
+from g2p_en import G2p
+_g2p_en = G2p()
+from pypinyin import lazy_pinyin, Style
+from sigilyph.text_norm.norm_func import text_norm_cn
+from sigilyph.core.symbols import punctuation, punc_map_ch, cn_word2phone_dict
+for punc in punctuation:
+    cn_word2phone_dict[punc] = punc
+def _g2p_cn(text):
+    pinyinlist = lazy_pinyin(text, style=Style.TONE3, neutral_tone_with_five=True, tone_sandhi=True)
+    outlist = []
+    for pp in pinyinlist:
+        if pp in cn_word2phone_dict.keys():
+            outlist.extend(cn_word2phone_dict[pp])
+        else:
+            for ch in pp:
+                if ch in cn_word2phone_dict.keys():
+                    outlist.extend(cn_word2phone_dict[ch])
+                else:
+                    outlist.extend('sil')
+    return outlist
+def g2p_word(word):
+    tmp_lang = langid.classify(word)[0]
+    if tmp_lang in ['zh', 'jp', 'ja']:
+        tmp_lang = 'zh'
+    else:
+        tmp_lang = 'en'
+    if tmp_lang == 'zh':
+        return _g2p_cn(word)
+    else:
+        return _g2p_en(word)
+class AlignBert():
+    def __init__(self, tkn_cache_dir, vocab_file):
+        #tkn_cache_dir = "./tmp/"
+        self.vocab_dict = self.load_vocab(vocab_file)
+        self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', cache_dir=tkn_cache_dir)
+        self.g2p_word = g2p_word
+        self.symbol_list = ['[CLS]', '[SEP]', '?', '!', '.', ',', '，', '。']
+        self.empty_bert = torch.zeros([768])
+    def gen_seqbert(self, mfa_phones, text, bert):
+        norm_text = text_norm_cn(text)
+        midph_list, midph2bertid_dict = self.get_midph(norm_text)
+        phoneme2midph_dict = self.get_phoneme2midph_dict(mfa_phones, midph_list)
+        phoneme2bertid_dict = self.get_phoneme2bertid_dict(midph2bertid_dict, phoneme2midph_dict)
+        seqbert = []
+        for idx in range(len(phoneme2bertid_dict)):
+            bertid = phoneme2bertid_dict[idx]
+            if bertid >=0:
+                seqbert.append(bert[bertid])
+            else:
+                seqbert.append(self.empty_bert)
+        seqbert = torch.stack(seqbert)
+        return seqbert
+    def load_vocab(self, vocab_file):
+        vocab_dict = {}
+        with open(vocab_file, 'r') as ovf:
+            lines = ovf.readlines()
+        for idx in range(len(lines)):
+            line = lines[idx]
+            tt = line.strip()
+            vocab_dict[idx] = tt
+        del lines
+        return vocab_dict
+    def id2text(self, idlist):
+        outlist = []
+        for idx in range(len(idlist)):
+            outlist.append(self.vocab_dict[int(idlist[idx])])
+        return outlist
+    def get_midph(self, text):
+        encoded_input = self.tokenizer(text, return_tensors='pt')
+        ret = self.id2text(encoded_input['input_ids'][0])
+        wordph_list = []
+        for word in ret[1:-1]:
+            word_phoneme = self.g2p_word(word)
+            wordph_list.append(word_phoneme)
+        midph_list = []
+        midph2bertid_dict = {}
+        midph_list.append(ret[0])
+        midph2bertid_dict[0] = 0
+        for widx in range(len(wordph_list)):
+            for phidx in range(len(wordph_list[widx])):
+                phoneme = wordph_list[widx][phidx]
+                midph_list.append(phoneme)
+                midph2bertid_dict[len(midph_list)-1]=widx+1
+        midph_list.append(ret[-1])
+        midph2bertid_dict[len(midph_list)-1] = len(ret)-1
+        return midph_list, midph2bertid_dict
+    def get_phoneme2midph_dict(self, mfa_phones, midph_list):
+        fixed_midph_list = []
+        for idx in range(len(midph_list)):
+            if midph_list[idx] in self.symbol_list:
+                fixed_midph_list.append('sil')
+            else:
+                fixed_midph_list.append(midph_list[idx])
+        phoneme2midph_dict = self.lcs(mfa_phones, fixed_midph_list)
+        return phoneme2midph_dict
+    def lcs(self, mfa_phones, midph_list):
+        n, m = len(midph_list), len(mfa_phones)
+        dp = [[0] * (n + 1) for _ in range(m + 1)]
+        phoneme2midph_dict = {}
+        for idx in range(1, m+1):
+            phoneme2midph_dict[idx-1]=-1
+        for idx in range(1, m+1):
+            for midph_id in range(1, n+1):
+                curr_ph = mfa_phones[idx-1]
+                if curr_ph == midph_list[midph_id-1]:
+                    dp[idx][midph_id] = dp[idx-1][midph_id-1] + 1
+                else:
+                    dp[idx][midph_id] = max(dp[idx-1][midph_id], dp[idx][midph_id-1])
+        n, m = len(midph_list), len(mfa_phones)
+        while m > 0 and n > 0:
+            if mfa_phones[m-1] == midph_list[n-1] and dp[m][n] == dp[m-1][n-1] + 1:
+                phoneme2midph_dict[m-1] = n-1
+                m, n = m-1, n-1
+                continue
+            if dp[m][n] == dp[m-1][n]:
+                m, n = m-1, n
+                continue
+            if dp[m][n] == dp[m][n-1]:
+                m, n = m, n-1
+                continue
+        return phoneme2midph_dict
+    def get_phoneme2bertid_dict(self, midph2bertid_dict, phoneme2midph_dict):
+        phoneme2bertid_dict = {}
+        for idx in range(len(phoneme2midph_dict)):
+            phoneme_id = idx
+            midph_id = phoneme2midph_dict[phoneme_id]
+            #if midph_id not in midph2bertid_dict.keys():
+            if midph_id == -1:
+                phoneme2bertid_dict[idx] = -1
+            else:
+                phoneme2bertid_dict[idx] = midph2bertid_dict[midph_id]
+        return phoneme2bertid_dict

sigilyph/core/g2p_func.py ADDED Viewed

@@ -0,0 +1,47 @@
+'''
+FilePath: /python-Sigilyph/sigilyph/core/g2p_func.py
+Descripttion:
+Author: Yixiang Chen
+version:
+Date: 2025-03-31 16:55:51
+LastEditors: Yixiang Chen
+LastEditTime: 2025-08-12 14:42:02
+'''
+from g2p_en import G2p
+_g2p_en = G2p()
+def g2p_en(text, sp_sign='<sp>'):
+    phone_list = _g2p_en(text)
+    phone_list = [sp_sign if xx == " " else xx for xx in phone_list]
+    if len(phone_list)>1 and phone_list[-1] != sp_sign:
+        phone_list.append(sp_sign)
+    return phone_list
+from pypinyin import lazy_pinyin, Style
+from sigilyph.core.symbols import punctuation, punc_map_ch, cn_word2phone_dict
+for punc in punctuation:
+    cn_word2phone_dict[punc] = punc
+def g2p_cn(text):
+    pinyinlist = lazy_pinyin(text, style=Style.TONE3, neutral_tone_with_five=True, tone_sandhi=True)
+    outlist = []
+    for pp in pinyinlist:
+        if pp in cn_word2phone_dict.keys():
+            outlist.extend(cn_word2phone_dict[pp])
+            outlist.append('<sp>')
+        else:
+            for ch in pp:
+                outlist.extend(cn_word2phone_dict[ch])
+                outlist.append('<sp>')
+    if len(outlist) > 4:
+        if outlist[-2] == 'sil' and outlist[-4] == 'sil':
+            outlist = outlist[:-2]
+    return outlist

sigilyph/core/norm_func_bk.py ADDED Viewed

@@ -0,0 +1,98 @@
+'''
+FilePath: /python-Sigilyph/sigilyph/core/norm_func.py
+Descripttion:
+Author: Yixiang Chen
+version:
+Date: 2025-03-31 17:50:26
+LastEditors: Yixiang Chen
+LastEditTime: 2026-01-13 17:44:02
+'''
+import re
+from sigilyph.core.symbols import punctuation, punc_map_ch
+#from tn.chinese.normalizer import Normalizer as ZhNormalizer
+#from tn.english.normalizer import Normalizer as EnNormalizer
+from sigilyph.fst_tool.infer_normalizer import ZhNormalizer, EnNormalizer
+import os
+from importlib_resources import files
+basedir = files('sigilyph')
+#zh_tn_model = ZhNormalizer(remove_erhua=False, full_to_half=False)
+#en_tn_model = EnNormalizer()
+#zh_tn_model = ZhNormalizer(cache_dir='./sigilyph/core/cache_dir', remove_erhua=False, full_to_half=False)
+#en_tn_model = EnNormalizer(cache_dir='./sigilyph/core/cache_dir')
+zh_tn_model = ZhNormalizer(cache_dir=os.path.join(basedir, 'core', 'cache_dir'), remove_erhua=False, full_to_half=False)
+en_tn_model = EnNormalizer(cache_dir=os.path.join(basedir, 'core', 'cache_dir'))
+import json
+#import sys
+#sys.path.append('text_front')
+#with open('./special_dict.json', 'r', encoding="utf-8") as infi:
+#with open('./text_front/special_dict.json', 'r', encoding="utf-8") as infi:
+#with open('./sigilyph/core/special_dict.json', 'r', encoding="utf-8") as infi:
+with open(os.path.join(basedir, 'core', 'special_dict.json'), 'r', encoding="utf-8") as infi:
+    special_dict = json.load(infi)
+def pro_norm(text, use_lang='zh'):
+    if use_lang == 'zh':
+        norm_text = zh_tn_model.normalize(text)
+        #print("zh ", norm_text)
+    else:
+        norm_text = en_tn_model.normalize(text)
+        #print("en ", norm_text)
+    return norm_text
+def replace_with_dict(text, replace_dict):
+    for old, new in replace_dict.items():
+        text = text.replace(old, new)
+    return text
+def replace_with_dict_re(text, replace_dict):
+    pattern = re.compile("|".join(re.escape(key) for key in replace_dict.keys()))
+    return pattern.sub(lambda m: replace_dict[m.group(0)], text)
+pre_replace_dict = {"AlphaFold-Plus": "AlphaFold Plus"}
+def preprocess_first_old(text, use_lang='zh'):
+    text = replace_with_dict(text, pre_replace_dict)
+    norm_text = pro_norm(text, use_lang)
+    #print(norm_text)
+    rep_text = replace_with_dict(norm_text, special_dict)
+    return rep_text
+def preprocess_first(text, before_replace_dict, special_word_dict, norm_use_lang='zh'):
+    text = replace_with_dict(text, before_replace_dict)
+    norm_text = pro_norm(text, norm_use_lang)
+    #print(norm_text)
+    rep_text = replace_with_dict(norm_text, special_word_dict)
+    return rep_text
+def preprocess_first_for_norm(text, before_replace_dict, norm_use_lang='zh'):
+    text = replace_with_dict(text, before_replace_dict)
+    norm_text = pro_norm(text, norm_use_lang)
+    return norm_text
+def normalizer(text):
+    return text
+def replace_punc(text):
+    #text = text.replace("嗯", "恩").replace("呣", "母")
+    pattern = re.compile("|".join(re.escape(p) for p in punc_map_ch.keys()))
+    replaced_text = pattern.sub(lambda x: punc_map_ch[x.group()], text)
+    replaced_text = re.sub(
+        r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text
+    )
+    return replaced_text
+def text_norm_cn(text):
+    text = normalizer(text)
+    text = replace_punc(text)
+    return text
+def text_norm_en(text):
+    return text

sigilyph/core/predict.py ADDED Viewed

@@ -0,0 +1,64 @@
+before_replace_dict = {
+    "AlphaFold-Plus": "AlphaFold Plus"
+}
+special_phrase = ['据了解']
+special_word_dict = {
+    "iPhone": "[AY1 F OW0 N]",
+    "IOS": "[AY1 OW1 AE1 S]",
+    "A十七": "[EY1 sh ir2 q i1]",
+    "A seventeen": "[EY1 S EH1 V AH0 N T IY1 N]",
+    "CEO": "[S IY1 IY1 OW1]",
+    "AI": "[EY1 AY1]",
+    "ID": "[AY1 D IY1]",
+    "ABC": "[EY1 B IY1 S IY1]",
+    "VIP": "[V IY1 AY1 P IY1]",
+    "PDF": "[P IY1 D IY1 AE1 F]",
+    "NLP": "[EH1 NG EH2 L P IY1]",
+    "API": "[EY1 P IY1 AY1]",
+    "GPU": "[JH IY1 P IY1 Y UW1]",
+    "WeChat": "[W IY1 CH AE1 T]",
+    "PPT": "[P IY1 P IY1 T IY1]",
+    "CA": "[S IY1 EY1]",
+    ".com": "[d ian3 K AA1 M]",
+    ".zhang": "[ZH AA1 NG]",
+    "live": "[L AY0 V]",
+    "@": "[sil_1 AE1 T sil_1]",
+    "睡不着觉": "[sh ui4 b u4 zh e5 j iao4]",
+    "月经不调": "[y ve4 j ing1 b u4 t iao2]",
+    "长护险": "[ch ang2 h u4 x ian3]",
+    "长时间": "[ch ang2 sh ir2 j ian1]",
+    "长住外地": "[ch ang2 zh u4 w ai4 d i4]",
+    "长按": "[ch ang2 AA an4]",
+    "喉咙干疼": "[h ou2 l ong2 g an1 t eng2]",
+    "死对头": "[s ii3 d ui4 t ou5]",
+    "成名曲": "[ch eng2 m ing2 q v3]",
+    "古朴": "[g u3 p u3]",
+    "啊": "[AA a1]",
+    "sinα": "[S AY1 N AH2 AE1 L F a3]",
+    "cosα": "[K OW0 S AY1 N AH2 AE1 L F a3]",
+    "tanα": "[T AE1 N JH AH0 N T AH2 AE1 L F a3]",
+    "α": "[AE1 L F a3]",
+    "Ⅰ": "[y i1]",
+    "Ⅱ": "[EE er4]",
+    "qq": "[K Y UW1 K Y UW1]",
+    "≠": "[b u4 d eng3 y v2]",
+    "Beijing": "[b ei3 j ing1]",
+    "<sil>": "[sil_1]",
+    "--": "[sil]",
+    "=-=": "[sil]",
+    ":": "[sil_1]",
+    "-": "[sil]",
+    "(": "[sil]",
+    ")": "[sil]",
+    "“": "[sil_1]",
+    "”": "[sil_1]",
+    "《": "[sil_1]",
+    "》": "[sil_1]",
+    "（": "[sil]",
+    "）": "[sil]",
+    "：": "[sil_1]",
+}

sigilyph/core/preprocess.py ADDED Viewed

@@ -0,0 +1,16 @@
+'''
+FilePath: /python-Sigilyph/sigilyph/core/preprocess.py
+Descripttion:
+Author: Yixiang Chen
+version:
+Date: 2025-05-13 11:01:26
+LastEditors: Yixiang Chen
+LastEditTime: 2025-05-14 20:26:27
+'''
+def replace_proper(text, namedict):
+    for k,v in namedict.items():
+        text = text.replace(k,v)
+    return text