PyPI - sigilyph - Versions diffs - 0.3.1__tar.gz - Mend

sigilyph 0.3.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

sigilyph-0.3.1/MANIFEST.in +3 -0
sigilyph-0.3.1/PKG-INFO +23 -0
sigilyph-0.3.1/README.md +2 -0
sigilyph-0.3.1/setup.cfg +4 -0
sigilyph-0.3.1/setup.py +49 -0
sigilyph-0.3.1/sigilyph/__init__.py +16 -0
sigilyph-0.3.1/sigilyph/core/__init__.py +0 -0
sigilyph-0.3.1/sigilyph/core/bert_align.py +163 -0
sigilyph-0.3.1/sigilyph/core/cache_dir/en_tn_tagger.fst +0 -0
sigilyph-0.3.1/sigilyph/core/cache_dir/en_tn_verbalizer.fst +0 -0
sigilyph-0.3.1/sigilyph/core/cache_dir/zh_tn_tagger.fst +0 -0
sigilyph-0.3.1/sigilyph/core/cache_dir/zh_tn_verbalizer.fst +0 -0
sigilyph-0.3.1/sigilyph/core/g2p_func.py +47 -0
sigilyph-0.3.1/sigilyph/core/norm_func.py +92 -0
sigilyph-0.3.1/sigilyph/core/predict.py +66 -0
sigilyph-0.3.1/sigilyph/core/preprocess.py +16 -0
sigilyph-0.3.1/sigilyph/core/py2phone.dict +2165 -0
sigilyph-0.3.1/sigilyph/core/sigilyph_class.py +215 -0
sigilyph-0.3.1/sigilyph/core/special_dict.json +26 -0
sigilyph-0.3.1/sigilyph/core/symbols.py +445 -0
sigilyph-0.3.1/sigilyph/core/text_process.py +327 -0
sigilyph-0.3.1/sigilyph.egg-info/PKG-INFO +23 -0
sigilyph-0.3.1/sigilyph.egg-info/SOURCES.txt +24 -0
sigilyph-0.3.1/sigilyph.egg-info/dependency_links.txt +1 -0
sigilyph-0.3.1/sigilyph.egg-info/requires.txt +6 -0
sigilyph-0.3.1/sigilyph.egg-info/top_level.txt +1 -0

sigilyph-0.3.1/MANIFEST.in ADDED Viewed

@@ -0,0 +1,3 @@
+include sigilyph/core/cache_dir/*
+include sigilyph/core/special_dict.json
+include sigilyph/core/py2phone.dict

sigilyph-0.3.1/PKG-INFO ADDED Viewed

@@ -0,0 +1,23 @@
+Metadata-Version: 2.1
+Name: sigilyph
+Version: 0.3.1
+Summary: Text Front for TTS
+Home-page: https://github.com/yixiangchen1995/python-Sigilyph
+Author: Yixiang Chen
+Author-email: <yixiangchen1995@gmail.com>
+License: MIT
+Keywords: python,first package
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: Programming Language :: Python
+Classifier: Programming Language :: Python :: 3.10
+Description-Content-Type: text/markdown
+Requires-Dist: g2p_en
+Requires-Dist: jieba
+Requires-Dist: jieba_fast
+Requires-Dist: pypinyin
+Requires-Dist: WeTextProcessing==1.0.3
+Requires-Dist: langid
+# python-Sigilyph
+The TTS Text Frontend for the use of own

sigilyph-0.3.1/README.md ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ # python-Sigilyph
2	+ The TTS Text Frontend for the use of own

sigilyph-0.3.1/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

sigilyph-0.3.1/setup.py ADDED Viewed

@@ -0,0 +1,49 @@
+'''
+FilePath: /python-Sigilyph/setup.py
+Descripttion:
+Author: Yixiang Chen
+version:
+Date: 2025-03-24 15:57:41
+LastEditors: Yixiang Chen
+LastEditTime: 2025-09-26 15:51:52
+'''
+from setuptools import setup, find_packages
+VERSION = '0.3.1'
+DESCRIPTION = 'Text Front for TTS'
+#LONG_DESCRIPTION = 'Data Package for TTS with a slightly longer description'
+LONG_DESCRIPTION = open("README.md", encoding="utf-8").read()
+# 配置
+setup(
+        name="sigilyph",
+        version=VERSION,
+        author="Yixiang Chen",
+        author_email="<yixiangchen1995@gmail.com>",
+        license='MIT',
+        description=DESCRIPTION,
+        long_description=LONG_DESCRIPTION,
+        long_description_content_type="text/markdown",
+        url="https://github.com/yixiangchen1995/python-Sigilyph",
+        packages=(
+            find_packages()
+        ),
+        include_package_data=True,
+        install_requires=[
+            'g2p_en',
+            'jieba',
+            'jieba_fast',
+            'pypinyin',
+            'WeTextProcessing==1.0.3',
+            'langid'
+        ], # add any additional packages that ## add tinytag package
+        keywords=['python', 'first package'],
+        classifiers= [
+            "Development Status :: 3 - Alpha",
+            "Intended Audience :: Developers",
+            'Programming Language :: Python',
+            'Programming Language :: Python :: 3.10',
+        ]
+)

sigilyph-0.3.1/sigilyph/__init__.py ADDED Viewed

@@ -0,0 +1,16 @@
+'''
+FilePath: /python-Sigilyph/sigilyph/__init__.py
+Descripttion:
+Author: Yixiang Chen
+version:
+Date: 2025-05-13 11:01:07
+LastEditors: Yixiang Chen
+LastEditTime: 2025-09-25 17:22:30
+'''
+from sigilyph.core.sigilyph_class import Sigilyph
+from sigilyph.core.symbols import all_phone_dict
+from sigilyph.core.bert_align import AlignBert

sigilyph-0.3.1/sigilyph/core/__init__.py ADDED Viewed

File without changes

sigilyph-0.3.1/sigilyph/core/bert_align.py ADDED Viewed

@@ -0,0 +1,163 @@
+'''
+FilePath: /python-Sigilyph/sigilyph/core/bert_align.py
+Descripttion:
+Author: Yixiang Chen
+version:
+Date: 2025-09-24 15:13:38
+LastEditors: Yixiang Chen
+LastEditTime: 2025-09-28 11:05:20
+'''
+import torch
+from transformers import BertTokenizer, BertModel
+import langid
+from g2p_en import G2p
+_g2p_en = G2p()
+from pypinyin import lazy_pinyin, Style
+from sigilyph.core.norm_func import text_norm_cn
+from sigilyph.core.symbols import punctuation, punc_map_ch, cn_word2phone_dict
+for punc in punctuation:
+    cn_word2phone_dict[punc] = punc
+def _g2p_cn(text):
+    pinyinlist = lazy_pinyin(text, style=Style.TONE3, neutral_tone_with_five=True, tone_sandhi=True)
+    outlist = []
+    for pp in pinyinlist:
+        if pp in cn_word2phone_dict.keys():
+            outlist.extend(cn_word2phone_dict[pp])
+        else:
+            for ch in pp:
+                if ch in cn_word2phone_dict.keys():
+                    outlist.extend(cn_word2phone_dict[ch])
+                else:
+                    outlist.extend('sil')
+    return outlist
+def g2p_word(word):
+    tmp_lang = langid.classify(word)[0]
+    if tmp_lang in ['zh', 'jp', 'ja']:
+        tmp_lang = 'zh'
+    else:
+        tmp_lang = 'en'
+    if tmp_lang == 'zh':
+        return _g2p_cn(word)
+    else:
+        return _g2p_en(word)
+class AlignBert():
+    def __init__(self, tkn_cache_dir, vocab_file):
+        #tkn_cache_dir = "./tmp/"
+        self.vocab_dict = self.load_vocab(vocab_file)
+        self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', cache_dir=tkn_cache_dir)
+        self.g2p_word = g2p_word
+        self.symbol_list = ['[CLS]', '[SEP]', '?', '!', '.', ',', '，', '。']
+        self.empty_bert = torch.zeros([768])
+    def gen_seqbert(self, mfa_phones, text, bert):
+        norm_text = text_norm_cn(text)
+        midph_list, midph2bertid_dict = self.get_midph(norm_text)
+        phoneme2midph_dict = self.get_phoneme2midph_dict(mfa_phones, midph_list)
+        phoneme2bertid_dict = self.get_phoneme2bertid_dict(midph2bertid_dict, phoneme2midph_dict)
+        seqbert = []
+        for idx in range(len(phoneme2bertid_dict)):
+            bertid = phoneme2bertid_dict[idx]
+            if bertid >=0:
+                seqbert.append(bert[bertid])
+            else:
+                seqbert.append(self.empty_bert)
+        seqbert = torch.stack(seqbert)
+        return seqbert
+    def load_vocab(self, vocab_file):
+        vocab_dict = {}
+        with open(vocab_file, 'r') as ovf:
+            lines = ovf.readlines()
+        for idx in range(len(lines)):
+            line = lines[idx]
+            tt = line.strip()
+            vocab_dict[idx] = tt
+        del lines
+        return vocab_dict
+    def id2text(self, idlist):
+        outlist = []
+        for idx in range(len(idlist)):
+            outlist.append(self.vocab_dict[int(idlist[idx])])
+        return outlist
+    def get_midph(self, text):
+        encoded_input = self.tokenizer(text, return_tensors='pt')
+        ret = self.id2text(encoded_input['input_ids'][0])
+        wordph_list = []
+        for word in ret[1:-1]:
+            word_phoneme = self.g2p_word(word)
+            wordph_list.append(word_phoneme)
+        midph_list = []
+        midph2bertid_dict = {}
+        midph_list.append(ret[0])
+        midph2bertid_dict[0] = 0
+        for widx in range(len(wordph_list)):
+            for phidx in range(len(wordph_list[widx])):
+                phoneme = wordph_list[widx][phidx]
+                midph_list.append(phoneme)
+                midph2bertid_dict[len(midph_list)-1]=widx+1
+        midph_list.append(ret[-1])
+        midph2bertid_dict[len(midph_list)-1] = len(ret)-1
+        return midph_list, midph2bertid_dict
+    def get_phoneme2midph_dict(self, mfa_phones, midph_list):
+        fixed_midph_list = []
+        for idx in range(len(midph_list)):
+            if midph_list[idx] in self.symbol_list:
+                fixed_midph_list.append('sil')
+            else:
+                fixed_midph_list.append(midph_list[idx])
+        phoneme2midph_dict = self.lcs(mfa_phones, fixed_midph_list)
+        return phoneme2midph_dict
+    def lcs(self, mfa_phones, midph_list):
+        n, m = len(midph_list), len(mfa_phones)
+        dp = [[0] * (n + 1) for _ in range(m + 1)]
+        phoneme2midph_dict = {}
+        for idx in range(1, m+1):
+            phoneme2midph_dict[idx-1]=-1
+        for idx in range(1, m+1):
+            for midph_id in range(1, n+1):
+                curr_ph = mfa_phones[idx-1]
+                if curr_ph == midph_list[midph_id-1]:
+                    dp[idx][midph_id] = dp[idx-1][midph_id-1] + 1
+                else:
+                    dp[idx][midph_id] = max(dp[idx-1][midph_id], dp[idx][midph_id-1])
+        n, m = len(midph_list), len(mfa_phones)
+        while m > 0 and n > 0:
+            if mfa_phones[m-1] == midph_list[n-1] and dp[m][n] == dp[m-1][n-1] + 1:
+                phoneme2midph_dict[m-1] = n-1
+                m, n = m-1, n-1
+                continue
+            if dp[m][n] == dp[m-1][n]:
+                m, n = m-1, n
+                continue
+            if dp[m][n] == dp[m][n-1]:
+                m, n = m, n-1
+                continue
+        return phoneme2midph_dict
+    def get_phoneme2bertid_dict(self, midph2bertid_dict, phoneme2midph_dict):
+        phoneme2bertid_dict = {}
+        for idx in range(len(phoneme2midph_dict)):
+            phoneme_id = idx
+            midph_id = phoneme2midph_dict[phoneme_id]
+            #if midph_id not in midph2bertid_dict.keys():
+            if midph_id == -1:
+                phoneme2bertid_dict[idx] = -1
+            else:
+                phoneme2bertid_dict[idx] = midph2bertid_dict[midph_id]
+        return phoneme2bertid_dict

sigilyph-0.3.1/sigilyph/core/cache_dir/en_tn_tagger.fst ADDED Viewed

Binary file

sigilyph-0.3.1/sigilyph/core/cache_dir/en_tn_verbalizer.fst ADDED Viewed

Binary file

sigilyph-0.3.1/sigilyph/core/cache_dir/zh_tn_tagger.fst ADDED Viewed

Binary file

sigilyph-0.3.1/sigilyph/core/cache_dir/zh_tn_verbalizer.fst ADDED Viewed

Binary file

sigilyph-0.3.1/sigilyph/core/g2p_func.py ADDED Viewed

@@ -0,0 +1,47 @@
+'''
+FilePath: /python-Sigilyph/sigilyph/core/g2p_func.py
+Descripttion:
+Author: Yixiang Chen
+version:
+Date: 2025-03-31 16:55:51
+LastEditors: Yixiang Chen
+LastEditTime: 2025-08-12 14:42:02
+'''
+from g2p_en import G2p
+_g2p_en = G2p()
+def g2p_en(text, sp_sign='<sp>'):
+    phone_list = _g2p_en(text)
+    phone_list = [sp_sign if xx == " " else xx for xx in phone_list]
+    if len(phone_list)>1 and phone_list[-1] != sp_sign:
+        phone_list.append(sp_sign)
+    return phone_list
+from pypinyin import lazy_pinyin, Style
+from sigilyph.core.symbols import punctuation, punc_map_ch, cn_word2phone_dict
+for punc in punctuation:
+    cn_word2phone_dict[punc] = punc
+def g2p_cn(text):
+    pinyinlist = lazy_pinyin(text, style=Style.TONE3, neutral_tone_with_five=True, tone_sandhi=True)
+    outlist = []
+    for pp in pinyinlist:
+        if pp in cn_word2phone_dict.keys():
+            outlist.extend(cn_word2phone_dict[pp])
+            outlist.append('<sp>')
+        else:
+            for ch in pp:
+                outlist.extend(cn_word2phone_dict[ch])
+                outlist.append('<sp>')
+    if len(outlist) > 4:
+        if outlist[-2] == 'sil' and outlist[-4] == 'sil':
+            outlist = outlist[:-2]
+    return outlist

sigilyph-0.3.1/sigilyph/core/norm_func.py ADDED Viewed

@@ -0,0 +1,92 @@
+'''
+FilePath: /python-Sigilyph/sigilyph/core/norm_func.py
+Descripttion:
+Author: Yixiang Chen
+version:
+Date: 2025-03-31 17:50:26
+LastEditors: Yixiang Chen
+LastEditTime: 2025-09-26 14:45:16
+'''
+import re
+from sigilyph.core.symbols import punctuation, punc_map_ch
+from tn.chinese.normalizer import Normalizer as ZhNormalizer
+from tn.english.normalizer import Normalizer as EnNormalizer
+import os
+from importlib_resources import files
+basedir = files('sigilyph')
+#zh_tn_model = ZhNormalizer(remove_erhua=False, full_to_half=False)
+#en_tn_model = EnNormalizer()
+#zh_tn_model = ZhNormalizer(cache_dir='./sigilyph/core/cache_dir', remove_erhua=False, full_to_half=False)
+#en_tn_model = EnNormalizer(cache_dir='./sigilyph/core/cache_dir')
+zh_tn_model = ZhNormalizer(cache_dir=os.path.join(basedir, 'core', 'cache_dir'), remove_erhua=False, full_to_half=False)
+en_tn_model = EnNormalizer(cache_dir=os.path.join(basedir, 'core', 'cache_dir'))
+import json
+#import sys
+#sys.path.append('text_front')
+#with open('./special_dict.json', 'r', encoding="utf-8") as infi:
+#with open('./text_front/special_dict.json', 'r', encoding="utf-8") as infi:
+#with open('./sigilyph/core/special_dict.json', 'r', encoding="utf-8") as infi:
+with open(os.path.join(basedir, 'core', 'special_dict.json'), 'r', encoding="utf-8") as infi:
+    special_dict = json.load(infi)
+def pro_norm(text, use_lang='zh'):
+    if use_lang == 'zh':
+        norm_text = zh_tn_model.normalize(text)
+        #print("zh ", norm_text)
+    else:
+        norm_text = en_tn_model.normalize(text)
+        #print("en ", norm_text)
+    return norm_text
+def replace_with_dict(text, replace_dict):
+    for old, new in replace_dict.items():
+        text = text.replace(old, new)
+    return text
+def replace_with_dict_re(text, replace_dict):
+    pattern = re.compile("|".join(re.escape(key) for key in replace_dict.keys()))
+    return pattern.sub(lambda m: replace_dict[m.group(0)], text)
+pre_replace_dict = {"AlphaFold-Plus": "AlphaFold Plus"}
+def preprocess_first_old(text, use_lang='zh'):
+    text = replace_with_dict(text, pre_replace_dict)
+    norm_text = pro_norm(text, use_lang)
+    #print(norm_text)
+    rep_text = replace_with_dict(norm_text, special_dict)
+    return rep_text
+def preprocess_first(text, before_replace_dict, special_word_dict, norm_use_lang='zh'):
+    text = replace_with_dict(text, before_replace_dict)
+    norm_text = pro_norm(text, norm_use_lang)
+    #print(norm_text)
+    rep_text = replace_with_dict(norm_text, special_word_dict)
+    return rep_text
+def normalizer(text):
+    return text
+def replace_punc(text):
+    #text = text.replace("嗯", "恩").replace("呣", "母")
+    pattern = re.compile("|".join(re.escape(p) for p in punc_map_ch.keys()))
+    replaced_text = pattern.sub(lambda x: punc_map_ch[x.group()], text)
+    replaced_text = re.sub(
+        r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text
+    )
+    return replaced_text
+def text_norm_cn(text):
+    text = normalizer(text)
+    text = replace_punc(text)
+    return text
+def text_norm_en(text):
+    return text

sigilyph-0.3.1/sigilyph/core/predict.py ADDED Viewed

@@ -0,0 +1,66 @@
+before_replace_dict = {
+    "AlphaFold-Plus": "AlphaFold Plus"
+}
+special_phrase = ['据了解']
+special_word_dict = {
+    "iPhone": "[AY1 F OW0 N]",
+    "IOS": "[AY1 OW1 AE1 S]",
+    "A十七": "[EY1 sh ir2 q i1]",
+    "A seventeen": "[EY1 S EH1 V AH0 N T IY1 N]",
+    "CEO": "[S IY1 IY1 OW1]",
+    "AI": "[EY1 AY1]",
+    "ID": "[AY1 D IY1]",
+    "ABC": "[EY1 B IY1 S IY1]",
+    "VIP": "[V IY1 AY1 P IY1]",
+    "PDF": "[P IY1 D IY1 AE1 F]",
+    "NLP": "[EH1 NG EH2 L P IY1]",
+    "API": "[EY1 P IY1 AY1]",
+    "GPU": "[JH IY1 P IY1 Y UW1]",
+    "WeChat": "[W IY1 CH AE1 T]",
+    "PPT": "[P IY1 P IY1 T IY1]",
+    "CA": "[S IY1 EY1]",
+    ".com": "[d ian3 K AA1 M]",
+    ".zhang": "[ZH AA1 NG]",
+    "live": "[L AY0 V]",
+    "@": "[sil_1 AE1 T sil_1]",
+    "睡不着觉": "[sh ui4 b u4 zh e5 j iao4]",
+    "月经不调": "[y ve4 j ing1 b u4 t iao2]",
+    "长护险": "[ch ang2 h u4 x ian3]",
+    "长时间": "[ch ang2 sh ir2 j ian1]",
+    "长住外地": "[ch ang2 zh u4 w ai4 d i4]",
+    "长按": "[ch ang2 AA an4]",
+    "喉咙干疼": "[h ou2 l ong2 g an1 t eng2]",
+    "死对头": "[s ii3 d ui4 t ou5]",
+    "成名曲": "[ch eng2 m ing2 q v3]",
+    "古朴": "[g u3 p u3]",
+    "啊": "[AA a1]",
+    "sinα": "[S AY1 N AH2 AE1 L F a3]",
+    "cosα": "[K OW0 S AY1 N AH2 AE1 L F a3]",
+    "tanα": "[T AE1 N JH AH0 N T AH2 AE1 L F a3]",
+    "α": "[AE1 L F a3]",
+    "Ⅰ": "[y i1]",
+    "Ⅱ": "[EE er4]",
+    "qq": "[K Y UW1 K Y UW1]",
+    "≠": "[b u4 d eng3 y v2]",
+    "Beijing": "[b ei3 j ing1]",
+    "<sil>": "[sil_1]",
+    "--": "[sil]",
+    "=-=": "[sil]",
+    ":": "[sil_1]",
+    "-": "[sil]",
+    "(": "[sil]",
+    ")": "[sil]",
+    "“": "[sil_1]",
+    "”": "[sil_1]",
+    "《": "[sil_1]",
+    "》": "[sil_1]",
+    "（": "[sil]",
+    "）": "[sil]",
+    "：": "[sil_1]",
+}

sigilyph-0.3.1/sigilyph/core/preprocess.py ADDED Viewed

@@ -0,0 +1,16 @@
+'''
+FilePath: /python-Sigilyph/sigilyph/core/preprocess.py
+Descripttion:
+Author: Yixiang Chen
+version:
+Date: 2025-05-13 11:01:26
+LastEditors: Yixiang Chen
+LastEditTime: 2025-05-14 20:26:27
+'''
+def replace_proper(text, namedict):
+    for k,v in namedict.items():
+        text = text.replace(k,v)
+    return text