sigilyph 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,155 @@
1
+ '''
2
+ FilePath: /python-Sigilyph/sigilyph/text_norm/norm_func.py
3
+ Descripttion:
4
+ Author: Yixiang Chen
5
+ version:
6
+ Date: 2025-03-31 17:50:26
7
+ LastEditors: Yixiang Chen
8
+ LastEditTime: 2026-01-19 20:05:35
9
+ '''
10
+
11
+
12
+ import re
13
+
14
+ from sigilyph.core.symbols import punctuation, punc_map_ch
15
+
16
+ from sigilyph.fst_tool.infer_normalizer import ZhNormalizer, EnNormalizer
17
+
18
+
19
+ import os
20
+ from importlib_resources import files
21
+ basedir = files('sigilyph')
22
+
23
+ zh_tn_model = ZhNormalizer(version_id='v2', cache_dir=os.path.join(basedir, 'text_norm', 'cache_dir'), remove_erhua=False, full_to_half=False)
24
+ en_tn_model = EnNormalizer(version_id='v2', cache_dir=os.path.join(basedir, 'text_norm', 'cache_dir'))
25
+
26
+ import json
27
+ with open(os.path.join(basedir, 'core', 'special_dict.json'), 'r', encoding="utf-8") as infi:
28
+ special_dict = json.load(infi)
29
+
30
+ def pro_norm(text, use_lang='zh'):
31
+ if use_lang == 'zh':
32
+ norm_text = zh_tn_model.normalize(text)
33
+ #print("zh ", norm_text)
34
+ else:
35
+ norm_text = en_tn_model.normalize(text)
36
+ #print("en ", norm_text)
37
+ return norm_text
38
+
39
+ def replace_with_dict(text, replace_dict):
40
+ for old, new in replace_dict.items():
41
+ text = text.replace(old, new)
42
+ return text
43
+
44
+ def replace_with_dict_re(text, replace_dict):
45
+ pattern = re.compile("|".join(re.escape(key) for key in replace_dict.keys()))
46
+ return pattern.sub(lambda m: replace_dict[m.group(0)], text)
47
+
48
+ def replace_roman_1_to_10(text: str) -> str:
49
+ """
50
+ 将字符串中的罗马数字符号 Ⅰ~Ⅹ 替换为中文数字 一~十。
51
+ 其余任何字符(包括乱码、英文 I/V/X 等)都保持不变。
52
+ """
53
+ roman_to_cn = {
54
+ 'Ⅰ': '一',
55
+ 'Ⅱ': '二',
56
+ 'Ⅲ': '三',
57
+ 'Ⅳ': '四',
58
+ 'Ⅴ': '五',
59
+ 'Ⅵ': '六',
60
+ 'Ⅶ': '七',
61
+ 'Ⅷ': '八',
62
+ 'Ⅸ': '九',
63
+ 'Ⅹ': '十',
64
+ }
65
+ # 逐字符扫描,能映射的就换成中文数字,不能映射的原样保留
66
+ return ''.join(roman_to_cn.get(ch, ch) for ch in text)
67
+
68
+ pre_replace_dict = {"AlphaFold-Plus": "AlphaFold Plus"}
69
+ def preprocess_first_old(text, use_lang='zh'):
70
+ text = replace_with_dict(text, pre_replace_dict)
71
+ norm_text = pro_norm(text, use_lang)
72
+ #print(norm_text)
73
+ rep_text = replace_with_dict(norm_text, special_dict)
74
+ return rep_text
75
+
76
+ def preprocess_first(text, before_replace_dict, special_word_dict, norm_use_lang='zh'):
77
+ text = replace_with_dict(text, before_replace_dict)
78
+ norm_text = pro_norm(text, norm_use_lang)
79
+ #print(norm_text)
80
+ #rep_text = replace_with_dict(norm_text, special_word_dict)
81
+ return norm_text
82
+ def post_process(text , special_word_dict):
83
+ rep_text = replace_with_dict(text, special_word_dict)
84
+ return rep_text
85
+
86
+
87
+ def preprocess_first_for_norm(text, before_replace_dict, norm_use_lang='zh'):
88
+ text = replace_roman_1_to_10(text)
89
+ text = replace_with_dict(text, before_replace_dict)
90
+ return text
91
+
92
+ def normalizer(text):
93
+ return text
94
+
95
+ def replace_punc(text):
96
+ #text = text.replace("嗯", "恩").replace("呣", "母")
97
+ pattern = re.compile("|".join(re.escape(p) for p in punc_map_ch.keys()))
98
+ replaced_text = pattern.sub(lambda x: punc_map_ch[x.group()], text)
99
+ replaced_text = re.sub(
100
+ r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text
101
+ )
102
+ return replaced_text
103
+
104
+ def replace_punc_part(text: str) -> str:
105
+ """
106
+ 将中文句号、逗号、分号、引号替换为英文符号:
107
+ 。 -> .
108
+ , -> ,
109
+ ; -> ;
110
+ “ -> "
111
+ ” -> "
112
+ 『 -> "
113
+ 』 -> "
114
+ 「 -> "
115
+ 」 -> "
116
+ 『』、「」等都统一为英文双引号
117
+ 其他符号不动
118
+ """
119
+ # 建立映射表
120
+ mapping = {
121
+ '。': '.',
122
+ ',': ',',
123
+ ';': ';',
124
+ '“': '"',
125
+ '”': '"',
126
+ '「': '"',
127
+ '」': '"',
128
+ '『': '"',
129
+ '』': '"',
130
+ }
131
+
132
+ # 构造正则:匹配所有需要替换的中文标点
133
+ pattern = re.compile(r'[。,“”;「」『』]')
134
+
135
+ # 使用 re.sub 进行替换
136
+ return pattern.sub(lambda m: mapping[m.group(0)], text)
137
+
138
+ '''
139
+ def text_norm_cn(text):
140
+ text = normalizer(text)
141
+ text = replace_punc(text)
142
+ return text
143
+
144
+ def text_norm_en(text):
145
+ return text
146
+ '''
147
+
148
+ def text_norm_cn(text):
149
+ norm_text = zh_tn_model.normalize(text)
150
+ norm_text = replace_punc_part(norm_text)
151
+ return norm_text
152
+
153
+ def text_norm_en(text):
154
+ norm_text = en_tn_model.normalize(text)
155
+ return norm_text
@@ -0,0 +1,89 @@
1
+ '''
2
+ FilePath: /python-Sigilyph/sigilyph/text_norm/norm_func.py
3
+ Descripttion:
4
+ Author: Yixiang Chen
5
+ version:
6
+ Date: 2025-03-31 17:50:26
7
+ LastEditors: Yixiang Chen
8
+ LastEditTime: 2026-01-19 10:09:56
9
+ '''
10
+
11
+ import os
12
+ import json
13
+ import re
14
+ from importlib_resources import files
15
+ from sigilyph.core.symbols import punctuation, punc_map_ch
16
+ from sigilyph.fst_tool.infer_normalizer import ZhNormalizer, EnNormalizer
17
+
18
+ basedir = files('sigilyph')
19
+ zh_tn_model = ZhNormalizer(version_id='v2', cache_dir=os.path.join(basedir, 'text_norm', 'cache_dir'), remove_erhua=False, full_to_half=False)
20
+ en_tn_model = EnNormalizer(version_id='v2', cache_dir=os.path.join(basedir, 'text_norm', 'cache_dir'))
21
+
22
+ #with open(os.path.join(basedir, 'core', 'special_dict.json'), 'r', encoding="utf-8") as infi:
23
+ # special_dict = json.load(infi)
24
+
25
+ def pro_norm(text, use_lang='zh'):
26
+ """Normalize text based on the specified language."""
27
+ if use_lang == 'zh':
28
+ return zh_tn_model.normalize(text)
29
+ return en_tn_model.normalize(text)
30
+
31
+ def replace_with_dict(text, replace_dict):
32
+ """Replace occurrences of keys in text with their corresponding values from replace_dict."""
33
+ for old, new in replace_dict.items():
34
+ text = text.replace(old, new)
35
+ return text
36
+
37
+ def replace_with_dict_re(text, replace_dict):
38
+ """Replace occurrences of keys in text using regular expressions."""
39
+ pattern = re.compile("|".join(re.escape(key) for key in replace_dict.keys()))
40
+ return pattern.sub(lambda m: replace_dict[m.group(0)], text)
41
+
42
+ def preprocess_first(text, before_replace_dict, special_word_dict, norm_use_lang='zh'):
43
+ """Preprocess text by replacing specified words and normalizing."""
44
+ text = replace_with_dict(text, before_replace_dict)
45
+ norm_text = pro_norm(text, norm_use_lang)
46
+ return replace_with_dict(norm_text, special_word_dict)
47
+
48
+ def post_process(text, special_word_dict):
49
+ """Post-process text by replacing special words."""
50
+ return replace_with_dict(text, special_word_dict)
51
+
52
+ def preprocess_first_for_norm(text, before_replace_dict):
53
+ """Preprocess text for normalization."""
54
+ return replace_with_dict(text, before_replace_dict)
55
+
56
+ def normalizer(text):
57
+ """Placeholder for a normalizer function."""
58
+ return text
59
+
60
+ def replace_punc(text):
61
+ """Replace Chinese punctuation with corresponding characters."""
62
+ pattern = re.compile("|".join(re.escape(p) for p in punc_map_ch.keys()))
63
+ replaced_text = pattern.sub(lambda x: punc_map_ch[x.group()], text)
64
+ return re.sub(r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text)
65
+
66
+ def replace_punc_part(text: str) -> str:
67
+ """Replace specific Chinese punctuation with English punctuation."""
68
+ mapping = {
69
+ '。': '.',
70
+ ',': ',',
71
+ ';': ';',
72
+ '“': '"',
73
+ '”': '"',
74
+ '「': '"',
75
+ '」': '"',
76
+ '『': '"',
77
+ '』': '"',
78
+ }
79
+ pattern = re.compile(r'[。,“”;「」『』]')
80
+ return pattern.sub(lambda m: mapping[m.group(0)], text)
81
+
82
+ def text_norm_cn(text):
83
+ """Normalize Chinese text."""
84
+ norm_text = zh_tn_model.normalize(text)
85
+ return replace_punc_part(norm_text)
86
+
87
+ def text_norm_en(text):
88
+ """Normalize English text."""
89
+ return en_tn_model.normalize(text)
@@ -0,0 +1,179 @@
1
+ '''
2
+ FilePath: /python-Sigilyph/sigilyph/text_norm/sigilyph_norm.py
3
+ Descripttion:
4
+ Author: Yixiang Chen
5
+ version:
6
+ Date: 2026-01-07 15:46:04
7
+ LastEditors: Yixiang Chen
8
+ LastEditTime: 2026-01-19 19:55:00
9
+ '''
10
+
11
+ import langid
12
+ import re
13
+ import jieba
14
+ import os
15
+
16
+ from sigilyph.text_norm.norm_func import preprocess_first_for_norm, text_norm_en, text_norm_cn
17
+ from sigilyph.core.predict import special_phrase
18
+
19
+ norm_func_dict = {
20
+ 'en': text_norm_en,
21
+ 'zh': text_norm_cn
22
+ }
23
+
24
+ import json
25
+ from importlib_resources import files
26
+ basedir = files('sigilyph')
27
+ with open(os.path.join(basedir, 'text_norm', 'dict_special_word_polyphone.json'), 'r', encoding="utf-8") as infi:
28
+ dict_special_word_polyphone_json = json.load(infi)
29
+ dict_special_word_polyphone = dict_special_word_polyphone_json['polyphone_config']
30
+ with open(os.path.join(basedir, 'text_norm', 'dict_special_word_base.json'), 'r', encoding="utf-8") as infib:
31
+ dict_special_word_base_json = json.load(infib)
32
+ dict_special_word_base = dict_special_word_base_json['base_config']
33
+
34
+ def is_float_strip(s: str) -> bool:
35
+ s = s.strip() # 只去掉首尾空白
36
+ if not s:
37
+ return False
38
+ try:
39
+ float(s)
40
+ return True
41
+ except ValueError:
42
+ return False
43
+
44
+ class SigilyphNormalizer:
45
+ def __init__(self, norm_use_dict) -> None:
46
+ self.sil1symbol='-'
47
+ self.special_phrase = special_phrase
48
+
49
+ self.base_replace_dict = dict_special_word_base
50
+ self.base_replace_dict.update(dict_special_word_polyphone)
51
+
52
+ self.before_replace_dict = self.base_replace_dict
53
+ self.before_replace_dict.update(norm_use_dict)
54
+
55
+ def fix_replace_dict(self, new_before_replace_dict):
56
+ self.before_replace_dict = self.base_replace_dict
57
+ self.before_replace_dict.update(new_before_replace_dict)
58
+
59
+ def normalize(self, text, lang, norm_use_lang='zh'):
60
+ text = preprocess_first_for_norm(text, self.before_replace_dict, norm_use_lang=norm_use_lang)
61
+ multi_lang_text_list = self.text_split_lang(text, lang)
62
+ all_phone = []
63
+ outtext = ''
64
+ for text_split_dict in multi_lang_text_list:
65
+ use_lang = text_split_dict['lang']
66
+ use_text = text_split_dict['text_split']
67
+ if use_lang not in norm_func_dict.keys():
68
+ use_lang = 'zh'
69
+ use_text = self.text_norm(use_text, use_lang)
70
+ outtext += use_text
71
+ return outtext
72
+
73
+ ######## text norm #########
74
+ def text_norm(self, text, lang):
75
+ outtext = norm_func_dict[lang](text)
76
+ return outtext
77
+
78
+ def split_with_units(self, text, regex):
79
+ # 中文数字(常见大写+小写口语化)
80
+ CHINESE_NUM_CHARS = "零一二三四五六七八九十百千万亿两〇壹贰叁肆伍陆柒捌玖拾佰仟萬億"
81
+ # 单位模式:可按需要继续扩展
82
+ unit_pattern = re.compile(r'^(\s*)(km/h|km|m/s|m|s|g|h|kg|mm|cm)\b')
83
+
84
+ pieces = re.findall(regex, text)
85
+ merged = []
86
+
87
+ for piece in pieces:
88
+ if merged:
89
+ m = unit_pattern.match(piece)
90
+ if m:
91
+ # 前一块最后一个字符
92
+ last_char = merged[-1][-1]
93
+ # 条件:前一块以“汉字或阿拉伯数字或中文数字”结尾
94
+ if (
95
+ re.match(r'[\u4e00-\u9fff\u3400-\u4dbf0-9]', last_char)
96
+ or last_char in CHINESE_NUM_CHARS
97
+ ):
98
+ # 把单位并到前一块
99
+ merged[-1] += m.group(1) + m.group(2)
100
+ # 当前块剩余部分(若有)单独保留
101
+ rest = piece[m.end():]
102
+ if rest:
103
+ merged.append(rest)
104
+ continue
105
+
106
+ merged.append(piece)
107
+
108
+ return merged
109
+
110
+ ############### split text in line with lang ##############
111
+ def text_split_lang(self, text, lang):
112
+ if lang == 'ZH' or lang == 'zh':
113
+ multi_lang_text_list = [{'lang':'zh', 'text_split': text}]
114
+ elif lang == 'en':
115
+ multi_lang_text_list = [{'lang':'en', 'text_split': text}]
116
+ else:
117
+ # Phoneme be judged
118
+ pretext_split = re.split("(\[.*?\])", text, re.I|re.M)
119
+ multi_lang_text_list = []
120
+ pretext_split = list(filter(None, pretext_split))
121
+ for utext in pretext_split:
122
+ if utext[0] != '[':
123
+ #pattern = r'([a-zA-Z ,.\!\?]+|[\u4e00-\u9fa5 ,。,.\t \"\!\?\“\”\、]+)'
124
+ #text_split = re.findall(pattern, utext)
125
+ pattern = r'''(
126
+ # ---------- 中文块 ----------
127
+ # 汉字 + 数字 + 日期时间符号 + 中/英逗号句号 + 常见中文标点 +
128
+ # 全角空格 + 半角空格 + ℃ + / + % + 单位字母 k,m,g,h(大小写都算)
129
+ [\u4e00-\u9fff\u3400-\u4dbf
130
+ 0-9
131
+ \-:~_ # 日期/时间里的 - 和 :
132
+ ,。!?:;、…“”‘’「」『』《》.【】()\u3000
133
+ ,\. # 英文逗号、英文句号
134
+ \x20 # 半角空格
135
+ /% # / 和 %
136
+
137
+ $££¥¥฿€₹₽CHFR$
138
+ ]+
139
+ |
140
+ # ---------- 英文块 ----------
141
+ # 字母 + 数字 + 英文标点 + 其它空白(制表符/换行等)
142
+ [a-zA-Z
143
+ ,\.!?;:'"\-\(\)\[\]/\\_@#\$%&\+
144
+ \t\r\n\f\v # 其它空白(不含普通空格)
145
+ ]+
146
+ |
147
+ # ---------- 其它块 ----------
148
+ # 不属于上面两类的字符(emoji、特殊符号等)
149
+ [^a-zA-Z0-9
150
+ \u4e00-\u9fff\u3400-\u4dbf
151
+ ,。!?:;、…“”‘’「」『』《》.【】()\u3000
152
+ \-:
153
+ ,\.
154
+ \x20\t\r\n\f\v
155
+ /%
156
+
157
+ ]+
158
+ )'''
159
+ regex = re.compile(pattern, re.VERBOSE)
160
+ #text_split = re.findall(regex, utext)
161
+ text_split = self.split_with_units(utext, regex)
162
+ for idx in range(len(text_split)):
163
+ tmpts = text_split[idx]
164
+ #if tmpts.strip().isdigit():
165
+ if is_float_strip(tmpts):
166
+ tmp_lang = 'zh'
167
+ else:
168
+ tmp_lang = langid.classify(tmpts)[0]
169
+ if tmp_lang in ['zh', 'jp', 'ja']:
170
+ tmp_lang = 'zh'
171
+ #tmpts = tmpts.replace(' ', self.sil1symbol)
172
+ else:
173
+ tmp_lang = 'en'
174
+ multi_lang_text_list.append({'lang':tmp_lang, 'text_split': tmpts})
175
+ else:
176
+ phones = utext[1:-1]
177
+ multi_lang_text_list.append({'lang':'phone', 'text_split': phones})
178
+ return multi_lang_text_list
179
+
@@ -0,0 +1,24 @@
1
+ Metadata-Version: 2.1
2
+ Name: sigilyph
3
+ Version: 0.5.2
4
+ Summary: Text Front for TTS
5
+ Home-page: https://github.com/yixiangchen1995/python-Sigilyph
6
+ Author: Yixiang Chen
7
+ Author-email: <yixiangchen1995@gmail.com>
8
+ License: MIT
9
+ Keywords: python,first package
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Programming Language :: Python
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Description-Content-Type: text/markdown
15
+ Requires-Dist: g2p-en
16
+ Requires-Dist: jieba
17
+ Requires-Dist: jieba-fast
18
+ Requires-Dist: pypinyin
19
+ Requires-Dist: pynini
20
+ Requires-Dist: importlib-resources
21
+ Requires-Dist: langid
22
+
23
+ # python-Sigilyph
24
+ The TTS Text Frontend for the use of own
@@ -0,0 +1,24 @@
1
+ sigilyph/__init__.py,sha256=sUTS6j9YR8pPW0US_QzS-jWYS0ztrt6AiAhkUJARgTo,403
2
+ sigilyph/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ sigilyph/core/bert_align.py,sha256=mIzqch3c-skHU9VmyNLjy-bRUt9eD8SqbAUXOZdaR-I,5741
4
+ sigilyph/core/g2p_func.py,sha256=ccwnMA9VgK0xTWKPVR1Pwb5Gop-CNH5_ipHH8SupNUk,1235
5
+ sigilyph/core/norm_func_bk.py,sha256=GRxsloM7mbGFVsLz9gtSA1-W6G9Ij3hzGvcuxe8q6As,3365
6
+ sigilyph/core/predict.py,sha256=ZQZIjui7eHs8MteqBa5tJMW0nyFRfITPfH_xrmHwQZ0,1808
7
+ sigilyph/core/preprocess.py,sha256=l55mqh6TK2cuFQhcl8ouys1-U9XfgNFwXLpFeKI17xk,316
8
+ sigilyph/core/py2phone.dict,sha256=RdafObGTAX9OxvHkijDEzsvyvXnsJuY7aPpBKHgic9g,24894
9
+ sigilyph/core/sigilyph_class.py,sha256=jmKtJf0QQK2N5_gHjTmV0T8y38EqnSp28ardm4xHl3k,9398
10
+ sigilyph/core/special_dict.json,sha256=LtFVDr6OnW8aLCs3z3IzU6rs57aoW8QAiOcqc2bXwys,899
11
+ sigilyph/core/symbols.py,sha256=4IHCCRTiKIaWdMmrUozELlDgkWHAo7l1tPT70Lc_5-w,53898
12
+ sigilyph/core/text_process.py,sha256=ZmttWo89i43aaq30R6Ww2mqcl6O6EYiVP95Y9zOyr7A,11821
13
+ sigilyph/fst_tool/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
+ sigilyph/fst_tool/infer_normalizer.py,sha256=g8KCFGTwg3AI4gJ7Fb7o3hTStA_0i3q6cqQ6-7-KkAc,2008
15
+ sigilyph/fst_tool/processor.py,sha256=WtiTB7ZCqeUfTFlEhIK_xfBdfO2TsH5LFr5NIFYSR1Y,4735
16
+ sigilyph/fst_tool/token_parser.py,sha256=vdI0_QAq4O1tRjsYQhxouehaqK1xt_VL84v2ikc3l5w,4494
17
+ sigilyph/text_norm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
+ sigilyph/text_norm/norm_func.py,sha256=hBCKBxaKNpKtPw4e3iGvFDweSxVu6M4LHW4c6cwM7Hw,4472
19
+ sigilyph/text_norm/norm_func_new.py,sha256=K3Xzc9C9FeNc0vWafFbZYo1zYldV9IzlTWwcC9gcprM,3247
20
+ sigilyph/text_norm/sigilyph_norm.py,sha256=gbRzJu_Pzm6f_IkA6_n0g4K-FaIn_4gY7NhXDAg9JUk,7572
21
+ sigilyph-0.5.2.dist-info/METADATA,sha256=lNb9ZIPrHXimAru0j4PoROZNGYYUKAojdJ-eJ_vWjW0,703
22
+ sigilyph-0.5.2.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
23
+ sigilyph-0.5.2.dist-info/top_level.txt,sha256=caZwwDzakMbaNNk9MOK172HjSUj1HmJr3oK4iOGUyTo,9
24
+ sigilyph-0.5.2.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: bdist_wheel (0.43.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ sigilyph