sigilyph 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sigilyph/__init__.py +18 -0
- sigilyph/core/__init__.py +0 -0
- sigilyph/core/bert_align.py +163 -0
- sigilyph/core/g2p_func.py +47 -0
- sigilyph/core/norm_func_bk.py +98 -0
- sigilyph/core/predict.py +64 -0
- sigilyph/core/preprocess.py +16 -0
- sigilyph/core/py2phone.dict +2165 -0
- sigilyph/core/sigilyph_class.py +246 -0
- sigilyph/core/special_dict.json +26 -0
- sigilyph/core/symbols.py +445 -0
- sigilyph/core/text_process.py +328 -0
- sigilyph/fst_tool/__init__.py +0 -0
- sigilyph/fst_tool/infer_normalizer.py +49 -0
- sigilyph/fst_tool/processor.py +122 -0
- sigilyph/fst_tool/token_parser.py +159 -0
- sigilyph/text_norm/__init__.py +0 -0
- sigilyph/text_norm/norm_func.py +155 -0
- sigilyph/text_norm/norm_func_new.py +89 -0
- sigilyph/text_norm/sigilyph_norm.py +179 -0
- sigilyph-0.5.2.dist-info/METADATA +24 -0
- sigilyph-0.5.2.dist-info/RECORD +24 -0
- sigilyph-0.5.2.dist-info/WHEEL +5 -0
- sigilyph-0.5.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
'''
|
|
2
|
+
FilePath: /python-Sigilyph/sigilyph/text_norm/norm_func.py
|
|
3
|
+
Descripttion:
|
|
4
|
+
Author: Yixiang Chen
|
|
5
|
+
version:
|
|
6
|
+
Date: 2025-03-31 17:50:26
|
|
7
|
+
LastEditors: Yixiang Chen
|
|
8
|
+
LastEditTime: 2026-01-19 20:05:35
|
|
9
|
+
'''
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
import re
|
|
13
|
+
|
|
14
|
+
from sigilyph.core.symbols import punctuation, punc_map_ch
|
|
15
|
+
|
|
16
|
+
from sigilyph.fst_tool.infer_normalizer import ZhNormalizer, EnNormalizer
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
import os
|
|
20
|
+
from importlib_resources import files
|
|
21
|
+
basedir = files('sigilyph')
|
|
22
|
+
|
|
23
|
+
zh_tn_model = ZhNormalizer(version_id='v2', cache_dir=os.path.join(basedir, 'text_norm', 'cache_dir'), remove_erhua=False, full_to_half=False)
|
|
24
|
+
en_tn_model = EnNormalizer(version_id='v2', cache_dir=os.path.join(basedir, 'text_norm', 'cache_dir'))
|
|
25
|
+
|
|
26
|
+
import json
|
|
27
|
+
with open(os.path.join(basedir, 'core', 'special_dict.json'), 'r', encoding="utf-8") as infi:
|
|
28
|
+
special_dict = json.load(infi)
|
|
29
|
+
|
|
30
|
+
def pro_norm(text, use_lang='zh'):
|
|
31
|
+
if use_lang == 'zh':
|
|
32
|
+
norm_text = zh_tn_model.normalize(text)
|
|
33
|
+
#print("zh ", norm_text)
|
|
34
|
+
else:
|
|
35
|
+
norm_text = en_tn_model.normalize(text)
|
|
36
|
+
#print("en ", norm_text)
|
|
37
|
+
return norm_text
|
|
38
|
+
|
|
39
|
+
def replace_with_dict(text, replace_dict):
|
|
40
|
+
for old, new in replace_dict.items():
|
|
41
|
+
text = text.replace(old, new)
|
|
42
|
+
return text
|
|
43
|
+
|
|
44
|
+
def replace_with_dict_re(text, replace_dict):
|
|
45
|
+
pattern = re.compile("|".join(re.escape(key) for key in replace_dict.keys()))
|
|
46
|
+
return pattern.sub(lambda m: replace_dict[m.group(0)], text)
|
|
47
|
+
|
|
48
|
+
def replace_roman_1_to_10(text: str) -> str:
|
|
49
|
+
"""
|
|
50
|
+
将字符串中的罗马数字符号 Ⅰ~Ⅹ 替换为中文数字 一~十。
|
|
51
|
+
其余任何字符(包括乱码、英文 I/V/X 等)都保持不变。
|
|
52
|
+
"""
|
|
53
|
+
roman_to_cn = {
|
|
54
|
+
'Ⅰ': '一',
|
|
55
|
+
'Ⅱ': '二',
|
|
56
|
+
'Ⅲ': '三',
|
|
57
|
+
'Ⅳ': '四',
|
|
58
|
+
'Ⅴ': '五',
|
|
59
|
+
'Ⅵ': '六',
|
|
60
|
+
'Ⅶ': '七',
|
|
61
|
+
'Ⅷ': '八',
|
|
62
|
+
'Ⅸ': '九',
|
|
63
|
+
'Ⅹ': '十',
|
|
64
|
+
}
|
|
65
|
+
# 逐字符扫描,能映射的就换成中文数字,不能映射的原样保留
|
|
66
|
+
return ''.join(roman_to_cn.get(ch, ch) for ch in text)
|
|
67
|
+
|
|
68
|
+
pre_replace_dict = {"AlphaFold-Plus": "AlphaFold Plus"}
|
|
69
|
+
def preprocess_first_old(text, use_lang='zh'):
|
|
70
|
+
text = replace_with_dict(text, pre_replace_dict)
|
|
71
|
+
norm_text = pro_norm(text, use_lang)
|
|
72
|
+
#print(norm_text)
|
|
73
|
+
rep_text = replace_with_dict(norm_text, special_dict)
|
|
74
|
+
return rep_text
|
|
75
|
+
|
|
76
|
+
def preprocess_first(text, before_replace_dict, special_word_dict, norm_use_lang='zh'):
|
|
77
|
+
text = replace_with_dict(text, before_replace_dict)
|
|
78
|
+
norm_text = pro_norm(text, norm_use_lang)
|
|
79
|
+
#print(norm_text)
|
|
80
|
+
#rep_text = replace_with_dict(norm_text, special_word_dict)
|
|
81
|
+
return norm_text
|
|
82
|
+
def post_process(text , special_word_dict):
|
|
83
|
+
rep_text = replace_with_dict(text, special_word_dict)
|
|
84
|
+
return rep_text
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def preprocess_first_for_norm(text, before_replace_dict, norm_use_lang='zh'):
|
|
88
|
+
text = replace_roman_1_to_10(text)
|
|
89
|
+
text = replace_with_dict(text, before_replace_dict)
|
|
90
|
+
return text
|
|
91
|
+
|
|
92
|
+
def normalizer(text):
|
|
93
|
+
return text
|
|
94
|
+
|
|
95
|
+
def replace_punc(text):
|
|
96
|
+
#text = text.replace("嗯", "恩").replace("呣", "母")
|
|
97
|
+
pattern = re.compile("|".join(re.escape(p) for p in punc_map_ch.keys()))
|
|
98
|
+
replaced_text = pattern.sub(lambda x: punc_map_ch[x.group()], text)
|
|
99
|
+
replaced_text = re.sub(
|
|
100
|
+
r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text
|
|
101
|
+
)
|
|
102
|
+
return replaced_text
|
|
103
|
+
|
|
104
|
+
def replace_punc_part(text: str) -> str:
|
|
105
|
+
"""
|
|
106
|
+
将中文句号、逗号、分号、引号替换为英文符号:
|
|
107
|
+
。 -> .
|
|
108
|
+
, -> ,
|
|
109
|
+
; -> ;
|
|
110
|
+
“ -> "
|
|
111
|
+
” -> "
|
|
112
|
+
『 -> "
|
|
113
|
+
』 -> "
|
|
114
|
+
「 -> "
|
|
115
|
+
」 -> "
|
|
116
|
+
『』、「」等都统一为英文双引号
|
|
117
|
+
其他符号不动
|
|
118
|
+
"""
|
|
119
|
+
# 建立映射表
|
|
120
|
+
mapping = {
|
|
121
|
+
'。': '.',
|
|
122
|
+
',': ',',
|
|
123
|
+
';': ';',
|
|
124
|
+
'“': '"',
|
|
125
|
+
'”': '"',
|
|
126
|
+
'「': '"',
|
|
127
|
+
'」': '"',
|
|
128
|
+
'『': '"',
|
|
129
|
+
'』': '"',
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
# 构造正则:匹配所有需要替换的中文标点
|
|
133
|
+
pattern = re.compile(r'[。,“”;「」『』]')
|
|
134
|
+
|
|
135
|
+
# 使用 re.sub 进行替换
|
|
136
|
+
return pattern.sub(lambda m: mapping[m.group(0)], text)
|
|
137
|
+
|
|
138
|
+
'''
|
|
139
|
+
def text_norm_cn(text):
|
|
140
|
+
text = normalizer(text)
|
|
141
|
+
text = replace_punc(text)
|
|
142
|
+
return text
|
|
143
|
+
|
|
144
|
+
def text_norm_en(text):
|
|
145
|
+
return text
|
|
146
|
+
'''
|
|
147
|
+
|
|
148
|
+
def text_norm_cn(text):
|
|
149
|
+
norm_text = zh_tn_model.normalize(text)
|
|
150
|
+
norm_text = replace_punc_part(norm_text)
|
|
151
|
+
return norm_text
|
|
152
|
+
|
|
153
|
+
def text_norm_en(text):
|
|
154
|
+
norm_text = en_tn_model.normalize(text)
|
|
155
|
+
return norm_text
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
'''
|
|
2
|
+
FilePath: /python-Sigilyph/sigilyph/text_norm/norm_func.py
|
|
3
|
+
Descripttion:
|
|
4
|
+
Author: Yixiang Chen
|
|
5
|
+
version:
|
|
6
|
+
Date: 2025-03-31 17:50:26
|
|
7
|
+
LastEditors: Yixiang Chen
|
|
8
|
+
LastEditTime: 2026-01-19 10:09:56
|
|
9
|
+
'''
|
|
10
|
+
|
|
11
|
+
import os
|
|
12
|
+
import json
|
|
13
|
+
import re
|
|
14
|
+
from importlib_resources import files
|
|
15
|
+
from sigilyph.core.symbols import punctuation, punc_map_ch
|
|
16
|
+
from sigilyph.fst_tool.infer_normalizer import ZhNormalizer, EnNormalizer
|
|
17
|
+
|
|
18
|
+
basedir = files('sigilyph')
|
|
19
|
+
zh_tn_model = ZhNormalizer(version_id='v2', cache_dir=os.path.join(basedir, 'text_norm', 'cache_dir'), remove_erhua=False, full_to_half=False)
|
|
20
|
+
en_tn_model = EnNormalizer(version_id='v2', cache_dir=os.path.join(basedir, 'text_norm', 'cache_dir'))
|
|
21
|
+
|
|
22
|
+
#with open(os.path.join(basedir, 'core', 'special_dict.json'), 'r', encoding="utf-8") as infi:
|
|
23
|
+
# special_dict = json.load(infi)
|
|
24
|
+
|
|
25
|
+
def pro_norm(text, use_lang='zh'):
|
|
26
|
+
"""Normalize text based on the specified language."""
|
|
27
|
+
if use_lang == 'zh':
|
|
28
|
+
return zh_tn_model.normalize(text)
|
|
29
|
+
return en_tn_model.normalize(text)
|
|
30
|
+
|
|
31
|
+
def replace_with_dict(text, replace_dict):
|
|
32
|
+
"""Replace occurrences of keys in text with their corresponding values from replace_dict."""
|
|
33
|
+
for old, new in replace_dict.items():
|
|
34
|
+
text = text.replace(old, new)
|
|
35
|
+
return text
|
|
36
|
+
|
|
37
|
+
def replace_with_dict_re(text, replace_dict):
|
|
38
|
+
"""Replace occurrences of keys in text using regular expressions."""
|
|
39
|
+
pattern = re.compile("|".join(re.escape(key) for key in replace_dict.keys()))
|
|
40
|
+
return pattern.sub(lambda m: replace_dict[m.group(0)], text)
|
|
41
|
+
|
|
42
|
+
def preprocess_first(text, before_replace_dict, special_word_dict, norm_use_lang='zh'):
|
|
43
|
+
"""Preprocess text by replacing specified words and normalizing."""
|
|
44
|
+
text = replace_with_dict(text, before_replace_dict)
|
|
45
|
+
norm_text = pro_norm(text, norm_use_lang)
|
|
46
|
+
return replace_with_dict(norm_text, special_word_dict)
|
|
47
|
+
|
|
48
|
+
def post_process(text, special_word_dict):
|
|
49
|
+
"""Post-process text by replacing special words."""
|
|
50
|
+
return replace_with_dict(text, special_word_dict)
|
|
51
|
+
|
|
52
|
+
def preprocess_first_for_norm(text, before_replace_dict):
|
|
53
|
+
"""Preprocess text for normalization."""
|
|
54
|
+
return replace_with_dict(text, before_replace_dict)
|
|
55
|
+
|
|
56
|
+
def normalizer(text):
|
|
57
|
+
"""Placeholder for a normalizer function."""
|
|
58
|
+
return text
|
|
59
|
+
|
|
60
|
+
def replace_punc(text):
|
|
61
|
+
"""Replace Chinese punctuation with corresponding characters."""
|
|
62
|
+
pattern = re.compile("|".join(re.escape(p) for p in punc_map_ch.keys()))
|
|
63
|
+
replaced_text = pattern.sub(lambda x: punc_map_ch[x.group()], text)
|
|
64
|
+
return re.sub(r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text)
|
|
65
|
+
|
|
66
|
+
def replace_punc_part(text: str) -> str:
|
|
67
|
+
"""Replace specific Chinese punctuation with English punctuation."""
|
|
68
|
+
mapping = {
|
|
69
|
+
'。': '.',
|
|
70
|
+
',': ',',
|
|
71
|
+
';': ';',
|
|
72
|
+
'“': '"',
|
|
73
|
+
'”': '"',
|
|
74
|
+
'「': '"',
|
|
75
|
+
'」': '"',
|
|
76
|
+
'『': '"',
|
|
77
|
+
'』': '"',
|
|
78
|
+
}
|
|
79
|
+
pattern = re.compile(r'[。,“”;「」『』]')
|
|
80
|
+
return pattern.sub(lambda m: mapping[m.group(0)], text)
|
|
81
|
+
|
|
82
|
+
def text_norm_cn(text):
|
|
83
|
+
"""Normalize Chinese text."""
|
|
84
|
+
norm_text = zh_tn_model.normalize(text)
|
|
85
|
+
return replace_punc_part(norm_text)
|
|
86
|
+
|
|
87
|
+
def text_norm_en(text):
|
|
88
|
+
"""Normalize English text."""
|
|
89
|
+
return en_tn_model.normalize(text)
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
'''
|
|
2
|
+
FilePath: /python-Sigilyph/sigilyph/text_norm/sigilyph_norm.py
|
|
3
|
+
Descripttion:
|
|
4
|
+
Author: Yixiang Chen
|
|
5
|
+
version:
|
|
6
|
+
Date: 2026-01-07 15:46:04
|
|
7
|
+
LastEditors: Yixiang Chen
|
|
8
|
+
LastEditTime: 2026-01-19 19:55:00
|
|
9
|
+
'''
|
|
10
|
+
|
|
11
|
+
import langid
|
|
12
|
+
import re
|
|
13
|
+
import jieba
|
|
14
|
+
import os
|
|
15
|
+
|
|
16
|
+
from sigilyph.text_norm.norm_func import preprocess_first_for_norm, text_norm_en, text_norm_cn
|
|
17
|
+
from sigilyph.core.predict import special_phrase
|
|
18
|
+
|
|
19
|
+
norm_func_dict = {
|
|
20
|
+
'en': text_norm_en,
|
|
21
|
+
'zh': text_norm_cn
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
import json
|
|
25
|
+
from importlib_resources import files
|
|
26
|
+
basedir = files('sigilyph')
|
|
27
|
+
with open(os.path.join(basedir, 'text_norm', 'dict_special_word_polyphone.json'), 'r', encoding="utf-8") as infi:
|
|
28
|
+
dict_special_word_polyphone_json = json.load(infi)
|
|
29
|
+
dict_special_word_polyphone = dict_special_word_polyphone_json['polyphone_config']
|
|
30
|
+
with open(os.path.join(basedir, 'text_norm', 'dict_special_word_base.json'), 'r', encoding="utf-8") as infib:
|
|
31
|
+
dict_special_word_base_json = json.load(infib)
|
|
32
|
+
dict_special_word_base = dict_special_word_base_json['base_config']
|
|
33
|
+
|
|
34
|
+
def is_float_strip(s: str) -> bool:
|
|
35
|
+
s = s.strip() # 只去掉首尾空白
|
|
36
|
+
if not s:
|
|
37
|
+
return False
|
|
38
|
+
try:
|
|
39
|
+
float(s)
|
|
40
|
+
return True
|
|
41
|
+
except ValueError:
|
|
42
|
+
return False
|
|
43
|
+
|
|
44
|
+
class SigilyphNormalizer:
|
|
45
|
+
def __init__(self, norm_use_dict) -> None:
|
|
46
|
+
self.sil1symbol='-'
|
|
47
|
+
self.special_phrase = special_phrase
|
|
48
|
+
|
|
49
|
+
self.base_replace_dict = dict_special_word_base
|
|
50
|
+
self.base_replace_dict.update(dict_special_word_polyphone)
|
|
51
|
+
|
|
52
|
+
self.before_replace_dict = self.base_replace_dict
|
|
53
|
+
self.before_replace_dict.update(norm_use_dict)
|
|
54
|
+
|
|
55
|
+
def fix_replace_dict(self, new_before_replace_dict):
|
|
56
|
+
self.before_replace_dict = self.base_replace_dict
|
|
57
|
+
self.before_replace_dict.update(new_before_replace_dict)
|
|
58
|
+
|
|
59
|
+
def normalize(self, text, lang, norm_use_lang='zh'):
|
|
60
|
+
text = preprocess_first_for_norm(text, self.before_replace_dict, norm_use_lang=norm_use_lang)
|
|
61
|
+
multi_lang_text_list = self.text_split_lang(text, lang)
|
|
62
|
+
all_phone = []
|
|
63
|
+
outtext = ''
|
|
64
|
+
for text_split_dict in multi_lang_text_list:
|
|
65
|
+
use_lang = text_split_dict['lang']
|
|
66
|
+
use_text = text_split_dict['text_split']
|
|
67
|
+
if use_lang not in norm_func_dict.keys():
|
|
68
|
+
use_lang = 'zh'
|
|
69
|
+
use_text = self.text_norm(use_text, use_lang)
|
|
70
|
+
outtext += use_text
|
|
71
|
+
return outtext
|
|
72
|
+
|
|
73
|
+
######## text norm #########
|
|
74
|
+
def text_norm(self, text, lang):
|
|
75
|
+
outtext = norm_func_dict[lang](text)
|
|
76
|
+
return outtext
|
|
77
|
+
|
|
78
|
+
def split_with_units(self, text, regex):
|
|
79
|
+
# 中文数字(常见大写+小写口语化)
|
|
80
|
+
CHINESE_NUM_CHARS = "零一二三四五六七八九十百千万亿两〇壹贰叁肆伍陆柒捌玖拾佰仟萬億"
|
|
81
|
+
# 单位模式:可按需要继续扩展
|
|
82
|
+
unit_pattern = re.compile(r'^(\s*)(km/h|km|m/s|m|s|g|h|kg|mm|cm)\b')
|
|
83
|
+
|
|
84
|
+
pieces = re.findall(regex, text)
|
|
85
|
+
merged = []
|
|
86
|
+
|
|
87
|
+
for piece in pieces:
|
|
88
|
+
if merged:
|
|
89
|
+
m = unit_pattern.match(piece)
|
|
90
|
+
if m:
|
|
91
|
+
# 前一块最后一个字符
|
|
92
|
+
last_char = merged[-1][-1]
|
|
93
|
+
# 条件:前一块以“汉字或阿拉伯数字或中文数字”结尾
|
|
94
|
+
if (
|
|
95
|
+
re.match(r'[\u4e00-\u9fff\u3400-\u4dbf0-9]', last_char)
|
|
96
|
+
or last_char in CHINESE_NUM_CHARS
|
|
97
|
+
):
|
|
98
|
+
# 把单位并到前一块
|
|
99
|
+
merged[-1] += m.group(1) + m.group(2)
|
|
100
|
+
# 当前块剩余部分(若有)单独保留
|
|
101
|
+
rest = piece[m.end():]
|
|
102
|
+
if rest:
|
|
103
|
+
merged.append(rest)
|
|
104
|
+
continue
|
|
105
|
+
|
|
106
|
+
merged.append(piece)
|
|
107
|
+
|
|
108
|
+
return merged
|
|
109
|
+
|
|
110
|
+
############### split text in line with lang ##############
|
|
111
|
+
def text_split_lang(self, text, lang):
|
|
112
|
+
if lang == 'ZH' or lang == 'zh':
|
|
113
|
+
multi_lang_text_list = [{'lang':'zh', 'text_split': text}]
|
|
114
|
+
elif lang == 'en':
|
|
115
|
+
multi_lang_text_list = [{'lang':'en', 'text_split': text}]
|
|
116
|
+
else:
|
|
117
|
+
# Phoneme be judged
|
|
118
|
+
pretext_split = re.split("(\[.*?\])", text, re.I|re.M)
|
|
119
|
+
multi_lang_text_list = []
|
|
120
|
+
pretext_split = list(filter(None, pretext_split))
|
|
121
|
+
for utext in pretext_split:
|
|
122
|
+
if utext[0] != '[':
|
|
123
|
+
#pattern = r'([a-zA-Z ,.\!\?]+|[\u4e00-\u9fa5 ,。,.\t \"\!\?\“\”\、]+)'
|
|
124
|
+
#text_split = re.findall(pattern, utext)
|
|
125
|
+
pattern = r'''(
|
|
126
|
+
# ---------- 中文块 ----------
|
|
127
|
+
# 汉字 + 数字 + 日期时间符号 + 中/英逗号句号 + 常见中文标点 +
|
|
128
|
+
# 全角空格 + 半角空格 + ℃ + / + % + 单位字母 k,m,g,h(大小写都算)
|
|
129
|
+
[\u4e00-\u9fff\u3400-\u4dbf
|
|
130
|
+
0-9
|
|
131
|
+
\-:~_ # 日期/时间里的 - 和 :
|
|
132
|
+
,。!?:;、…“”‘’「」『』《》.【】()\u3000
|
|
133
|
+
,\. # 英文逗号、英文句号
|
|
134
|
+
\x20 # 半角空格
|
|
135
|
+
/% # / 和 %
|
|
136
|
+
℃
|
|
137
|
+
$££¥¥฿€₹₽CHFR$
|
|
138
|
+
]+
|
|
139
|
+
|
|
|
140
|
+
# ---------- 英文块 ----------
|
|
141
|
+
# 字母 + 数字 + 英文标点 + 其它空白(制表符/换行等)
|
|
142
|
+
[a-zA-Z
|
|
143
|
+
,\.!?;:'"\-\(\)\[\]/\\_@#\$%&\+
|
|
144
|
+
\t\r\n\f\v # 其它空白(不含普通空格)
|
|
145
|
+
]+
|
|
146
|
+
|
|
|
147
|
+
# ---------- 其它块 ----------
|
|
148
|
+
# 不属于上面两类的字符(emoji、特殊符号等)
|
|
149
|
+
[^a-zA-Z0-9
|
|
150
|
+
\u4e00-\u9fff\u3400-\u4dbf
|
|
151
|
+
,。!?:;、…“”‘’「」『』《》.【】()\u3000
|
|
152
|
+
\-:
|
|
153
|
+
,\.
|
|
154
|
+
\x20\t\r\n\f\v
|
|
155
|
+
/%
|
|
156
|
+
℃
|
|
157
|
+
]+
|
|
158
|
+
)'''
|
|
159
|
+
regex = re.compile(pattern, re.VERBOSE)
|
|
160
|
+
#text_split = re.findall(regex, utext)
|
|
161
|
+
text_split = self.split_with_units(utext, regex)
|
|
162
|
+
for idx in range(len(text_split)):
|
|
163
|
+
tmpts = text_split[idx]
|
|
164
|
+
#if tmpts.strip().isdigit():
|
|
165
|
+
if is_float_strip(tmpts):
|
|
166
|
+
tmp_lang = 'zh'
|
|
167
|
+
else:
|
|
168
|
+
tmp_lang = langid.classify(tmpts)[0]
|
|
169
|
+
if tmp_lang in ['zh', 'jp', 'ja']:
|
|
170
|
+
tmp_lang = 'zh'
|
|
171
|
+
#tmpts = tmpts.replace(' ', self.sil1symbol)
|
|
172
|
+
else:
|
|
173
|
+
tmp_lang = 'en'
|
|
174
|
+
multi_lang_text_list.append({'lang':tmp_lang, 'text_split': tmpts})
|
|
175
|
+
else:
|
|
176
|
+
phones = utext[1:-1]
|
|
177
|
+
multi_lang_text_list.append({'lang':'phone', 'text_split': phones})
|
|
178
|
+
return multi_lang_text_list
|
|
179
|
+
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: sigilyph
|
|
3
|
+
Version: 0.5.2
|
|
4
|
+
Summary: Text Front for TTS
|
|
5
|
+
Home-page: https://github.com/yixiangchen1995/python-Sigilyph
|
|
6
|
+
Author: Yixiang Chen
|
|
7
|
+
Author-email: <yixiangchen1995@gmail.com>
|
|
8
|
+
License: MIT
|
|
9
|
+
Keywords: python,first package
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Programming Language :: Python
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
Requires-Dist: g2p-en
|
|
16
|
+
Requires-Dist: jieba
|
|
17
|
+
Requires-Dist: jieba-fast
|
|
18
|
+
Requires-Dist: pypinyin
|
|
19
|
+
Requires-Dist: pynini
|
|
20
|
+
Requires-Dist: importlib-resources
|
|
21
|
+
Requires-Dist: langid
|
|
22
|
+
|
|
23
|
+
# python-Sigilyph
|
|
24
|
+
The TTS Text Frontend for the use of own
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
sigilyph/__init__.py,sha256=sUTS6j9YR8pPW0US_QzS-jWYS0ztrt6AiAhkUJARgTo,403
|
|
2
|
+
sigilyph/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
sigilyph/core/bert_align.py,sha256=mIzqch3c-skHU9VmyNLjy-bRUt9eD8SqbAUXOZdaR-I,5741
|
|
4
|
+
sigilyph/core/g2p_func.py,sha256=ccwnMA9VgK0xTWKPVR1Pwb5Gop-CNH5_ipHH8SupNUk,1235
|
|
5
|
+
sigilyph/core/norm_func_bk.py,sha256=GRxsloM7mbGFVsLz9gtSA1-W6G9Ij3hzGvcuxe8q6As,3365
|
|
6
|
+
sigilyph/core/predict.py,sha256=ZQZIjui7eHs8MteqBa5tJMW0nyFRfITPfH_xrmHwQZ0,1808
|
|
7
|
+
sigilyph/core/preprocess.py,sha256=l55mqh6TK2cuFQhcl8ouys1-U9XfgNFwXLpFeKI17xk,316
|
|
8
|
+
sigilyph/core/py2phone.dict,sha256=RdafObGTAX9OxvHkijDEzsvyvXnsJuY7aPpBKHgic9g,24894
|
|
9
|
+
sigilyph/core/sigilyph_class.py,sha256=jmKtJf0QQK2N5_gHjTmV0T8y38EqnSp28ardm4xHl3k,9398
|
|
10
|
+
sigilyph/core/special_dict.json,sha256=LtFVDr6OnW8aLCs3z3IzU6rs57aoW8QAiOcqc2bXwys,899
|
|
11
|
+
sigilyph/core/symbols.py,sha256=4IHCCRTiKIaWdMmrUozELlDgkWHAo7l1tPT70Lc_5-w,53898
|
|
12
|
+
sigilyph/core/text_process.py,sha256=ZmttWo89i43aaq30R6Ww2mqcl6O6EYiVP95Y9zOyr7A,11821
|
|
13
|
+
sigilyph/fst_tool/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
14
|
+
sigilyph/fst_tool/infer_normalizer.py,sha256=g8KCFGTwg3AI4gJ7Fb7o3hTStA_0i3q6cqQ6-7-KkAc,2008
|
|
15
|
+
sigilyph/fst_tool/processor.py,sha256=WtiTB7ZCqeUfTFlEhIK_xfBdfO2TsH5LFr5NIFYSR1Y,4735
|
|
16
|
+
sigilyph/fst_tool/token_parser.py,sha256=vdI0_QAq4O1tRjsYQhxouehaqK1xt_VL84v2ikc3l5w,4494
|
|
17
|
+
sigilyph/text_norm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
18
|
+
sigilyph/text_norm/norm_func.py,sha256=hBCKBxaKNpKtPw4e3iGvFDweSxVu6M4LHW4c6cwM7Hw,4472
|
|
19
|
+
sigilyph/text_norm/norm_func_new.py,sha256=K3Xzc9C9FeNc0vWafFbZYo1zYldV9IzlTWwcC9gcprM,3247
|
|
20
|
+
sigilyph/text_norm/sigilyph_norm.py,sha256=gbRzJu_Pzm6f_IkA6_n0g4K-FaIn_4gY7NhXDAg9JUk,7572
|
|
21
|
+
sigilyph-0.5.2.dist-info/METADATA,sha256=lNb9ZIPrHXimAru0j4PoROZNGYYUKAojdJ-eJ_vWjW0,703
|
|
22
|
+
sigilyph-0.5.2.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
23
|
+
sigilyph-0.5.2.dist-info/top_level.txt,sha256=caZwwDzakMbaNNk9MOK172HjSUj1HmJr3oK4iOGUyTo,9
|
|
24
|
+
sigilyph-0.5.2.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
sigilyph
|