tugaphone 0.0.2a1__py3-none-any.whl → 0.1.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tugaphone/__init__.py +66 -60
- tugaphone/lexicon.py +169 -0
- tugaphone/number_utils.py +320 -0
- tugaphone/pos.py +154 -0
- tugaphone/regional_dict.csv +271 -271
- tugaphone/syl.py +1203 -0
- tugaphone/tokenizer.py +3689 -0
- tugaphone/version.py +2 -2
- tugaphone-0.1.0a1.dist-info/METADATA +12 -0
- tugaphone-0.1.0a1.dist-info/RECORD +12 -0
- {tugaphone-0.0.2a1.dist-info → tugaphone-0.1.0a1.dist-info}/WHEEL +1 -1
- tugaphone/espeak.py +0 -164
- tugaphone/util.py +0 -713
- tugaphone-0.0.2a1.dist-info/METADATA +0 -8
- tugaphone-0.0.2a1.dist-info/RECORD +0 -9
- {tugaphone-0.0.2a1.dist-info → tugaphone-0.1.0a1.dist-info}/top_level.txt +0 -0
tugaphone/version.py
CHANGED
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tugaphone
|
|
3
|
+
Version: 0.1.0a1
|
|
4
|
+
Home-page: https://github.com/TigreGotico/tugaphone
|
|
5
|
+
Author: JarbasAi
|
|
6
|
+
Author-email: jarbasai@mailfence.com
|
|
7
|
+
Requires-Dist: brill-postagger
|
|
8
|
+
Requires-Dist: unicode-rbnf
|
|
9
|
+
Dynamic: author
|
|
10
|
+
Dynamic: author-email
|
|
11
|
+
Dynamic: home-page
|
|
12
|
+
Dynamic: requires-dist
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
tugaphone/__init__.py,sha256=7VnC2a-ou5Y0mYxHtHOSBsidiYFklZVRZVLr78tViVY,5282
|
|
2
|
+
tugaphone/lexicon.py,sha256=rwZQtU1mP7PboMu2480e9BSMr5J9oAgfEVdQX93iev0,6596
|
|
3
|
+
tugaphone/number_utils.py,sha256=XhU4gUxaX5Vz3hFoP-Vkx3C75hmoKGvwZ1jbOEbZAPA,14727
|
|
4
|
+
tugaphone/pos.py,sha256=d3rBcSUySjvynJcCLyDXKNy1sRK3LAzloImrEin9izw,5666
|
|
5
|
+
tugaphone/regional_dict.csv,sha256=173QZgoDrCYVlDIXDIq70fn56zJ9f3dfLfA95_kQvhY,49216388
|
|
6
|
+
tugaphone/syl.py,sha256=VtVP-BqFF3wnikBGxhBK9PkVHcqj6gE1tkrn62CNFnA,51050
|
|
7
|
+
tugaphone/tokenizer.py,sha256=5tgXuIS_aDe6xJwPPQ3rprBC4KX7Kh3J_AGcpdobUnM,137800
|
|
8
|
+
tugaphone/version.py,sha256=mTZ0SIYsV1IVEOlxa2pu3LKW4dZCFy4l30LadddrvaQ,237
|
|
9
|
+
tugaphone-0.1.0a1.dist-info/METADATA,sha256=3tPens-TOyUxkk37WCUnrEoV1PEGKbuL4nQFDBaVvIY,300
|
|
10
|
+
tugaphone-0.1.0a1.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
|
|
11
|
+
tugaphone-0.1.0a1.dist-info/top_level.txt,sha256=HJb1X0j2g_NMNmGEHF6IfJEPLPWseI3ShEwCYfYK0dk,10
|
|
12
|
+
tugaphone-0.1.0a1.dist-info/RECORD,,
|
tugaphone/espeak.py
DELETED
|
@@ -1,164 +0,0 @@
|
|
|
1
|
-
"""multilingual phonemizers"""
|
|
2
|
-
|
|
3
|
-
import json
|
|
4
|
-
import os
|
|
5
|
-
from langcodes import tag_distance
|
|
6
|
-
import subprocess
|
|
7
|
-
from typing import List, Dict, Optional
|
|
8
|
-
|
|
9
|
-
import numpy as np
|
|
10
|
-
import onnxruntime
|
|
11
|
-
import requests
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class EspeakError(Exception):
|
|
15
|
-
"""Custom exception for espeak-ng related errors."""
|
|
16
|
-
pass
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
class EspeakPhonemizer:
|
|
20
|
-
"""
|
|
21
|
-
A phonemizer class that uses the espeak-ng command-line tool to convert text into phonemes.
|
|
22
|
-
It segments the input text heuristically based on punctuation to mimic clause-by-clause processing.
|
|
23
|
-
"""
|
|
24
|
-
ESPEAK_LANGS = ['es-419', 'ca', 'qya', 'ga', 'et', 'ky', 'io', 'fa-latn', 'en-gb', 'fo', 'haw', 'kl',
|
|
25
|
-
'ta', 'ml', 'gd', 'sd', 'es', 'hy', 'ur', 'ro', 'hi', 'or', 'ti', 'ca-va', 'om', 'tr', 'pa',
|
|
26
|
-
'smj', 'mk', 'bg', 'cv', "fr", 'fi', 'en-gb-x-rp', 'ru', 'mt', 'an', 'mr', 'pap', 'vi', 'id',
|
|
27
|
-
'fr-be', 'ltg', 'my', 'nl', 'shn', 'ba', 'az', 'cmn', 'da', 'as', 'sw',
|
|
28
|
-
'piqd', 'en-us', 'hr', 'it', 'ug', 'th', 'mi', 'cy', 'ru-lv', 'ia', 'tt', 'hu', 'xex', 'te', 'ne',
|
|
29
|
-
'eu', 'ja', 'bpy', 'hak', 'cs', 'en-gb-scotland', 'hyw', 'uk', 'pt', 'bn', 'mto', 'yue',
|
|
30
|
-
'be', 'gu', 'sv', 'sl', 'cmn-latn-pinyin', 'lfn', 'lv', 'fa', 'sjn', 'nog', 'ms',
|
|
31
|
-
'vi-vn-x-central', 'lt', 'kn', 'he', 'qu', 'ca-ba', 'quc', 'nb', 'sk', 'tn', 'py', 'si', 'de',
|
|
32
|
-
'ar', 'en-gb-x-gbcwmd', 'bs', 'qdb', 'sq', 'sr', 'tk', 'en-029', 'ht', 'ru-cl', 'af', 'pt-br',
|
|
33
|
-
'fr-ch', 'ka', 'en-gb-x-gbclan', 'ko', 'is', 'ca-nw', 'gn', 'kok', 'la', 'lb', 'am', 'kk', 'ku',
|
|
34
|
-
'kaa', 'jbo', 'eo', 'uz', 'nci', 'vi-vn-x-south', 'el', 'pl', 'grc', ]
|
|
35
|
-
|
|
36
|
-
@classmethod
|
|
37
|
-
def get_lang(cls, target_lang: str) -> str:
|
|
38
|
-
"""
|
|
39
|
-
Validates and returns the closest supported language code.
|
|
40
|
-
|
|
41
|
-
Args:
|
|
42
|
-
target_lang (str): The language code to validate.
|
|
43
|
-
|
|
44
|
-
Returns:
|
|
45
|
-
str: The validated language code.
|
|
46
|
-
|
|
47
|
-
Raises:
|
|
48
|
-
ValueError: If the language code is unsupported.
|
|
49
|
-
"""
|
|
50
|
-
if target_lang.lower() == "en-gb":
|
|
51
|
-
return "en-gb-x-rp"
|
|
52
|
-
if target_lang in cls.ESPEAK_LANGS:
|
|
53
|
-
return target_lang
|
|
54
|
-
if target_lang.lower().split("-")[0] in cls.ESPEAK_LANGS:
|
|
55
|
-
return target_lang.lower().split("-")[0]
|
|
56
|
-
return cls.match_lang(target_lang, cls.ESPEAK_LANGS)
|
|
57
|
-
|
|
58
|
-
@staticmethod
|
|
59
|
-
def match_lang(target_lang: str, valid_langs: List[str]) -> str:
|
|
60
|
-
"""
|
|
61
|
-
Validates and returns the closest supported language code.
|
|
62
|
-
|
|
63
|
-
Args:
|
|
64
|
-
target_lang (str): The language code to validate.
|
|
65
|
-
|
|
66
|
-
Returns:
|
|
67
|
-
str: The validated language code.
|
|
68
|
-
|
|
69
|
-
Raises:
|
|
70
|
-
ValueError: If the language code is unsupported.
|
|
71
|
-
"""
|
|
72
|
-
if target_lang in valid_langs:
|
|
73
|
-
return target_lang
|
|
74
|
-
best_lang = "und"
|
|
75
|
-
best_distance = 10000000
|
|
76
|
-
for l in valid_langs:
|
|
77
|
-
try:
|
|
78
|
-
distance: int = tag_distance(l, target_lang)
|
|
79
|
-
except:
|
|
80
|
-
try:
|
|
81
|
-
l = f"{l.split('-')[0]}-{l.split('-')[1]}"
|
|
82
|
-
distance: int = tag_distance(l, target_lang)
|
|
83
|
-
except:
|
|
84
|
-
try:
|
|
85
|
-
distance: int = tag_distance(l.split('-')[0], target_lang)
|
|
86
|
-
except:
|
|
87
|
-
continue
|
|
88
|
-
if distance < best_distance:
|
|
89
|
-
best_lang, best_distance = l, distance
|
|
90
|
-
|
|
91
|
-
# If the score is low (meaning a good match), return the language
|
|
92
|
-
if best_distance <= 10:
|
|
93
|
-
return best_lang
|
|
94
|
-
# Otherwise, raise an error for unsupported language
|
|
95
|
-
raise ValueError(f"unsupported language code: {target_lang}")
|
|
96
|
-
|
|
97
|
-
@staticmethod
|
|
98
|
-
def _run_espeak_command(args: List[str], input_text: str = None, check: bool = True) -> str:
|
|
99
|
-
"""
|
|
100
|
-
Helper function to run espeak-ng commands via subprocess.
|
|
101
|
-
Executes 'espeak-ng' with the given arguments and input text.
|
|
102
|
-
Captures stdout and stderr, and raises EspeakError on failure.
|
|
103
|
-
|
|
104
|
-
Args:
|
|
105
|
-
args (List[str]): A list of command-line arguments for espeak-ng.
|
|
106
|
-
input_text (str, optional): The text to pass to espeak-ng's stdin. Defaults to None.
|
|
107
|
-
check (bool, optional): If True, raises a CalledProcessError if the command returns a non-zero exit code. Defaults to True.
|
|
108
|
-
|
|
109
|
-
Returns:
|
|
110
|
-
str: The stripped standard output from the espeak-ng command.
|
|
111
|
-
|
|
112
|
-
Raises:
|
|
113
|
-
EspeakError: If espeak-ng command is not found, or if the subprocess call fails.
|
|
114
|
-
"""
|
|
115
|
-
command: List[str] = ['espeak-ng'] + args
|
|
116
|
-
try:
|
|
117
|
-
process: subprocess.CompletedProcess = subprocess.run(
|
|
118
|
-
command,
|
|
119
|
-
input=input_text,
|
|
120
|
-
capture_output=True,
|
|
121
|
-
text=True,
|
|
122
|
-
check=check,
|
|
123
|
-
encoding='utf-8',
|
|
124
|
-
errors='replace' # Replaces unencodable characters with a placeholder
|
|
125
|
-
)
|
|
126
|
-
return process.stdout.strip()
|
|
127
|
-
except FileNotFoundError:
|
|
128
|
-
raise EspeakError(
|
|
129
|
-
"espeak-ng command not found. Please ensure espeak-ng is installed "
|
|
130
|
-
"and available in your system's PATH."
|
|
131
|
-
)
|
|
132
|
-
except subprocess.CalledProcessError as e:
|
|
133
|
-
raise EspeakError(
|
|
134
|
-
f"espeak-ng command failed with error code {e.returncode}:\n"
|
|
135
|
-
f"STDOUT: {e.stdout}\n"
|
|
136
|
-
f"STDERR: {e.stderr}"
|
|
137
|
-
)
|
|
138
|
-
except Exception as e:
|
|
139
|
-
raise EspeakError(f"An unexpected error occurred while running espeak-ng: {e}")
|
|
140
|
-
|
|
141
|
-
def phonemize(self, text: str, lang: str) -> str:
|
|
142
|
-
lang = self.get_lang(lang)
|
|
143
|
-
return self._run_espeak_command(
|
|
144
|
-
['-q', '-x', '--ipa', '-v', lang],
|
|
145
|
-
input_text=text
|
|
146
|
-
).replace("\n", " . ")
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
if __name__ == "__main__":
|
|
152
|
-
|
|
153
|
-
espeak = EspeakPhonemizer()
|
|
154
|
-
|
|
155
|
-
lang = "en-gb"
|
|
156
|
-
|
|
157
|
-
text1 = "Hello, world. How are you?"
|
|
158
|
-
|
|
159
|
-
print("\n--- Getting phonemes for 'Hello, world. How are you?' ---")
|
|
160
|
-
phonemes1 = espeak.phonemize(text1, lang)
|
|
161
|
-
|
|
162
|
-
print(f" Espeak Phonemes: {phonemes1}")
|
|
163
|
-
|
|
164
|
-
|