stark-engine 4.2.1__tar.gz → 4.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {stark_engine-4.2.1 → stark_engine-4.2.2}/PKG-INFO +1 -1
- {stark_engine-4.2.1 → stark_engine-4.2.2}/pyproject.toml +1 -1
- {stark_engine-4.2.1 → stark_engine-4.2.2}/stark/tools/dictionary/dictionary.py +46 -9
- stark_engine-4.2.2/stark/tools/phonetic/transcription/__init__.py +26 -0
- stark_engine-4.2.2/stark/tools/phonetic/transcription/epitran.py +191 -0
- stark_engine-4.2.1/stark/tools/phonetic/espeak_ng.py → stark_engine-4.2.2/stark/tools/phonetic/transcription/espeak.py +17 -35
- stark_engine-4.2.2/stark/tools/phonetic/transcription/ipa2lat.py +151 -0
- stark_engine-4.2.2/stark/tools/phonetic/transcription/protocol.py +5 -0
- stark_engine-4.2.1/stark/tools/phonetic/ipa.py +0 -399
- {stark_engine-4.2.1 → stark_engine-4.2.2}/LICENSE.md +0 -0
- {stark_engine-4.2.1 → stark_engine-4.2.2}/README.md +0 -0
- {stark_engine-4.2.1 → stark_engine-4.2.2}/stark/__init__.py +0 -0
- {stark_engine-4.2.1 → stark_engine-4.2.2}/stark/core/__init__.py +0 -0
- {stark_engine-4.2.1 → stark_engine-4.2.2}/stark/core/command.py +0 -0
- {stark_engine-4.2.1 → stark_engine-4.2.2}/stark/core/commands_context.py +0 -0
- {stark_engine-4.2.1 → stark_engine-4.2.2}/stark/core/commands_manager.py +0 -0
- {stark_engine-4.2.1 → stark_engine-4.2.2}/stark/core/patterns/__init__.py +0 -0
- {stark_engine-4.2.1 → stark_engine-4.2.2}/stark/core/patterns/parsing.py +0 -0
- {stark_engine-4.2.1 → stark_engine-4.2.2}/stark/core/patterns/pattern.py +0 -0
- {stark_engine-4.2.1 → stark_engine-4.2.2}/stark/core/patterns/rules.py +0 -0
- {stark_engine-4.2.1 → stark_engine-4.2.2}/stark/core/types/__init__.py +0 -0
- {stark_engine-4.2.1 → stark_engine-4.2.2}/stark/core/types/number.py +0 -0
- {stark_engine-4.2.1 → stark_engine-4.2.2}/stark/core/types/object.py +0 -0
- {stark_engine-4.2.1 → stark_engine-4.2.2}/stark/core/types/slots.py +0 -0
- {stark_engine-4.2.1 → stark_engine-4.2.2}/stark/core/types/string.py +0 -0
- {stark_engine-4.2.1 → stark_engine-4.2.2}/stark/core/types/time.py +0 -0
- {stark_engine-4.2.1 → stark_engine-4.2.2}/stark/core/types/time_interval.py +0 -0
- {stark_engine-4.2.1 → stark_engine-4.2.2}/stark/core/types/word.py +0 -0
- {stark_engine-4.2.1 → stark_engine-4.2.2}/stark/general/blockage_detector.py +0 -0
- {stark_engine-4.2.1 → stark_engine-4.2.2}/stark/general/classproperty.py +0 -0
- {stark_engine-4.2.1 → stark_engine-4.2.2}/stark/general/dependencies.py +0 -0
- {stark_engine-4.2.1 → stark_engine-4.2.2}/stark/general/json_encoder.py +0 -0
- {stark_engine-4.2.1 → stark_engine-4.2.2}/stark/interfaces/gcloud.py +0 -0
- {stark_engine-4.2.1 → stark_engine-4.2.2}/stark/interfaces/protocols.py +0 -0
- {stark_engine-4.2.1 → stark_engine-4.2.2}/stark/interfaces/silero.py +0 -0
- {stark_engine-4.2.1 → stark_engine-4.2.2}/stark/interfaces/vosk.py +0 -0
- {stark_engine-4.2.1 → stark_engine-4.2.2}/stark/tools/common/span.py +0 -0
- {stark_engine-4.2.1 → stark_engine-4.2.2}/stark/tools/dictionary/!examples.py +0 -0
- {stark_engine-4.2.1 → stark_engine-4.2.2}/stark/tools/dictionary/__init__.py +0 -0
- {stark_engine-4.2.1 → stark_engine-4.2.2}/stark/tools/dictionary/models.py +0 -0
- {stark_engine-4.2.1 → stark_engine-4.2.2}/stark/tools/dictionary/nl_dictionary_name.py +0 -0
- {stark_engine-4.2.1 → stark_engine-4.2.2}/stark/tools/dictionary/storage/__init__.py +0 -0
- {stark_engine-4.2.1 → stark_engine-4.2.2}/stark/tools/dictionary/storage/storage_memory.py +0 -0
- {stark_engine-4.2.1 → stark_engine-4.2.2}/stark/tools/dictionary/storage/storage_sqlite.py +0 -0
- {stark_engine-4.2.1 → stark_engine-4.2.2}/stark/tools/levenshtein/__init__.py +0 -0
- {stark_engine-4.2.1 → stark_engine-4.2.2}/stark/tools/levenshtein/levenshtein.pyi +0 -0
- {stark_engine-4.2.1 → stark_engine-4.2.2}/stark/tools/levenshtein/levenshtein.pyx +0 -0
- {stark_engine-4.2.1 → stark_engine-4.2.2}/stark/tools/phonetic/simplephone.py +0 -0
- {stark_engine-4.2.1 → stark_engine-4.2.2}/stark/tools/sliding_window_parser.py +0 -0
- {stark_engine-4.2.1 → stark_engine-4.2.2}/stark/tools/strtools.py +0 -0
- {stark_engine-4.2.1 → stark_engine-4.2.2}/stark/voice_assistant/__init__.py +0 -0
- {stark_engine-4.2.1 → stark_engine-4.2.2}/stark/voice_assistant/mode.py +0 -0
- {stark_engine-4.2.1 → stark_engine-4.2.2}/stark/voice_assistant/voice_assistant.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "stark-engine"
|
|
3
|
-
version = "4.2.
|
|
3
|
+
version = "4.2.2"
|
|
4
4
|
description = "S.T.A.R.K - Speech and Text Algorithmic Recognition Kit. Modern framework for creating powerfull voice assistants."
|
|
5
5
|
authors = ["MarkParker5 <mark@parker-programs.com>"]
|
|
6
6
|
license = "CC BY-NC-SA 4.0"
|
|
@@ -14,7 +14,11 @@ from stark.tools.levenshtein import (
|
|
|
14
14
|
levenshtein_similarity,
|
|
15
15
|
levenshtein_similarity_substring,
|
|
16
16
|
)
|
|
17
|
-
from stark.tools.phonetic.
|
|
17
|
+
from stark.tools.phonetic.transcription import (
|
|
18
|
+
transcription,
|
|
19
|
+
IpaProvider,
|
|
20
|
+
EspeakIpaProvider,
|
|
21
|
+
)
|
|
18
22
|
from stark.tools.phonetic.simplephone import simplephone
|
|
19
23
|
from stark.tools.strtools import find_substring_in_words_map, split_indices
|
|
20
24
|
|
|
@@ -52,8 +56,13 @@ class Dictionary:
|
|
|
52
56
|
Phonetic-aware dictionary with metadata storage.
|
|
53
57
|
"""
|
|
54
58
|
|
|
55
|
-
def __init__(
|
|
59
|
+
def __init__(
|
|
60
|
+
self,
|
|
61
|
+
storage: DictionaryStorageProtocol,
|
|
62
|
+
ipa_provider: IpaProvider = EspeakIpaProvider(),
|
|
63
|
+
):
|
|
56
64
|
self.storage: DictionaryStorageProtocol = storage
|
|
65
|
+
self.ipa_provider: IpaProvider = ipa_provider
|
|
57
66
|
|
|
58
67
|
# ----------------------
|
|
59
68
|
# Write methods
|
|
@@ -65,7 +74,9 @@ class Dictionary:
|
|
|
65
74
|
Add a single entry to the dictionary.
|
|
66
75
|
Phonetic conversion happens internally (mandatory).
|
|
67
76
|
"""
|
|
68
|
-
phonetic_str =
|
|
77
|
+
phonetic_str = transcription(
|
|
78
|
+
name, language_code=language_code, ipa_provider=self.ipa_provider
|
|
79
|
+
)
|
|
69
80
|
simple_phonetic = simplephone(phonetic_str) or ""
|
|
70
81
|
item = DictionaryItem(
|
|
71
82
|
name=name,
|
|
@@ -125,7 +136,9 @@ class Dictionary:
|
|
|
125
136
|
else r.item.phonetic,
|
|
126
137
|
s2=sentence
|
|
127
138
|
if r.item.language_code == language_code
|
|
128
|
-
else
|
|
139
|
+
else transcription(
|
|
140
|
+
sentence, language_code, ipa_provider=self.ipa_provider
|
|
141
|
+
),
|
|
129
142
|
ignore_prefix=True,
|
|
130
143
|
)[0][1], # TODO: review
|
|
131
144
|
reverse=True,
|
|
@@ -141,7 +154,14 @@ class Dictionary:
|
|
|
141
154
|
"""
|
|
142
155
|
Lookup dictionary items by name_candidate and language_code using LookupMode and LookupField.
|
|
143
156
|
"""
|
|
144
|
-
simple_phonetic =
|
|
157
|
+
simple_phonetic = (
|
|
158
|
+
simplephone(
|
|
159
|
+
transcription(
|
|
160
|
+
name_candidate, language_code, ipa_provider=self.ipa_provider
|
|
161
|
+
)
|
|
162
|
+
)
|
|
163
|
+
or ""
|
|
164
|
+
)
|
|
145
165
|
logger.debug(
|
|
146
166
|
f"Looking up '{name_candidate}' with simple phonetic '{simple_phonetic}' under mode {mode}, field {field}"
|
|
147
167
|
)
|
|
@@ -170,7 +190,13 @@ class Dictionary:
|
|
|
170
190
|
yield from filter(
|
|
171
191
|
lambda item: levenshtein_match(
|
|
172
192
|
s1=item.simple_phonetic,
|
|
173
|
-
s2=simplephone(
|
|
193
|
+
s2=simplephone(
|
|
194
|
+
transcription(
|
|
195
|
+
name_candidate,
|
|
196
|
+
language_code,
|
|
197
|
+
ipa_provider=self.ipa_provider,
|
|
198
|
+
)
|
|
199
|
+
)
|
|
174
200
|
or "",
|
|
175
201
|
threshold=0.8,
|
|
176
202
|
proximity_graph=SIMPLEPHONE_PROXIMITY_GRAPH,
|
|
@@ -253,7 +279,12 @@ class Dictionary:
|
|
|
253
279
|
case LookupMode.FUZZY:
|
|
254
280
|
if field == LookupField.PHONETIC:
|
|
255
281
|
simple_phonetic = (
|
|
256
|
-
simplephone(
|
|
282
|
+
simplephone(
|
|
283
|
+
transcription(
|
|
284
|
+
sentence, language_code, ipa_provider=self.ipa_provider
|
|
285
|
+
)
|
|
286
|
+
)
|
|
287
|
+
or ""
|
|
257
288
|
)
|
|
258
289
|
for item in self.storage.iterate():
|
|
259
290
|
for span, _ in levenshtein_search_substring(
|
|
@@ -343,7 +374,11 @@ class Dictionary:
|
|
|
343
374
|
span=span,
|
|
344
375
|
text=sentence[span.slice],
|
|
345
376
|
simple_phonetic=simplephone(
|
|
346
|
-
|
|
377
|
+
transcription(
|
|
378
|
+
sentence[span.slice],
|
|
379
|
+
language_code,
|
|
380
|
+
ipa_provider=self.ipa_provider,
|
|
381
|
+
)
|
|
347
382
|
)
|
|
348
383
|
or "",
|
|
349
384
|
)
|
|
@@ -405,7 +440,9 @@ class Dictionary:
|
|
|
405
440
|
key=lambda item: levenshtein_similarity(
|
|
406
441
|
s1=name_candidate
|
|
407
442
|
if item.language_code == language_code
|
|
408
|
-
else
|
|
443
|
+
else transcription(
|
|
444
|
+
name_candidate, language_code, ipa_provider=self.ipa_provider
|
|
445
|
+
),
|
|
409
446
|
s2=item.name if item.language_code == language_code else item.phonetic,
|
|
410
447
|
),
|
|
411
448
|
reverse=True,
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from .protocol import IpaProvider
|
|
2
|
+
from .espeak import EspeakIpaProvider
|
|
3
|
+
from .ipa2lat import ipa2lat
|
|
4
|
+
from functools import lru_cache
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@lru_cache
|
|
8
|
+
def transcription(
|
|
9
|
+
string: str,
|
|
10
|
+
language_code: str,
|
|
11
|
+
ipa_provider: IpaProvider = EspeakIpaProvider(),
|
|
12
|
+
) -> str:
|
|
13
|
+
"""
|
|
14
|
+
Converts a string to a simplified latin transcription via phonetic (IPA) transliteration.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
string: The input string to transcribe.
|
|
18
|
+
language_code: The language code for IPA conversion.
|
|
19
|
+
ipa_provider: The IPA provider to use for conversion (default: EspeakIpaProvider).
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
The simplified latin transcription of the input string.
|
|
23
|
+
"""
|
|
24
|
+
return " ".join(
|
|
25
|
+
ipa2lat(ipa_provider.to_ipa(word, language_code)) for word in string.split()
|
|
26
|
+
)
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
import warnings
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class EpitranIpaProvider:
|
|
6
|
+
def __init__(self) -> None:
|
|
7
|
+
self._cache: dict[str, Any] = {}
|
|
8
|
+
|
|
9
|
+
def _epitran_obj(self, language_code: str) -> Any:
|
|
10
|
+
if language_code not in self._cache:
|
|
11
|
+
from epitran import Epitran
|
|
12
|
+
|
|
13
|
+
self._cache[language_code] = Epitran(language_code)
|
|
14
|
+
return self._cache[language_code]
|
|
15
|
+
|
|
16
|
+
def to_ipa(self, string: str, language_code: str) -> str:
|
|
17
|
+
# if language_code.startswith("en"):
|
|
18
|
+
# raise NotImplementedError(
|
|
19
|
+
# "IPA to Epitran conversion for English is not implemented yet."
|
|
20
|
+
# )
|
|
21
|
+
|
|
22
|
+
if language_code == "ru":
|
|
23
|
+
language_code = "rus-Cyrl"
|
|
24
|
+
|
|
25
|
+
# Code: Language (Script)
|
|
26
|
+
supported_languages = {
|
|
27
|
+
"aar-Latn": "Afar",
|
|
28
|
+
"afr-Latn": "Afrikanns",
|
|
29
|
+
"aii-Syrc": "Assyrian Neo-Aramaic",
|
|
30
|
+
"amh-Ethi": "Amharic",
|
|
31
|
+
"amh-Ethi-pp": "Amharic (more phonetic)",
|
|
32
|
+
"amh-Ethi-red": "Amharic (reduced)",
|
|
33
|
+
"ara-Arab": "Literary Arabic",
|
|
34
|
+
"ava-Cyrl": "Avaric",
|
|
35
|
+
"aze-Cyrl": "Azerbaijani (Cyrillic)",
|
|
36
|
+
"aze-Latn": "Azerbaijani (Latin)",
|
|
37
|
+
"ben-Beng": "Bengali",
|
|
38
|
+
"ben-Beng-red": "Bengali (reduced)",
|
|
39
|
+
"ben-Beng-east": "East Bengali",
|
|
40
|
+
"bho-Deva": "Bhojpuri",
|
|
41
|
+
"bxk-Latn": "Bukusu",
|
|
42
|
+
"cat-Latn": "Catalan",
|
|
43
|
+
"ceb-Latn": "Cebuano",
|
|
44
|
+
"ces-Latn": "Czech",
|
|
45
|
+
"cjy-Latn": "Jin (Wiktionary)",
|
|
46
|
+
"ckb-Arab": "Sorani",
|
|
47
|
+
"cmn-Hans": "Mandarin (Simplified)*",
|
|
48
|
+
"cmn-Hant": "Mandarin (Traditional)*",
|
|
49
|
+
"cmn-Latn": "Mandarin (Pinyin)*",
|
|
50
|
+
"csb-Latn": "Kashubian",
|
|
51
|
+
"deu-Latn": "German",
|
|
52
|
+
"deu-Latn-np": "German†",
|
|
53
|
+
"deu-Latn-nar": "German (more phonetic)",
|
|
54
|
+
"eng-Latn": "English‡",
|
|
55
|
+
"epo-Latn": "Esperanto",
|
|
56
|
+
"est-Latn": "Estonian",
|
|
57
|
+
"fas-Arab": "Farsi (Perso-Arabic)",
|
|
58
|
+
"fin-Latn": "Finnish",
|
|
59
|
+
"fra-Latn": "French",
|
|
60
|
+
"fra-Latn-np": "French†",
|
|
61
|
+
"fra-Latn-p": "French (more phonetic)",
|
|
62
|
+
"ful-Latn": "Fulah",
|
|
63
|
+
"gan-Latn": "Gan (Wiktionary)",
|
|
64
|
+
"glg-Latn": "Galician",
|
|
65
|
+
"got-Goth": "Gothic",
|
|
66
|
+
"got-Latn": "Gothic (Latin)",
|
|
67
|
+
"hak-Latn": "Hakka (pha̍k-fa-sṳ)",
|
|
68
|
+
"hat-Latn-bab": "Haitian (Latin-Babel)",
|
|
69
|
+
"hau-Latn": "Hausa",
|
|
70
|
+
"hin-Deva": "Hindi",
|
|
71
|
+
"hmn-Latn": "Hmong",
|
|
72
|
+
"hrv-Latn": "Croatian",
|
|
73
|
+
"hsn-Latn": "Xiang (Wiktionary)",
|
|
74
|
+
"hun-Latn": "Hungarian",
|
|
75
|
+
"ilo-Latn": "Ilocano",
|
|
76
|
+
"ind-Latn": "Indonesian",
|
|
77
|
+
"ita-Latn": "Italian",
|
|
78
|
+
"jam-Latn": "Jamaican",
|
|
79
|
+
"jav-Latn": "Javanese",
|
|
80
|
+
"jpn-Hira": "Japanese (Hiragana)",
|
|
81
|
+
"jpn-Hira-red": "red Japanese (Hiragana, reduced)",
|
|
82
|
+
"jpn-Jpan": "Japanese (Hiragana, Katakana, Kanji)",
|
|
83
|
+
"jpn-Kana": "Japanese (Katakana)",
|
|
84
|
+
"jpn-Kana-red": "red Japanese (Katakana, reduced)",
|
|
85
|
+
"kat-Geor": "Georgian",
|
|
86
|
+
"kaz-Cyrl": "Kazakh (Cyrillic)",
|
|
87
|
+
"kaz-Cyrl-bab": "bab Kazakh (Cyrillic—Babel)",
|
|
88
|
+
"kaz-Latn": "Kazakh (Latin)",
|
|
89
|
+
"kbd-Cyrl": "Kabardian",
|
|
90
|
+
"khm-Khmr": "Khmer",
|
|
91
|
+
"kin-Latn": "Kinyarwanda",
|
|
92
|
+
"kir-Arab": "Kyrgyz (Perso-Arabic)",
|
|
93
|
+
"kir-Cyrl": "Kyrgyz (Cyrillic)",
|
|
94
|
+
"kir-Latn": "Kyrgyz (Latin)",
|
|
95
|
+
"kmr-Latn": "Kurmanji",
|
|
96
|
+
"kmr-Latn-red": "Kurmanji (reduced)",
|
|
97
|
+
"kor-Hang": "Korean",
|
|
98
|
+
"lao-Laoo": "Lao",
|
|
99
|
+
"lao-Laoo-prereform": "Lao (Before spelling reform)",
|
|
100
|
+
"lav-Latn": "Latvian",
|
|
101
|
+
"lez-Cyrl": "Lezgian",
|
|
102
|
+
"lij-Latn": "Ligurian",
|
|
103
|
+
"lit-Latn": "Lithuanian",
|
|
104
|
+
"lsm-Latn": "Saamia",
|
|
105
|
+
"ltc-Latn-bax": "Middle Chinese (Baxter and Sagart 2014)",
|
|
106
|
+
"lug-Latn": "Ganda / Luganda",
|
|
107
|
+
"mal-Mlym": "Malayalam",
|
|
108
|
+
"mar-Deva": "Marathi",
|
|
109
|
+
"mlt-Latn": "Maltese",
|
|
110
|
+
"mon-Cyrl-bab": "Mongolian (Cyrillic)",
|
|
111
|
+
"mri-Latn": "Maori",
|
|
112
|
+
"msa-Latn": "Malay",
|
|
113
|
+
"mya-Mymr": "Burmese",
|
|
114
|
+
"nan-Latn": "Hokkien (pe̍h-oē-jī)",
|
|
115
|
+
"nan-Latn-tl": "Hokkien (Tâi-lô)",
|
|
116
|
+
"nld-Latn": "Dutch",
|
|
117
|
+
"nya-Latn": "Chichewa",
|
|
118
|
+
"ood-Latn-alv": "Tohono O'odham (Alvarez–Hale)",
|
|
119
|
+
"ood-Latn-sax": "Tohono O'odham (Saxton)",
|
|
120
|
+
"ori-Orya": "Odia",
|
|
121
|
+
"orm-Latn": "Oromo",
|
|
122
|
+
"pan-Guru": "Punjabi (Eastern)",
|
|
123
|
+
"pol-Latn": "Polish",
|
|
124
|
+
"por-Latn": "Portuguese",
|
|
125
|
+
"quy-Latn": "Ayacucho Quechua / Quechua Chanka",
|
|
126
|
+
"ron-Latn": "Romanian",
|
|
127
|
+
"run-Latn": "Rundi",
|
|
128
|
+
"rus-Cyrl": "Russian",
|
|
129
|
+
"sag-Latn": "Sango",
|
|
130
|
+
"sin-Sinh": "Sinhala",
|
|
131
|
+
"slv-Latn": "Slovene / Slovenian",
|
|
132
|
+
"sna-Latn": "Shona",
|
|
133
|
+
"som-Latn": "Somali",
|
|
134
|
+
"spa-Latn": "Spanish",
|
|
135
|
+
"spa-Latn-eu": "Spanish (Iberian)",
|
|
136
|
+
"sqi-Latn": "Albanian",
|
|
137
|
+
"sro-Latn": "Sardinian (Campidanese)",
|
|
138
|
+
"srp-Latn": "Serbian (Latin)",
|
|
139
|
+
"srp-Cyrl": "Serbian (Cyrillic)",
|
|
140
|
+
"swa-Latn": "Swahili",
|
|
141
|
+
"swa-Latn-red": "Swahili (reduced)",
|
|
142
|
+
"swe-Latn": "Swedish",
|
|
143
|
+
"tam-Taml": "Tamil",
|
|
144
|
+
"tam-Taml-red": "Tamil (reduced)",
|
|
145
|
+
"tel-Telu": "Telugu",
|
|
146
|
+
"tgk-Cyrl": "Tajik",
|
|
147
|
+
"tgl-Latn": "Tagalog",
|
|
148
|
+
"tgl-Latn-red": "Tagalog (reduced)",
|
|
149
|
+
"tha-Thai": "Thai",
|
|
150
|
+
"tir-Ethi": "Tigrinya",
|
|
151
|
+
"tir-Ethi-pp": "Tigrinya (more phonemic)",
|
|
152
|
+
"tir-Ethi-red": "Tigrinya (reduced)",
|
|
153
|
+
"tok-Latn": "Toki Pona",
|
|
154
|
+
"tpi-Latn": "Tok Pisin",
|
|
155
|
+
"tuk-Cyrl": "Turkmen (Cyrillic)",
|
|
156
|
+
"tuk-Latn": "Turkmen (Latin)",
|
|
157
|
+
"tur-Latn": "Turkish (Latin)",
|
|
158
|
+
"tur-Latn-bab": "Turkish (Latin—Babel)",
|
|
159
|
+
"tur-Latn-red": "Turkish (reduced)",
|
|
160
|
+
"ukr-Cyrl": "Ukrainian",
|
|
161
|
+
"urd-Arab": "Urdu",
|
|
162
|
+
"uig-Arab": "Uyghur (Perso-Arabic)",
|
|
163
|
+
"uzb-Cyrl": "Uzbek (Cyrillic)",
|
|
164
|
+
"uzb-Latn": "Uzbek (Latin)",
|
|
165
|
+
"vie-Latn": "Vietnamese",
|
|
166
|
+
"wuu-Latn": "Shanghainese Wu (Wiktionary)",
|
|
167
|
+
"xho-Latn": "Xhosa",
|
|
168
|
+
"yor-Latn": "Yoruba",
|
|
169
|
+
"yue-Latn": "Cantonese (Jyutping)",
|
|
170
|
+
"yue-Hant": "Cantonese (Character)",
|
|
171
|
+
"zha-Latn": "Zhuang",
|
|
172
|
+
"zul-Latn": "Zulu",
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
if language_code not in supported_languages:
|
|
176
|
+
for key in supported_languages:
|
|
177
|
+
if key.startswith(language_code):
|
|
178
|
+
warnings.warn(
|
|
179
|
+
f"Unsupported language code: {language_code}; trying to use similar key {key}"
|
|
180
|
+
)
|
|
181
|
+
language_code = key
|
|
182
|
+
break
|
|
183
|
+
else:
|
|
184
|
+
raise ValueError(
|
|
185
|
+
f"Unsupported language code: {language_code}; supported languages: {supported_languages}"
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
if not string.strip():
|
|
189
|
+
return ""
|
|
190
|
+
|
|
191
|
+
return self._epitran_obj(language_code).transliterate(string)
|
|
@@ -126,38 +126,20 @@ espeak: EspeakNG | None = None
|
|
|
126
126
|
_espeak_lock = threading.Lock()
|
|
127
127
|
|
|
128
128
|
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
("en", "Hello World"),
|
|
147
|
-
("uk", "Привіт світ"),
|
|
148
|
-
("fr", "Bonjour le monde"),
|
|
149
|
-
("ru", "Привет мир"),
|
|
150
|
-
("en", "Hello World"),
|
|
151
|
-
("en", "Hello World"),
|
|
152
|
-
("ru", "Привет мир"),
|
|
153
|
-
("ru", "Привет мир"),
|
|
154
|
-
("uk", "Привіт світ"),
|
|
155
|
-
("uk", "Привіт світ"),
|
|
156
|
-
("uk", "Привіт світ"),
|
|
157
|
-
("en", "Hello, World"),
|
|
158
|
-
("uk", "Привіт світ"),
|
|
159
|
-
("en", "Hello! World"),
|
|
160
|
-
("uk", "Привіт, світ"),
|
|
161
|
-
]
|
|
162
|
-
for lang, text in data:
|
|
163
|
-
print(f"{lang.upper()}: '{text}' -> '{text_to_ipa(text, lang)}'")
|
|
129
|
+
class EspeakIpaProvider:
|
|
130
|
+
def __init__(self, check_chars: bool = True):
|
|
131
|
+
self.check_chars = check_chars
|
|
132
|
+
|
|
133
|
+
def to_ipa(self, string: str, language_code: str) -> str:
|
|
134
|
+
with _espeak_lock:
|
|
135
|
+
global espeak
|
|
136
|
+
if espeak is None:
|
|
137
|
+
espeak = EspeakNG(language_code)
|
|
138
|
+
espeak.set_lang(language_code)
|
|
139
|
+
ipa = espeak.text_to_ipa(string, remove_stress=True)
|
|
140
|
+
if self.check_chars:
|
|
141
|
+
for char in {"(", ")", "[", "]"}:
|
|
142
|
+
assert char not in ipa, (
|
|
143
|
+
f"Unexpected character '{char}' in IPA '{ipa}' with lang '{language_code}'. Check if the language is supported by eSpeak NG. You can disable this check by setting check_chars=False."
|
|
144
|
+
)
|
|
145
|
+
return ipa
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
import warnings
|
|
2
|
+
|
|
3
|
+
_mapping = {
|
|
4
|
+
# Vowels
|
|
5
|
+
"i": "i",
|
|
6
|
+
"y": "i",
|
|
7
|
+
"ɨ": "i",
|
|
8
|
+
"ʉ": "u",
|
|
9
|
+
"ɯ": "u",
|
|
10
|
+
"u": "u",
|
|
11
|
+
"ɪ": "i",
|
|
12
|
+
"ʏ": "i",
|
|
13
|
+
"ʊ": "u",
|
|
14
|
+
"e": "e",
|
|
15
|
+
"ø": "e",
|
|
16
|
+
"ɘ": "e",
|
|
17
|
+
"ɵ": "o",
|
|
18
|
+
"ɤ": "o",
|
|
19
|
+
"o": "o",
|
|
20
|
+
"ə": "e",
|
|
21
|
+
"ɛ": "e",
|
|
22
|
+
"œ": "e",
|
|
23
|
+
"ɜ": "e",
|
|
24
|
+
"ɞ": "e",
|
|
25
|
+
"ʌ": "a",
|
|
26
|
+
"ɔ": "o",
|
|
27
|
+
"æ": "a",
|
|
28
|
+
"ɐ": "a",
|
|
29
|
+
"a": "a",
|
|
30
|
+
"ɶ": "a",
|
|
31
|
+
"ä": "a",
|
|
32
|
+
"ɑ": "a",
|
|
33
|
+
"ɒ": "o",
|
|
34
|
+
# Pulmonic Consonants
|
|
35
|
+
"p": "p",
|
|
36
|
+
"b": "b",
|
|
37
|
+
"t": "t",
|
|
38
|
+
"d": "d",
|
|
39
|
+
"ʈ": "t",
|
|
40
|
+
"ɖ": "d",
|
|
41
|
+
"c": "k",
|
|
42
|
+
"ɟ": "j",
|
|
43
|
+
"k": "k",
|
|
44
|
+
"g": "g",
|
|
45
|
+
"q": "k",
|
|
46
|
+
"ɢ": "g",
|
|
47
|
+
"ɡ": "g",
|
|
48
|
+
"m": "m",
|
|
49
|
+
"ɱ": "m",
|
|
50
|
+
"n": "n",
|
|
51
|
+
"ɳ": "n",
|
|
52
|
+
"ɲ": "nj",
|
|
53
|
+
"ŋ": "ng",
|
|
54
|
+
"ʋ": "v",
|
|
55
|
+
"ɹ": "r",
|
|
56
|
+
"ɻ": "r",
|
|
57
|
+
"j": "j",
|
|
58
|
+
"ɰ": "w",
|
|
59
|
+
"ʙ": "b",
|
|
60
|
+
"r": "r",
|
|
61
|
+
"ʀ": "r",
|
|
62
|
+
"ɾ": "r",
|
|
63
|
+
"ɸ": "f",
|
|
64
|
+
"β": "v",
|
|
65
|
+
"f": "f",
|
|
66
|
+
"v": "v",
|
|
67
|
+
"θ": "th",
|
|
68
|
+
"ð": "dh",
|
|
69
|
+
"s": "s",
|
|
70
|
+
"z": "z",
|
|
71
|
+
"ʃ": "sh",
|
|
72
|
+
"ʒ": "zh",
|
|
73
|
+
"ʂ": "sh",
|
|
74
|
+
"ʐ": "zh",
|
|
75
|
+
"ç": "h",
|
|
76
|
+
"ʝ": "j",
|
|
77
|
+
"x": "h",
|
|
78
|
+
"ʑ": "z",
|
|
79
|
+
"ɣ": "gh",
|
|
80
|
+
"χ": "h",
|
|
81
|
+
"ʁ": "gh",
|
|
82
|
+
"ħ": "h",
|
|
83
|
+
"ʕ": "a",
|
|
84
|
+
"h": "h",
|
|
85
|
+
# Clicks
|
|
86
|
+
"ʘ": "o",
|
|
87
|
+
"ǀ": "l",
|
|
88
|
+
"ǃ": "!",
|
|
89
|
+
"ǂ": "!",
|
|
90
|
+
"ǁ": "l",
|
|
91
|
+
# Implosives and Ejectives
|
|
92
|
+
"ɓ": "b",
|
|
93
|
+
"ɗ": "d",
|
|
94
|
+
"ʄ": "j",
|
|
95
|
+
"ɠ": "g",
|
|
96
|
+
"ʛ": "g",
|
|
97
|
+
# Suprasegmentals
|
|
98
|
+
"ˈ": "",
|
|
99
|
+
"ˌ": "",
|
|
100
|
+
"ː": "",
|
|
101
|
+
"ˑ": "",
|
|
102
|
+
"|": "",
|
|
103
|
+
"‖": "",
|
|
104
|
+
".": "",
|
|
105
|
+
"ʼ": "",
|
|
106
|
+
# Tones and word accents
|
|
107
|
+
"̋": "",
|
|
108
|
+
"́": "",
|
|
109
|
+
"̄": "",
|
|
110
|
+
"̀": "",
|
|
111
|
+
"̏": "",
|
|
112
|
+
"̌": "",
|
|
113
|
+
"̂": "",
|
|
114
|
+
"᷄": "",
|
|
115
|
+
"᷅": "",
|
|
116
|
+
"᷈": "",
|
|
117
|
+
"᷉": "",
|
|
118
|
+
# Other symbols and diacritics
|
|
119
|
+
"ʲ": "",
|
|
120
|
+
"ʷ": "w",
|
|
121
|
+
"ʱ": "h",
|
|
122
|
+
"ʰ": "h",
|
|
123
|
+
"ʴ": "r",
|
|
124
|
+
"ʳ": "r",
|
|
125
|
+
"ˠ": "g",
|
|
126
|
+
"ʡ": "a",
|
|
127
|
+
"ʢ": "a",
|
|
128
|
+
"ɭ": "l",
|
|
129
|
+
"_": "",
|
|
130
|
+
'"': "",
|
|
131
|
+
" ": "",
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def ipa2lat(ipa_string: str) -> str:
|
|
136
|
+
"""Convert IPA string to a simplified Latin string"""
|
|
137
|
+
|
|
138
|
+
if not ipa_string:
|
|
139
|
+
return ""
|
|
140
|
+
|
|
141
|
+
string = ipa_string[:]
|
|
142
|
+
for ipa, simple in _mapping.items():
|
|
143
|
+
string = string.replace(ipa, simple)
|
|
144
|
+
|
|
145
|
+
for symbol in string:
|
|
146
|
+
if symbol not in "abcdefghijklmnopqrstuvwxyz":
|
|
147
|
+
warnings.warn(
|
|
148
|
+
f'ipa2lat: Unknown symbol: "{symbol}" in {string} ({ipa_string})'
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
return string
|
|
@@ -1,399 +0,0 @@
|
|
|
1
|
-
from functools import lru_cache
|
|
2
|
-
import warnings
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
from stark.tools.phonetic import espeak_ng
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
@lru_cache
|
|
9
|
-
def phonetic(string: str, language_code: str):
|
|
10
|
-
"""
|
|
11
|
-
Converts a string to simplified latin transcription via phonetic (ipa) transliteration.
|
|
12
|
-
"""
|
|
13
|
-
return " ".join(
|
|
14
|
-
_ipa2lat(_to_ipa(word, language_code)) for word in string.split()
|
|
15
|
-
) # TODO: try calling _to_ipa for the entire sentence
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
def _to_ipa(string: str, language_code: str) -> str:
|
|
19
|
-
return _to_ipa__espeak_bin(string, language_code)
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def _ipa2lat(ipa_string: str) -> str:
|
|
23
|
-
"""Converts IPA to a simplified latin transcription."""
|
|
24
|
-
return _ipa2lat__dict(ipa_string)
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
# ----- Implementations: -----
|
|
28
|
-
|
|
29
|
-
_mapping = {
|
|
30
|
-
# Vowels
|
|
31
|
-
"i": "i",
|
|
32
|
-
"y": "i",
|
|
33
|
-
"ɨ": "i",
|
|
34
|
-
"ʉ": "u",
|
|
35
|
-
"ɯ": "u",
|
|
36
|
-
"u": "u",
|
|
37
|
-
"ɪ": "i",
|
|
38
|
-
"ʏ": "i",
|
|
39
|
-
"ʊ": "u",
|
|
40
|
-
"e": "e",
|
|
41
|
-
"ø": "e",
|
|
42
|
-
"ɘ": "e",
|
|
43
|
-
"ɵ": "o",
|
|
44
|
-
"ɤ": "o",
|
|
45
|
-
"o": "o",
|
|
46
|
-
"ə": "e",
|
|
47
|
-
"ɛ": "e",
|
|
48
|
-
"œ": "e",
|
|
49
|
-
"ɜ": "e",
|
|
50
|
-
"ɞ": "e",
|
|
51
|
-
"ʌ": "a",
|
|
52
|
-
"ɔ": "o",
|
|
53
|
-
"æ": "a",
|
|
54
|
-
"ɐ": "a",
|
|
55
|
-
"a": "a",
|
|
56
|
-
"ɶ": "a",
|
|
57
|
-
"ä": "a",
|
|
58
|
-
"ɑ": "a",
|
|
59
|
-
"ɒ": "o",
|
|
60
|
-
# Pulmonic Consonants
|
|
61
|
-
"p": "p",
|
|
62
|
-
"b": "b",
|
|
63
|
-
"t": "t",
|
|
64
|
-
"d": "d",
|
|
65
|
-
"ʈ": "t",
|
|
66
|
-
"ɖ": "d",
|
|
67
|
-
"c": "k",
|
|
68
|
-
"ɟ": "j",
|
|
69
|
-
"k": "k",
|
|
70
|
-
"g": "g",
|
|
71
|
-
"q": "k",
|
|
72
|
-
"ɢ": "g",
|
|
73
|
-
"ɡ": "g",
|
|
74
|
-
"m": "m",
|
|
75
|
-
"ɱ": "m",
|
|
76
|
-
"n": "n",
|
|
77
|
-
"ɳ": "n",
|
|
78
|
-
"ɲ": "nj",
|
|
79
|
-
"ŋ": "ng",
|
|
80
|
-
"ʋ": "v",
|
|
81
|
-
"ɹ": "r",
|
|
82
|
-
"ɻ": "r",
|
|
83
|
-
"j": "j",
|
|
84
|
-
"ɰ": "w",
|
|
85
|
-
"ʙ": "b",
|
|
86
|
-
"r": "r",
|
|
87
|
-
"ʀ": "r",
|
|
88
|
-
"ɾ": "r",
|
|
89
|
-
"ɸ": "f",
|
|
90
|
-
"β": "v",
|
|
91
|
-
"f": "f",
|
|
92
|
-
"v": "v",
|
|
93
|
-
"θ": "th",
|
|
94
|
-
"ð": "dh",
|
|
95
|
-
"s": "s",
|
|
96
|
-
"z": "z",
|
|
97
|
-
"ʃ": "sh",
|
|
98
|
-
"ʒ": "zh",
|
|
99
|
-
"ʂ": "sh",
|
|
100
|
-
"ʐ": "zh",
|
|
101
|
-
"ç": "h",
|
|
102
|
-
"ʝ": "j",
|
|
103
|
-
"x": "h",
|
|
104
|
-
"ʑ": "z",
|
|
105
|
-
"ɣ": "gh",
|
|
106
|
-
"χ": "h",
|
|
107
|
-
"ʁ": "gh",
|
|
108
|
-
"ħ": "h",
|
|
109
|
-
"ʕ": "a",
|
|
110
|
-
"h": "h",
|
|
111
|
-
# Clicks
|
|
112
|
-
"ʘ": "o",
|
|
113
|
-
"ǀ": "l",
|
|
114
|
-
"ǃ": "!",
|
|
115
|
-
"ǂ": "!",
|
|
116
|
-
"ǁ": "l",
|
|
117
|
-
# Implosives and Ejectives
|
|
118
|
-
"ɓ": "b",
|
|
119
|
-
"ɗ": "d",
|
|
120
|
-
"ʄ": "j",
|
|
121
|
-
"ɠ": "g",
|
|
122
|
-
"ʛ": "g",
|
|
123
|
-
# Suprasegmentals
|
|
124
|
-
"ˈ": "",
|
|
125
|
-
"ˌ": "",
|
|
126
|
-
"ː": "",
|
|
127
|
-
"ˑ": "",
|
|
128
|
-
"|": "",
|
|
129
|
-
"‖": "",
|
|
130
|
-
".": "",
|
|
131
|
-
"ʼ": "",
|
|
132
|
-
# Tones and word accents
|
|
133
|
-
"̋": "",
|
|
134
|
-
"́": "",
|
|
135
|
-
"̄": "",
|
|
136
|
-
"̀": "",
|
|
137
|
-
"̏": "",
|
|
138
|
-
"̌": "",
|
|
139
|
-
"̂": "",
|
|
140
|
-
"᷄": "",
|
|
141
|
-
"᷅": "",
|
|
142
|
-
"᷈": "",
|
|
143
|
-
"᷉": "",
|
|
144
|
-
# Other symbols and diacritics
|
|
145
|
-
"ʲ": "",
|
|
146
|
-
"ʷ": "w",
|
|
147
|
-
"ʱ": "h",
|
|
148
|
-
"ʰ": "h",
|
|
149
|
-
"ʴ": "r",
|
|
150
|
-
"ʳ": "r",
|
|
151
|
-
"ˠ": "g",
|
|
152
|
-
"ʡ": "a",
|
|
153
|
-
"ʢ": "a",
|
|
154
|
-
"ɭ": "l",
|
|
155
|
-
"_": "",
|
|
156
|
-
'"': "",
|
|
157
|
-
" ": "",
|
|
158
|
-
}
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
def _ipa2lat__dict(ipa_string: str) -> str:
|
|
162
|
-
if not ipa_string:
|
|
163
|
-
return ""
|
|
164
|
-
|
|
165
|
-
string = ipa_string[:]
|
|
166
|
-
for ipa, simple in _mapping.items():
|
|
167
|
-
string = string.replace(ipa, simple)
|
|
168
|
-
|
|
169
|
-
for symbol in string:
|
|
170
|
-
if symbol not in "abcdefghijklmnopqrstuvwxyz":
|
|
171
|
-
warnings.warn(
|
|
172
|
-
f'SuggestionsManager._ipa_to_latin: Unknown symbol: "{symbol}" in {string} ({ipa_string})'
|
|
173
|
-
)
|
|
174
|
-
|
|
175
|
-
return string
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
def _to_ipa__espeak_bin(string: str, language_code: str) -> str:
|
|
179
|
-
return espeak_ng.text_to_ipa(string, language_code)
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
# def to_ipa__espeak_cli(string: str, language_code: str) -> str:
|
|
183
|
-
# import re
|
|
184
|
-
# import subprocess
|
|
185
|
-
|
|
186
|
-
# result = subprocess.run(
|
|
187
|
-
# ["espeak-ng", "--ipa", f"-v{language_code}", "-q", string],
|
|
188
|
-
# capture_output=True,
|
|
189
|
-
# text=True,
|
|
190
|
-
# )
|
|
191
|
-
# return re.compile(r"\(.*?\)").sub("", result.stdout.strip()).strip()
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
# @lru_cache
|
|
195
|
-
# def _epitran_obj(language_code: str) -> Epitran:
|
|
196
|
-
# from epitran import Epitran
|
|
197
|
-
|
|
198
|
-
# return Epitran(language_code) # this one is long, about an entire second
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
# def to_ipa__epitran(string: str, language_code: str) -> str:
|
|
202
|
-
# # if language_code.startswith("en"):
|
|
203
|
-
# # raise NotImplementedError(
|
|
204
|
-
# # "IPA to Epitran conversion for English is not implemented yet."
|
|
205
|
-
# # )
|
|
206
|
-
|
|
207
|
-
# if language_code == "ru":
|
|
208
|
-
# language_code = "rus-Cyrl"
|
|
209
|
-
|
|
210
|
-
# # Code: Language (Script)
|
|
211
|
-
# supported_languages = {
|
|
212
|
-
# "aar-Latn": "Afar",
|
|
213
|
-
# "afr-Latn": "Afrikanns",
|
|
214
|
-
# "aii-Syrc": "Assyrian Neo-Aramaic",
|
|
215
|
-
# "amh-Ethi": "Amharic",
|
|
216
|
-
# "amh-Ethi-pp": "Amharic (more phonetic)",
|
|
217
|
-
# "amh-Ethi-red": "Amharic (reduced)",
|
|
218
|
-
# "ara-Arab": "Literary Arabic",
|
|
219
|
-
# "ava-Cyrl": "Avaric",
|
|
220
|
-
# "aze-Cyrl": "Azerbaijani (Cyrillic)",
|
|
221
|
-
# "aze-Latn": "Azerbaijani (Latin)",
|
|
222
|
-
# "ben-Beng": "Bengali",
|
|
223
|
-
# "ben-Beng-red": "Bengali (reduced)",
|
|
224
|
-
# "ben-Beng-east": "East Bengali",
|
|
225
|
-
# "bho-Deva": "Bhojpuri",
|
|
226
|
-
# "bxk-Latn": "Bukusu",
|
|
227
|
-
# "cat-Latn": "Catalan",
|
|
228
|
-
# "ceb-Latn": "Cebuano",
|
|
229
|
-
# "ces-Latn": "Czech",
|
|
230
|
-
# "cjy-Latn": "Jin (Wiktionary)",
|
|
231
|
-
# "ckb-Arab": "Sorani",
|
|
232
|
-
# "cmn-Hans": "Mandarin (Simplified)*",
|
|
233
|
-
# "cmn-Hant": "Mandarin (Traditional)*",
|
|
234
|
-
# "cmn-Latn": "Mandarin (Pinyin)*",
|
|
235
|
-
# "csb-Latn": "Kashubian",
|
|
236
|
-
# "deu-Latn": "German",
|
|
237
|
-
# "deu-Latn-np": "German†",
|
|
238
|
-
# "deu-Latn-nar": "German (more phonetic)",
|
|
239
|
-
# "eng-Latn": "English‡",
|
|
240
|
-
# "epo-Latn": "Esperanto",
|
|
241
|
-
# "est-Latn": "Estonian",
|
|
242
|
-
# "fas-Arab": "Farsi (Perso-Arabic)",
|
|
243
|
-
# "fin-Latn": "Finnish",
|
|
244
|
-
# "fra-Latn": "French",
|
|
245
|
-
# "fra-Latn-np": "French†",
|
|
246
|
-
# "fra-Latn-p": "French (more phonetic)",
|
|
247
|
-
# "ful-Latn": "Fulah",
|
|
248
|
-
# "gan-Latn": "Gan (Wiktionary)",
|
|
249
|
-
# "glg-Latn": "Galician",
|
|
250
|
-
# "got-Goth": "Gothic",
|
|
251
|
-
# "got-Latn": "Gothic (Latin)",
|
|
252
|
-
# "hak-Latn": "Hakka (pha̍k-fa-sṳ)",
|
|
253
|
-
# "hat-Latn-bab": "Haitian (Latin-Babel)",
|
|
254
|
-
# "hau-Latn": "Hausa",
|
|
255
|
-
# "hin-Deva": "Hindi",
|
|
256
|
-
# "hmn-Latn": "Hmong",
|
|
257
|
-
# "hrv-Latn": "Croatian",
|
|
258
|
-
# "hsn-Latn": "Xiang (Wiktionary)",
|
|
259
|
-
# "hun-Latn": "Hungarian",
|
|
260
|
-
# "ilo-Latn": "Ilocano",
|
|
261
|
-
# "ind-Latn": "Indonesian",
|
|
262
|
-
# "ita-Latn": "Italian",
|
|
263
|
-
# "jam-Latn": "Jamaican",
|
|
264
|
-
# "jav-Latn": "Javanese",
|
|
265
|
-
# "jpn-Hira": "Japanese (Hiragana)",
|
|
266
|
-
# "jpn-Hira-red": "red Japanese (Hiragana, reduced)",
|
|
267
|
-
# "jpn-Jpan": "Japanese (Hiragana, Katakana, Kanji)",
|
|
268
|
-
# "jpn-Kana": "Japanese (Katakana)",
|
|
269
|
-
# "jpn-Kana-red": "red Japanese (Katakana, reduced)",
|
|
270
|
-
# "kat-Geor": "Georgian",
|
|
271
|
-
# "kaz-Cyrl": "Kazakh (Cyrillic)",
|
|
272
|
-
# "kaz-Cyrl-bab": "bab Kazakh (Cyrillic—Babel)",
|
|
273
|
-
# "kaz-Latn": "Kazakh (Latin)",
|
|
274
|
-
# "kbd-Cyrl": "Kabardian",
|
|
275
|
-
# "khm-Khmr": "Khmer",
|
|
276
|
-
# "kin-Latn": "Kinyarwanda",
|
|
277
|
-
# "kir-Arab": "Kyrgyz (Perso-Arabic)",
|
|
278
|
-
# "kir-Cyrl": "Kyrgyz (Cyrillic)",
|
|
279
|
-
# "kir-Latn": "Kyrgyz (Latin)",
|
|
280
|
-
# "kmr-Latn": "Kurmanji",
|
|
281
|
-
# "kmr-Latn-red": "Kurmanji (reduced)",
|
|
282
|
-
# "kor-Hang": "Korean",
|
|
283
|
-
# "lao-Laoo": "Lao",
|
|
284
|
-
# "lao-Laoo-prereform": "Lao (Before spelling reform)",
|
|
285
|
-
# "lav-Latn": "Latvian",
|
|
286
|
-
# "lez-Cyrl": "Lezgian",
|
|
287
|
-
# "lij-Latn": "Ligurian",
|
|
288
|
-
# "lit-Latn": "Lithuanian",
|
|
289
|
-
# "lsm-Latn": "Saamia",
|
|
290
|
-
# "ltc-Latn-bax": "Middle Chinese (Baxter and Sagart 2014)",
|
|
291
|
-
# "lug-Latn": "Ganda / Luganda",
|
|
292
|
-
# "mal-Mlym": "Malayalam",
|
|
293
|
-
# "mar-Deva": "Marathi",
|
|
294
|
-
# "mlt-Latn": "Maltese",
|
|
295
|
-
# "mon-Cyrl-bab": "Mongolian (Cyrillic)",
|
|
296
|
-
# "mri-Latn": "Maori",
|
|
297
|
-
# "msa-Latn": "Malay",
|
|
298
|
-
# "mya-Mymr": "Burmese",
|
|
299
|
-
# "nan-Latn": "Hokkien (pe̍h-oē-jī)",
|
|
300
|
-
# "nan-Latn-tl": "Hokkien (Tâi-lô)",
|
|
301
|
-
# "nld-Latn": "Dutch",
|
|
302
|
-
# "nya-Latn": "Chichewa",
|
|
303
|
-
# "ood-Latn-alv": "Tohono O'odham (Alvarez–Hale)",
|
|
304
|
-
# "ood-Latn-sax": "Tohono O'odham (Saxton)",
|
|
305
|
-
# "ori-Orya": "Odia",
|
|
306
|
-
# "orm-Latn": "Oromo",
|
|
307
|
-
# "pan-Guru": "Punjabi (Eastern)",
|
|
308
|
-
# "pol-Latn": "Polish",
|
|
309
|
-
# "por-Latn": "Portuguese",
|
|
310
|
-
# "quy-Latn": "Ayacucho Quechua / Quechua Chanka",
|
|
311
|
-
# "ron-Latn": "Romanian",
|
|
312
|
-
# "run-Latn": "Rundi",
|
|
313
|
-
# "rus-Cyrl": "Russian",
|
|
314
|
-
# "sag-Latn": "Sango",
|
|
315
|
-
# "sin-Sinh": "Sinhala",
|
|
316
|
-
# "slv-Latn": "Slovene / Slovenian",
|
|
317
|
-
# "sna-Latn": "Shona",
|
|
318
|
-
# "som-Latn": "Somali",
|
|
319
|
-
# "spa-Latn": "Spanish",
|
|
320
|
-
# "spa-Latn-eu": "Spanish (Iberian)",
|
|
321
|
-
# "sqi-Latn": "Albanian",
|
|
322
|
-
# "sro-Latn": "Sardinian (Campidanese)",
|
|
323
|
-
# "srp-Latn": "Serbian (Latin)",
|
|
324
|
-
# "srp-Cyrl": "Serbian (Cyrillic)",
|
|
325
|
-
# "swa-Latn": "Swahili",
|
|
326
|
-
# "swa-Latn-red": "Swahili (reduced)",
|
|
327
|
-
# "swe-Latn": "Swedish",
|
|
328
|
-
# "tam-Taml": "Tamil",
|
|
329
|
-
# "tam-Taml-red": "Tamil (reduced)",
|
|
330
|
-
# "tel-Telu": "Telugu",
|
|
331
|
-
# "tgk-Cyrl": "Tajik",
|
|
332
|
-
# "tgl-Latn": "Tagalog",
|
|
333
|
-
# "tgl-Latn-red": "Tagalog (reduced)",
|
|
334
|
-
# "tha-Thai": "Thai",
|
|
335
|
-
# "tir-Ethi": "Tigrinya",
|
|
336
|
-
# "tir-Ethi-pp": "Tigrinya (more phonemic)",
|
|
337
|
-
# "tir-Ethi-red": "Tigrinya (reduced)",
|
|
338
|
-
# "tok-Latn": "Toki Pona",
|
|
339
|
-
# "tpi-Latn": "Tok Pisin",
|
|
340
|
-
# "tuk-Cyrl": "Turkmen (Cyrillic)",
|
|
341
|
-
# "tuk-Latn": "Turkmen (Latin)",
|
|
342
|
-
# "tur-Latn": "Turkish (Latin)",
|
|
343
|
-
# "tur-Latn-bab": "Turkish (Latin—Babel)",
|
|
344
|
-
# "tur-Latn-red": "Turkish (reduced)",
|
|
345
|
-
# "ukr-Cyrl": "Ukrainian",
|
|
346
|
-
# "urd-Arab": "Urdu",
|
|
347
|
-
# "uig-Arab": "Uyghur (Perso-Arabic)",
|
|
348
|
-
# "uzb-Cyrl": "Uzbek (Cyrillic)",
|
|
349
|
-
# "uzb-Latn": "Uzbek (Latin)",
|
|
350
|
-
# "vie-Latn": "Vietnamese",
|
|
351
|
-
# "wuu-Latn": "Shanghainese Wu (Wiktionary)",
|
|
352
|
-
# "xho-Latn": "Xhosa",
|
|
353
|
-
# "yor-Latn": "Yoruba",
|
|
354
|
-
# "yue-Latn": "Cantonese (Jyutping)",
|
|
355
|
-
# "yue-Hant": "Cantonese (Character)",
|
|
356
|
-
# "zha-Latn": "Zhuang",
|
|
357
|
-
# "zul-Latn": "Zulu",
|
|
358
|
-
# }
|
|
359
|
-
|
|
360
|
-
# if language_code not in supported_languages:
|
|
361
|
-
# for key in supported_languages:
|
|
362
|
-
# if key.startswith(language_code):
|
|
363
|
-
# warnings.warn(
|
|
364
|
-
# f"Unsupported language code: {language_code}; trying to use similar key {key}"
|
|
365
|
-
# )
|
|
366
|
-
# language_code = key
|
|
367
|
-
# break
|
|
368
|
-
# else:
|
|
369
|
-
# raise ValueError(
|
|
370
|
-
# f"Unsupported language code: {language_code}; supported languages: {supported_languages}"
|
|
371
|
-
# )
|
|
372
|
-
|
|
373
|
-
# if not string.strip():
|
|
374
|
-
# return ""
|
|
375
|
-
|
|
376
|
-
# return _epitran_obj(language_code).transliterate(string)
|
|
377
|
-
|
|
378
|
-
if __name__ == "__main__":
|
|
379
|
-
pass
|
|
380
|
-
# print("Starting...")
|
|
381
|
-
# print(to_ipa__epitran("Hello", "eng-Latn"))
|
|
382
|
-
# print("Two more...")
|
|
383
|
-
# print(to_ipa__epitran("Hello", "eng-Latn"))
|
|
384
|
-
# print(to_ipa__epitran("Hello", "eng-Latn"))
|
|
385
|
-
# print("Київ", to_ipa("Київ", "ua"))
|
|
386
|
-
# print("Київ", to_ipa("Київ", "uk"))
|
|
387
|
-
# test_cases = [
|
|
388
|
-
# 'Привет Иван как у тебя делая',
|
|
389
|
-
# 'любимые занятия надо делать часто',
|
|
390
|
-
# 'Съешь ещё этих мягких французских булок да выпей чаю',
|
|
391
|
-
# 'Хай',
|
|
392
|
-
# 'хай',
|
|
393
|
-
# 'Хэллоу',
|
|
394
|
-
# 'хэллоу',
|
|
395
|
-
# 'друг с другом',
|
|
396
|
-
# 'с пути фай',
|
|
397
|
-
# ]
|
|
398
|
-
# for test_case in test_cases:
|
|
399
|
-
# print((ipa := to_ipa__epitran(test_case, 'rus-Cyrl')), to_ipa__espeak(test_case, 'ru'), ipa2lat__ipapy(ipa), ipa2lat__dict(ipa), sep=' || ')
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|