stark-engine 4.2.0__tar.gz → 4.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {stark_engine-4.2.0 → stark_engine-4.2.2}/PKG-INFO +1 -1
- {stark_engine-4.2.0 → stark_engine-4.2.2}/pyproject.toml +1 -1
- {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/tools/dictionary/dictionary.py +46 -9
- stark_engine-4.2.2/stark/tools/phonetic/transcription/__init__.py +26 -0
- stark_engine-4.2.2/stark/tools/phonetic/transcription/epitran.py +191 -0
- stark_engine-4.2.0/stark/tools/phonetic/espeak_ng.py → stark_engine-4.2.2/stark/tools/phonetic/transcription/espeak.py +17 -35
- stark_engine-4.2.2/stark/tools/phonetic/transcription/ipa2lat.py +151 -0
- stark_engine-4.2.2/stark/tools/phonetic/transcription/protocol.py +5 -0
- stark_engine-4.2.2/stark/tools/sliding_window_parser.py +139 -0
- stark_engine-4.2.0/stark/tools/phonetic/ipa.py +0 -399
- {stark_engine-4.2.0 → stark_engine-4.2.2}/LICENSE.md +0 -0
- {stark_engine-4.2.0 → stark_engine-4.2.2}/README.md +0 -0
- {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/__init__.py +0 -0
- {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/core/__init__.py +0 -0
- {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/core/command.py +0 -0
- {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/core/commands_context.py +0 -0
- {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/core/commands_manager.py +0 -0
- {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/core/patterns/__init__.py +0 -0
- {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/core/patterns/parsing.py +0 -0
- {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/core/patterns/pattern.py +0 -0
- {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/core/patterns/rules.py +0 -0
- {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/core/types/__init__.py +0 -0
- {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/core/types/number.py +0 -0
- {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/core/types/object.py +0 -0
- {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/core/types/slots.py +0 -0
- {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/core/types/string.py +0 -0
- {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/core/types/time.py +0 -0
- {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/core/types/time_interval.py +0 -0
- {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/core/types/word.py +0 -0
- {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/general/blockage_detector.py +0 -0
- {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/general/classproperty.py +0 -0
- {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/general/dependencies.py +0 -0
- {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/general/json_encoder.py +0 -0
- {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/interfaces/gcloud.py +0 -0
- {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/interfaces/protocols.py +0 -0
- {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/interfaces/silero.py +0 -0
- {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/interfaces/vosk.py +0 -0
- {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/tools/common/span.py +0 -0
- {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/tools/dictionary/!examples.py +0 -0
- {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/tools/dictionary/__init__.py +0 -0
- {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/tools/dictionary/models.py +0 -0
- {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/tools/dictionary/nl_dictionary_name.py +0 -0
- {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/tools/dictionary/storage/__init__.py +0 -0
- {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/tools/dictionary/storage/storage_memory.py +0 -0
- {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/tools/dictionary/storage/storage_sqlite.py +0 -0
- {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/tools/levenshtein/__init__.py +0 -0
- {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/tools/levenshtein/levenshtein.pyi +0 -0
- {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/tools/levenshtein/levenshtein.pyx +0 -0
- {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/tools/phonetic/simplephone.py +0 -0
- {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/tools/strtools.py +0 -0
- {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/voice_assistant/__init__.py +0 -0
- {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/voice_assistant/mode.py +0 -0
- {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/voice_assistant/voice_assistant.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "stark-engine"
|
|
3
|
-
version = "4.2.
|
|
3
|
+
version = "4.2.2"
|
|
4
4
|
description = "S.T.A.R.K - Speech and Text Algorithmic Recognition Kit. Modern framework for creating powerfull voice assistants."
|
|
5
5
|
authors = ["MarkParker5 <mark@parker-programs.com>"]
|
|
6
6
|
license = "CC BY-NC-SA 4.0"
|
|
@@ -14,7 +14,11 @@ from stark.tools.levenshtein import (
|
|
|
14
14
|
levenshtein_similarity,
|
|
15
15
|
levenshtein_similarity_substring,
|
|
16
16
|
)
|
|
17
|
-
from stark.tools.phonetic.
|
|
17
|
+
from stark.tools.phonetic.transcription import (
|
|
18
|
+
transcription,
|
|
19
|
+
IpaProvider,
|
|
20
|
+
EspeakIpaProvider,
|
|
21
|
+
)
|
|
18
22
|
from stark.tools.phonetic.simplephone import simplephone
|
|
19
23
|
from stark.tools.strtools import find_substring_in_words_map, split_indices
|
|
20
24
|
|
|
@@ -52,8 +56,13 @@ class Dictionary:
|
|
|
52
56
|
Phonetic-aware dictionary with metadata storage.
|
|
53
57
|
"""
|
|
54
58
|
|
|
55
|
-
def __init__(
|
|
59
|
+
def __init__(
|
|
60
|
+
self,
|
|
61
|
+
storage: DictionaryStorageProtocol,
|
|
62
|
+
ipa_provider: IpaProvider = EspeakIpaProvider(),
|
|
63
|
+
):
|
|
56
64
|
self.storage: DictionaryStorageProtocol = storage
|
|
65
|
+
self.ipa_provider: IpaProvider = ipa_provider
|
|
57
66
|
|
|
58
67
|
# ----------------------
|
|
59
68
|
# Write methods
|
|
@@ -65,7 +74,9 @@ class Dictionary:
|
|
|
65
74
|
Add a single entry to the dictionary.
|
|
66
75
|
Phonetic conversion happens internally (mandatory).
|
|
67
76
|
"""
|
|
68
|
-
phonetic_str =
|
|
77
|
+
phonetic_str = transcription(
|
|
78
|
+
name, language_code=language_code, ipa_provider=self.ipa_provider
|
|
79
|
+
)
|
|
69
80
|
simple_phonetic = simplephone(phonetic_str) or ""
|
|
70
81
|
item = DictionaryItem(
|
|
71
82
|
name=name,
|
|
@@ -125,7 +136,9 @@ class Dictionary:
|
|
|
125
136
|
else r.item.phonetic,
|
|
126
137
|
s2=sentence
|
|
127
138
|
if r.item.language_code == language_code
|
|
128
|
-
else
|
|
139
|
+
else transcription(
|
|
140
|
+
sentence, language_code, ipa_provider=self.ipa_provider
|
|
141
|
+
),
|
|
129
142
|
ignore_prefix=True,
|
|
130
143
|
)[0][1], # TODO: review
|
|
131
144
|
reverse=True,
|
|
@@ -141,7 +154,14 @@ class Dictionary:
|
|
|
141
154
|
"""
|
|
142
155
|
Lookup dictionary items by name_candidate and language_code using LookupMode and LookupField.
|
|
143
156
|
"""
|
|
144
|
-
simple_phonetic =
|
|
157
|
+
simple_phonetic = (
|
|
158
|
+
simplephone(
|
|
159
|
+
transcription(
|
|
160
|
+
name_candidate, language_code, ipa_provider=self.ipa_provider
|
|
161
|
+
)
|
|
162
|
+
)
|
|
163
|
+
or ""
|
|
164
|
+
)
|
|
145
165
|
logger.debug(
|
|
146
166
|
f"Looking up '{name_candidate}' with simple phonetic '{simple_phonetic}' under mode {mode}, field {field}"
|
|
147
167
|
)
|
|
@@ -170,7 +190,13 @@ class Dictionary:
|
|
|
170
190
|
yield from filter(
|
|
171
191
|
lambda item: levenshtein_match(
|
|
172
192
|
s1=item.simple_phonetic,
|
|
173
|
-
s2=simplephone(
|
|
193
|
+
s2=simplephone(
|
|
194
|
+
transcription(
|
|
195
|
+
name_candidate,
|
|
196
|
+
language_code,
|
|
197
|
+
ipa_provider=self.ipa_provider,
|
|
198
|
+
)
|
|
199
|
+
)
|
|
174
200
|
or "",
|
|
175
201
|
threshold=0.8,
|
|
176
202
|
proximity_graph=SIMPLEPHONE_PROXIMITY_GRAPH,
|
|
@@ -253,7 +279,12 @@ class Dictionary:
|
|
|
253
279
|
case LookupMode.FUZZY:
|
|
254
280
|
if field == LookupField.PHONETIC:
|
|
255
281
|
simple_phonetic = (
|
|
256
|
-
simplephone(
|
|
282
|
+
simplephone(
|
|
283
|
+
transcription(
|
|
284
|
+
sentence, language_code, ipa_provider=self.ipa_provider
|
|
285
|
+
)
|
|
286
|
+
)
|
|
287
|
+
or ""
|
|
257
288
|
)
|
|
258
289
|
for item in self.storage.iterate():
|
|
259
290
|
for span, _ in levenshtein_search_substring(
|
|
@@ -343,7 +374,11 @@ class Dictionary:
|
|
|
343
374
|
span=span,
|
|
344
375
|
text=sentence[span.slice],
|
|
345
376
|
simple_phonetic=simplephone(
|
|
346
|
-
|
|
377
|
+
transcription(
|
|
378
|
+
sentence[span.slice],
|
|
379
|
+
language_code,
|
|
380
|
+
ipa_provider=self.ipa_provider,
|
|
381
|
+
)
|
|
347
382
|
)
|
|
348
383
|
or "",
|
|
349
384
|
)
|
|
@@ -405,7 +440,9 @@ class Dictionary:
|
|
|
405
440
|
key=lambda item: levenshtein_similarity(
|
|
406
441
|
s1=name_candidate
|
|
407
442
|
if item.language_code == language_code
|
|
408
|
-
else
|
|
443
|
+
else transcription(
|
|
444
|
+
name_candidate, language_code, ipa_provider=self.ipa_provider
|
|
445
|
+
),
|
|
409
446
|
s2=item.name if item.language_code == language_code else item.phonetic,
|
|
410
447
|
),
|
|
411
448
|
reverse=True,
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from .protocol import IpaProvider
|
|
2
|
+
from .espeak import EspeakIpaProvider
|
|
3
|
+
from .ipa2lat import ipa2lat
|
|
4
|
+
from functools import lru_cache
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@lru_cache
|
|
8
|
+
def transcription(
|
|
9
|
+
string: str,
|
|
10
|
+
language_code: str,
|
|
11
|
+
ipa_provider: IpaProvider = EspeakIpaProvider(),
|
|
12
|
+
) -> str:
|
|
13
|
+
"""
|
|
14
|
+
Converts a string to a simplified latin transcription via phonetic (IPA) transliteration.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
string: The input string to transcribe.
|
|
18
|
+
language_code: The language code for IPA conversion.
|
|
19
|
+
ipa_provider: The IPA provider to use for conversion (default: EspeakIpaProvider).
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
The simplified latin transcription of the input string.
|
|
23
|
+
"""
|
|
24
|
+
return " ".join(
|
|
25
|
+
ipa2lat(ipa_provider.to_ipa(word, language_code)) for word in string.split()
|
|
26
|
+
)
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
import warnings
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class EpitranIpaProvider:
|
|
6
|
+
def __init__(self) -> None:
|
|
7
|
+
self._cache: dict[str, Any] = {}
|
|
8
|
+
|
|
9
|
+
def _epitran_obj(self, language_code: str) -> Any:
|
|
10
|
+
if language_code not in self._cache:
|
|
11
|
+
from epitran import Epitran
|
|
12
|
+
|
|
13
|
+
self._cache[language_code] = Epitran(language_code)
|
|
14
|
+
return self._cache[language_code]
|
|
15
|
+
|
|
16
|
+
def to_ipa(self, string: str, language_code: str) -> str:
|
|
17
|
+
# if language_code.startswith("en"):
|
|
18
|
+
# raise NotImplementedError(
|
|
19
|
+
# "IPA to Epitran conversion for English is not implemented yet."
|
|
20
|
+
# )
|
|
21
|
+
|
|
22
|
+
if language_code == "ru":
|
|
23
|
+
language_code = "rus-Cyrl"
|
|
24
|
+
|
|
25
|
+
# Code: Language (Script)
|
|
26
|
+
supported_languages = {
|
|
27
|
+
"aar-Latn": "Afar",
|
|
28
|
+
"afr-Latn": "Afrikanns",
|
|
29
|
+
"aii-Syrc": "Assyrian Neo-Aramaic",
|
|
30
|
+
"amh-Ethi": "Amharic",
|
|
31
|
+
"amh-Ethi-pp": "Amharic (more phonetic)",
|
|
32
|
+
"amh-Ethi-red": "Amharic (reduced)",
|
|
33
|
+
"ara-Arab": "Literary Arabic",
|
|
34
|
+
"ava-Cyrl": "Avaric",
|
|
35
|
+
"aze-Cyrl": "Azerbaijani (Cyrillic)",
|
|
36
|
+
"aze-Latn": "Azerbaijani (Latin)",
|
|
37
|
+
"ben-Beng": "Bengali",
|
|
38
|
+
"ben-Beng-red": "Bengali (reduced)",
|
|
39
|
+
"ben-Beng-east": "East Bengali",
|
|
40
|
+
"bho-Deva": "Bhojpuri",
|
|
41
|
+
"bxk-Latn": "Bukusu",
|
|
42
|
+
"cat-Latn": "Catalan",
|
|
43
|
+
"ceb-Latn": "Cebuano",
|
|
44
|
+
"ces-Latn": "Czech",
|
|
45
|
+
"cjy-Latn": "Jin (Wiktionary)",
|
|
46
|
+
"ckb-Arab": "Sorani",
|
|
47
|
+
"cmn-Hans": "Mandarin (Simplified)*",
|
|
48
|
+
"cmn-Hant": "Mandarin (Traditional)*",
|
|
49
|
+
"cmn-Latn": "Mandarin (Pinyin)*",
|
|
50
|
+
"csb-Latn": "Kashubian",
|
|
51
|
+
"deu-Latn": "German",
|
|
52
|
+
"deu-Latn-np": "German†",
|
|
53
|
+
"deu-Latn-nar": "German (more phonetic)",
|
|
54
|
+
"eng-Latn": "English‡",
|
|
55
|
+
"epo-Latn": "Esperanto",
|
|
56
|
+
"est-Latn": "Estonian",
|
|
57
|
+
"fas-Arab": "Farsi (Perso-Arabic)",
|
|
58
|
+
"fin-Latn": "Finnish",
|
|
59
|
+
"fra-Latn": "French",
|
|
60
|
+
"fra-Latn-np": "French†",
|
|
61
|
+
"fra-Latn-p": "French (more phonetic)",
|
|
62
|
+
"ful-Latn": "Fulah",
|
|
63
|
+
"gan-Latn": "Gan (Wiktionary)",
|
|
64
|
+
"glg-Latn": "Galician",
|
|
65
|
+
"got-Goth": "Gothic",
|
|
66
|
+
"got-Latn": "Gothic (Latin)",
|
|
67
|
+
"hak-Latn": "Hakka (pha̍k-fa-sṳ)",
|
|
68
|
+
"hat-Latn-bab": "Haitian (Latin-Babel)",
|
|
69
|
+
"hau-Latn": "Hausa",
|
|
70
|
+
"hin-Deva": "Hindi",
|
|
71
|
+
"hmn-Latn": "Hmong",
|
|
72
|
+
"hrv-Latn": "Croatian",
|
|
73
|
+
"hsn-Latn": "Xiang (Wiktionary)",
|
|
74
|
+
"hun-Latn": "Hungarian",
|
|
75
|
+
"ilo-Latn": "Ilocano",
|
|
76
|
+
"ind-Latn": "Indonesian",
|
|
77
|
+
"ita-Latn": "Italian",
|
|
78
|
+
"jam-Latn": "Jamaican",
|
|
79
|
+
"jav-Latn": "Javanese",
|
|
80
|
+
"jpn-Hira": "Japanese (Hiragana)",
|
|
81
|
+
"jpn-Hira-red": "red Japanese (Hiragana, reduced)",
|
|
82
|
+
"jpn-Jpan": "Japanese (Hiragana, Katakana, Kanji)",
|
|
83
|
+
"jpn-Kana": "Japanese (Katakana)",
|
|
84
|
+
"jpn-Kana-red": "red Japanese (Katakana, reduced)",
|
|
85
|
+
"kat-Geor": "Georgian",
|
|
86
|
+
"kaz-Cyrl": "Kazakh (Cyrillic)",
|
|
87
|
+
"kaz-Cyrl-bab": "bab Kazakh (Cyrillic—Babel)",
|
|
88
|
+
"kaz-Latn": "Kazakh (Latin)",
|
|
89
|
+
"kbd-Cyrl": "Kabardian",
|
|
90
|
+
"khm-Khmr": "Khmer",
|
|
91
|
+
"kin-Latn": "Kinyarwanda",
|
|
92
|
+
"kir-Arab": "Kyrgyz (Perso-Arabic)",
|
|
93
|
+
"kir-Cyrl": "Kyrgyz (Cyrillic)",
|
|
94
|
+
"kir-Latn": "Kyrgyz (Latin)",
|
|
95
|
+
"kmr-Latn": "Kurmanji",
|
|
96
|
+
"kmr-Latn-red": "Kurmanji (reduced)",
|
|
97
|
+
"kor-Hang": "Korean",
|
|
98
|
+
"lao-Laoo": "Lao",
|
|
99
|
+
"lao-Laoo-prereform": "Lao (Before spelling reform)",
|
|
100
|
+
"lav-Latn": "Latvian",
|
|
101
|
+
"lez-Cyrl": "Lezgian",
|
|
102
|
+
"lij-Latn": "Ligurian",
|
|
103
|
+
"lit-Latn": "Lithuanian",
|
|
104
|
+
"lsm-Latn": "Saamia",
|
|
105
|
+
"ltc-Latn-bax": "Middle Chinese (Baxter and Sagart 2014)",
|
|
106
|
+
"lug-Latn": "Ganda / Luganda",
|
|
107
|
+
"mal-Mlym": "Malayalam",
|
|
108
|
+
"mar-Deva": "Marathi",
|
|
109
|
+
"mlt-Latn": "Maltese",
|
|
110
|
+
"mon-Cyrl-bab": "Mongolian (Cyrillic)",
|
|
111
|
+
"mri-Latn": "Maori",
|
|
112
|
+
"msa-Latn": "Malay",
|
|
113
|
+
"mya-Mymr": "Burmese",
|
|
114
|
+
"nan-Latn": "Hokkien (pe̍h-oē-jī)",
|
|
115
|
+
"nan-Latn-tl": "Hokkien (Tâi-lô)",
|
|
116
|
+
"nld-Latn": "Dutch",
|
|
117
|
+
"nya-Latn": "Chichewa",
|
|
118
|
+
"ood-Latn-alv": "Tohono O'odham (Alvarez–Hale)",
|
|
119
|
+
"ood-Latn-sax": "Tohono O'odham (Saxton)",
|
|
120
|
+
"ori-Orya": "Odia",
|
|
121
|
+
"orm-Latn": "Oromo",
|
|
122
|
+
"pan-Guru": "Punjabi (Eastern)",
|
|
123
|
+
"pol-Latn": "Polish",
|
|
124
|
+
"por-Latn": "Portuguese",
|
|
125
|
+
"quy-Latn": "Ayacucho Quechua / Quechua Chanka",
|
|
126
|
+
"ron-Latn": "Romanian",
|
|
127
|
+
"run-Latn": "Rundi",
|
|
128
|
+
"rus-Cyrl": "Russian",
|
|
129
|
+
"sag-Latn": "Sango",
|
|
130
|
+
"sin-Sinh": "Sinhala",
|
|
131
|
+
"slv-Latn": "Slovene / Slovenian",
|
|
132
|
+
"sna-Latn": "Shona",
|
|
133
|
+
"som-Latn": "Somali",
|
|
134
|
+
"spa-Latn": "Spanish",
|
|
135
|
+
"spa-Latn-eu": "Spanish (Iberian)",
|
|
136
|
+
"sqi-Latn": "Albanian",
|
|
137
|
+
"sro-Latn": "Sardinian (Campidanese)",
|
|
138
|
+
"srp-Latn": "Serbian (Latin)",
|
|
139
|
+
"srp-Cyrl": "Serbian (Cyrillic)",
|
|
140
|
+
"swa-Latn": "Swahili",
|
|
141
|
+
"swa-Latn-red": "Swahili (reduced)",
|
|
142
|
+
"swe-Latn": "Swedish",
|
|
143
|
+
"tam-Taml": "Tamil",
|
|
144
|
+
"tam-Taml-red": "Tamil (reduced)",
|
|
145
|
+
"tel-Telu": "Telugu",
|
|
146
|
+
"tgk-Cyrl": "Tajik",
|
|
147
|
+
"tgl-Latn": "Tagalog",
|
|
148
|
+
"tgl-Latn-red": "Tagalog (reduced)",
|
|
149
|
+
"tha-Thai": "Thai",
|
|
150
|
+
"tir-Ethi": "Tigrinya",
|
|
151
|
+
"tir-Ethi-pp": "Tigrinya (more phonemic)",
|
|
152
|
+
"tir-Ethi-red": "Tigrinya (reduced)",
|
|
153
|
+
"tok-Latn": "Toki Pona",
|
|
154
|
+
"tpi-Latn": "Tok Pisin",
|
|
155
|
+
"tuk-Cyrl": "Turkmen (Cyrillic)",
|
|
156
|
+
"tuk-Latn": "Turkmen (Latin)",
|
|
157
|
+
"tur-Latn": "Turkish (Latin)",
|
|
158
|
+
"tur-Latn-bab": "Turkish (Latin—Babel)",
|
|
159
|
+
"tur-Latn-red": "Turkish (reduced)",
|
|
160
|
+
"ukr-Cyrl": "Ukrainian",
|
|
161
|
+
"urd-Arab": "Urdu",
|
|
162
|
+
"uig-Arab": "Uyghur (Perso-Arabic)",
|
|
163
|
+
"uzb-Cyrl": "Uzbek (Cyrillic)",
|
|
164
|
+
"uzb-Latn": "Uzbek (Latin)",
|
|
165
|
+
"vie-Latn": "Vietnamese",
|
|
166
|
+
"wuu-Latn": "Shanghainese Wu (Wiktionary)",
|
|
167
|
+
"xho-Latn": "Xhosa",
|
|
168
|
+
"yor-Latn": "Yoruba",
|
|
169
|
+
"yue-Latn": "Cantonese (Jyutping)",
|
|
170
|
+
"yue-Hant": "Cantonese (Character)",
|
|
171
|
+
"zha-Latn": "Zhuang",
|
|
172
|
+
"zul-Latn": "Zulu",
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
if language_code not in supported_languages:
|
|
176
|
+
for key in supported_languages:
|
|
177
|
+
if key.startswith(language_code):
|
|
178
|
+
warnings.warn(
|
|
179
|
+
f"Unsupported language code: {language_code}; trying to use similar key {key}"
|
|
180
|
+
)
|
|
181
|
+
language_code = key
|
|
182
|
+
break
|
|
183
|
+
else:
|
|
184
|
+
raise ValueError(
|
|
185
|
+
f"Unsupported language code: {language_code}; supported languages: {supported_languages}"
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
if not string.strip():
|
|
189
|
+
return ""
|
|
190
|
+
|
|
191
|
+
return self._epitran_obj(language_code).transliterate(string)
|
|
@@ -126,38 +126,20 @@ espeak: EspeakNG | None = None
|
|
|
126
126
|
_espeak_lock = threading.Lock()
|
|
127
127
|
|
|
128
128
|
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
("en", "Hello World"),
|
|
147
|
-
("uk", "Привіт світ"),
|
|
148
|
-
("fr", "Bonjour le monde"),
|
|
149
|
-
("ru", "Привет мир"),
|
|
150
|
-
("en", "Hello World"),
|
|
151
|
-
("en", "Hello World"),
|
|
152
|
-
("ru", "Привет мир"),
|
|
153
|
-
("ru", "Привет мир"),
|
|
154
|
-
("uk", "Привіт світ"),
|
|
155
|
-
("uk", "Привіт світ"),
|
|
156
|
-
("uk", "Привіт світ"),
|
|
157
|
-
("en", "Hello, World"),
|
|
158
|
-
("uk", "Привіт світ"),
|
|
159
|
-
("en", "Hello! World"),
|
|
160
|
-
("uk", "Привіт, світ"),
|
|
161
|
-
]
|
|
162
|
-
for lang, text in data:
|
|
163
|
-
print(f"{lang.upper()}: '{text}' -> '{text_to_ipa(text, lang)}'")
|
|
129
|
+
class EspeakIpaProvider:
|
|
130
|
+
def __init__(self, check_chars: bool = True):
|
|
131
|
+
self.check_chars = check_chars
|
|
132
|
+
|
|
133
|
+
def to_ipa(self, string: str, language_code: str) -> str:
|
|
134
|
+
with _espeak_lock:
|
|
135
|
+
global espeak
|
|
136
|
+
if espeak is None:
|
|
137
|
+
espeak = EspeakNG(language_code)
|
|
138
|
+
espeak.set_lang(language_code)
|
|
139
|
+
ipa = espeak.text_to_ipa(string, remove_stress=True)
|
|
140
|
+
if self.check_chars:
|
|
141
|
+
for char in {"(", ")", "[", "]"}:
|
|
142
|
+
assert char not in ipa, (
|
|
143
|
+
f"Unexpected character '{char}' in IPA '{ipa}' with lang '{language_code}'. Check if the language is supported by eSpeak NG. You can disable this check by setting check_chars=False."
|
|
144
|
+
)
|
|
145
|
+
return ipa
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
import warnings
|
|
2
|
+
|
|
3
|
+
_mapping = {
|
|
4
|
+
# Vowels
|
|
5
|
+
"i": "i",
|
|
6
|
+
"y": "i",
|
|
7
|
+
"ɨ": "i",
|
|
8
|
+
"ʉ": "u",
|
|
9
|
+
"ɯ": "u",
|
|
10
|
+
"u": "u",
|
|
11
|
+
"ɪ": "i",
|
|
12
|
+
"ʏ": "i",
|
|
13
|
+
"ʊ": "u",
|
|
14
|
+
"e": "e",
|
|
15
|
+
"ø": "e",
|
|
16
|
+
"ɘ": "e",
|
|
17
|
+
"ɵ": "o",
|
|
18
|
+
"ɤ": "o",
|
|
19
|
+
"o": "o",
|
|
20
|
+
"ə": "e",
|
|
21
|
+
"ɛ": "e",
|
|
22
|
+
"œ": "e",
|
|
23
|
+
"ɜ": "e",
|
|
24
|
+
"ɞ": "e",
|
|
25
|
+
"ʌ": "a",
|
|
26
|
+
"ɔ": "o",
|
|
27
|
+
"æ": "a",
|
|
28
|
+
"ɐ": "a",
|
|
29
|
+
"a": "a",
|
|
30
|
+
"ɶ": "a",
|
|
31
|
+
"ä": "a",
|
|
32
|
+
"ɑ": "a",
|
|
33
|
+
"ɒ": "o",
|
|
34
|
+
# Pulmonic Consonants
|
|
35
|
+
"p": "p",
|
|
36
|
+
"b": "b",
|
|
37
|
+
"t": "t",
|
|
38
|
+
"d": "d",
|
|
39
|
+
"ʈ": "t",
|
|
40
|
+
"ɖ": "d",
|
|
41
|
+
"c": "k",
|
|
42
|
+
"ɟ": "j",
|
|
43
|
+
"k": "k",
|
|
44
|
+
"g": "g",
|
|
45
|
+
"q": "k",
|
|
46
|
+
"ɢ": "g",
|
|
47
|
+
"ɡ": "g",
|
|
48
|
+
"m": "m",
|
|
49
|
+
"ɱ": "m",
|
|
50
|
+
"n": "n",
|
|
51
|
+
"ɳ": "n",
|
|
52
|
+
"ɲ": "nj",
|
|
53
|
+
"ŋ": "ng",
|
|
54
|
+
"ʋ": "v",
|
|
55
|
+
"ɹ": "r",
|
|
56
|
+
"ɻ": "r",
|
|
57
|
+
"j": "j",
|
|
58
|
+
"ɰ": "w",
|
|
59
|
+
"ʙ": "b",
|
|
60
|
+
"r": "r",
|
|
61
|
+
"ʀ": "r",
|
|
62
|
+
"ɾ": "r",
|
|
63
|
+
"ɸ": "f",
|
|
64
|
+
"β": "v",
|
|
65
|
+
"f": "f",
|
|
66
|
+
"v": "v",
|
|
67
|
+
"θ": "th",
|
|
68
|
+
"ð": "dh",
|
|
69
|
+
"s": "s",
|
|
70
|
+
"z": "z",
|
|
71
|
+
"ʃ": "sh",
|
|
72
|
+
"ʒ": "zh",
|
|
73
|
+
"ʂ": "sh",
|
|
74
|
+
"ʐ": "zh",
|
|
75
|
+
"ç": "h",
|
|
76
|
+
"ʝ": "j",
|
|
77
|
+
"x": "h",
|
|
78
|
+
"ʑ": "z",
|
|
79
|
+
"ɣ": "gh",
|
|
80
|
+
"χ": "h",
|
|
81
|
+
"ʁ": "gh",
|
|
82
|
+
"ħ": "h",
|
|
83
|
+
"ʕ": "a",
|
|
84
|
+
"h": "h",
|
|
85
|
+
# Clicks
|
|
86
|
+
"ʘ": "o",
|
|
87
|
+
"ǀ": "l",
|
|
88
|
+
"ǃ": "!",
|
|
89
|
+
"ǂ": "!",
|
|
90
|
+
"ǁ": "l",
|
|
91
|
+
# Implosives and Ejectives
|
|
92
|
+
"ɓ": "b",
|
|
93
|
+
"ɗ": "d",
|
|
94
|
+
"ʄ": "j",
|
|
95
|
+
"ɠ": "g",
|
|
96
|
+
"ʛ": "g",
|
|
97
|
+
# Suprasegmentals
|
|
98
|
+
"ˈ": "",
|
|
99
|
+
"ˌ": "",
|
|
100
|
+
"ː": "",
|
|
101
|
+
"ˑ": "",
|
|
102
|
+
"|": "",
|
|
103
|
+
"‖": "",
|
|
104
|
+
".": "",
|
|
105
|
+
"ʼ": "",
|
|
106
|
+
# Tones and word accents
|
|
107
|
+
"̋": "",
|
|
108
|
+
"́": "",
|
|
109
|
+
"̄": "",
|
|
110
|
+
"̀": "",
|
|
111
|
+
"̏": "",
|
|
112
|
+
"̌": "",
|
|
113
|
+
"̂": "",
|
|
114
|
+
"᷄": "",
|
|
115
|
+
"᷅": "",
|
|
116
|
+
"᷈": "",
|
|
117
|
+
"᷉": "",
|
|
118
|
+
# Other symbols and diacritics
|
|
119
|
+
"ʲ": "",
|
|
120
|
+
"ʷ": "w",
|
|
121
|
+
"ʱ": "h",
|
|
122
|
+
"ʰ": "h",
|
|
123
|
+
"ʴ": "r",
|
|
124
|
+
"ʳ": "r",
|
|
125
|
+
"ˠ": "g",
|
|
126
|
+
"ʡ": "a",
|
|
127
|
+
"ʢ": "a",
|
|
128
|
+
"ɭ": "l",
|
|
129
|
+
"_": "",
|
|
130
|
+
'"': "",
|
|
131
|
+
" ": "",
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def ipa2lat(ipa_string: str) -> str:
|
|
136
|
+
"""Convert IPA string to a simplified Latin string"""
|
|
137
|
+
|
|
138
|
+
if not ipa_string:
|
|
139
|
+
return ""
|
|
140
|
+
|
|
141
|
+
string = ipa_string[:]
|
|
142
|
+
for ipa, simple in _mapping.items():
|
|
143
|
+
string = string.replace(ipa, simple)
|
|
144
|
+
|
|
145
|
+
for symbol in string:
|
|
146
|
+
if symbol not in "abcdefghijklmnopqrstuvwxyz":
|
|
147
|
+
warnings.warn(
|
|
148
|
+
f'ipa2lat: Unknown symbol: "{symbol}" in {string} ({ipa_string})'
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
return string
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from typing import Awaitable, Callable
|
|
3
|
+
from stark.core.patterns.parsing import ParseError
|
|
4
|
+
from stark.tools.common.span import Span
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def _token_span_to_char_span(tokens: list[str], span: Span, phrase: str) -> Span:
|
|
8
|
+
"""Convert a token span (by index) to a character span in the original phrase."""
|
|
9
|
+
if not tokens or not (0 <= span.start <= span.end <= len(tokens)):
|
|
10
|
+
return Span(0, 0)
|
|
11
|
+
# Find the start and end char positions of the tokens in the original phrase
|
|
12
|
+
positions = []
|
|
13
|
+
idx = 0
|
|
14
|
+
for token in tokens:
|
|
15
|
+
# skip leading spaces
|
|
16
|
+
while idx < len(phrase) and phrase[idx].isspace():
|
|
17
|
+
idx += 1
|
|
18
|
+
start = idx
|
|
19
|
+
idx += len(token)
|
|
20
|
+
end = idx
|
|
21
|
+
positions.append((start, end))
|
|
22
|
+
if not positions or span.start >= len(positions) or span.end > len(positions):
|
|
23
|
+
return Span(0, 0)
|
|
24
|
+
char_start = positions[span.start][0]
|
|
25
|
+
char_end = (
|
|
26
|
+
positions[span.end - 1][1]
|
|
27
|
+
if span.end > span.start
|
|
28
|
+
else positions[span.start][0]
|
|
29
|
+
)
|
|
30
|
+
return Span(char_start, char_end)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
async def _binary_cookie_trim[T](
|
|
34
|
+
tokens: list[str],
|
|
35
|
+
start: int,
|
|
36
|
+
end: int,
|
|
37
|
+
parser: Callable[[str], Awaitable[T]],
|
|
38
|
+
baseline_value: T,
|
|
39
|
+
phrase: str,
|
|
40
|
+
) -> tuple[Span, str, T]:
|
|
41
|
+
"""
|
|
42
|
+
Return minimal (char Span, substring, value) such that
|
|
43
|
+
parser(' '.join(tokens[span.start:span.end])) == baseline_value.
|
|
44
|
+
"""
|
|
45
|
+
# Binary search for the leftmost index such that tokens[left:end] still parses to baseline_value.
|
|
46
|
+
left = start
|
|
47
|
+
l_low, l_high = start, end - 1
|
|
48
|
+
while l_low <= l_high:
|
|
49
|
+
mid = (l_low + l_high) // 2
|
|
50
|
+
try:
|
|
51
|
+
r = await parser(" ".join(tokens[mid:end]))
|
|
52
|
+
except ParseError:
|
|
53
|
+
r = None
|
|
54
|
+
if r == baseline_value:
|
|
55
|
+
left = mid
|
|
56
|
+
l_low = mid + 1
|
|
57
|
+
else:
|
|
58
|
+
l_high = mid - 1
|
|
59
|
+
|
|
60
|
+
# Binary search for the rightmost index such that tokens[left:right] still parses to baseline_value.
|
|
61
|
+
right = end
|
|
62
|
+
r_low, r_high = left + 1, end
|
|
63
|
+
while r_low <= r_high:
|
|
64
|
+
mid = (r_low + r_high) // 2
|
|
65
|
+
try:
|
|
66
|
+
res = await parser(" ".join(tokens[left:mid]))
|
|
67
|
+
except ParseError:
|
|
68
|
+
res = None
|
|
69
|
+
if res == baseline_value:
|
|
70
|
+
right = mid
|
|
71
|
+
r_high = mid - 1
|
|
72
|
+
else:
|
|
73
|
+
r_low = mid + 1
|
|
74
|
+
token_span = Span(left, right)
|
|
75
|
+
char_span = _token_span_to_char_span(tokens, token_span, phrase)
|
|
76
|
+
substr = phrase[char_span.start : char_span.end]
|
|
77
|
+
return char_span, substr, baseline_value
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
async def sliding_window_parse[T](
|
|
81
|
+
phrase: str,
|
|
82
|
+
parser: Callable[[str], Awaitable[T]],
|
|
83
|
+
min_window: int = 1,
|
|
84
|
+
max_window: int | None = None,
|
|
85
|
+
concurrency: int | None = None,
|
|
86
|
+
find_one: bool = True,
|
|
87
|
+
) -> list[tuple[Span, str, T]]:
|
|
88
|
+
tokens: list[str] = phrase.split()
|
|
89
|
+
n: int = len(tokens)
|
|
90
|
+
if n == 0 or parser is None:
|
|
91
|
+
return None
|
|
92
|
+
if max_window is None:
|
|
93
|
+
max_window = n
|
|
94
|
+
|
|
95
|
+
if concurrency is not None and concurrency > 0:
|
|
96
|
+
# Use a semaphore to limit concurrency of parser calls.
|
|
97
|
+
sem = asyncio.Semaphore(concurrency)
|
|
98
|
+
|
|
99
|
+
async def try_window(i: int, j: int) -> T:
|
|
100
|
+
async with sem:
|
|
101
|
+
try:
|
|
102
|
+
return await parser(" ".join(tokens[i:j]))
|
|
103
|
+
except ParseError:
|
|
104
|
+
return None
|
|
105
|
+
else:
|
|
106
|
+
|
|
107
|
+
async def try_window(i: int, j: int) -> T:
|
|
108
|
+
try:
|
|
109
|
+
return await parser(" ".join(tokens[i:j]))
|
|
110
|
+
except ParseError:
|
|
111
|
+
return None
|
|
112
|
+
|
|
113
|
+
# Slide a window of decreasing size over the tokens, left to right.
|
|
114
|
+
# Try parsing for each window. Once successful, trim to minimal window.
|
|
115
|
+
results: list[tuple[Span, str, T]] = []
|
|
116
|
+
for window_size in range(min(max_window, n), min_window - 1, -1):
|
|
117
|
+
for start in range(0, n - window_size + 1):
|
|
118
|
+
end = start + window_size
|
|
119
|
+
try:
|
|
120
|
+
res = await try_window(start, end)
|
|
121
|
+
except ParseError:
|
|
122
|
+
res = None
|
|
123
|
+
if res is None:
|
|
124
|
+
continue
|
|
125
|
+
char_span, substr, value = await _binary_cookie_trim(
|
|
126
|
+
tokens, start, end, parser, res, phrase
|
|
127
|
+
)
|
|
128
|
+
result = (char_span, substr, value)
|
|
129
|
+
if find_one:
|
|
130
|
+
return [result]
|
|
131
|
+
else:
|
|
132
|
+
results.append(result)
|
|
133
|
+
# TODO: limit next windows left edge to char_span.end
|
|
134
|
+
|
|
135
|
+
if results:
|
|
136
|
+
return results
|
|
137
|
+
|
|
138
|
+
# If no valid window is found, raise an error.
|
|
139
|
+
raise ParseError(f"No valid window found using parser={parser} in phrase={phrase}")
|
|
@@ -1,399 +0,0 @@
|
|
|
1
|
-
from functools import lru_cache
|
|
2
|
-
import warnings
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
from stark.tools.phonetic import espeak_ng
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
@lru_cache
|
|
9
|
-
def phonetic(string: str, language_code: str):
|
|
10
|
-
"""
|
|
11
|
-
Converts a string to simplified latin transcription via phonetic (ipa) transliteration.
|
|
12
|
-
"""
|
|
13
|
-
return " ".join(
|
|
14
|
-
_ipa2lat(_to_ipa(word, language_code)) for word in string.split()
|
|
15
|
-
) # TODO: try calling _to_ipa for the entire sentence
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
def _to_ipa(string: str, language_code: str) -> str:
|
|
19
|
-
return _to_ipa__espeak_bin(string, language_code)
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def _ipa2lat(ipa_string: str) -> str:
|
|
23
|
-
"""Converts IPA to a simplified latin transcription."""
|
|
24
|
-
return _ipa2lat__dict(ipa_string)
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
# ----- Implementations: -----
|
|
28
|
-
|
|
29
|
-
_mapping = {
|
|
30
|
-
# Vowels
|
|
31
|
-
"i": "i",
|
|
32
|
-
"y": "i",
|
|
33
|
-
"ɨ": "i",
|
|
34
|
-
"ʉ": "u",
|
|
35
|
-
"ɯ": "u",
|
|
36
|
-
"u": "u",
|
|
37
|
-
"ɪ": "i",
|
|
38
|
-
"ʏ": "i",
|
|
39
|
-
"ʊ": "u",
|
|
40
|
-
"e": "e",
|
|
41
|
-
"ø": "e",
|
|
42
|
-
"ɘ": "e",
|
|
43
|
-
"ɵ": "o",
|
|
44
|
-
"ɤ": "o",
|
|
45
|
-
"o": "o",
|
|
46
|
-
"ə": "e",
|
|
47
|
-
"ɛ": "e",
|
|
48
|
-
"œ": "e",
|
|
49
|
-
"ɜ": "e",
|
|
50
|
-
"ɞ": "e",
|
|
51
|
-
"ʌ": "a",
|
|
52
|
-
"ɔ": "o",
|
|
53
|
-
"æ": "a",
|
|
54
|
-
"ɐ": "a",
|
|
55
|
-
"a": "a",
|
|
56
|
-
"ɶ": "a",
|
|
57
|
-
"ä": "a",
|
|
58
|
-
"ɑ": "a",
|
|
59
|
-
"ɒ": "o",
|
|
60
|
-
# Pulmonic Consonants
|
|
61
|
-
"p": "p",
|
|
62
|
-
"b": "b",
|
|
63
|
-
"t": "t",
|
|
64
|
-
"d": "d",
|
|
65
|
-
"ʈ": "t",
|
|
66
|
-
"ɖ": "d",
|
|
67
|
-
"c": "k",
|
|
68
|
-
"ɟ": "j",
|
|
69
|
-
"k": "k",
|
|
70
|
-
"g": "g",
|
|
71
|
-
"q": "k",
|
|
72
|
-
"ɢ": "g",
|
|
73
|
-
"ɡ": "g",
|
|
74
|
-
"m": "m",
|
|
75
|
-
"ɱ": "m",
|
|
76
|
-
"n": "n",
|
|
77
|
-
"ɳ": "n",
|
|
78
|
-
"ɲ": "nj",
|
|
79
|
-
"ŋ": "ng",
|
|
80
|
-
"ʋ": "v",
|
|
81
|
-
"ɹ": "r",
|
|
82
|
-
"ɻ": "r",
|
|
83
|
-
"j": "j",
|
|
84
|
-
"ɰ": "w",
|
|
85
|
-
"ʙ": "b",
|
|
86
|
-
"r": "r",
|
|
87
|
-
"ʀ": "r",
|
|
88
|
-
"ɾ": "r",
|
|
89
|
-
"ɸ": "f",
|
|
90
|
-
"β": "v",
|
|
91
|
-
"f": "f",
|
|
92
|
-
"v": "v",
|
|
93
|
-
"θ": "th",
|
|
94
|
-
"ð": "dh",
|
|
95
|
-
"s": "s",
|
|
96
|
-
"z": "z",
|
|
97
|
-
"ʃ": "sh",
|
|
98
|
-
"ʒ": "zh",
|
|
99
|
-
"ʂ": "sh",
|
|
100
|
-
"ʐ": "zh",
|
|
101
|
-
"ç": "h",
|
|
102
|
-
"ʝ": "j",
|
|
103
|
-
"x": "h",
|
|
104
|
-
"ʑ": "z",
|
|
105
|
-
"ɣ": "gh",
|
|
106
|
-
"χ": "h",
|
|
107
|
-
"ʁ": "gh",
|
|
108
|
-
"ħ": "h",
|
|
109
|
-
"ʕ": "a",
|
|
110
|
-
"h": "h",
|
|
111
|
-
# Clicks
|
|
112
|
-
"ʘ": "o",
|
|
113
|
-
"ǀ": "l",
|
|
114
|
-
"ǃ": "!",
|
|
115
|
-
"ǂ": "!",
|
|
116
|
-
"ǁ": "l",
|
|
117
|
-
# Implosives and Ejectives
|
|
118
|
-
"ɓ": "b",
|
|
119
|
-
"ɗ": "d",
|
|
120
|
-
"ʄ": "j",
|
|
121
|
-
"ɠ": "g",
|
|
122
|
-
"ʛ": "g",
|
|
123
|
-
# Suprasegmentals
|
|
124
|
-
"ˈ": "",
|
|
125
|
-
"ˌ": "",
|
|
126
|
-
"ː": "",
|
|
127
|
-
"ˑ": "",
|
|
128
|
-
"|": "",
|
|
129
|
-
"‖": "",
|
|
130
|
-
".": "",
|
|
131
|
-
"ʼ": "",
|
|
132
|
-
# Tones and word accents
|
|
133
|
-
"̋": "",
|
|
134
|
-
"́": "",
|
|
135
|
-
"̄": "",
|
|
136
|
-
"̀": "",
|
|
137
|
-
"̏": "",
|
|
138
|
-
"̌": "",
|
|
139
|
-
"̂": "",
|
|
140
|
-
"᷄": "",
|
|
141
|
-
"᷅": "",
|
|
142
|
-
"᷈": "",
|
|
143
|
-
"᷉": "",
|
|
144
|
-
# Other symbols and diacritics
|
|
145
|
-
"ʲ": "",
|
|
146
|
-
"ʷ": "w",
|
|
147
|
-
"ʱ": "h",
|
|
148
|
-
"ʰ": "h",
|
|
149
|
-
"ʴ": "r",
|
|
150
|
-
"ʳ": "r",
|
|
151
|
-
"ˠ": "g",
|
|
152
|
-
"ʡ": "a",
|
|
153
|
-
"ʢ": "a",
|
|
154
|
-
"ɭ": "l",
|
|
155
|
-
"_": "",
|
|
156
|
-
'"': "",
|
|
157
|
-
" ": "",
|
|
158
|
-
}
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
def _ipa2lat__dict(ipa_string: str) -> str:
|
|
162
|
-
if not ipa_string:
|
|
163
|
-
return ""
|
|
164
|
-
|
|
165
|
-
string = ipa_string[:]
|
|
166
|
-
for ipa, simple in _mapping.items():
|
|
167
|
-
string = string.replace(ipa, simple)
|
|
168
|
-
|
|
169
|
-
for symbol in string:
|
|
170
|
-
if symbol not in "abcdefghijklmnopqrstuvwxyz":
|
|
171
|
-
warnings.warn(
|
|
172
|
-
f'SuggestionsManager._ipa_to_latin: Unknown symbol: "{symbol}" in {string} ({ipa_string})'
|
|
173
|
-
)
|
|
174
|
-
|
|
175
|
-
return string
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
def _to_ipa__espeak_bin(string: str, language_code: str) -> str:
|
|
179
|
-
return espeak_ng.text_to_ipa(string, language_code)
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
# def to_ipa__espeak_cli(string: str, language_code: str) -> str:
|
|
183
|
-
# import re
|
|
184
|
-
# import subprocess
|
|
185
|
-
|
|
186
|
-
# result = subprocess.run(
|
|
187
|
-
# ["espeak-ng", "--ipa", f"-v{language_code}", "-q", string],
|
|
188
|
-
# capture_output=True,
|
|
189
|
-
# text=True,
|
|
190
|
-
# )
|
|
191
|
-
# return re.compile(r"\(.*?\)").sub("", result.stdout.strip()).strip()
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
# @lru_cache
|
|
195
|
-
# def _epitran_obj(language_code: str) -> Epitran:
|
|
196
|
-
# from epitran import Epitran
|
|
197
|
-
|
|
198
|
-
# return Epitran(language_code) # this one is long, about an entire second
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
# def to_ipa__epitran(string: str, language_code: str) -> str:
|
|
202
|
-
# # if language_code.startswith("en"):
|
|
203
|
-
# # raise NotImplementedError(
|
|
204
|
-
# # "IPA to Epitran conversion for English is not implemented yet."
|
|
205
|
-
# # )
|
|
206
|
-
|
|
207
|
-
# if language_code == "ru":
|
|
208
|
-
# language_code = "rus-Cyrl"
|
|
209
|
-
|
|
210
|
-
# # Code: Language (Script)
|
|
211
|
-
# supported_languages = {
|
|
212
|
-
# "aar-Latn": "Afar",
|
|
213
|
-
# "afr-Latn": "Afrikanns",
|
|
214
|
-
# "aii-Syrc": "Assyrian Neo-Aramaic",
|
|
215
|
-
# "amh-Ethi": "Amharic",
|
|
216
|
-
# "amh-Ethi-pp": "Amharic (more phonetic)",
|
|
217
|
-
# "amh-Ethi-red": "Amharic (reduced)",
|
|
218
|
-
# "ara-Arab": "Literary Arabic",
|
|
219
|
-
# "ava-Cyrl": "Avaric",
|
|
220
|
-
# "aze-Cyrl": "Azerbaijani (Cyrillic)",
|
|
221
|
-
# "aze-Latn": "Azerbaijani (Latin)",
|
|
222
|
-
# "ben-Beng": "Bengali",
|
|
223
|
-
# "ben-Beng-red": "Bengali (reduced)",
|
|
224
|
-
# "ben-Beng-east": "East Bengali",
|
|
225
|
-
# "bho-Deva": "Bhojpuri",
|
|
226
|
-
# "bxk-Latn": "Bukusu",
|
|
227
|
-
# "cat-Latn": "Catalan",
|
|
228
|
-
# "ceb-Latn": "Cebuano",
|
|
229
|
-
# "ces-Latn": "Czech",
|
|
230
|
-
# "cjy-Latn": "Jin (Wiktionary)",
|
|
231
|
-
# "ckb-Arab": "Sorani",
|
|
232
|
-
# "cmn-Hans": "Mandarin (Simplified)*",
|
|
233
|
-
# "cmn-Hant": "Mandarin (Traditional)*",
|
|
234
|
-
# "cmn-Latn": "Mandarin (Pinyin)*",
|
|
235
|
-
# "csb-Latn": "Kashubian",
|
|
236
|
-
# "deu-Latn": "German",
|
|
237
|
-
# "deu-Latn-np": "German†",
|
|
238
|
-
# "deu-Latn-nar": "German (more phonetic)",
|
|
239
|
-
# "eng-Latn": "English‡",
|
|
240
|
-
# "epo-Latn": "Esperanto",
|
|
241
|
-
# "est-Latn": "Estonian",
|
|
242
|
-
# "fas-Arab": "Farsi (Perso-Arabic)",
|
|
243
|
-
# "fin-Latn": "Finnish",
|
|
244
|
-
# "fra-Latn": "French",
|
|
245
|
-
# "fra-Latn-np": "French†",
|
|
246
|
-
# "fra-Latn-p": "French (more phonetic)",
|
|
247
|
-
# "ful-Latn": "Fulah",
|
|
248
|
-
# "gan-Latn": "Gan (Wiktionary)",
|
|
249
|
-
# "glg-Latn": "Galician",
|
|
250
|
-
# "got-Goth": "Gothic",
|
|
251
|
-
# "got-Latn": "Gothic (Latin)",
|
|
252
|
-
# "hak-Latn": "Hakka (pha̍k-fa-sṳ)",
|
|
253
|
-
# "hat-Latn-bab": "Haitian (Latin-Babel)",
|
|
254
|
-
# "hau-Latn": "Hausa",
|
|
255
|
-
# "hin-Deva": "Hindi",
|
|
256
|
-
# "hmn-Latn": "Hmong",
|
|
257
|
-
# "hrv-Latn": "Croatian",
|
|
258
|
-
# "hsn-Latn": "Xiang (Wiktionary)",
|
|
259
|
-
# "hun-Latn": "Hungarian",
|
|
260
|
-
# "ilo-Latn": "Ilocano",
|
|
261
|
-
# "ind-Latn": "Indonesian",
|
|
262
|
-
# "ita-Latn": "Italian",
|
|
263
|
-
# "jam-Latn": "Jamaican",
|
|
264
|
-
# "jav-Latn": "Javanese",
|
|
265
|
-
# "jpn-Hira": "Japanese (Hiragana)",
|
|
266
|
-
# "jpn-Hira-red": "red Japanese (Hiragana, reduced)",
|
|
267
|
-
# "jpn-Jpan": "Japanese (Hiragana, Katakana, Kanji)",
|
|
268
|
-
# "jpn-Kana": "Japanese (Katakana)",
|
|
269
|
-
# "jpn-Kana-red": "red Japanese (Katakana, reduced)",
|
|
270
|
-
# "kat-Geor": "Georgian",
|
|
271
|
-
# "kaz-Cyrl": "Kazakh (Cyrillic)",
|
|
272
|
-
# "kaz-Cyrl-bab": "bab Kazakh (Cyrillic—Babel)",
|
|
273
|
-
# "kaz-Latn": "Kazakh (Latin)",
|
|
274
|
-
# "kbd-Cyrl": "Kabardian",
|
|
275
|
-
# "khm-Khmr": "Khmer",
|
|
276
|
-
# "kin-Latn": "Kinyarwanda",
|
|
277
|
-
# "kir-Arab": "Kyrgyz (Perso-Arabic)",
|
|
278
|
-
# "kir-Cyrl": "Kyrgyz (Cyrillic)",
|
|
279
|
-
# "kir-Latn": "Kyrgyz (Latin)",
|
|
280
|
-
# "kmr-Latn": "Kurmanji",
|
|
281
|
-
# "kmr-Latn-red": "Kurmanji (reduced)",
|
|
282
|
-
# "kor-Hang": "Korean",
|
|
283
|
-
# "lao-Laoo": "Lao",
|
|
284
|
-
# "lao-Laoo-prereform": "Lao (Before spelling reform)",
|
|
285
|
-
# "lav-Latn": "Latvian",
|
|
286
|
-
# "lez-Cyrl": "Lezgian",
|
|
287
|
-
# "lij-Latn": "Ligurian",
|
|
288
|
-
# "lit-Latn": "Lithuanian",
|
|
289
|
-
# "lsm-Latn": "Saamia",
|
|
290
|
-
# "ltc-Latn-bax": "Middle Chinese (Baxter and Sagart 2014)",
|
|
291
|
-
# "lug-Latn": "Ganda / Luganda",
|
|
292
|
-
# "mal-Mlym": "Malayalam",
|
|
293
|
-
# "mar-Deva": "Marathi",
|
|
294
|
-
# "mlt-Latn": "Maltese",
|
|
295
|
-
# "mon-Cyrl-bab": "Mongolian (Cyrillic)",
|
|
296
|
-
# "mri-Latn": "Maori",
|
|
297
|
-
# "msa-Latn": "Malay",
|
|
298
|
-
# "mya-Mymr": "Burmese",
|
|
299
|
-
# "nan-Latn": "Hokkien (pe̍h-oē-jī)",
|
|
300
|
-
# "nan-Latn-tl": "Hokkien (Tâi-lô)",
|
|
301
|
-
# "nld-Latn": "Dutch",
|
|
302
|
-
# "nya-Latn": "Chichewa",
|
|
303
|
-
# "ood-Latn-alv": "Tohono O'odham (Alvarez–Hale)",
|
|
304
|
-
# "ood-Latn-sax": "Tohono O'odham (Saxton)",
|
|
305
|
-
# "ori-Orya": "Odia",
|
|
306
|
-
# "orm-Latn": "Oromo",
|
|
307
|
-
# "pan-Guru": "Punjabi (Eastern)",
|
|
308
|
-
# "pol-Latn": "Polish",
|
|
309
|
-
# "por-Latn": "Portuguese",
|
|
310
|
-
# "quy-Latn": "Ayacucho Quechua / Quechua Chanka",
|
|
311
|
-
# "ron-Latn": "Romanian",
|
|
312
|
-
# "run-Latn": "Rundi",
|
|
313
|
-
# "rus-Cyrl": "Russian",
|
|
314
|
-
# "sag-Latn": "Sango",
|
|
315
|
-
# "sin-Sinh": "Sinhala",
|
|
316
|
-
# "slv-Latn": "Slovene / Slovenian",
|
|
317
|
-
# "sna-Latn": "Shona",
|
|
318
|
-
# "som-Latn": "Somali",
|
|
319
|
-
# "spa-Latn": "Spanish",
|
|
320
|
-
# "spa-Latn-eu": "Spanish (Iberian)",
|
|
321
|
-
# "sqi-Latn": "Albanian",
|
|
322
|
-
# "sro-Latn": "Sardinian (Campidanese)",
|
|
323
|
-
# "srp-Latn": "Serbian (Latin)",
|
|
324
|
-
# "srp-Cyrl": "Serbian (Cyrillic)",
|
|
325
|
-
# "swa-Latn": "Swahili",
|
|
326
|
-
# "swa-Latn-red": "Swahili (reduced)",
|
|
327
|
-
# "swe-Latn": "Swedish",
|
|
328
|
-
# "tam-Taml": "Tamil",
|
|
329
|
-
# "tam-Taml-red": "Tamil (reduced)",
|
|
330
|
-
# "tel-Telu": "Telugu",
|
|
331
|
-
# "tgk-Cyrl": "Tajik",
|
|
332
|
-
# "tgl-Latn": "Tagalog",
|
|
333
|
-
# "tgl-Latn-red": "Tagalog (reduced)",
|
|
334
|
-
# "tha-Thai": "Thai",
|
|
335
|
-
# "tir-Ethi": "Tigrinya",
|
|
336
|
-
# "tir-Ethi-pp": "Tigrinya (more phonemic)",
|
|
337
|
-
# "tir-Ethi-red": "Tigrinya (reduced)",
|
|
338
|
-
# "tok-Latn": "Toki Pona",
|
|
339
|
-
# "tpi-Latn": "Tok Pisin",
|
|
340
|
-
# "tuk-Cyrl": "Turkmen (Cyrillic)",
|
|
341
|
-
# "tuk-Latn": "Turkmen (Latin)",
|
|
342
|
-
# "tur-Latn": "Turkish (Latin)",
|
|
343
|
-
# "tur-Latn-bab": "Turkish (Latin—Babel)",
|
|
344
|
-
# "tur-Latn-red": "Turkish (reduced)",
|
|
345
|
-
# "ukr-Cyrl": "Ukrainian",
|
|
346
|
-
# "urd-Arab": "Urdu",
|
|
347
|
-
# "uig-Arab": "Uyghur (Perso-Arabic)",
|
|
348
|
-
# "uzb-Cyrl": "Uzbek (Cyrillic)",
|
|
349
|
-
# "uzb-Latn": "Uzbek (Latin)",
|
|
350
|
-
# "vie-Latn": "Vietnamese",
|
|
351
|
-
# "wuu-Latn": "Shanghainese Wu (Wiktionary)",
|
|
352
|
-
# "xho-Latn": "Xhosa",
|
|
353
|
-
# "yor-Latn": "Yoruba",
|
|
354
|
-
# "yue-Latn": "Cantonese (Jyutping)",
|
|
355
|
-
# "yue-Hant": "Cantonese (Character)",
|
|
356
|
-
# "zha-Latn": "Zhuang",
|
|
357
|
-
# "zul-Latn": "Zulu",
|
|
358
|
-
# }
|
|
359
|
-
|
|
360
|
-
# if language_code not in supported_languages:
|
|
361
|
-
# for key in supported_languages:
|
|
362
|
-
# if key.startswith(language_code):
|
|
363
|
-
# warnings.warn(
|
|
364
|
-
# f"Unsupported language code: {language_code}; trying to use similar key {key}"
|
|
365
|
-
# )
|
|
366
|
-
# language_code = key
|
|
367
|
-
# break
|
|
368
|
-
# else:
|
|
369
|
-
# raise ValueError(
|
|
370
|
-
# f"Unsupported language code: {language_code}; supported languages: {supported_languages}"
|
|
371
|
-
# )
|
|
372
|
-
|
|
373
|
-
# if not string.strip():
|
|
374
|
-
# return ""
|
|
375
|
-
|
|
376
|
-
# return _epitran_obj(language_code).transliterate(string)
|
|
377
|
-
|
|
378
|
-
if __name__ == "__main__":
|
|
379
|
-
pass
|
|
380
|
-
# print("Starting...")
|
|
381
|
-
# print(to_ipa__epitran("Hello", "eng-Latn"))
|
|
382
|
-
# print("Two more...")
|
|
383
|
-
# print(to_ipa__epitran("Hello", "eng-Latn"))
|
|
384
|
-
# print(to_ipa__epitran("Hello", "eng-Latn"))
|
|
385
|
-
# print("Київ", to_ipa("Київ", "ua"))
|
|
386
|
-
# print("Київ", to_ipa("Київ", "uk"))
|
|
387
|
-
# test_cases = [
|
|
388
|
-
# 'Привет Иван как у тебя делая',
|
|
389
|
-
# 'любимые занятия надо делать часто',
|
|
390
|
-
# 'Съешь ещё этих мягких французских булок да выпей чаю',
|
|
391
|
-
# 'Хай',
|
|
392
|
-
# 'хай',
|
|
393
|
-
# 'Хэллоу',
|
|
394
|
-
# 'хэллоу',
|
|
395
|
-
# 'друг с другом',
|
|
396
|
-
# 'с пути фай',
|
|
397
|
-
# ]
|
|
398
|
-
# for test_case in test_cases:
|
|
399
|
-
# print((ipa := to_ipa__epitran(test_case, 'rus-Cyrl')), to_ipa__espeak(test_case, 'ru'), ipa2lat__ipapy(ipa), ipa2lat__dict(ipa), sep=' || ')
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|