stark-engine 4.2.0__tar.gz → 4.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. {stark_engine-4.2.0 → stark_engine-4.2.2}/PKG-INFO +1 -1
  2. {stark_engine-4.2.0 → stark_engine-4.2.2}/pyproject.toml +1 -1
  3. {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/tools/dictionary/dictionary.py +46 -9
  4. stark_engine-4.2.2/stark/tools/phonetic/transcription/__init__.py +26 -0
  5. stark_engine-4.2.2/stark/tools/phonetic/transcription/epitran.py +191 -0
  6. stark_engine-4.2.0/stark/tools/phonetic/espeak_ng.py → stark_engine-4.2.2/stark/tools/phonetic/transcription/espeak.py +17 -35
  7. stark_engine-4.2.2/stark/tools/phonetic/transcription/ipa2lat.py +151 -0
  8. stark_engine-4.2.2/stark/tools/phonetic/transcription/protocol.py +5 -0
  9. stark_engine-4.2.2/stark/tools/sliding_window_parser.py +139 -0
  10. stark_engine-4.2.0/stark/tools/phonetic/ipa.py +0 -399
  11. {stark_engine-4.2.0 → stark_engine-4.2.2}/LICENSE.md +0 -0
  12. {stark_engine-4.2.0 → stark_engine-4.2.2}/README.md +0 -0
  13. {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/__init__.py +0 -0
  14. {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/core/__init__.py +0 -0
  15. {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/core/command.py +0 -0
  16. {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/core/commands_context.py +0 -0
  17. {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/core/commands_manager.py +0 -0
  18. {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/core/patterns/__init__.py +0 -0
  19. {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/core/patterns/parsing.py +0 -0
  20. {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/core/patterns/pattern.py +0 -0
  21. {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/core/patterns/rules.py +0 -0
  22. {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/core/types/__init__.py +0 -0
  23. {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/core/types/number.py +0 -0
  24. {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/core/types/object.py +0 -0
  25. {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/core/types/slots.py +0 -0
  26. {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/core/types/string.py +0 -0
  27. {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/core/types/time.py +0 -0
  28. {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/core/types/time_interval.py +0 -0
  29. {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/core/types/word.py +0 -0
  30. {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/general/blockage_detector.py +0 -0
  31. {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/general/classproperty.py +0 -0
  32. {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/general/dependencies.py +0 -0
  33. {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/general/json_encoder.py +0 -0
  34. {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/interfaces/gcloud.py +0 -0
  35. {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/interfaces/protocols.py +0 -0
  36. {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/interfaces/silero.py +0 -0
  37. {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/interfaces/vosk.py +0 -0
  38. {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/tools/common/span.py +0 -0
  39. {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/tools/dictionary/!examples.py +0 -0
  40. {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/tools/dictionary/__init__.py +0 -0
  41. {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/tools/dictionary/models.py +0 -0
  42. {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/tools/dictionary/nl_dictionary_name.py +0 -0
  43. {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/tools/dictionary/storage/__init__.py +0 -0
  44. {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/tools/dictionary/storage/storage_memory.py +0 -0
  45. {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/tools/dictionary/storage/storage_sqlite.py +0 -0
  46. {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/tools/levenshtein/__init__.py +0 -0
  47. {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/tools/levenshtein/levenshtein.pyi +0 -0
  48. {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/tools/levenshtein/levenshtein.pyx +0 -0
  49. {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/tools/phonetic/simplephone.py +0 -0
  50. {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/tools/strtools.py +0 -0
  51. {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/voice_assistant/__init__.py +0 -0
  52. {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/voice_assistant/mode.py +0 -0
  53. {stark_engine-4.2.0 → stark_engine-4.2.2}/stark/voice_assistant/voice_assistant.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: stark-engine
3
- Version: 4.2.0
3
+ Version: 4.2.2
4
4
  Summary: S.T.A.R.K - Speech and Text Algorithmic Recognition Kit. Modern framework for creating powerfull voice assistants.
5
5
  License: CC BY-NC-SA 4.0
6
6
  License-File: LICENSE.md
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "stark-engine"
3
- version = "4.2.0"
3
+ version = "4.2.2"
4
4
  description = "S.T.A.R.K - Speech and Text Algorithmic Recognition Kit. Modern framework for creating powerfull voice assistants."
5
5
  authors = ["MarkParker5 <mark@parker-programs.com>"]
6
6
  license = "CC BY-NC-SA 4.0"
@@ -14,7 +14,11 @@ from stark.tools.levenshtein import (
14
14
  levenshtein_similarity,
15
15
  levenshtein_similarity_substring,
16
16
  )
17
- from stark.tools.phonetic.ipa import phonetic
17
+ from stark.tools.phonetic.transcription import (
18
+ transcription,
19
+ IpaProvider,
20
+ EspeakIpaProvider,
21
+ )
18
22
  from stark.tools.phonetic.simplephone import simplephone
19
23
  from stark.tools.strtools import find_substring_in_words_map, split_indices
20
24
 
@@ -52,8 +56,13 @@ class Dictionary:
52
56
  Phonetic-aware dictionary with metadata storage.
53
57
  """
54
58
 
55
- def __init__(self, storage: DictionaryStorageProtocol):
59
+ def __init__(
60
+ self,
61
+ storage: DictionaryStorageProtocol,
62
+ ipa_provider: IpaProvider = EspeakIpaProvider(),
63
+ ):
56
64
  self.storage: DictionaryStorageProtocol = storage
65
+ self.ipa_provider: IpaProvider = ipa_provider
57
66
 
58
67
  # ----------------------
59
68
  # Write methods
@@ -65,7 +74,9 @@ class Dictionary:
65
74
  Add a single entry to the dictionary.
66
75
  Phonetic conversion happens internally (mandatory).
67
76
  """
68
- phonetic_str = phonetic(name, language_code=language_code)
77
+ phonetic_str = transcription(
78
+ name, language_code=language_code, ipa_provider=self.ipa_provider
79
+ )
69
80
  simple_phonetic = simplephone(phonetic_str) or ""
70
81
  item = DictionaryItem(
71
82
  name=name,
@@ -125,7 +136,9 @@ class Dictionary:
125
136
  else r.item.phonetic,
126
137
  s2=sentence
127
138
  if r.item.language_code == language_code
128
- else phonetic(sentence, language_code),
139
+ else transcription(
140
+ sentence, language_code, ipa_provider=self.ipa_provider
141
+ ),
129
142
  ignore_prefix=True,
130
143
  )[0][1], # TODO: review
131
144
  reverse=True,
@@ -141,7 +154,14 @@ class Dictionary:
141
154
  """
142
155
  Lookup dictionary items by name_candidate and language_code using LookupMode and LookupField.
143
156
  """
144
- simple_phonetic = simplephone(phonetic(name_candidate, language_code)) or ""
157
+ simple_phonetic = (
158
+ simplephone(
159
+ transcription(
160
+ name_candidate, language_code, ipa_provider=self.ipa_provider
161
+ )
162
+ )
163
+ or ""
164
+ )
145
165
  logger.debug(
146
166
  f"Looking up '{name_candidate}' with simple phonetic '{simple_phonetic}' under mode {mode}, field {field}"
147
167
  )
@@ -170,7 +190,13 @@ class Dictionary:
170
190
  yield from filter(
171
191
  lambda item: levenshtein_match(
172
192
  s1=item.simple_phonetic,
173
- s2=simplephone(phonetic(name_candidate, language_code))
193
+ s2=simplephone(
194
+ transcription(
195
+ name_candidate,
196
+ language_code,
197
+ ipa_provider=self.ipa_provider,
198
+ )
199
+ )
174
200
  or "",
175
201
  threshold=0.8,
176
202
  proximity_graph=SIMPLEPHONE_PROXIMITY_GRAPH,
@@ -253,7 +279,12 @@ class Dictionary:
253
279
  case LookupMode.FUZZY:
254
280
  if field == LookupField.PHONETIC:
255
281
  simple_phonetic = (
256
- simplephone(phonetic(sentence, language_code)) or ""
282
+ simplephone(
283
+ transcription(
284
+ sentence, language_code, ipa_provider=self.ipa_provider
285
+ )
286
+ )
287
+ or ""
257
288
  )
258
289
  for item in self.storage.iterate():
259
290
  for span, _ in levenshtein_search_substring(
@@ -343,7 +374,11 @@ class Dictionary:
343
374
  span=span,
344
375
  text=sentence[span.slice],
345
376
  simple_phonetic=simplephone(
346
- phonetic(sentence[span.slice], language_code)
377
+ transcription(
378
+ sentence[span.slice],
379
+ language_code,
380
+ ipa_provider=self.ipa_provider,
381
+ )
347
382
  )
348
383
  or "",
349
384
  )
@@ -405,7 +440,9 @@ class Dictionary:
405
440
  key=lambda item: levenshtein_similarity(
406
441
  s1=name_candidate
407
442
  if item.language_code == language_code
408
- else phonetic(name_candidate, language_code),
443
+ else transcription(
444
+ name_candidate, language_code, ipa_provider=self.ipa_provider
445
+ ),
409
446
  s2=item.name if item.language_code == language_code else item.phonetic,
410
447
  ),
411
448
  reverse=True,
@@ -0,0 +1,26 @@
1
+ from .protocol import IpaProvider
2
+ from .espeak import EspeakIpaProvider
3
+ from .ipa2lat import ipa2lat
4
+ from functools import lru_cache
5
+
6
+
7
+ @lru_cache
8
+ def transcription(
9
+ string: str,
10
+ language_code: str,
11
+ ipa_provider: IpaProvider = EspeakIpaProvider(),
12
+ ) -> str:
13
+ """
14
+ Converts a string to a simplified latin transcription via phonetic (IPA) transliteration.
15
+
16
+ Args:
17
+ string: The input string to transcribe.
18
+ language_code: The language code for IPA conversion.
19
+ ipa_provider: The IPA provider to use for conversion (default: EspeakIpaProvider).
20
+
21
+ Returns:
22
+ The simplified latin transcription of the input string.
23
+ """
24
+ return " ".join(
25
+ ipa2lat(ipa_provider.to_ipa(word, language_code)) for word in string.split()
26
+ )
@@ -0,0 +1,191 @@
1
+ from typing import Any
2
+ import warnings
3
+
4
+
5
+ class EpitranIpaProvider:
6
+ def __init__(self) -> None:
7
+ self._cache: dict[str, Any] = {}
8
+
9
+ def _epitran_obj(self, language_code: str) -> Any:
10
+ if language_code not in self._cache:
11
+ from epitran import Epitran
12
+
13
+ self._cache[language_code] = Epitran(language_code)
14
+ return self._cache[language_code]
15
+
16
+ def to_ipa(self, string: str, language_code: str) -> str:
17
+ # if language_code.startswith("en"):
18
+ # raise NotImplementedError(
19
+ # "IPA to Epitran conversion for English is not implemented yet."
20
+ # )
21
+
22
+ if language_code == "ru":
23
+ language_code = "rus-Cyrl"
24
+
25
+ # Code: Language (Script)
26
+ supported_languages = {
27
+ "aar-Latn": "Afar",
28
+ "afr-Latn": "Afrikanns",
29
+ "aii-Syrc": "Assyrian Neo-Aramaic",
30
+ "amh-Ethi": "Amharic",
31
+ "amh-Ethi-pp": "Amharic (more phonetic)",
32
+ "amh-Ethi-red": "Amharic (reduced)",
33
+ "ara-Arab": "Literary Arabic",
34
+ "ava-Cyrl": "Avaric",
35
+ "aze-Cyrl": "Azerbaijani (Cyrillic)",
36
+ "aze-Latn": "Azerbaijani (Latin)",
37
+ "ben-Beng": "Bengali",
38
+ "ben-Beng-red": "Bengali (reduced)",
39
+ "ben-Beng-east": "East Bengali",
40
+ "bho-Deva": "Bhojpuri",
41
+ "bxk-Latn": "Bukusu",
42
+ "cat-Latn": "Catalan",
43
+ "ceb-Latn": "Cebuano",
44
+ "ces-Latn": "Czech",
45
+ "cjy-Latn": "Jin (Wiktionary)",
46
+ "ckb-Arab": "Sorani",
47
+ "cmn-Hans": "Mandarin (Simplified)*",
48
+ "cmn-Hant": "Mandarin (Traditional)*",
49
+ "cmn-Latn": "Mandarin (Pinyin)*",
50
+ "csb-Latn": "Kashubian",
51
+ "deu-Latn": "German",
52
+ "deu-Latn-np": "German†",
53
+ "deu-Latn-nar": "German (more phonetic)",
54
+ "eng-Latn": "English‡",
55
+ "epo-Latn": "Esperanto",
56
+ "est-Latn": "Estonian",
57
+ "fas-Arab": "Farsi (Perso-Arabic)",
58
+ "fin-Latn": "Finnish",
59
+ "fra-Latn": "French",
60
+ "fra-Latn-np": "French†",
61
+ "fra-Latn-p": "French (more phonetic)",
62
+ "ful-Latn": "Fulah",
63
+ "gan-Latn": "Gan (Wiktionary)",
64
+ "glg-Latn": "Galician",
65
+ "got-Goth": "Gothic",
66
+ "got-Latn": "Gothic (Latin)",
67
+ "hak-Latn": "Hakka (pha̍k-fa-sṳ)",
68
+ "hat-Latn-bab": "Haitian (Latin-Babel)",
69
+ "hau-Latn": "Hausa",
70
+ "hin-Deva": "Hindi",
71
+ "hmn-Latn": "Hmong",
72
+ "hrv-Latn": "Croatian",
73
+ "hsn-Latn": "Xiang (Wiktionary)",
74
+ "hun-Latn": "Hungarian",
75
+ "ilo-Latn": "Ilocano",
76
+ "ind-Latn": "Indonesian",
77
+ "ita-Latn": "Italian",
78
+ "jam-Latn": "Jamaican",
79
+ "jav-Latn": "Javanese",
80
+ "jpn-Hira": "Japanese (Hiragana)",
81
+ "jpn-Hira-red": "red Japanese (Hiragana, reduced)",
82
+ "jpn-Jpan": "Japanese (Hiragana, Katakana, Kanji)",
83
+ "jpn-Kana": "Japanese (Katakana)",
84
+ "jpn-Kana-red": "red Japanese (Katakana, reduced)",
85
+ "kat-Geor": "Georgian",
86
+ "kaz-Cyrl": "Kazakh (Cyrillic)",
87
+ "kaz-Cyrl-bab": "bab Kazakh (Cyrillic—Babel)",
88
+ "kaz-Latn": "Kazakh (Latin)",
89
+ "kbd-Cyrl": "Kabardian",
90
+ "khm-Khmr": "Khmer",
91
+ "kin-Latn": "Kinyarwanda",
92
+ "kir-Arab": "Kyrgyz (Perso-Arabic)",
93
+ "kir-Cyrl": "Kyrgyz (Cyrillic)",
94
+ "kir-Latn": "Kyrgyz (Latin)",
95
+ "kmr-Latn": "Kurmanji",
96
+ "kmr-Latn-red": "Kurmanji (reduced)",
97
+ "kor-Hang": "Korean",
98
+ "lao-Laoo": "Lao",
99
+ "lao-Laoo-prereform": "Lao (Before spelling reform)",
100
+ "lav-Latn": "Latvian",
101
+ "lez-Cyrl": "Lezgian",
102
+ "lij-Latn": "Ligurian",
103
+ "lit-Latn": "Lithuanian",
104
+ "lsm-Latn": "Saamia",
105
+ "ltc-Latn-bax": "Middle Chinese (Baxter and Sagart 2014)",
106
+ "lug-Latn": "Ganda / Luganda",
107
+ "mal-Mlym": "Malayalam",
108
+ "mar-Deva": "Marathi",
109
+ "mlt-Latn": "Maltese",
110
+ "mon-Cyrl-bab": "Mongolian (Cyrillic)",
111
+ "mri-Latn": "Maori",
112
+ "msa-Latn": "Malay",
113
+ "mya-Mymr": "Burmese",
114
+ "nan-Latn": "Hokkien (pe̍h-oē-jī)",
115
+ "nan-Latn-tl": "Hokkien (Tâi-lô)",
116
+ "nld-Latn": "Dutch",
117
+ "nya-Latn": "Chichewa",
118
+ "ood-Latn-alv": "Tohono O'odham (Alvarez–Hale)",
119
+ "ood-Latn-sax": "Tohono O'odham (Saxton)",
120
+ "ori-Orya": "Odia",
121
+ "orm-Latn": "Oromo",
122
+ "pan-Guru": "Punjabi (Eastern)",
123
+ "pol-Latn": "Polish",
124
+ "por-Latn": "Portuguese",
125
+ "quy-Latn": "Ayacucho Quechua / Quechua Chanka",
126
+ "ron-Latn": "Romanian",
127
+ "run-Latn": "Rundi",
128
+ "rus-Cyrl": "Russian",
129
+ "sag-Latn": "Sango",
130
+ "sin-Sinh": "Sinhala",
131
+ "slv-Latn": "Slovene / Slovenian",
132
+ "sna-Latn": "Shona",
133
+ "som-Latn": "Somali",
134
+ "spa-Latn": "Spanish",
135
+ "spa-Latn-eu": "Spanish (Iberian)",
136
+ "sqi-Latn": "Albanian",
137
+ "sro-Latn": "Sardinian (Campidanese)",
138
+ "srp-Latn": "Serbian (Latin)",
139
+ "srp-Cyrl": "Serbian (Cyrillic)",
140
+ "swa-Latn": "Swahili",
141
+ "swa-Latn-red": "Swahili (reduced)",
142
+ "swe-Latn": "Swedish",
143
+ "tam-Taml": "Tamil",
144
+ "tam-Taml-red": "Tamil (reduced)",
145
+ "tel-Telu": "Telugu",
146
+ "tgk-Cyrl": "Tajik",
147
+ "tgl-Latn": "Tagalog",
148
+ "tgl-Latn-red": "Tagalog (reduced)",
149
+ "tha-Thai": "Thai",
150
+ "tir-Ethi": "Tigrinya",
151
+ "tir-Ethi-pp": "Tigrinya (more phonemic)",
152
+ "tir-Ethi-red": "Tigrinya (reduced)",
153
+ "tok-Latn": "Toki Pona",
154
+ "tpi-Latn": "Tok Pisin",
155
+ "tuk-Cyrl": "Turkmen (Cyrillic)",
156
+ "tuk-Latn": "Turkmen (Latin)",
157
+ "tur-Latn": "Turkish (Latin)",
158
+ "tur-Latn-bab": "Turkish (Latin—Babel)",
159
+ "tur-Latn-red": "Turkish (reduced)",
160
+ "ukr-Cyrl": "Ukrainian",
161
+ "urd-Arab": "Urdu",
162
+ "uig-Arab": "Uyghur (Perso-Arabic)",
163
+ "uzb-Cyrl": "Uzbek (Cyrillic)",
164
+ "uzb-Latn": "Uzbek (Latin)",
165
+ "vie-Latn": "Vietnamese",
166
+ "wuu-Latn": "Shanghainese Wu (Wiktionary)",
167
+ "xho-Latn": "Xhosa",
168
+ "yor-Latn": "Yoruba",
169
+ "yue-Latn": "Cantonese (Jyutping)",
170
+ "yue-Hant": "Cantonese (Character)",
171
+ "zha-Latn": "Zhuang",
172
+ "zul-Latn": "Zulu",
173
+ }
174
+
175
+ if language_code not in supported_languages:
176
+ for key in supported_languages:
177
+ if key.startswith(language_code):
178
+ warnings.warn(
179
+ f"Unsupported language code: {language_code}; trying to use similar key {key}"
180
+ )
181
+ language_code = key
182
+ break
183
+ else:
184
+ raise ValueError(
185
+ f"Unsupported language code: {language_code}; supported languages: {supported_languages}"
186
+ )
187
+
188
+ if not string.strip():
189
+ return ""
190
+
191
+ return self._epitran_obj(language_code).transliterate(string)
@@ -126,38 +126,20 @@ espeak: EspeakNG | None = None
126
126
  _espeak_lock = threading.Lock()
127
127
 
128
128
 
129
- def text_to_ipa(text: str, lang: str, check_chars: bool = True) -> str:
130
- with _espeak_lock:
131
- global espeak
132
- if espeak is None:
133
- espeak = EspeakNG(lang)
134
- espeak.set_lang(lang)
135
- ipa = espeak.text_to_ipa(text, remove_stress=True)
136
- if check_chars:
137
- for char in {"(", ")", "[", "]"}:
138
- assert char not in ipa, (
139
- f"Unexpected character '{char}' in IPA '{ipa}' with lang '{lang}'. Check if the language is supported by eSpeak NG. You can disable this check by setting check_chars=False."
140
- )
141
- return ipa
142
-
143
-
144
- if __name__ == "__main__":
145
- data = [
146
- ("en", "Hello World"),
147
- ("uk", "Привіт світ"),
148
- ("fr", "Bonjour le monde"),
149
- ("ru", "Привет мир"),
150
- ("en", "Hello World"),
151
- ("en", "Hello World"),
152
- ("ru", "Привет мир"),
153
- ("ru", "Привет мир"),
154
- ("uk", "Привіт світ"),
155
- ("uk", "Привіт світ"),
156
- ("uk", "Привіт світ"),
157
- ("en", "Hello, World"),
158
- ("uk", "Привіт світ"),
159
- ("en", "Hello! World"),
160
- ("uk", "Привіт, світ"),
161
- ]
162
- for lang, text in data:
163
- print(f"{lang.upper()}: '{text}' -> '{text_to_ipa(text, lang)}'")
129
+ class EspeakIpaProvider:
130
+ def __init__(self, check_chars: bool = True):
131
+ self.check_chars = check_chars
132
+
133
+ def to_ipa(self, string: str, language_code: str) -> str:
134
+ with _espeak_lock:
135
+ global espeak
136
+ if espeak is None:
137
+ espeak = EspeakNG(language_code)
138
+ espeak.set_lang(language_code)
139
+ ipa = espeak.text_to_ipa(string, remove_stress=True)
140
+ if self.check_chars:
141
+ for char in {"(", ")", "[", "]"}:
142
+ assert char not in ipa, (
143
+ f"Unexpected character '{char}' in IPA '{ipa}' with lang '{language_code}'. Check if the language is supported by eSpeak NG. You can disable this check by setting check_chars=False."
144
+ )
145
+ return ipa
@@ -0,0 +1,151 @@
1
+ import warnings
2
+
3
+ _mapping = {
4
+ # Vowels
5
+ "i": "i",
6
+ "y": "i",
7
+ "ɨ": "i",
8
+ "ʉ": "u",
9
+ "ɯ": "u",
10
+ "u": "u",
11
+ "ɪ": "i",
12
+ "ʏ": "i",
13
+ "ʊ": "u",
14
+ "e": "e",
15
+ "ø": "e",
16
+ "ɘ": "e",
17
+ "ɵ": "o",
18
+ "ɤ": "o",
19
+ "o": "o",
20
+ "ə": "e",
21
+ "ɛ": "e",
22
+ "œ": "e",
23
+ "ɜ": "e",
24
+ "ɞ": "e",
25
+ "ʌ": "a",
26
+ "ɔ": "o",
27
+ "æ": "a",
28
+ "ɐ": "a",
29
+ "a": "a",
30
+ "ɶ": "a",
31
+ "ä": "a",
32
+ "ɑ": "a",
33
+ "ɒ": "o",
34
+ # Pulmonic Consonants
35
+ "p": "p",
36
+ "b": "b",
37
+ "t": "t",
38
+ "d": "d",
39
+ "ʈ": "t",
40
+ "ɖ": "d",
41
+ "c": "k",
42
+ "ɟ": "j",
43
+ "k": "k",
44
+ "g": "g",
45
+ "q": "k",
46
+ "ɢ": "g",
47
+ "ɡ": "g",
48
+ "m": "m",
49
+ "ɱ": "m",
50
+ "n": "n",
51
+ "ɳ": "n",
52
+ "ɲ": "nj",
53
+ "ŋ": "ng",
54
+ "ʋ": "v",
55
+ "ɹ": "r",
56
+ "ɻ": "r",
57
+ "j": "j",
58
+ "ɰ": "w",
59
+ "ʙ": "b",
60
+ "r": "r",
61
+ "ʀ": "r",
62
+ "ɾ": "r",
63
+ "ɸ": "f",
64
+ "β": "v",
65
+ "f": "f",
66
+ "v": "v",
67
+ "θ": "th",
68
+ "ð": "dh",
69
+ "s": "s",
70
+ "z": "z",
71
+ "ʃ": "sh",
72
+ "ʒ": "zh",
73
+ "ʂ": "sh",
74
+ "ʐ": "zh",
75
+ "ç": "h",
76
+ "ʝ": "j",
77
+ "x": "h",
78
+ "ʑ": "z",
79
+ "ɣ": "gh",
80
+ "χ": "h",
81
+ "ʁ": "gh",
82
+ "ħ": "h",
83
+ "ʕ": "a",
84
+ "h": "h",
85
+ # Clicks
86
+ "ʘ": "o",
87
+ "ǀ": "l",
88
+ "ǃ": "!",
89
+ "ǂ": "!",
90
+ "ǁ": "l",
91
+ # Implosives and Ejectives
92
+ "ɓ": "b",
93
+ "ɗ": "d",
94
+ "ʄ": "j",
95
+ "ɠ": "g",
96
+ "ʛ": "g",
97
+ # Suprasegmentals
98
+ "ˈ": "",
99
+ "ˌ": "",
100
+ "ː": "",
101
+ "ˑ": "",
102
+ "|": "",
103
+ "‖": "",
104
+ ".": "",
105
+ "ʼ": "",
106
+ # Tones and word accents
107
+ "̋": "",
108
+ "́": "",
109
+ "̄": "",
110
+ "̀": "",
111
+ "̏": "",
112
+ "̌": "",
113
+ "̂": "",
114
+ "᷄": "",
115
+ "᷅": "",
116
+ "᷈": "",
117
+ "᷉": "",
118
+ # Other symbols and diacritics
119
+ "ʲ": "",
120
+ "ʷ": "w",
121
+ "ʱ": "h",
122
+ "ʰ": "h",
123
+ "ʴ": "r",
124
+ "ʳ": "r",
125
+ "ˠ": "g",
126
+ "ʡ": "a",
127
+ "ʢ": "a",
128
+ "ɭ": "l",
129
+ "_": "",
130
+ '"': "",
131
+ " ": "",
132
+ }
133
+
134
+
135
+ def ipa2lat(ipa_string: str) -> str:
136
+ """Convert IPA string to a simplified Latin string"""
137
+
138
+ if not ipa_string:
139
+ return ""
140
+
141
+ string = ipa_string[:]
142
+ for ipa, simple in _mapping.items():
143
+ string = string.replace(ipa, simple)
144
+
145
+ for symbol in string:
146
+ if symbol not in "abcdefghijklmnopqrstuvwxyz":
147
+ warnings.warn(
148
+ f'ipa2lat: Unknown symbol: "{symbol}" in {string} ({ipa_string})'
149
+ )
150
+
151
+ return string
@@ -0,0 +1,5 @@
1
+ from typing import Protocol
2
+
3
+
4
+ class IpaProvider(Protocol):
5
+ def to_ipa(self, string: str, language_code: str) -> str: ...
@@ -0,0 +1,139 @@
1
+ import asyncio
2
+ from typing import Awaitable, Callable
3
+ from stark.core.patterns.parsing import ParseError
4
+ from stark.tools.common.span import Span
5
+
6
+
7
+ def _token_span_to_char_span(tokens: list[str], span: Span, phrase: str) -> Span:
8
+ """Convert a token span (by index) to a character span in the original phrase."""
9
+ if not tokens or not (0 <= span.start <= span.end <= len(tokens)):
10
+ return Span(0, 0)
11
+ # Find the start and end char positions of the tokens in the original phrase
12
+ positions = []
13
+ idx = 0
14
+ for token in tokens:
15
+ # skip leading spaces
16
+ while idx < len(phrase) and phrase[idx].isspace():
17
+ idx += 1
18
+ start = idx
19
+ idx += len(token)
20
+ end = idx
21
+ positions.append((start, end))
22
+ if not positions or span.start >= len(positions) or span.end > len(positions):
23
+ return Span(0, 0)
24
+ char_start = positions[span.start][0]
25
+ char_end = (
26
+ positions[span.end - 1][1]
27
+ if span.end > span.start
28
+ else positions[span.start][0]
29
+ )
30
+ return Span(char_start, char_end)
31
+
32
+
33
+ async def _binary_cookie_trim[T](
34
+ tokens: list[str],
35
+ start: int,
36
+ end: int,
37
+ parser: Callable[[str], Awaitable[T]],
38
+ baseline_value: T,
39
+ phrase: str,
40
+ ) -> tuple[Span, str, T]:
41
+ """
42
+ Return minimal (char Span, substring, value) such that
43
+ parser(' '.join(tokens[span.start:span.end])) == baseline_value.
44
+ """
45
+ # Binary search for the leftmost index such that tokens[left:end] still parses to baseline_value.
46
+ left = start
47
+ l_low, l_high = start, end - 1
48
+ while l_low <= l_high:
49
+ mid = (l_low + l_high) // 2
50
+ try:
51
+ r = await parser(" ".join(tokens[mid:end]))
52
+ except ParseError:
53
+ r = None
54
+ if r == baseline_value:
55
+ left = mid
56
+ l_low = mid + 1
57
+ else:
58
+ l_high = mid - 1
59
+
60
+ # Binary search for the rightmost index such that tokens[left:right] still parses to baseline_value.
61
+ right = end
62
+ r_low, r_high = left + 1, end
63
+ while r_low <= r_high:
64
+ mid = (r_low + r_high) // 2
65
+ try:
66
+ res = await parser(" ".join(tokens[left:mid]))
67
+ except ParseError:
68
+ res = None
69
+ if res == baseline_value:
70
+ right = mid
71
+ r_high = mid - 1
72
+ else:
73
+ r_low = mid + 1
74
+ token_span = Span(left, right)
75
+ char_span = _token_span_to_char_span(tokens, token_span, phrase)
76
+ substr = phrase[char_span.start : char_span.end]
77
+ return char_span, substr, baseline_value
78
+
79
+
80
+ async def sliding_window_parse[T](
81
+ phrase: str,
82
+ parser: Callable[[str], Awaitable[T]],
83
+ min_window: int = 1,
84
+ max_window: int | None = None,
85
+ concurrency: int | None = None,
86
+ find_one: bool = True,
87
+ ) -> list[tuple[Span, str, T]]:
88
+ tokens: list[str] = phrase.split()
89
+ n: int = len(tokens)
90
+ if n == 0 or parser is None:
91
+ return None
92
+ if max_window is None:
93
+ max_window = n
94
+
95
+ if concurrency is not None and concurrency > 0:
96
+ # Use a semaphore to limit concurrency of parser calls.
97
+ sem = asyncio.Semaphore(concurrency)
98
+
99
+ async def try_window(i: int, j: int) -> T:
100
+ async with sem:
101
+ try:
102
+ return await parser(" ".join(tokens[i:j]))
103
+ except ParseError:
104
+ return None
105
+ else:
106
+
107
+ async def try_window(i: int, j: int) -> T:
108
+ try:
109
+ return await parser(" ".join(tokens[i:j]))
110
+ except ParseError:
111
+ return None
112
+
113
+ # Slide a window of decreasing size over the tokens, left to right.
114
+ # Try parsing for each window. Once successful, trim to minimal window.
115
+ results: list[tuple[Span, str, T]] = []
116
+ for window_size in range(min(max_window, n), min_window - 1, -1):
117
+ for start in range(0, n - window_size + 1):
118
+ end = start + window_size
119
+ try:
120
+ res = await try_window(start, end)
121
+ except ParseError:
122
+ res = None
123
+ if res is None:
124
+ continue
125
+ char_span, substr, value = await _binary_cookie_trim(
126
+ tokens, start, end, parser, res, phrase
127
+ )
128
+ result = (char_span, substr, value)
129
+ if find_one:
130
+ return [result]
131
+ else:
132
+ results.append(result)
133
+ # TODO: limit next windows left edge to char_span.end
134
+
135
+ if results:
136
+ return results
137
+
138
+ # If no valid window is found, raise an error.
139
+ raise ParseError(f"No valid window found using parser={parser} in phrase={phrase}")
@@ -1,399 +0,0 @@
1
- from functools import lru_cache
2
- import warnings
3
-
4
-
5
- from stark.tools.phonetic import espeak_ng
6
-
7
-
8
- @lru_cache
9
- def phonetic(string: str, language_code: str):
10
- """
11
- Converts a string to simplified latin transcription via phonetic (ipa) transliteration.
12
- """
13
- return " ".join(
14
- _ipa2lat(_to_ipa(word, language_code)) for word in string.split()
15
- ) # TODO: try calling _to_ipa for the entire sentence
16
-
17
-
18
- def _to_ipa(string: str, language_code: str) -> str:
19
- return _to_ipa__espeak_bin(string, language_code)
20
-
21
-
22
- def _ipa2lat(ipa_string: str) -> str:
23
- """Converts IPA to a simplified latin transcription."""
24
- return _ipa2lat__dict(ipa_string)
25
-
26
-
27
- # ----- Implementations: -----
28
-
29
- _mapping = {
30
- # Vowels
31
- "i": "i",
32
- "y": "i",
33
- "ɨ": "i",
34
- "ʉ": "u",
35
- "ɯ": "u",
36
- "u": "u",
37
- "ɪ": "i",
38
- "ʏ": "i",
39
- "ʊ": "u",
40
- "e": "e",
41
- "ø": "e",
42
- "ɘ": "e",
43
- "ɵ": "o",
44
- "ɤ": "o",
45
- "o": "o",
46
- "ə": "e",
47
- "ɛ": "e",
48
- "œ": "e",
49
- "ɜ": "e",
50
- "ɞ": "e",
51
- "ʌ": "a",
52
- "ɔ": "o",
53
- "æ": "a",
54
- "ɐ": "a",
55
- "a": "a",
56
- "ɶ": "a",
57
- "ä": "a",
58
- "ɑ": "a",
59
- "ɒ": "o",
60
- # Pulmonic Consonants
61
- "p": "p",
62
- "b": "b",
63
- "t": "t",
64
- "d": "d",
65
- "ʈ": "t",
66
- "ɖ": "d",
67
- "c": "k",
68
- "ɟ": "j",
69
- "k": "k",
70
- "g": "g",
71
- "q": "k",
72
- "ɢ": "g",
73
- "ɡ": "g",
74
- "m": "m",
75
- "ɱ": "m",
76
- "n": "n",
77
- "ɳ": "n",
78
- "ɲ": "nj",
79
- "ŋ": "ng",
80
- "ʋ": "v",
81
- "ɹ": "r",
82
- "ɻ": "r",
83
- "j": "j",
84
- "ɰ": "w",
85
- "ʙ": "b",
86
- "r": "r",
87
- "ʀ": "r",
88
- "ɾ": "r",
89
- "ɸ": "f",
90
- "β": "v",
91
- "f": "f",
92
- "v": "v",
93
- "θ": "th",
94
- "ð": "dh",
95
- "s": "s",
96
- "z": "z",
97
- "ʃ": "sh",
98
- "ʒ": "zh",
99
- "ʂ": "sh",
100
- "ʐ": "zh",
101
- "ç": "h",
102
- "ʝ": "j",
103
- "x": "h",
104
- "ʑ": "z",
105
- "ɣ": "gh",
106
- "χ": "h",
107
- "ʁ": "gh",
108
- "ħ": "h",
109
- "ʕ": "a",
110
- "h": "h",
111
- # Clicks
112
- "ʘ": "o",
113
- "ǀ": "l",
114
- "ǃ": "!",
115
- "ǂ": "!",
116
- "ǁ": "l",
117
- # Implosives and Ejectives
118
- "ɓ": "b",
119
- "ɗ": "d",
120
- "ʄ": "j",
121
- "ɠ": "g",
122
- "ʛ": "g",
123
- # Suprasegmentals
124
- "ˈ": "",
125
- "ˌ": "",
126
- "ː": "",
127
- "ˑ": "",
128
- "|": "",
129
- "‖": "",
130
- ".": "",
131
- "ʼ": "",
132
- # Tones and word accents
133
- "̋": "",
134
- "́": "",
135
- "̄": "",
136
- "̀": "",
137
- "̏": "",
138
- "̌": "",
139
- "̂": "",
140
- "᷄": "",
141
- "᷅": "",
142
- "᷈": "",
143
- "᷉": "",
144
- # Other symbols and diacritics
145
- "ʲ": "",
146
- "ʷ": "w",
147
- "ʱ": "h",
148
- "ʰ": "h",
149
- "ʴ": "r",
150
- "ʳ": "r",
151
- "ˠ": "g",
152
- "ʡ": "a",
153
- "ʢ": "a",
154
- "ɭ": "l",
155
- "_": "",
156
- '"': "",
157
- " ": "",
158
- }
159
-
160
-
161
- def _ipa2lat__dict(ipa_string: str) -> str:
162
- if not ipa_string:
163
- return ""
164
-
165
- string = ipa_string[:]
166
- for ipa, simple in _mapping.items():
167
- string = string.replace(ipa, simple)
168
-
169
- for symbol in string:
170
- if symbol not in "abcdefghijklmnopqrstuvwxyz":
171
- warnings.warn(
172
- f'SuggestionsManager._ipa_to_latin: Unknown symbol: "{symbol}" in {string} ({ipa_string})'
173
- )
174
-
175
- return string
176
-
177
-
178
- def _to_ipa__espeak_bin(string: str, language_code: str) -> str:
179
- return espeak_ng.text_to_ipa(string, language_code)
180
-
181
-
182
- # def to_ipa__espeak_cli(string: str, language_code: str) -> str:
183
- # import re
184
- # import subprocess
185
-
186
- # result = subprocess.run(
187
- # ["espeak-ng", "--ipa", f"-v{language_code}", "-q", string],
188
- # capture_output=True,
189
- # text=True,
190
- # )
191
- # return re.compile(r"\(.*?\)").sub("", result.stdout.strip()).strip()
192
-
193
-
194
- # @lru_cache
195
- # def _epitran_obj(language_code: str) -> Epitran:
196
- # from epitran import Epitran
197
-
198
- # return Epitran(language_code) # this one is long, about an entire second
199
-
200
-
201
- # def to_ipa__epitran(string: str, language_code: str) -> str:
202
- # # if language_code.startswith("en"):
203
- # # raise NotImplementedError(
204
- # # "IPA to Epitran conversion for English is not implemented yet."
205
- # # )
206
-
207
- # if language_code == "ru":
208
- # language_code = "rus-Cyrl"
209
-
210
- # # Code: Language (Script)
211
- # supported_languages = {
212
- # "aar-Latn": "Afar",
213
- # "afr-Latn": "Afrikanns",
214
- # "aii-Syrc": "Assyrian Neo-Aramaic",
215
- # "amh-Ethi": "Amharic",
216
- # "amh-Ethi-pp": "Amharic (more phonetic)",
217
- # "amh-Ethi-red": "Amharic (reduced)",
218
- # "ara-Arab": "Literary Arabic",
219
- # "ava-Cyrl": "Avaric",
220
- # "aze-Cyrl": "Azerbaijani (Cyrillic)",
221
- # "aze-Latn": "Azerbaijani (Latin)",
222
- # "ben-Beng": "Bengali",
223
- # "ben-Beng-red": "Bengali (reduced)",
224
- # "ben-Beng-east": "East Bengali",
225
- # "bho-Deva": "Bhojpuri",
226
- # "bxk-Latn": "Bukusu",
227
- # "cat-Latn": "Catalan",
228
- # "ceb-Latn": "Cebuano",
229
- # "ces-Latn": "Czech",
230
- # "cjy-Latn": "Jin (Wiktionary)",
231
- # "ckb-Arab": "Sorani",
232
- # "cmn-Hans": "Mandarin (Simplified)*",
233
- # "cmn-Hant": "Mandarin (Traditional)*",
234
- # "cmn-Latn": "Mandarin (Pinyin)*",
235
- # "csb-Latn": "Kashubian",
236
- # "deu-Latn": "German",
237
- # "deu-Latn-np": "German†",
238
- # "deu-Latn-nar": "German (more phonetic)",
239
- # "eng-Latn": "English‡",
240
- # "epo-Latn": "Esperanto",
241
- # "est-Latn": "Estonian",
242
- # "fas-Arab": "Farsi (Perso-Arabic)",
243
- # "fin-Latn": "Finnish",
244
- # "fra-Latn": "French",
245
- # "fra-Latn-np": "French†",
246
- # "fra-Latn-p": "French (more phonetic)",
247
- # "ful-Latn": "Fulah",
248
- # "gan-Latn": "Gan (Wiktionary)",
249
- # "glg-Latn": "Galician",
250
- # "got-Goth": "Gothic",
251
- # "got-Latn": "Gothic (Latin)",
252
- # "hak-Latn": "Hakka (pha̍k-fa-sṳ)",
253
- # "hat-Latn-bab": "Haitian (Latin-Babel)",
254
- # "hau-Latn": "Hausa",
255
- # "hin-Deva": "Hindi",
256
- # "hmn-Latn": "Hmong",
257
- # "hrv-Latn": "Croatian",
258
- # "hsn-Latn": "Xiang (Wiktionary)",
259
- # "hun-Latn": "Hungarian",
260
- # "ilo-Latn": "Ilocano",
261
- # "ind-Latn": "Indonesian",
262
- # "ita-Latn": "Italian",
263
- # "jam-Latn": "Jamaican",
264
- # "jav-Latn": "Javanese",
265
- # "jpn-Hira": "Japanese (Hiragana)",
266
- # "jpn-Hira-red": "red Japanese (Hiragana, reduced)",
267
- # "jpn-Jpan": "Japanese (Hiragana, Katakana, Kanji)",
268
- # "jpn-Kana": "Japanese (Katakana)",
269
- # "jpn-Kana-red": "red Japanese (Katakana, reduced)",
270
- # "kat-Geor": "Georgian",
271
- # "kaz-Cyrl": "Kazakh (Cyrillic)",
272
- # "kaz-Cyrl-bab": "bab Kazakh (Cyrillic—Babel)",
273
- # "kaz-Latn": "Kazakh (Latin)",
274
- # "kbd-Cyrl": "Kabardian",
275
- # "khm-Khmr": "Khmer",
276
- # "kin-Latn": "Kinyarwanda",
277
- # "kir-Arab": "Kyrgyz (Perso-Arabic)",
278
- # "kir-Cyrl": "Kyrgyz (Cyrillic)",
279
- # "kir-Latn": "Kyrgyz (Latin)",
280
- # "kmr-Latn": "Kurmanji",
281
- # "kmr-Latn-red": "Kurmanji (reduced)",
282
- # "kor-Hang": "Korean",
283
- # "lao-Laoo": "Lao",
284
- # "lao-Laoo-prereform": "Lao (Before spelling reform)",
285
- # "lav-Latn": "Latvian",
286
- # "lez-Cyrl": "Lezgian",
287
- # "lij-Latn": "Ligurian",
288
- # "lit-Latn": "Lithuanian",
289
- # "lsm-Latn": "Saamia",
290
- # "ltc-Latn-bax": "Middle Chinese (Baxter and Sagart 2014)",
291
- # "lug-Latn": "Ganda / Luganda",
292
- # "mal-Mlym": "Malayalam",
293
- # "mar-Deva": "Marathi",
294
- # "mlt-Latn": "Maltese",
295
- # "mon-Cyrl-bab": "Mongolian (Cyrillic)",
296
- # "mri-Latn": "Maori",
297
- # "msa-Latn": "Malay",
298
- # "mya-Mymr": "Burmese",
299
- # "nan-Latn": "Hokkien (pe̍h-oē-jī)",
300
- # "nan-Latn-tl": "Hokkien (Tâi-lô)",
301
- # "nld-Latn": "Dutch",
302
- # "nya-Latn": "Chichewa",
303
- # "ood-Latn-alv": "Tohono O'odham (Alvarez–Hale)",
304
- # "ood-Latn-sax": "Tohono O'odham (Saxton)",
305
- # "ori-Orya": "Odia",
306
- # "orm-Latn": "Oromo",
307
- # "pan-Guru": "Punjabi (Eastern)",
308
- # "pol-Latn": "Polish",
309
- # "por-Latn": "Portuguese",
310
- # "quy-Latn": "Ayacucho Quechua / Quechua Chanka",
311
- # "ron-Latn": "Romanian",
312
- # "run-Latn": "Rundi",
313
- # "rus-Cyrl": "Russian",
314
- # "sag-Latn": "Sango",
315
- # "sin-Sinh": "Sinhala",
316
- # "slv-Latn": "Slovene / Slovenian",
317
- # "sna-Latn": "Shona",
318
- # "som-Latn": "Somali",
319
- # "spa-Latn": "Spanish",
320
- # "spa-Latn-eu": "Spanish (Iberian)",
321
- # "sqi-Latn": "Albanian",
322
- # "sro-Latn": "Sardinian (Campidanese)",
323
- # "srp-Latn": "Serbian (Latin)",
324
- # "srp-Cyrl": "Serbian (Cyrillic)",
325
- # "swa-Latn": "Swahili",
326
- # "swa-Latn-red": "Swahili (reduced)",
327
- # "swe-Latn": "Swedish",
328
- # "tam-Taml": "Tamil",
329
- # "tam-Taml-red": "Tamil (reduced)",
330
- # "tel-Telu": "Telugu",
331
- # "tgk-Cyrl": "Tajik",
332
- # "tgl-Latn": "Tagalog",
333
- # "tgl-Latn-red": "Tagalog (reduced)",
334
- # "tha-Thai": "Thai",
335
- # "tir-Ethi": "Tigrinya",
336
- # "tir-Ethi-pp": "Tigrinya (more phonemic)",
337
- # "tir-Ethi-red": "Tigrinya (reduced)",
338
- # "tok-Latn": "Toki Pona",
339
- # "tpi-Latn": "Tok Pisin",
340
- # "tuk-Cyrl": "Turkmen (Cyrillic)",
341
- # "tuk-Latn": "Turkmen (Latin)",
342
- # "tur-Latn": "Turkish (Latin)",
343
- # "tur-Latn-bab": "Turkish (Latin—Babel)",
344
- # "tur-Latn-red": "Turkish (reduced)",
345
- # "ukr-Cyrl": "Ukrainian",
346
- # "urd-Arab": "Urdu",
347
- # "uig-Arab": "Uyghur (Perso-Arabic)",
348
- # "uzb-Cyrl": "Uzbek (Cyrillic)",
349
- # "uzb-Latn": "Uzbek (Latin)",
350
- # "vie-Latn": "Vietnamese",
351
- # "wuu-Latn": "Shanghainese Wu (Wiktionary)",
352
- # "xho-Latn": "Xhosa",
353
- # "yor-Latn": "Yoruba",
354
- # "yue-Latn": "Cantonese (Jyutping)",
355
- # "yue-Hant": "Cantonese (Character)",
356
- # "zha-Latn": "Zhuang",
357
- # "zul-Latn": "Zulu",
358
- # }
359
-
360
- # if language_code not in supported_languages:
361
- # for key in supported_languages:
362
- # if key.startswith(language_code):
363
- # warnings.warn(
364
- # f"Unsupported language code: {language_code}; trying to use similar key {key}"
365
- # )
366
- # language_code = key
367
- # break
368
- # else:
369
- # raise ValueError(
370
- # f"Unsupported language code: {language_code}; supported languages: {supported_languages}"
371
- # )
372
-
373
- # if not string.strip():
374
- # return ""
375
-
376
- # return _epitran_obj(language_code).transliterate(string)
377
-
378
- if __name__ == "__main__":
379
- pass
380
- # print("Starting...")
381
- # print(to_ipa__epitran("Hello", "eng-Latn"))
382
- # print("Two more...")
383
- # print(to_ipa__epitran("Hello", "eng-Latn"))
384
- # print(to_ipa__epitran("Hello", "eng-Latn"))
385
- # print("Київ", to_ipa("Київ", "ua"))
386
- # print("Київ", to_ipa("Київ", "uk"))
387
- # test_cases = [
388
- # 'Привет Иван как у тебя делая',
389
- # 'любимые занятия надо делать часто',
390
- # 'Съешь ещё этих мягких французских булок да выпей чаю',
391
- # 'Хай',
392
- # 'хай',
393
- # 'Хэллоу',
394
- # 'хэллоу',
395
- # 'друг с другом',
396
- # 'с пути фай',
397
- # ]
398
- # for test_case in test_cases:
399
- # print((ipa := to_ipa__epitran(test_case, 'rus-Cyrl')), to_ipa__espeak(test_case, 'ru'), ipa2lat__ipapy(ipa), ipa2lat__dict(ipa), sep=' || ')
File without changes
File without changes