PyPI - stark-engine - Versions diffs - 4.2.0__tar.gz → 4.2.2__tar.gz - Mend

stark-engine 4.2.0tar.gz → 4.2.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

{stark_engine-4.2.0 → stark_engine-4.2.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: stark-engine
-Version: 4.2.0
+Version: 4.2.2
 Summary: S.T.A.R.K - Speech and Text Algorithmic Recognition Kit. Modern framework for creating powerfull voice assistants.
 License: CC BY-NC-SA 4.0
 License-File: LICENSE.md

{stark_engine-4.2.0 → stark_engine-4.2.2}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "stark-engine"
-version = "4.2.0"
+version = "4.2.2"
 description = "S.T.A.R.K - Speech and Text Algorithmic Recognition Kit. Modern framework for creating powerfull voice assistants."
 authors = ["MarkParker5 <mark@parker-programs.com>"]
 license = "CC BY-NC-SA 4.0"

{stark_engine-4.2.0 → stark_engine-4.2.2}/stark/tools/dictionary/dictionary.py RENAMED Viewed

@@ -14,7 +14,11 @@ from stark.tools.levenshtein import (
     levenshtein_similarity,
     levenshtein_similarity_substring,
 )
-from stark.tools.phonetic.ipa import phonetic
+from stark.tools.phonetic.transcription import (
+    transcription,
+    IpaProvider,
+    EspeakIpaProvider,
+)
 from stark.tools.phonetic.simplephone import simplephone
 from stark.tools.strtools import find_substring_in_words_map, split_indices
@@ -52,8 +56,13 @@ class Dictionary:
     Phonetic-aware dictionary with metadata storage.
     """
-    def __init__(self, storage: DictionaryStorageProtocol):
+    def __init__(
+        self,
+        storage: DictionaryStorageProtocol,
+        ipa_provider: IpaProvider = EspeakIpaProvider(),
+    ):
         self.storage: DictionaryStorageProtocol = storage
+        self.ipa_provider: IpaProvider = ipa_provider
     # ----------------------
     # Write methods
@@ -65,7 +74,9 @@ class Dictionary:
         Add a single entry to the dictionary.
         Phonetic conversion happens internally (mandatory).
         """
-        phonetic_str = phonetic(name, language_code=language_code)
+        phonetic_str = transcription(
+            name, language_code=language_code, ipa_provider=self.ipa_provider
+        )
         simple_phonetic = simplephone(phonetic_str) or ""
         item = DictionaryItem(
             name=name,
@@ -125,7 +136,9 @@ class Dictionary:
                 else r.item.phonetic,
                 s2=sentence
                 if r.item.language_code == language_code
-                else phonetic(sentence, language_code),
+                else transcription(
+                    sentence, language_code, ipa_provider=self.ipa_provider
+                ),
                 ignore_prefix=True,
             )[0][1],  # TODO: review
             reverse=True,
@@ -141,7 +154,14 @@ class Dictionary:
         """
         Lookup dictionary items by name_candidate and language_code using LookupMode and LookupField.
         """
-        simple_phonetic = simplephone(phonetic(name_candidate, language_code)) or ""
+        simple_phonetic = (
+            simplephone(
+                transcription(
+                    name_candidate, language_code, ipa_provider=self.ipa_provider
+                )
+            )
+            or ""
+        )
         logger.debug(
             f"Looking up '{name_candidate}' with simple phonetic '{simple_phonetic}' under mode {mode}, field {field}"
         )
@@ -170,7 +190,13 @@ class Dictionary:
                     yield from filter(
                         lambda item: levenshtein_match(
                             s1=item.simple_phonetic,
-                            s2=simplephone(phonetic(name_candidate, language_code))
+                            s2=simplephone(
+                                transcription(
+                                    name_candidate,
+                                    language_code,
+                                    ipa_provider=self.ipa_provider,
+                                )
+                            )
                             or "",
                             threshold=0.8,
                             proximity_graph=SIMPLEPHONE_PROXIMITY_GRAPH,
@@ -253,7 +279,12 @@ class Dictionary:
             case LookupMode.FUZZY:
                 if field == LookupField.PHONETIC:
                     simple_phonetic = (
-                        simplephone(phonetic(sentence, language_code)) or ""
+                        simplephone(
+                            transcription(
+                                sentence, language_code, ipa_provider=self.ipa_provider
+                            )
+                        )
+                        or ""
                     )
                     for item in self.storage.iterate():
                         for span, _ in levenshtein_search_substring(
@@ -343,7 +374,11 @@ class Dictionary:
                 span=span,
                 text=sentence[span.slice],
                 simple_phonetic=simplephone(
-                    phonetic(sentence[span.slice], language_code)
+                    transcription(
+                        sentence[span.slice],
+                        language_code,
+                        ipa_provider=self.ipa_provider,
+                    )
                 )
                 or "",
             )
@@ -405,7 +440,9 @@ class Dictionary:
             key=lambda item: levenshtein_similarity(
                 s1=name_candidate
                 if item.language_code == language_code
-                else phonetic(name_candidate, language_code),
+                else transcription(
+                    name_candidate, language_code, ipa_provider=self.ipa_provider
+                ),
                 s2=item.name if item.language_code == language_code else item.phonetic,
             ),
             reverse=True,

stark_engine-4.2.2/stark/tools/phonetic/transcription/__init__.py ADDED Viewed

@@ -0,0 +1,26 @@
+from .protocol import IpaProvider
+from .espeak import EspeakIpaProvider
+from .ipa2lat import ipa2lat
+from functools import lru_cache
+@lru_cache
+def transcription(
+    string: str,
+    language_code: str,
+    ipa_provider: IpaProvider = EspeakIpaProvider(),
+) -> str:
+    """
+    Converts a string to a simplified latin transcription via phonetic (IPA) transliteration.
+    Args:
+        string: The input string to transcribe.
+        language_code: The language code for IPA conversion.
+        ipa_provider: The IPA provider to use for conversion (default: EspeakIpaProvider).
+    Returns:
+        The simplified latin transcription of the input string.
+    """
+    return " ".join(
+        ipa2lat(ipa_provider.to_ipa(word, language_code)) for word in string.split()
+    )

stark_engine-4.2.2/stark/tools/phonetic/transcription/epitran.py ADDED Viewed

@@ -0,0 +1,191 @@
+from typing import Any
+import warnings
+class EpitranIpaProvider:
+    def __init__(self) -> None:
+        self._cache: dict[str, Any] = {}
+    def _epitran_obj(self, language_code: str) -> Any:
+        if language_code not in self._cache:
+            from epitran import Epitran
+            self._cache[language_code] = Epitran(language_code)
+        return self._cache[language_code]
+    def to_ipa(self, string: str, language_code: str) -> str:
+        # if language_code.startswith("en"):
+        #     raise NotImplementedError(
+        #         "IPA to Epitran conversion for English is not implemented yet."
+        #     )
+        if language_code == "ru":
+            language_code = "rus-Cyrl"
+        # Code:	Language (Script)
+        supported_languages = {
+            "aar-Latn": "Afar",
+            "afr-Latn": "Afrikanns",
+            "aii-Syrc": "Assyrian Neo-Aramaic",
+            "amh-Ethi": "Amharic",
+            "amh-Ethi-pp": "Amharic (more phonetic)",
+            "amh-Ethi-red": "Amharic (reduced)",
+            "ara-Arab": "Literary Arabic",
+            "ava-Cyrl": "Avaric",
+            "aze-Cyrl": "Azerbaijani (Cyrillic)",
+            "aze-Latn": "Azerbaijani (Latin)",
+            "ben-Beng": "Bengali",
+            "ben-Beng-red": "Bengali (reduced)",
+            "ben-Beng-east": "East Bengali",
+            "bho-Deva": "Bhojpuri",
+            "bxk-Latn": "Bukusu",
+            "cat-Latn": "Catalan",
+            "ceb-Latn": "Cebuano",
+            "ces-Latn": "Czech",
+            "cjy-Latn": "Jin (Wiktionary)",
+            "ckb-Arab": "Sorani",
+            "cmn-Hans": "Mandarin (Simplified)*",
+            "cmn-Hant": "Mandarin (Traditional)*",
+            "cmn-Latn": "Mandarin (Pinyin)*",
+            "csb-Latn": "Kashubian",
+            "deu-Latn": "German",
+            "deu-Latn-np": "German†",
+            "deu-Latn-nar": "German (more phonetic)",
+            "eng-Latn": "English‡",
+            "epo-Latn": "Esperanto",
+            "est-Latn": "Estonian",
+            "fas-Arab": "Farsi (Perso-Arabic)",
+            "fin-Latn": "Finnish",
+            "fra-Latn": "French",
+            "fra-Latn-np": "French†",
+            "fra-Latn-p": "French (more phonetic)",
+            "ful-Latn": "Fulah",
+            "gan-Latn": "Gan (Wiktionary)",
+            "glg-Latn": "Galician",
+            "got-Goth": "Gothic",
+            "got-Latn": "Gothic (Latin)",
+            "hak-Latn": "Hakka (pha̍k-fa-sṳ)",
+            "hat-Latn-bab": "Haitian (Latin-Babel)",
+            "hau-Latn": "Hausa",
+            "hin-Deva": "Hindi",
+            "hmn-Latn": "Hmong",
+            "hrv-Latn": "Croatian",
+            "hsn-Latn": "Xiang (Wiktionary)",
+            "hun-Latn": "Hungarian",
+            "ilo-Latn": "Ilocano",
+            "ind-Latn": "Indonesian",
+            "ita-Latn": "Italian",
+            "jam-Latn": "Jamaican",
+            "jav-Latn": "Javanese",
+            "jpn-Hira": "Japanese (Hiragana)",
+            "jpn-Hira-red": "red	Japanese (Hiragana, reduced)",
+            "jpn-Jpan": "Japanese (Hiragana, Katakana, Kanji)",
+            "jpn-Kana": "Japanese (Katakana)",
+            "jpn-Kana-red": "red	Japanese (Katakana, reduced)",
+            "kat-Geor": "Georgian",
+            "kaz-Cyrl": "Kazakh (Cyrillic)",
+            "kaz-Cyrl-bab": "bab	Kazakh (Cyrillic—Babel)",
+            "kaz-Latn": "Kazakh (Latin)",
+            "kbd-Cyrl": "Kabardian",
+            "khm-Khmr": "Khmer",
+            "kin-Latn": "Kinyarwanda",
+            "kir-Arab": "Kyrgyz (Perso-Arabic)",
+            "kir-Cyrl": "Kyrgyz (Cyrillic)",
+            "kir-Latn": "Kyrgyz (Latin)",
+            "kmr-Latn": "Kurmanji",
+            "kmr-Latn-red": "Kurmanji (reduced)",
+            "kor-Hang": "Korean",
+            "lao-Laoo": "Lao",
+            "lao-Laoo-prereform": "Lao (Before spelling reform)",
+            "lav-Latn": "Latvian",
+            "lez-Cyrl": "Lezgian",
+            "lij-Latn": "Ligurian",
+            "lit-Latn": "Lithuanian",
+            "lsm-Latn": "Saamia",
+            "ltc-Latn-bax": "Middle Chinese (Baxter and Sagart 2014)",
+            "lug-Latn": "Ganda / Luganda",
+            "mal-Mlym": "Malayalam",
+            "mar-Deva": "Marathi",
+            "mlt-Latn": "Maltese",
+            "mon-Cyrl-bab": "Mongolian (Cyrillic)",
+            "mri-Latn": "Maori",
+            "msa-Latn": "Malay",
+            "mya-Mymr": "Burmese",
+            "nan-Latn": "Hokkien (pe̍h-oē-jī)",
+            "nan-Latn-tl": "Hokkien (Tâi-lô)",
+            "nld-Latn": "Dutch",
+            "nya-Latn": "Chichewa",
+            "ood-Latn-alv": "Tohono O'odham (Alvarez–Hale)",
+            "ood-Latn-sax": "Tohono O'odham (Saxton)",
+            "ori-Orya": "Odia",
+            "orm-Latn": "Oromo",
+            "pan-Guru": "Punjabi (Eastern)",
+            "pol-Latn": "Polish",
+            "por-Latn": "Portuguese",
+            "quy-Latn": "Ayacucho Quechua / Quechua Chanka",
+            "ron-Latn": "Romanian",
+            "run-Latn": "Rundi",
+            "rus-Cyrl": "Russian",
+            "sag-Latn": "Sango",
+            "sin-Sinh": "Sinhala",
+            "slv-Latn": "Slovene / Slovenian",
+            "sna-Latn": "Shona",
+            "som-Latn": "Somali",
+            "spa-Latn": "Spanish",
+            "spa-Latn-eu": "Spanish (Iberian)",
+            "sqi-Latn": "Albanian",
+            "sro-Latn": "Sardinian (Campidanese)",
+            "srp-Latn": "Serbian (Latin)",
+            "srp-Cyrl": "Serbian (Cyrillic)",
+            "swa-Latn": "Swahili",
+            "swa-Latn-red": "Swahili (reduced)",
+            "swe-Latn": "Swedish",
+            "tam-Taml": "Tamil",
+            "tam-Taml-red": "Tamil (reduced)",
+            "tel-Telu": "Telugu",
+            "tgk-Cyrl": "Tajik",
+            "tgl-Latn": "Tagalog",
+            "tgl-Latn-red": "Tagalog (reduced)",
+            "tha-Thai": "Thai",
+            "tir-Ethi": "Tigrinya",
+            "tir-Ethi-pp": "Tigrinya (more phonemic)",
+            "tir-Ethi-red": "Tigrinya (reduced)",
+            "tok-Latn": "Toki Pona",
+            "tpi-Latn": "Tok Pisin",
+            "tuk-Cyrl": "Turkmen (Cyrillic)",
+            "tuk-Latn": "Turkmen (Latin)",
+            "tur-Latn": "Turkish (Latin)",
+            "tur-Latn-bab": "Turkish (Latin—Babel)",
+            "tur-Latn-red": "Turkish (reduced)",
+            "ukr-Cyrl": "Ukrainian",
+            "urd-Arab": "Urdu",
+            "uig-Arab": "Uyghur (Perso-Arabic)",
+            "uzb-Cyrl": "Uzbek (Cyrillic)",
+            "uzb-Latn": "Uzbek (Latin)",
+            "vie-Latn": "Vietnamese",
+            "wuu-Latn": "Shanghainese Wu (Wiktionary)",
+            "xho-Latn": "Xhosa",
+            "yor-Latn": "Yoruba",
+            "yue-Latn": "Cantonese (Jyutping)",
+            "yue-Hant": "Cantonese (Character)",
+            "zha-Latn": "Zhuang",
+            "zul-Latn": "Zulu",
+        }
+        if language_code not in supported_languages:
+            for key in supported_languages:
+                if key.startswith(language_code):
+                    warnings.warn(
+                        f"Unsupported language code: {language_code}; trying to use similar key {key}"
+                    )
+                    language_code = key
+                    break
+            else:
+                raise ValueError(
+                    f"Unsupported language code: {language_code}; supported languages: {supported_languages}"
+                )
+            if not string.strip():
+                return ""
+        return self._epitran_obj(language_code).transliterate(string)

stark_engine-4.2.0/stark/tools/phonetic/espeak_ng.py → stark_engine-4.2.2/stark/tools/phonetic/transcription/espeak.py RENAMED Viewed

@@ -126,38 +126,20 @@ espeak: EspeakNG | None = None
 _espeak_lock = threading.Lock()
-def text_to_ipa(text: str, lang: str, check_chars: bool = True) -> str:
-    with _espeak_lock:
-        global espeak
-        if espeak is None:
-            espeak = EspeakNG(lang)
-        espeak.set_lang(lang)
-        ipa = espeak.text_to_ipa(text, remove_stress=True)
-        if check_chars:
-            for char in {"(", ")", "[", "]"}:
-                assert char not in ipa, (
-                    f"Unexpected character '{char}' in IPA '{ipa}' with lang '{lang}'. Check if the language is supported by eSpeak NG. You can disable this check by setting check_chars=False."
-                )
-        return ipa
-if __name__ == "__main__":
-    data = [
-        ("en", "Hello World"),
-        ("uk", "Привіт світ"),
-        ("fr", "Bonjour le monde"),
-        ("ru", "Привет мир"),
-        ("en", "Hello World"),
-        ("en", "Hello World"),
-        ("ru", "Привет мир"),
-        ("ru", "Привет мир"),
-        ("uk", "Привіт світ"),
-        ("uk", "Привіт світ"),
-        ("uk", "Привіт світ"),
-        ("en", "Hello, World"),
-        ("uk", "Привіт світ"),
-        ("en", "Hello! World"),
-        ("uk", "Привіт, світ"),
-    ]
-    for lang, text in data:
-        print(f"{lang.upper()}: '{text}' -> '{text_to_ipa(text, lang)}'")
+class EspeakIpaProvider:
+    def __init__(self, check_chars: bool = True):
+        self.check_chars = check_chars
+    def to_ipa(self, string: str, language_code: str) -> str:
+        with _espeak_lock:
+            global espeak
+            if espeak is None:
+                espeak = EspeakNG(language_code)
+            espeak.set_lang(language_code)
+            ipa = espeak.text_to_ipa(string, remove_stress=True)
+            if self.check_chars:
+                for char in {"(", ")", "[", "]"}:
+                    assert char not in ipa, (
+                        f"Unexpected character '{char}' in IPA '{ipa}' with lang '{language_code}'. Check if the language is supported by eSpeak NG. You can disable this check by setting check_chars=False."
+                    )
+            return ipa

stark_engine-4.2.2/stark/tools/phonetic/transcription/ipa2lat.py ADDED Viewed

@@ -0,0 +1,151 @@
+import warnings
+_mapping = {
+    # Vowels
+    "i": "i",
+    "y": "i",
+    "ɨ": "i",
+    "ʉ": "u",
+    "ɯ": "u",
+    "u": "u",
+    "ɪ": "i",
+    "ʏ": "i",
+    "ʊ": "u",
+    "e": "e",
+    "ø": "e",
+    "ɘ": "e",
+    "ɵ": "o",
+    "ɤ": "o",
+    "o": "o",
+    "ə": "e",
+    "ɛ": "e",
+    "œ": "e",
+    "ɜ": "e",
+    "ɞ": "e",
+    "ʌ": "a",
+    "ɔ": "o",
+    "æ": "a",
+    "ɐ": "a",
+    "a": "a",
+    "ɶ": "a",
+    "ä": "a",
+    "ɑ": "a",
+    "ɒ": "o",
+    # Pulmonic Consonants
+    "p": "p",
+    "b": "b",
+    "t": "t",
+    "d": "d",
+    "ʈ": "t",
+    "ɖ": "d",
+    "c": "k",
+    "ɟ": "j",
+    "k": "k",
+    "g": "g",
+    "q": "k",
+    "ɢ": "g",
+    "ɡ": "g",
+    "m": "m",
+    "ɱ": "m",
+    "n": "n",
+    "ɳ": "n",
+    "ɲ": "nj",
+    "ŋ": "ng",
+    "ʋ": "v",
+    "ɹ": "r",
+    "ɻ": "r",
+    "j": "j",
+    "ɰ": "w",
+    "ʙ": "b",
+    "r": "r",
+    "ʀ": "r",
+    "ɾ": "r",
+    "ɸ": "f",
+    "β": "v",
+    "f": "f",
+    "v": "v",
+    "θ": "th",
+    "ð": "dh",
+    "s": "s",
+    "z": "z",
+    "ʃ": "sh",
+    "ʒ": "zh",
+    "ʂ": "sh",
+    "ʐ": "zh",
+    "ç": "h",
+    "ʝ": "j",
+    "x": "h",
+    "ʑ": "z",
+    "ɣ": "gh",
+    "χ": "h",
+    "ʁ": "gh",
+    "ħ": "h",
+    "ʕ": "a",
+    "h": "h",
+    # Clicks
+    "ʘ": "o",
+    "ǀ": "l",
+    "ǃ": "!",
+    "ǂ": "!",
+    "ǁ": "l",
+    # Implosives and Ejectives
+    "ɓ": "b",
+    "ɗ": "d",
+    "ʄ": "j",
+    "ɠ": "g",
+    "ʛ": "g",
+    # Suprasegmentals
+    "ˈ": "",
+    "ˌ": "",
+    "ː": "",
+    "ˑ": "",
+    "|": "",
+    "‖": "",
+    ".": "",
+    "ʼ": "",
+    # Tones and word accents
+    "̋": "",
+    "́": "",
+    "̄": "",
+    "̀": "",
+    "̏": "",
+    "̌": "",
+    "̂": "",
+    "᷄": "",
+    "᷅": "",
+    "᷈": "",
+    "᷉": "",
+    # Other symbols and diacritics
+    "ʲ": "",
+    "ʷ": "w",
+    "ʱ": "h",
+    "ʰ": "h",
+    "ʴ": "r",
+    "ʳ": "r",
+    "ˠ": "g",
+    "ʡ": "a",
+    "ʢ": "a",
+    "ɭ": "l",
+    "_": "",
+    '"': "",
+    " ": "",
+}
+def ipa2lat(ipa_string: str) -> str:
+    """Convert IPA string to a simplified Latin string"""
+    if not ipa_string:
+        return ""
+    string = ipa_string[:]
+    for ipa, simple in _mapping.items():
+        string = string.replace(ipa, simple)
+    for symbol in string:
+        if symbol not in "abcdefghijklmnopqrstuvwxyz":
+            warnings.warn(
+                f'ipa2lat: Unknown symbol: "{symbol}" in {string} ({ipa_string})'
+            )
+    return string

stark_engine-4.2.2/stark/tools/phonetic/transcription/protocol.py ADDED Viewed

@@ -0,0 +1,5 @@
+from typing import Protocol
+class IpaProvider(Protocol):
+    def to_ipa(self, string: str, language_code: str) -> str: ...

stark_engine-4.2.2/stark/tools/sliding_window_parser.py ADDED Viewed

@@ -0,0 +1,139 @@
+import asyncio
+from typing import Awaitable, Callable
+from stark.core.patterns.parsing import ParseError
+from stark.tools.common.span import Span
+def _token_span_to_char_span(tokens: list[str], span: Span, phrase: str) -> Span:
+    """Convert a token span (by index) to a character span in the original phrase."""
+    if not tokens or not (0 <= span.start <= span.end <= len(tokens)):
+        return Span(0, 0)
+    # Find the start and end char positions of the tokens in the original phrase
+    positions = []
+    idx = 0
+    for token in tokens:
+        # skip leading spaces
+        while idx < len(phrase) and phrase[idx].isspace():
+            idx += 1
+        start = idx
+        idx += len(token)
+        end = idx
+        positions.append((start, end))
+    if not positions or span.start >= len(positions) or span.end > len(positions):
+        return Span(0, 0)
+    char_start = positions[span.start][0]
+    char_end = (
+        positions[span.end - 1][1]
+        if span.end > span.start
+        else positions[span.start][0]
+    )
+    return Span(char_start, char_end)
+async def _binary_cookie_trim[T](
+    tokens: list[str],
+    start: int,
+    end: int,
+    parser: Callable[[str], Awaitable[T]],
+    baseline_value: T,
+    phrase: str,
+) -> tuple[Span, str, T]:
+    """
+    Return minimal (char Span, substring, value) such that
+    parser(' '.join(tokens[span.start:span.end])) == baseline_value.
+    """
+    # Binary search for the leftmost index such that tokens[left:end] still parses to baseline_value.
+    left = start
+    l_low, l_high = start, end - 1
+    while l_low <= l_high:
+        mid = (l_low + l_high) // 2
+        try:
+            r = await parser(" ".join(tokens[mid:end]))
+        except ParseError:
+            r = None
+        if r == baseline_value:
+            left = mid
+            l_low = mid + 1
+        else:
+            l_high = mid - 1
+    # Binary search for the rightmost index such that tokens[left:right] still parses to baseline_value.
+    right = end
+    r_low, r_high = left + 1, end
+    while r_low <= r_high:
+        mid = (r_low + r_high) // 2
+        try:
+            res = await parser(" ".join(tokens[left:mid]))
+        except ParseError:
+            res = None
+        if res == baseline_value:
+            right = mid
+            r_high = mid - 1
+        else:
+            r_low = mid + 1
+    token_span = Span(left, right)
+    char_span = _token_span_to_char_span(tokens, token_span, phrase)
+    substr = phrase[char_span.start : char_span.end]
+    return char_span, substr, baseline_value
+async def sliding_window_parse[T](
+    phrase: str,
+    parser: Callable[[str], Awaitable[T]],
+    min_window: int = 1,
+    max_window: int | None = None,
+    concurrency: int | None = None,
+    find_one: bool = True,
+) -> list[tuple[Span, str, T]]:
+    tokens: list[str] = phrase.split()
+    n: int = len(tokens)
+    if n == 0 or parser is None:
+        return None
+    if max_window is None:
+        max_window = n
+    if concurrency is not None and concurrency > 0:
+        # Use a semaphore to limit concurrency of parser calls.
+        sem = asyncio.Semaphore(concurrency)
+        async def try_window(i: int, j: int) -> T:
+            async with sem:
+                try:
+                    return await parser(" ".join(tokens[i:j]))
+                except ParseError:
+                    return None
+    else:
+        async def try_window(i: int, j: int) -> T:
+            try:
+                return await parser(" ".join(tokens[i:j]))
+            except ParseError:
+                return None
+    # Slide a window of decreasing size over the tokens, left to right.
+    # Try parsing for each window. Once successful, trim to minimal window.
+    results: list[tuple[Span, str, T]] = []
+    for window_size in range(min(max_window, n), min_window - 1, -1):
+        for start in range(0, n - window_size + 1):
+            end = start + window_size
+            try:
+                res = await try_window(start, end)
+            except ParseError:
+                res = None
+            if res is None:
+                continue
+            char_span, substr, value = await _binary_cookie_trim(
+                tokens, start, end, parser, res, phrase
+            )
+            result = (char_span, substr, value)
+            if find_one:
+                return [result]
+            else:
+                results.append(result)
+            # TODO: limit next windows left edge to char_span.end
+    if results:
+        return results
+    # If no valid window is found, raise an error.
+    raise ParseError(f"No valid window found using parser={parser} in phrase={phrase}")

stark_engine-4.2.0/stark/tools/phonetic/ipa.py DELETED Viewed

@@ -1,399 +0,0 @@
-from functools import lru_cache
-import warnings
-from stark.tools.phonetic import espeak_ng
-@lru_cache
-def phonetic(string: str, language_code: str):
-    """
-    Converts a string to simplified latin transcription via phonetic (ipa) transliteration.
-    """
-    return " ".join(
-        _ipa2lat(_to_ipa(word, language_code)) for word in string.split()
-    )  # TODO: try calling _to_ipa for the entire sentence
-def _to_ipa(string: str, language_code: str) -> str:
-    return _to_ipa__espeak_bin(string, language_code)
-def _ipa2lat(ipa_string: str) -> str:
-    """Converts IPA to a simplified latin transcription."""
-    return _ipa2lat__dict(ipa_string)
-# ----- Implementations: -----
-_mapping = {
-    # Vowels
-    "i": "i",
-    "y": "i",
-    "ɨ": "i",
-    "ʉ": "u",
-    "ɯ": "u",
-    "u": "u",
-    "ɪ": "i",
-    "ʏ": "i",
-    "ʊ": "u",
-    "e": "e",
-    "ø": "e",
-    "ɘ": "e",
-    "ɵ": "o",
-    "ɤ": "o",
-    "o": "o",
-    "ə": "e",
-    "ɛ": "e",
-    "œ": "e",
-    "ɜ": "e",
-    "ɞ": "e",
-    "ʌ": "a",
-    "ɔ": "o",
-    "æ": "a",
-    "ɐ": "a",
-    "a": "a",
-    "ɶ": "a",
-    "ä": "a",
-    "ɑ": "a",
-    "ɒ": "o",
-    # Pulmonic Consonants
-    "p": "p",
-    "b": "b",
-    "t": "t",
-    "d": "d",
-    "ʈ": "t",
-    "ɖ": "d",
-    "c": "k",
-    "ɟ": "j",
-    "k": "k",
-    "g": "g",
-    "q": "k",
-    "ɢ": "g",
-    "ɡ": "g",
-    "m": "m",
-    "ɱ": "m",
-    "n": "n",
-    "ɳ": "n",
-    "ɲ": "nj",
-    "ŋ": "ng",
-    "ʋ": "v",
-    "ɹ": "r",
-    "ɻ": "r",
-    "j": "j",
-    "ɰ": "w",
-    "ʙ": "b",
-    "r": "r",
-    "ʀ": "r",
-    "ɾ": "r",
-    "ɸ": "f",
-    "β": "v",
-    "f": "f",
-    "v": "v",
-    "θ": "th",
-    "ð": "dh",
-    "s": "s",
-    "z": "z",
-    "ʃ": "sh",
-    "ʒ": "zh",
-    "ʂ": "sh",
-    "ʐ": "zh",
-    "ç": "h",
-    "ʝ": "j",
-    "x": "h",
-    "ʑ": "z",
-    "ɣ": "gh",
-    "χ": "h",
-    "ʁ": "gh",
-    "ħ": "h",
-    "ʕ": "a",
-    "h": "h",
-    # Clicks
-    "ʘ": "o",
-    "ǀ": "l",
-    "ǃ": "!",
-    "ǂ": "!",
-    "ǁ": "l",
-    # Implosives and Ejectives
-    "ɓ": "b",
-    "ɗ": "d",
-    "ʄ": "j",
-    "ɠ": "g",
-    "ʛ": "g",
-    # Suprasegmentals
-    "ˈ": "",
-    "ˌ": "",
-    "ː": "",
-    "ˑ": "",
-    "|": "",
-    "‖": "",
-    ".": "",
-    "ʼ": "",
-    # Tones and word accents
-    "̋": "",
-    "́": "",
-    "̄": "",
-    "̀": "",
-    "̏": "",
-    "̌": "",
-    "̂": "",
-    "᷄": "",
-    "᷅": "",
-    "᷈": "",
-    "᷉": "",
-    # Other symbols and diacritics
-    "ʲ": "",
-    "ʷ": "w",
-    "ʱ": "h",
-    "ʰ": "h",
-    "ʴ": "r",
-    "ʳ": "r",
-    "ˠ": "g",
-    "ʡ": "a",
-    "ʢ": "a",
-    "ɭ": "l",
-    "_": "",
-    '"': "",
-    " ": "",
-}
-def _ipa2lat__dict(ipa_string: str) -> str:
-    if not ipa_string:
-        return ""
-    string = ipa_string[:]
-    for ipa, simple in _mapping.items():
-        string = string.replace(ipa, simple)
-    for symbol in string:
-        if symbol not in "abcdefghijklmnopqrstuvwxyz":
-            warnings.warn(
-                f'SuggestionsManager._ipa_to_latin: Unknown symbol: "{symbol}" in {string} ({ipa_string})'
-            )
-    return string
-def _to_ipa__espeak_bin(string: str, language_code: str) -> str:
-    return espeak_ng.text_to_ipa(string, language_code)
-# def to_ipa__espeak_cli(string: str, language_code: str) -> str:
-#     import re
-#     import subprocess
-#     result = subprocess.run(
-#         ["espeak-ng", "--ipa", f"-v{language_code}", "-q", string],
-#         capture_output=True,
-#         text=True,
-#     )
-#     return re.compile(r"\(.*?\)").sub("", result.stdout.strip()).strip()
-# @lru_cache
-# def _epitran_obj(language_code: str) -> Epitran:
-#     from epitran import Epitran
-#     return Epitran(language_code)  # this one is long, about an entire second
-# def to_ipa__epitran(string: str, language_code: str) -> str:
-#     # if language_code.startswith("en"):
-#     #     raise NotImplementedError(
-#     #         "IPA to Epitran conversion for English is not implemented yet."
-#     #     )
-#     if language_code == "ru":
-#         language_code = "rus-Cyrl"
-#     # Code:	Language (Script)
-#     supported_languages = {
-#         "aar-Latn": "Afar",
-#         "afr-Latn": "Afrikanns",
-#         "aii-Syrc": "Assyrian Neo-Aramaic",
-#         "amh-Ethi": "Amharic",
-#         "amh-Ethi-pp": "Amharic (more phonetic)",
-#         "amh-Ethi-red": "Amharic (reduced)",
-#         "ara-Arab": "Literary Arabic",
-#         "ava-Cyrl": "Avaric",
-#         "aze-Cyrl": "Azerbaijani (Cyrillic)",
-#         "aze-Latn": "Azerbaijani (Latin)",
-#         "ben-Beng": "Bengali",
-#         "ben-Beng-red": "Bengali (reduced)",
-#         "ben-Beng-east": "East Bengali",
-#         "bho-Deva": "Bhojpuri",
-#         "bxk-Latn": "Bukusu",
-#         "cat-Latn": "Catalan",
-#         "ceb-Latn": "Cebuano",
-#         "ces-Latn": "Czech",
-#         "cjy-Latn": "Jin (Wiktionary)",
-#         "ckb-Arab": "Sorani",
-#         "cmn-Hans": "Mandarin (Simplified)*",
-#         "cmn-Hant": "Mandarin (Traditional)*",
-#         "cmn-Latn": "Mandarin (Pinyin)*",
-#         "csb-Latn": "Kashubian",
-#         "deu-Latn": "German",
-#         "deu-Latn-np": "German†",
-#         "deu-Latn-nar": "German (more phonetic)",
-#         "eng-Latn": "English‡",
-#         "epo-Latn": "Esperanto",
-#         "est-Latn": "Estonian",
-#         "fas-Arab": "Farsi (Perso-Arabic)",
-#         "fin-Latn": "Finnish",
-#         "fra-Latn": "French",
-#         "fra-Latn-np": "French†",
-#         "fra-Latn-p": "French (more phonetic)",
-#         "ful-Latn": "Fulah",
-#         "gan-Latn": "Gan (Wiktionary)",
-#         "glg-Latn": "Galician",
-#         "got-Goth": "Gothic",
-#         "got-Latn": "Gothic (Latin)",
-#         "hak-Latn": "Hakka (pha̍k-fa-sṳ)",
-#         "hat-Latn-bab": "Haitian (Latin-Babel)",
-#         "hau-Latn": "Hausa",
-#         "hin-Deva": "Hindi",
-#         "hmn-Latn": "Hmong",
-#         "hrv-Latn": "Croatian",
-#         "hsn-Latn": "Xiang (Wiktionary)",
-#         "hun-Latn": "Hungarian",
-#         "ilo-Latn": "Ilocano",
-#         "ind-Latn": "Indonesian",
-#         "ita-Latn": "Italian",
-#         "jam-Latn": "Jamaican",
-#         "jav-Latn": "Javanese",
-#         "jpn-Hira": "Japanese (Hiragana)",
-#         "jpn-Hira-red": "red	Japanese (Hiragana, reduced)",
-#         "jpn-Jpan": "Japanese (Hiragana, Katakana, Kanji)",
-#         "jpn-Kana": "Japanese (Katakana)",
-#         "jpn-Kana-red": "red	Japanese (Katakana, reduced)",
-#         "kat-Geor": "Georgian",
-#         "kaz-Cyrl": "Kazakh (Cyrillic)",
-#         "kaz-Cyrl-bab": "bab	Kazakh (Cyrillic—Babel)",
-#         "kaz-Latn": "Kazakh (Latin)",
-#         "kbd-Cyrl": "Kabardian",
-#         "khm-Khmr": "Khmer",
-#         "kin-Latn": "Kinyarwanda",
-#         "kir-Arab": "Kyrgyz (Perso-Arabic)",
-#         "kir-Cyrl": "Kyrgyz (Cyrillic)",
-#         "kir-Latn": "Kyrgyz (Latin)",
-#         "kmr-Latn": "Kurmanji",
-#         "kmr-Latn-red": "Kurmanji (reduced)",
-#         "kor-Hang": "Korean",
-#         "lao-Laoo": "Lao",
-#         "lao-Laoo-prereform": "Lao (Before spelling reform)",
-#         "lav-Latn": "Latvian",
-#         "lez-Cyrl": "Lezgian",
-#         "lij-Latn": "Ligurian",
-#         "lit-Latn": "Lithuanian",
-#         "lsm-Latn": "Saamia",
-#         "ltc-Latn-bax": "Middle Chinese (Baxter and Sagart 2014)",
-#         "lug-Latn": "Ganda / Luganda",
-#         "mal-Mlym": "Malayalam",
-#         "mar-Deva": "Marathi",
-#         "mlt-Latn": "Maltese",
-#         "mon-Cyrl-bab": "Mongolian (Cyrillic)",
-#         "mri-Latn": "Maori",
-#         "msa-Latn": "Malay",
-#         "mya-Mymr": "Burmese",
-#         "nan-Latn": "Hokkien (pe̍h-oē-jī)",
-#         "nan-Latn-tl": "Hokkien (Tâi-lô)",
-#         "nld-Latn": "Dutch",
-#         "nya-Latn": "Chichewa",
-#         "ood-Latn-alv": "Tohono O'odham (Alvarez–Hale)",
-#         "ood-Latn-sax": "Tohono O'odham (Saxton)",
-#         "ori-Orya": "Odia",
-#         "orm-Latn": "Oromo",
-#         "pan-Guru": "Punjabi (Eastern)",
-#         "pol-Latn": "Polish",
-#         "por-Latn": "Portuguese",
-#         "quy-Latn": "Ayacucho Quechua / Quechua Chanka",
-#         "ron-Latn": "Romanian",
-#         "run-Latn": "Rundi",
-#         "rus-Cyrl": "Russian",
-#         "sag-Latn": "Sango",
-#         "sin-Sinh": "Sinhala",
-#         "slv-Latn": "Slovene / Slovenian",
-#         "sna-Latn": "Shona",
-#         "som-Latn": "Somali",
-#         "spa-Latn": "Spanish",
-#         "spa-Latn-eu": "Spanish (Iberian)",
-#         "sqi-Latn": "Albanian",
-#         "sro-Latn": "Sardinian (Campidanese)",
-#         "srp-Latn": "Serbian (Latin)",
-#         "srp-Cyrl": "Serbian (Cyrillic)",
-#         "swa-Latn": "Swahili",
-#         "swa-Latn-red": "Swahili (reduced)",
-#         "swe-Latn": "Swedish",
-#         "tam-Taml": "Tamil",
-#         "tam-Taml-red": "Tamil (reduced)",
-#         "tel-Telu": "Telugu",
-#         "tgk-Cyrl": "Tajik",
-#         "tgl-Latn": "Tagalog",
-#         "tgl-Latn-red": "Tagalog (reduced)",
-#         "tha-Thai": "Thai",
-#         "tir-Ethi": "Tigrinya",
-#         "tir-Ethi-pp": "Tigrinya (more phonemic)",
-#         "tir-Ethi-red": "Tigrinya (reduced)",
-#         "tok-Latn": "Toki Pona",
-#         "tpi-Latn": "Tok Pisin",
-#         "tuk-Cyrl": "Turkmen (Cyrillic)",
-#         "tuk-Latn": "Turkmen (Latin)",
-#         "tur-Latn": "Turkish (Latin)",
-#         "tur-Latn-bab": "Turkish (Latin—Babel)",
-#         "tur-Latn-red": "Turkish (reduced)",
-#         "ukr-Cyrl": "Ukrainian",
-#         "urd-Arab": "Urdu",
-#         "uig-Arab": "Uyghur (Perso-Arabic)",
-#         "uzb-Cyrl": "Uzbek (Cyrillic)",
-#         "uzb-Latn": "Uzbek (Latin)",
-#         "vie-Latn": "Vietnamese",
-#         "wuu-Latn": "Shanghainese Wu (Wiktionary)",
-#         "xho-Latn": "Xhosa",
-#         "yor-Latn": "Yoruba",
-#         "yue-Latn": "Cantonese (Jyutping)",
-#         "yue-Hant": "Cantonese (Character)",
-#         "zha-Latn": "Zhuang",
-#         "zul-Latn": "Zulu",
-#     }
-#     if language_code not in supported_languages:
-#         for key in supported_languages:
-#             if key.startswith(language_code):
-#                 warnings.warn(
-#                     f"Unsupported language code: {language_code}; trying to use similar key {key}"
-#                 )
-#                 language_code = key
-#                 break
-#         else:
-#             raise ValueError(
-#                 f"Unsupported language code: {language_code}; supported languages: {supported_languages}"
-#             )
-#         if not string.strip():
-#             return ""
-#     return _epitran_obj(language_code).transliterate(string)
-if __name__ == "__main__":
-    pass
-    # print("Starting...")
-    # print(to_ipa__epitran("Hello", "eng-Latn"))
-    # print("Two more...")
-    # print(to_ipa__epitran("Hello", "eng-Latn"))
-    # print(to_ipa__epitran("Hello", "eng-Latn"))
-    # print("Київ", to_ipa("Київ", "ua"))
-    # print("Київ", to_ipa("Київ", "uk"))
-    # test_cases = [
-    #     'Привет Иван как у тебя делая',
-    #     'любимые занятия надо делать часто',
-    #     'Съешь ещё этих мягких французских булок да выпей чаю',
-    #     'Хай',
-    #     'хай',
-    #     'Хэллоу',
-    #     'хэллоу',
-    #     'друг с другом',
-    #     'с пути фай',
-    # ]
-    # for test_case in test_cases:
-    #     print((ipa := to_ipa__epitran(test_case, 'rus-Cyrl')), to_ipa__espeak(test_case, 'ru'), ipa2lat__ipapy(ipa), ipa2lat__dict(ipa), sep=' || ')

{stark_engine-4.2.0 → stark_engine-4.2.2}/LICENSE.md RENAMED Viewed

File without changes

{stark_engine-4.2.0 → stark_engine-4.2.2}/README.md RENAMED Viewed

File without changes

{stark_engine-4.2.0 → stark_engine-4.2.2}/stark/__init__.py RENAMED Viewed

File without changes

{stark_engine-4.2.0 → stark_engine-4.2.2}/stark/core/__init__.py RENAMED Viewed

File without changes

{stark_engine-4.2.0 → stark_engine-4.2.2}/stark/core/command.py RENAMED Viewed

File without changes

{stark_engine-4.2.0 → stark_engine-4.2.2}/stark/core/commands_context.py RENAMED Viewed

File without changes

{stark_engine-4.2.0 → stark_engine-4.2.2}/stark/core/commands_manager.py RENAMED Viewed

File without changes

{stark_engine-4.2.0 → stark_engine-4.2.2}/stark/core/patterns/__init__.py RENAMED Viewed

File without changes

{stark_engine-4.2.0 → stark_engine-4.2.2}/stark/core/patterns/parsing.py RENAMED Viewed

File without changes

{stark_engine-4.2.0 → stark_engine-4.2.2}/stark/core/patterns/pattern.py RENAMED Viewed

File without changes

{stark_engine-4.2.0 → stark_engine-4.2.2}/stark/core/patterns/rules.py RENAMED Viewed

File without changes

{stark_engine-4.2.0 → stark_engine-4.2.2}/stark/core/types/__init__.py RENAMED Viewed

File without changes

{stark_engine-4.2.0 → stark_engine-4.2.2}/stark/core/types/number.py RENAMED Viewed

File without changes

{stark_engine-4.2.0 → stark_engine-4.2.2}/stark/core/types/object.py RENAMED Viewed

File without changes

{stark_engine-4.2.0 → stark_engine-4.2.2}/stark/core/types/slots.py RENAMED Viewed

File without changes

{stark_engine-4.2.0 → stark_engine-4.2.2}/stark/core/types/string.py RENAMED Viewed

File without changes

{stark_engine-4.2.0 → stark_engine-4.2.2}/stark/core/types/time.py RENAMED Viewed

File without changes

{stark_engine-4.2.0 → stark_engine-4.2.2}/stark/core/types/time_interval.py RENAMED Viewed

File without changes

{stark_engine-4.2.0 → stark_engine-4.2.2}/stark/core/types/word.py RENAMED Viewed

File without changes

{stark_engine-4.2.0 → stark_engine-4.2.2}/stark/general/blockage_detector.py RENAMED Viewed

File without changes

{stark_engine-4.2.0 → stark_engine-4.2.2}/stark/general/classproperty.py RENAMED Viewed

File without changes

{stark_engine-4.2.0 → stark_engine-4.2.2}/stark/general/dependencies.py RENAMED Viewed

File without changes

{stark_engine-4.2.0 → stark_engine-4.2.2}/stark/general/json_encoder.py RENAMED Viewed

File without changes

{stark_engine-4.2.0 → stark_engine-4.2.2}/stark/interfaces/gcloud.py RENAMED Viewed

File without changes

{stark_engine-4.2.0 → stark_engine-4.2.2}/stark/interfaces/protocols.py RENAMED Viewed

File without changes

{stark_engine-4.2.0 → stark_engine-4.2.2}/stark/interfaces/silero.py RENAMED Viewed

File without changes

{stark_engine-4.2.0 → stark_engine-4.2.2}/stark/interfaces/vosk.py RENAMED Viewed

File without changes

{stark_engine-4.2.0 → stark_engine-4.2.2}/stark/tools/common/span.py RENAMED Viewed

File without changes

{stark_engine-4.2.0 → stark_engine-4.2.2}/stark/tools/dictionary/!examples.py RENAMED Viewed

File without changes

{stark_engine-4.2.0 → stark_engine-4.2.2}/stark/tools/dictionary/__init__.py RENAMED Viewed

File without changes

{stark_engine-4.2.0 → stark_engine-4.2.2}/stark/tools/dictionary/models.py RENAMED Viewed

File without changes

{stark_engine-4.2.0 → stark_engine-4.2.2}/stark/tools/dictionary/nl_dictionary_name.py RENAMED Viewed

File without changes

{stark_engine-4.2.0 → stark_engine-4.2.2}/stark/tools/dictionary/storage/__init__.py RENAMED Viewed

File without changes

{stark_engine-4.2.0 → stark_engine-4.2.2}/stark/tools/dictionary/storage/storage_memory.py RENAMED Viewed

File without changes

{stark_engine-4.2.0 → stark_engine-4.2.2}/stark/tools/dictionary/storage/storage_sqlite.py RENAMED Viewed

File without changes

{stark_engine-4.2.0 → stark_engine-4.2.2}/stark/tools/levenshtein/__init__.py RENAMED Viewed

File without changes

{stark_engine-4.2.0 → stark_engine-4.2.2}/stark/tools/levenshtein/levenshtein.pyi RENAMED Viewed

File without changes

{stark_engine-4.2.0 → stark_engine-4.2.2}/stark/tools/levenshtein/levenshtein.pyx RENAMED Viewed

File without changes

{stark_engine-4.2.0 → stark_engine-4.2.2}/stark/tools/phonetic/simplephone.py RENAMED Viewed

File without changes

{stark_engine-4.2.0 → stark_engine-4.2.2}/stark/tools/strtools.py RENAMED Viewed

File without changes

{stark_engine-4.2.0 → stark_engine-4.2.2}/stark/voice_assistant/__init__.py RENAMED Viewed

File without changes

{stark_engine-4.2.0 → stark_engine-4.2.2}/stark/voice_assistant/mode.py RENAMED Viewed

File without changes

{stark_engine-4.2.0 → stark_engine-4.2.2}/stark/voice_assistant/voice_assistant.py RENAMED Viewed

File without changes

stark-engine 4.2.0__tar.gz → 4.2.2__tar.gz

stark-engine 4.2.0tar.gz → 4.2.2tar.gz