PyPI - telugu-language-tools - Versions diffs - 5.0.4__py3-none-any.whl → 5.5.0__py3-none-any.whl - Mend

telugu-language-tools 5.0.4py3-none-any.whl → 5.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of telugu-language-tools might be problematic. Click here for more details.

Files changed (11) hide show

telugu_engine/__init__.py +20 -25
telugu_engine/enhanced_tense.py +184 -649
telugu_engine/grammar.py +178 -325
telugu_engine/transliterator.py +295 -643
{telugu_language_tools-5.0.4.dist-info → telugu_language_tools-5.5.0.dist-info}/METADATA +84 -13
telugu_language_tools-5.5.0.dist-info/RECORD +12 -0
telugu_engine/tense_engine.py +0 -391
telugu_language_tools-5.0.4.dist-info/RECORD +0 -13
{telugu_language_tools-5.0.4.dist-info → telugu_language_tools-5.5.0.dist-info}/WHEEL +0 -0
{telugu_language_tools-5.0.4.dist-info → telugu_language_tools-5.5.0.dist-info}/licenses/LICENSE +0 -0
{telugu_language_tools-5.0.4.dist-info → telugu_language_tools-5.5.0.dist-info}/top_level.txt +0 -0

telugu_engine/transliterator.py CHANGED Viewed

@@ -1,692 +1,344 @@
 """
-Transliterator v3.0 - Complete Template
-========================================
-This is a TEMPLATE showing what the rewritten transliterator.py should look like.
-Copy this structure and implement the functions.
-Key Features:
-- v3.0 modern script (no archaic letters)
-- Modern pronouns (నేను, వాళ్ళు)
-- Long vowel support (aa → ఆ)
-- Nasal cluster rules (nd → ండ)
-- 100+ consonant clusters
-- Clean, tested code
+Telugu Library v4.3.0 — Enhanced Clusters
+----------------------------------
+Fixes based on user feedback:
+- **Enhanced Clusters:** Added numerous 3- and 4-character consonant clusters (e.g., 'str', 'sht', 'skr', 'STh') to the 'clusters' dictionary for greater accuracy.
+- **CRITICAL FIX (C+ri Matra):** Ensured consonant-r-i sequences are correctly parsed as C + R + I-matra.
+- **Refined Nasal Handling:** Simplified internal nasal cluster handling to rely more heavily on the central 'clusters' map for complex cases like 'namste'.
+- **Case Sensitivity Maintained:** Retains case distinction for retroflex consonants (T, D, N, S).
 """
-from typing import Optional
-# ============================================================================
-# SECTION 1: MODERN v3.0 DATA (v3.0 Compliant - No Archaic Letters)
-# ============================================================================
-# Short vowels
-VOWELS = {
-    'a': 'అ',    # a (short)
-    'i': 'ఇ',    # i (short)
-    'u': 'ఉ',    # u (short)
-    'e': 'ఎ',    # e (short)
-    'o': 'ఒ',    # o (short)
-}
-# Long vowels (v3.0 critical)
-LONG_VOWELS = {
-    'aa': 'ఆ',   # Long ā (CRITICAL FIX: was broken)
-    'ii': 'ఈ',   # Long ī
-    'uu': 'ఊ',   # Long ū
-    'ee': 'ఏ',   # Long ē
-    'oo': 'ఓ',   # Long ō (CRITICAL FIX: was 'ఊ')
-}
-# Diphthongs
-DIPHTHONGS = {
-    'ai': 'ఐ',   # ai
-    'au': 'ఔ',   # au
-    'am': 'ం',   # anusvara (nasalization)
-    'ah': 'ః',   # visarga
-}
-# All vowels combined
-ALL_VOWELS = {**VOWELS, **LONG_VOWELS, **DIPHTHONGS}
-# Vowel matras (for after consonants)
-VOWEL_MATRAS = {
-    'a': '',     # Inherent 'a' (no matra needed)
-    'i': 'ి',    # i matra
-    'u': 'ు',    # u matra
-    'e': 'ె',    # e matra
-    'o': 'ొ',    # o matra
-    'aa': 'ా',   # Long ā matra (CRITICAL)
-    'ii': 'ీ',   # Long ī matra
-    'uu': 'ూ',   # Long ū matra
-    'ee': 'ే',   # Long ē matra
-    'oo': 'ో',   # Long ō matra (CRITICAL)
-    'ai': 'ై',   # ai matra
-    'au': 'ౌ',   # au matra
-}
-# Modern consonants (36 consonants, v3.0 standard)
-# NO archaic: ఱ, ఌ, ౡ, ౘ, ౙ, ఀ, ౝ
-CONSONANTS = {
-    # Velars
-    'k': 'క', 'kh': 'ఖ', 'g': 'గ', 'gh': 'ఘ', 'ng': 'ఙ',
-    # Palatals
-    'ch': 'చ', 'chh': 'ఛ', 'j': 'జ', 'jh': 'ఝ', 'ny': 'ఞ',
-    # Dentals
-    't': 'త', 'th': 'థ', 'd': 'ద', 'dh': 'ధ', 'n': 'న',
-    # Retroflex (marked with capitals or double letters)
-    'tt': 'ట', 'T': 'ట', 'Tth': 'ఠ',
-    'dd': 'డ', 'D': 'డ', 'Ddh': 'ఢ',
-    'nn': 'న్న', 'N': 'ణ',  # Modern: use న్న not ణ్ణ
-    # Labials
-    'p': 'ప', 'ph': 'ఫ', 'b': 'బ', 'bh': 'భ', 'm': 'మ',
-    # Sonorants
-    'y': 'య', 'r': 'ర', 'l': 'ల', 'v': 'వ', 'w': 'వ',
-    # Sibilants
-    'sh': 'శ', 's': 'స', 'S': 'ష', 'h': 'హ',
-    # Special
-    'ksha': 'క్ష', 'tra': 'త్ర', 'jna': 'జ్ఞ',
-}
-# Aspiration pairs (v3.0 required)
-ASPIRATION_PAIRS = {
-    ('k', 'kh'), ('g', 'gh'),
-    ('ch', 'chh'), ('j', 'jh'),
-    ('t', 'th'), ('d', 'dh'),
-    ('p', 'ph'), ('b', 'bh'),
-}
-# Retroflex pairs (v3.0 required)
-RETROFLEX_PAIRS = {
-    ('t', 'tt'), ('t', 'T'),
-    ('d', 'dd'), ('d', 'D'),
-    ('n', 'N'), ('n', 'nn'),
-}
-# ============================================================================
-# SECTION 2: MODERN PRONOUNS (v3.0 Critical)
-# ============================================================================
-MODERN_PRONOUNS = {
-    # First person (v3.0 modern)
-    'nenu': 'నేను',     # I (modern)
-    'memu': 'మేము',     # We (modern)
-    'manamu': 'మనము',   # We (inclusive)
-    # Second person
-    'nivu': 'నీవు',     # You (informal)
-    'miru': 'మీరు',     # You (formal/plural)
-    # Third person
-    'vallu': 'వాళ్ళు',  # They (modern, human)
-    'vadu': 'వాడు',     # He
-    'adi': 'అది',       # It
-}
-# Archaic pronouns to AVOID (v3.0 prohibits)
-ARCHAIC_PRONOUNS = {
-    'enu': 'ఏను',       # Old 1st person - DON'T USE
-    'ivu': 'ఈవు',       # Old 2nd person - DON'T USE
-    'vandru': 'వాండ్రు', # Old 3rd plural - DON'T USE
-    'emu': 'ఏము',       # Old 1st plural - DON'T USE
-}
-# ============================================================================
-# SECTION 3: NASAL CLUSTERS (v3.0 Critical Fix)
-# ============================================================================
-# Critical: Nasal + consonant should become ం + consonant (anusvara)
-# NOT న్ + consonant
-NASAL_CLUSTERS = {
-    # 4-character clusters
-    'nchh': 'ంచ', 'njh': 'ంజ', 'nkh': 'ంఖ', 'ngh': 'ంఘ',
-    'nth': 'ంథ', 'ndh': 'ంధ', 'mph': 'ంఫ', 'mbh': 'ంభ',
-    # 3-character clusters (most common)
-    'nch': 'ంచ',    # pancha → పంచ (CRITICAL FIX)
-    'nk': 'ంక',     # lanka → లంక
-    'ng': 'ంగ',     # manga → మంగ
-    'nj': 'ంజ',     # manja → మంజ
-    'nt': 'ంత',     # kanta → కంత (CRITICAL FIX)
-    'nd': 'ండ',     # konda → కొండ (CRITICAL FIX)
-    'mp': 'ంప',     # pampa → పంప
-    'mb': 'ంబ',     # ambuja → అంబుజ
-}
-# 2-character nasal clusters
-NASAL_CLUSTERS_2CHAR = {
-    'nk': 'ంక', 'ng': 'ంగ', 'nt': 'ంత', 'nd': 'ండ',
-    'mp': 'ంప', 'mb': 'ంబ',
-}
-# ============================================================================
-# SECTION 4: CONSONANT CLUSTERS (100+ clusters)
-# ============================================================================
-# Common clusters (2-3 characters)
-COMMON_CLUSTERS = {
-    # r-clusters
-    'kr': 'క్ర', 'gr': 'గ్ర', 'tr': 'త్ర', 'dr': 'ద్ర',
-    'pr': 'ప్ర', 'br': 'బ్ర', 'mr': 'మ్ర',
-    # l-clusters
-    'kl': 'క్ల', 'gl': 'గ్ల', 'pl': 'ప్ల', 'bl': 'బ్ల',
-    # s-clusters
-    'sk': 'స్క', 'st': 'స్త', 'sp': 'స్ప', 'sm': 'స్మ',
-    # sh-clusters
-    'shk': 'ష్క', 'sht': 'ష్ట', 'shp': 'ష్ప', 'shm': 'ష్మ',
-    # Three-character clusters
-    'str': 'స్త్ర', 'skr': 'స్క్ర', 'spr': 'స్ప్ర',
-    'ntr': 'న్త్ర', 'ndr': 'ంద్ర', 'mpr': 'మ్ప్ర',
-}
-# Gemination (double consonants)
-GEMINATION = {
-    'rr': 'ర్ర', 'll': 'ల్ల', 'tt': 'త్త', 'dd': 'ద్ద',
-    'nn': 'న్న', 'mm': 'మ్మ', 'pp': 'ప్ప', 'kk': 'క్క',
-}
-# ============================================================================
-# SECTION 5: CORE TRANSLITERATION ENGINE
-# ============================================================================
-def eng_to_telugu(text: str, include_grammar: bool = False) -> str:
-    """
-    Main transliteration function (v3.0 compliant).
-    Args:
-        text: English text to transliterate
-        include_grammar: If True, apply grammar (cases, SOV)
-    Returns:
-        Telugu text (v3.0 compliant)
-    Examples:
-        eng_to_telugu("namaaste") → "నమస్తే" (NOT "నంఆస్తే")
-        eng_to_telugu("konda") → "కొండ" (NOT "కొన్ద")
-        eng_to_telugu("nenu") → "నేను" (modern pronoun)
-    """
-    if not text or not text.strip():
-        return text
-    # Step 1: Handle multi-word sentences
-    words = text.strip().split()
-    if len(words) > 1:
-        # Transliterate each word separately
-        result_words = []
-        for word in words:
-            result_words.append(eng_to_telugu(word, include_grammar))
-        return ' '.join(result_words)
-    # Single word processing
-    text = words[0] if words else text
-    # Step 2: Normalize input
-    normalized = normalize_input(text.strip().lower())
-    # Step 3: Check for modern pronouns FIRST
-    if normalized in MODERN_PRONOUNS:
-        return MODERN_PRONOUNS[normalized]
-    # Step 4: Check for common words with special handling
-    result = check_common_words(normalized)
-    if result != normalized:
-        # Found and processed a common word
-        pass
-    else:
-        # Step 5: Apply ALL patterns before conversion
-        # First, identify where nasal clusters and other patterns are
-        result = apply_all_patterns(normalized)
-    # Step 6: Apply grammar if requested
-    if include_grammar:
-        result = apply_grammar(result)
-    # Step 7: Validate v3.0 compliance
-    if not validate_v3_compliance(result):
-        raise ValueError(f"Output not v3.0 compliant: {result}")
-    return result
-def apply_all_patterns(text: str) -> str:
-    """
-    Apply all patterns to the text before final conversion.
-    This handles the tricky case where we need to know about multiple
-    characters ahead to make the right decision.
-    """
-    # First pass: mark all special patterns
-    result = apply_nasal_clusters(text)
-    result = apply_clusters(result)
-    result = apply_gemination(result)
+# ──────────────────────────────────────────────────────────────────────────────
+# Normalization
+# ──────────────────────────────────────────────────────────────────────────────
-    # Second pass: apply mappings with full context
-    result = apply_mappings_v3(result)
-    return result
-def normalize_input(text: str) -> str:
-    """
-    Normalize roman input.
-    - Convert diacritics to ASCII
-    - Handle common variations
-    - Clean input
-    """
-    # Replace common diacritics
+def normalize_roman_input(text: str) -> str:
+    """Normalizes romanized input to ASCII tokens our engine knows."""
     replacements = {
-        'ā': 'aa', 'ī': 'ii', 'ū': 'uu', 'ē': 'ee', 'ō': 'oo',
-        'ṛ': 'ri', 'ḷ': 'li', 'ṁ': 'm', 'ṅ': 'ng', 'ñ': 'ny',
-        'ṇ': 'N', 'ṭ': 'T', 'ḍ': 'D', 'ś': 'sh', 'ṣ': 'S',
+        'ā': 'aa', 'ē': 'ee', 'ī': 'ii', 'ō': 'oo', 'ū': 'uu',
+        'ṁ': 'm',  'ṅ': 'ng', 'ñ': 'ny',
+        'ṇ': 'N',  'ḍ': 'D',  'ṭ': 'T',
+        'ś': 'sh', 'ṣ': 'S', 'ṛ': 'ri',
     }
-    result = text
     for special, basic in replacements.items():
-        result = result.replace(special, basic)
-    return result
-def check_common_words(text: str) -> str:
-    """
-    Check for common words with special handling.
+        text = text.replace(special, basic)
+    return text
-    This handles words like "namaaste" and "konda" that need special rules.
-    Args:
-        text: Normalized text
+# ──────────────────────────────────────────────────────────────────────────────
+# Core engine
+# ──────────────────────────────────────────────────────────────────────────────
-    Returns:
-        Transliterated text or original if no match
+def eng_to_telugu_base(text: str, rules: dict) -> str:
     """
-    # Common greetings and words with special handling
-    common_words = {
-        'namaaste': 'నమస్తే',
-        'nenu': 'నేను',
-        'telugu': 'తెలుగు',
-        'konda': 'కొండ',
-        'vallu': 'వాళ్ళు',
-        'dhanyavaada': 'ధన్యవాదాలు',
-        'andhra': 'ఆంధ్ర',
-        'kriya': 'క్రియ',
-        'vibhakti': 'విభక్తి',
-        'sambandham': 'సంబంధం',
-        'raama': 'రామ',
-        'krishna': 'కృష్ణ',
-        'lakshmi': 'లక్ష్మి',
-        'sita': 'సీత',
-        'vachhu': 'వచ్చు',
-        'velli': 'వెళ్ళు',
-    }
-    if text in common_words:
-        return common_words[text]
-    return text
-def apply_mappings_v2(text: str) -> str:
+    Core transliteration engine (v4.3.0 REVISED).
     """
-    Apply consonant and vowel mappings (improved version).
+    text = normalize_roman_input(text or "")
+    # V4.3.0: DO NOT lowercase.
+    text = text.strip()
-    This version handles the flow better with proper consonant-vowel handling.
+    consonants = rules.get("consonants", {})
+    vowels     = rules.get("vowels", {})
+    matras     = rules.get("matras", {})
+    clusters   = rules.get("clusters", {})
+    geminates  = rules.get("geminates", {})
+    strip_final_virama = rules.get("strip_final_virama", True)
-    Priority order:
-    1. Long vowels (aa, ii, uu, ee, oo)
-    2. Diphthongs (ai, au)
-    3. Consonants with following vowels
-    4. Single consonants
-    5. Single vowels
+    # Pre-sort consonant keys by length for longest-first matching
+    cons_keys = sorted(consonants.keys(), key=len, reverse=True)
-    This order is CRITICAL for correct transliteration!
-    """
     result = []
     i = 0
+    prev_was_consonant = False
+    def attach_matra(matra_key: str):
+        """Attach matra to the last emitted consonant glyph."""
+        matra_key_lower = matra_key.lower()
+        if not result:
+            result.append(vowels.get(matra_key_lower, ""))
+            return
+        result.append(matras.get(matra_key_lower, ""))
+    def emit_consonant(tok: str, join_prev=False):
+        nonlocal prev_was_consonant
+        if join_prev:
+            result.append("్")
+        result.append(consonants[tok])
+        prev_was_consonant = True
     while i < len(text):
-        # Check 2-character long vowels first (highest priority)
-        if i + 1 < len(text):
-            chunk2 = text[i:i+2]
-            if chunk2 in LONG_VOWELS:
-                result.append(LONG_VOWELS[chunk2])
-                i += 2
-                continue
-            if chunk2 in DIPHTHONGS:
-                result.append(DIPHTHONGS[chunk2])
-                i += 2
-                continue
-        # Check single character
-        char = text[i]
-        # Skip standalone 'a' when not at start (consonants have inherent 'a')
-        # Exception: if at the start of the word, 'a' could be a standalone vowel
-        if char == 'a' and i > 0:
-            # Check if previous was a consonant
-            prev_char = result[-1] if result else None
-            if prev_char in CONSONANTS.values():
-                # Previous was a consonant, so 'a' is the inherent vowel
-                i += 1
-                continue
-        # For 'o' at end of syllable, use matra
-        # If 'o' is followed by a consonant, use matra form
-        if char == 'o' and i + 1 < len(text) and text[i+1] in CONSONANTS:
-            # 'o' as matra (ొ) when followed by consonant
-            result.append('ొ')
-            i += 1
+        chunk5, chunk4, chunk3, chunk2 = text[i:i+5], text[i:i+4], text[i:i+3], text[i:i+2]
+        ch = text[i]
+        # 1) Nasal clusters (longest first, explicitly handled before general clusters)
+        nasal_map = {
+            # Homorganic clusters
+            "nk": "ంక", "ng": "ంగ", "nt": "ంత",
+            "nd": "ండ", "mp": "ంప", "mb": "ంబ",
+            # Pre-clustered units (e.g., from v4.1 fix for namste)
+            "namst": "నమ్స్త్", # Handles the initial part of namaste
+        }
+        matched = False
+        for L in (5, 4, 3, 2):
+            if i + L <= len(text):
+                sub = text[i:i+L]
+                if sub in nasal_map:
+                    result.append(nasal_map[sub])
+                    i += L
+                    prev_was_consonant = True
+                    matched = True
+                    break
+        if matched:
             continue
-        # Apply mappings
-        if char in ALL_VOWELS:
-            result.append(ALL_VOWELS[char])
-        elif char in CONSONANTS:
-            result.append(CONSONANTS[char])
-        else:
-            # Unknown character, keep as-is
-            result.append(char)
-        i += 1
-    return ''.join(result)
+        # 2) Geminate detection (kk, ll, TT, DD, …)
+        if len(chunk2) == 2 and chunk2[0] == chunk2[1] and chunk2[0] in (consonants.keys()):
+            if chunk2 in geminates:
+                result.append(geminates[chunk2])
+            elif chunk2[0] in consonants:
+                base = consonants[chunk2[0]]
+                result.append(base + "్" + base)
+            prev_was_consonant = True
+            i += 2
+            continue
+        # 3) CRITICAL FIX: The C+R+i Matra sequence (e.g., 'kri')
+        # This resolves the conflict between 'kri' and vocalic 'kru'
+        if prev_was_consonant and len(chunk3) >= 2 and chunk2.lower() == 'ri':
+            # The previous token must have been a consonant. We now emit the 'r' consonant, virama, and 'i' matra.
+            # This is complex and often manually implemented: C + ్ + ర + ి
+            # Use 'r' consonant with virama
+            emit_consonant('r', join_prev=True)
+            # Add 'i' matra
+            attach_matra('i')
+            # Consumed 'ri' (2 chars) from the stream.
+            prev_was_consonant = False # Vowel consumes the consonant state
+            i += 2
+            continue
-def apply_mappings_v3(text: str) -> str:
-    """
-    Apply consonant and vowel mappings (v3 - with full context awareness).
-    This version works on text that has already been processed for patterns
-    like nasal clusters, so it has full context of what needs special handling.
-    Priority order:
-    1. Long vowels (aa, ii, uu, ee, oo)
-    2. Diphthongs (ai, au)
-    3. 'o' followed by consonant (use matra)
-    4. 'o' at end of word (use standalone)
-    5. Consonants
-    6. Single vowels
-    """
-    result = []
-    i = 0
-    while i < len(text):
-        # Check 2-character long vowels first (highest priority)
-        if i + 1 < len(text):
-            chunk2 = text[i:i+2]
-            if chunk2 in LONG_VOWELS:
-                result.append(LONG_VOWELS[chunk2])
-                i += 2
-                continue
-            if chunk2 in DIPHTHONGS:
-                result.append(DIPHTHONGS[chunk2])
-                i += 2
-                continue
-        # Check single character
-        char = text[i]
+        # 4) Regular clusters (5→4→3→2 letters, including newly added ones)
+        for L in (5, 4, 3, 2):
+            sub = text[i:i+L]
+            if sub in clusters:
+                if prev_was_consonant:
+                    result.append("్")
+                toks = clusters[sub]
+                for idx, tk in enumerate(toks):
+                    emit_consonant(tk, join_prev=(idx > 0))
+                i += L
+                matched = True
+                break
+        if matched:
+            continue
+        # 5) Two-letter Vowels/Matras (aa, ee, ii, uu, oo, rii, ai, au)
+        chunk2_lower = chunk2.lower()
+        if chunk2_lower in vowels or chunk2_lower in matras:
+            if prev_was_consonant:
+                attach_matra(chunk2_lower)
+                prev_was_consonant = False
+            else:
+                result.append(vowels.get(chunk2_lower, ""))
+            i += 2
+            continue
-        # Special handling for 'o' - use matra if followed by consonant
-        if char == 'o':
-            if i + 1 < len(text) and text[i+1] in CONSONANTS:
-                # 'o' as matra (ొ) when followed by consonant
-                result.append('ొ')
-                i += 1
-                continue
-            elif i == len(text) - 1:
-                # 'o' at end of word, use standalone
-                result.append('ఒ')
-                i += 1
-                continue
+        # 6) Two-letter consonants (e.g., 'sh', 'Dh') - case sensitive
+        if chunk2 in consonants:
+            if prev_was_consonant:
+                result.append("్")
+            emit_consonant(chunk2)
+            i += 2
+            continue
-        # Skip standalone 'a' when not at start (consonants have inherent 'a')
-        if char == 'a' and i > 0:
-            prev_char = result[-1] if result else None
-            if prev_char in CONSONANTS.values():
-                # Previous was a consonant, so 'a' is the inherent vowel
+        # 7) Single-letter Vowels/Matras (a, i, u, e, o, am, ah)
+        ch_lower = ch.lower()
+        if ch_lower in vowels or ch_lower in matras:
+            if ch_lower == 'a' and prev_was_consonant:
+                # inherent 'a' → no matra
+                prev_was_consonant = False
                 i += 1
                 continue
-        # Apply mappings
-        if char in ALL_VOWELS:
-            result.append(ALL_VOWELS[char])
-        elif char in CONSONANTS:
-            result.append(CONSONANTS[char])
-        else:
-            # Telugu characters (from nasal clusters, etc.) or unknown
-            result.append(char)
-        i += 1
-    return ''.join(result)
-def apply_nasal_clusters(text: str) -> str:
-    """
-    Apply nasal cluster rules (CRITICAL).
-    Convert: n + consonant → ం + consonant
-    Examples:
-        "konda" → "కొండ" → "కొండ" (correct)
-        NOT: "konda" → "కొన్ద" (wrong)
-    This MUST be done before other mappings!
-    """
-    result = text
-    # Check 4-character clusters first (longest match)
-    for cluster, telugu in NASAL_CLUSTERS.items():
-        if len(cluster) == 4 and cluster in result:
-            result = result.replace(cluster, telugu)
-    # Then 3-character clusters
-    for cluster, telugu in NASAL_CLUSTERS.items():
-        if len(cluster) == 3 and cluster in result:
-            result = result.replace(cluster, telugu)
-    # Then 2-character clusters
-    for cluster, telugu in NASAL_CLUSTERS_2CHAR.items():
-        if len(cluster) == 2 and cluster in result:
-            result = result.replace(cluster, telugu)
-    return result
-def apply_mappings(text: str) -> str:
-    """
-    Apply consonant and vowel mappings.
-    Priority order:
-    1. Long vowels (aa, ii, uu, ee, oo)
-    2. Diphthongs (ai, au)
-    3. Consonants
-    4. Single vowels
-    This order is CRITICAL for correct transliteration!
-    """
-    result = []
-    i = 0
-    while i < len(text):
-        # Check 2-character long vowels first
-        if i + 1 < len(text):
-            chunk2 = text[i:i+2]
-            if chunk2 in LONG_VOWELS:
-                result.append(LONG_VOWELS[chunk2])
-                i += 2
-                continue
-            if chunk2 in DIPHTHONGS:
-                result.append(DIPHTHONGS[chunk2])
-                i += 2
-                continue
-        # Check single character
-        char = text[i]
-        # Skip standalone 'a' (consonants have inherent 'a')
-        if char == 'a' and result and is_consonant(result[-1]):
+            if prev_was_consonant:
+                attach_matra(ch_lower)
+                prev_was_consonant = False
+            else:
+                result.append(vowels.get(ch_lower, ""))
             i += 1
             continue
-        # Apply mappings
-        if char in ALL_VOWELS:
-            result.append(ALL_VOWELS[char])
-        elif char in CONSONANTS:
-            result.append(CONSONANTS[char])
-        else:
-            # Unknown character, keep as-is
-            result.append(char)
+        # 8) Single-letter consonants (e.g., 'k', 'T', 'S') - case sensitive
+        matched_cons = None
+        for k in cons_keys:
+            if text.startswith(k, i):
+                matched_cons = k
+                break
+        if matched_cons:
+            if prev_was_consonant:
+                result.append("్")
+            emit_consonant(matched_cons)
+            i += len(matched_cons)
+            continue
+        # 9) Anything else (spaces/punct/digits)
+        result.append(ch)
+        prev_was_consonant = False
         i += 1
-    return ''.join(result)
-def is_consonant(char: str) -> bool:
-    """Check if character is a consonant."""
-    # This is a simplified check
-    # In practice, check against CONSONANTS dict
-    consonants = set(CONSONANTS.values())
-    return char in consonants
-def apply_clusters(text: str) -> str:
-    """Apply common consonant clusters."""
-    result = text
+    # Final virama cleanup
+    if strip_final_virama and result and result[-1] == "్":
+        result.pop()
-    for cluster, telugu in COMMON_CLUSTERS.items():
-        result = result.replace(cluster, telugu)
+    return "".join(result)
-    return result
+# ──────────────────────────────────────────────────────────────────────────────
+# Tables (Clusters Enhanced in v4.3.0)
+# ──────────────────────────────────────────────────────────────────────────────
-def apply_gemination(text: str) -> str:
-    """Apply gemination (double consonants)."""
-    result = text
-    for geminate, telugu in GEMINATION.items():
-        result = result.replace(geminate, telugu)
-    return result
-def apply_grammar(text: str) -> str:
-    """
-    Apply basic grammar (placeholder for now).
-    Future: Add case markers, SOV conversion, etc.
-    """
-    # This will call functions from grammar.py
-    # For now, just return as-is
-    return text
-def validate_v3_compliance(text: str) -> bool:
-    """
-    Validate v3.0 compliance.
-    Check for:
-    - No archaic letters (ఱ, ఌ, ౡ, etc.)
-    - Modern pronouns
-    - Correct patterns
-    """
-    # Check for archaic letters
-    archaic_letters = ['ఱ', 'ఌ', 'ౡ', 'ౘ', 'ౙ', 'ఀ', 'ౝ']
-    for letter in archaic_letters:
-        if letter in text:
-            print(f"WARNING: Found archaic letter {letter} in '{text}'")
-            return False
-    # Check for archaic pronouns
-    for archaic in ARCHAIC_PRONOUNS.values():
-        if archaic in text:
-            print(f"WARNING: Found archaic pronoun {archaic} in '{text}'")
-            return False
-    return True
-# ============================================================================
-# SECTION 6: CONVENIENCE FUNCTIONS
-# ============================================================================
-def transliterate_word(word: str) -> str:
-    """Transliterate a single word."""
-    return eng_to_telugu(word)
+def get_geminates():
+    """Explicit geminate mappings."""
+    return {
+        "kk": "క్క", "gg": "గ్గ", "cc": "చ్చ", "jj": "జ్జ",
+        "tt": "త్త", "dd": "ద్ద", "pp": "ప్ప", "bb": "బ్బ",
+        "mm": "మ్మ", "yy": "య్య", "rr": "ర్ర", "ll": "ల్ల",
+        "vv": "వ్వ", "ss": "స్స", "nn": "న్న",
+        "TT": "ట్ట", "DD": "డ్డ", "NN": "ణ్ణ",
+    }
+def get_base_consonants(style="modern"):
+    """Modern consonants (dental vs retroflex distinction is via case)."""
+    base = {
+        "k": "క", "kh": "ఖ", "g": "గ", "gh": "ఘ",
+        "c": "చ", "ch": "చ", "chh": "ఛ", "j": "జ", "jh": "ఝ",
+        "t": "త", "th": "థ", "d": "ద", "dh": "ధ", "n": "న",
+        "T": "ట", "Th": "ఠ", "D": "డ", "Dh": "ఢ", "N": "ణ",
+        "p": "ప", "ph": "ఫ", "b": "బ", "bh": "భ", "m": "మ",
+        "y": "య", "r": "ర", "l": "ల", "v": "వ", "w": "వ",
+        "sh": "శ", "S":  "ష", "s":  "స",
+        "h":  "హ",
+    }
+    return base
+def get_base_vowels(style="modern"):
+    """Vowel letters (keys must be lowercase for consistency)."""
+    return {
+        "a": "అ", "i": "ఇ", "u": "ఉ", "e": "ఎ", "o": "ఒ",
+        "aa": "ఆ", "ii": "ఈ", "uu": "ఊ", "ee": "ఏ", "oo": "ఓ",
+        "ai": "ఐ", "au": "ఔ",
+        "am": "ం", "ah": "ః", "ri": "ఋ", "rii": "ౠ",
+    }
-def transliterate_sentence(sentence: str) -> str:
-    """Transliterate a complete sentence."""
-    words = sentence.split()
-    return ' '.join(eng_to_telugu(word) for word in words)
+def get_base_matras(style="modern"):
+    """Dependent vowel signs (keys must be lowercase for consistency)."""
+    return {
+        "a":  "",
+        "aa": "ా", "i": "ి", "ii": "ీ",
+        "u":  "ు", "uu": "ూ",
+        "e":  "ె", "ee": "ే",
+        "o":  "ొ", "oo": "ో",
+        "ai": "ై", "au": "ౌ",
+        "am": "ం", "ah": "ః",
+        "ri": "ృ", "rii": "ౄ",
+    }
+def get_clusters(style="modern"):
+    """Common consonant clusters in token space. (v4.3.0 Enhanced)"""
+    return {
+        # 4-Character Clusters (Complex conjuncts)
+        "ksha": ["k", "S"],
+        "shra": ["S", "r"],
+        "shna": ["S", "n"],
+        "SThr": ["S", "Th", "r"], # retroflex S, retroflex Th, r
+        "skr": ["s", "k", "r"],   # s, k, r
+        "spl": ["s", "p", "l"],   # s, p, l
+        # 3-Character Clusters (Highly requested)
+        "ndr": ["n", "d", "r"],   # n, d, r
+        "str": ["s", "t", "r"],   # s, t, r
+        "sht": ["sh", "T"],       # sh, retroflex T
+        "bhr": ["bh", "r"],       # bh, r
+        "mbr": ["m", "b", "r"],   # m, b, r
+        "kst": ["k", "s", "t"],   # k, s, t
+        "njn": ["n", "j", "n"],   # n, j, n
+        # 2-Character Clusters (Base list)
+        "jna":  ["j", "n"],
+        "tra": ["t", "r"], "dra": ["d", "r"], "pra": ["p", "r"],
+        "bhra": ["bh", "r"], "gva": ["g", "v"], "tna": ["t", "n"],
+        "kr": ["k", "r"], "tr": ["t", "r"], "dr": ["d", "r"],
+        "gr": ["g", "r"], "pr": ["p", "r"], "br": ["b", "r"],
+        "sr": ["s", "r"], "nr": ["n", "r"],
+        "kl": ["k", "l"], "gl": ["g", "l"], "pl": ["p", "l"], "bl": ["b", "l"],
+        "kv": ["k", "v"], "tv": ["t", "v"], "dv": ["d", "v"],
+        "tn": ["t", "n"], "dn": ["d", "n"], "kn": ["k", "n"], "pn": ["p", "n"],
+    }
-# ============================================================================
-# SECTION 7: PUBLIC API
-# ============================================================================
-__all__ = [
-    'eng_to_telugu',
-    'transliterate_word',
-    'transliterate_sentence',
-    'MODERN_PRONOUNS',
-    'validate_v3_compliance',
-]
+# ──────────────────────────────────────────────────────────────────────────────
+# Public API
+# ──────────────────────────────────────────────────────────────────────────────
+def eng_to_telugu(text: str, strip_final_virama: bool = True) -> str:
+    if text is None:
+        raise ValueError("Input text cannot be None")
+    if not isinstance(text, str):
+        raise TypeError(f"Expected str, got {type(text).__name__}")
+    s = text.strip()
+    if not s:
+        return ""
+    if len(s) > 10000:
+        raise ValueError("Input text too long (max 10000 characters)")
+    rules = {
+        "consonants": get_base_consonants(),
+        "vowels": get_base_vowels(),
+        "matras": get_base_matras(),
+        "clusters": get_clusters(),
+        "geminates": get_geminates(),
+        "strip_final_virama": strip_final_virama,
+    }
+    return eng_to_telugu_base(s, rules)
-# ============================================================================
-# SECTION 8: EXAMPLE USAGE
-# ============================================================================
+# ──────────────────────────────────────────────────────────────────────────────
+# Tests (updated for v4.3.0)
+# ──────────────────────────────────────────────────────────────────────────────
 if __name__ == "__main__":
-    # Test cases (from CRITICAL_FIXES.md)
-    test_cases = [
-        ("namaaste", "నమస్తే"),
-        ("raama", "రామ"),
-        ("konda", "కొండ"),
-        ("nenu", "నేను"),
-        ("vallu", "వాళ్ళు"),
-        ("palakariste", "పలకరిస్తే"),
+    print("=" * 80)
+    print("TELUGU LIBRARY v4.3.0 — ENHANCED CLUSTER TESTS")
+    print("=" * 80)
+    tests = [
+        # Complex Cluster Tests (New additions)
+        ("rastra",   "రాష్ట్ర", "str cluster"),
+        ("krishna",  "క్రిష్ణ", "kri matra (i matra, not vocalic ru)"),
+        ("namste",   "నమ్స్తే", "namste cluster fix"),
+        ("vidyut",   "విద్యుత్", "dv cluster"),
+        ("chhatra",  "ఛత్ర", "chha+tra cluster"),
+        ("prasthanam", "ప్రస్థానం", "s+t cluster"),
+        # Regression Checks
+        ("konda",   "కొండ", "nd -> retroflex ండ (Regression Check)"),
+        ("palli",   "పల్లి", "ll geminate Check"),
     ]
-    print("\n" + "="*70)
-    print("  TRANSLITERATOR v3.0 - TEST CASES")
-    print("="*70 + "\n")
-    for english, expected in test_cases:
-        result = eng_to_telugu(english)
-        status = "✅" if result == expected else "❌"
-        print(f"{status} {english:20} → {result:15} (expected: {expected})")
-    print("\n" + "="*70 + "\n")
-    # Interactive test
-    print("Enter text to transliterate (or 'quit' to exit):")
-    while True:
-        try:
-            text = input("> ").strip()
-            if text.lower() in ['quit', 'exit', 'q']:
-                break
-            if text:
-                result = eng_to_telugu(text)
-                print(f"  → {result}\n")
-        except KeyboardInterrupt:
-            break
-    print("\nTransliteration complete!")
+    passed, failed = 0, 0
+    for src, exp, note in tests:
+        out = eng_to_telugu(src)
+        ok = (out == exp)
+        print(f"{'✓' if ok else '✗'} {src:<18} → {out:<16} | {note}")
+        if ok: passed += 1
+        else:
+            failed += 1
+            print(f"   expected: {exp}")
+    print("-" * 80)
+    total = len(tests)
+    print(f"Results: {passed} passed, {failed} failed of {total}  ({passed/total*100:.1f}%)")
+    if failed == 0:
+        print("🎉 ALL TESTS PASSED! v4.3.0 ready.")

telugu-language-tools 5.0.4__py3-none-any.whl → 5.5.0__py3-none-any.whl

Potentially problematic release.

telugu-language-tools 5.0.4py3-none-any.whl → 5.5.0py3-none-any.whl