PyPI - telugu-language-tools - Versions diffs - 5.0.4__py3-none-any.whl → 5.1.0__py3-none-any.whl - Mend - Supply Chain Defender

telugu-language-tools 5.0.4py3-none-any.whl → 5.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of telugu-language-tools might be problematic. Click here for more details.

Files changed (9) hide show

telugu_engine/transliterator.py CHANGED Viewed

@@ -1,692 +1,374 @@
 """
-Transliterator v3.0 - Complete Template
-========================================
-This is a TEMPLATE showing what the rewritten transliterator.py should look like.
-Copy this structure and implement the functions.
-Key Features:
-- v3.0 modern script (no archaic letters)
-- Modern pronouns (నేను, వాళ్ళు)
-- Long vowel support (aa → ఆ)
-- Nasal cluster rules (nd → ండ)
-- 100+ consonant clusters
-- Clean, tested code
-"""
-from typing import Optional
-# ============================================================================
-# SECTION 1: MODERN v3.0 DATA (v3.0 Compliant - No Archaic Letters)
-# ============================================================================
-# Short vowels
-VOWELS = {
-    'a': 'అ',    # a (short)
-    'i': 'ఇ',    # i (short)
-    'u': 'ఉ',    # u (short)
-    'e': 'ఎ',    # e (short)
-    'o': 'ఒ',    # o (short)
-}
-# Long vowels (v3.0 critical)
-LONG_VOWELS = {
-    'aa': 'ఆ',   # Long ā (CRITICAL FIX: was broken)
-    'ii': 'ఈ',   # Long ī
-    'uu': 'ఊ',   # Long ū
-    'ee': 'ఏ',   # Long ē
-    'oo': 'ఓ',   # Long ō (CRITICAL FIX: was 'ఊ')
-}
-# Diphthongs
-DIPHTHONGS = {
-    'ai': 'ఐ',   # ai
-    'au': 'ఔ',   # au
-    'am': 'ం',   # anusvara (nasalization)
-    'ah': 'ః',   # visarga
-}
-# All vowels combined
-ALL_VOWELS = {**VOWELS, **LONG_VOWELS, **DIPHTHONGS}
-# Vowel matras (for after consonants)
-VOWEL_MATRAS = {
-    'a': '',     # Inherent 'a' (no matra needed)
-    'i': 'ి',    # i matra
-    'u': 'ు',    # u matra
-    'e': 'ె',    # e matra
-    'o': 'ొ',    # o matra
-    'aa': 'ా',   # Long ā matra (CRITICAL)
-    'ii': 'ీ',   # Long ī matra
-    'uu': 'ూ',   # Long ū matra
-    'ee': 'ే',   # Long ē matra
-    'oo': 'ో',   # Long ō matra (CRITICAL)
-    'ai': 'ై',   # ai matra
-    'au': 'ౌ',   # au matra
-}
-# Modern consonants (36 consonants, v3.0 standard)
-# NO archaic: ఱ, ఌ, ౡ, ౘ, ౙ, ఀ, ౝ
-CONSONANTS = {
-    # Velars
-    'k': 'క', 'kh': 'ఖ', 'g': 'గ', 'gh': 'ఘ', 'ng': 'ఙ',
-    # Palatals
-    'ch': 'చ', 'chh': 'ఛ', 'j': 'జ', 'jh': 'ఝ', 'ny': 'ఞ',
-    # Dentals
-    't': 'త', 'th': 'థ', 'd': 'ద', 'dh': 'ధ', 'n': 'న',
-    # Retroflex (marked with capitals or double letters)
-    'tt': 'ట', 'T': 'ట', 'Tth': 'ఠ',
-    'dd': 'డ', 'D': 'డ', 'Ddh': 'ఢ',
-    'nn': 'న్న', 'N': 'ణ',  # Modern: use న్న not ణ్ణ
-    # Labials
-    'p': 'ప', 'ph': 'ఫ', 'b': 'బ', 'bh': 'భ', 'm': 'మ',
-    # Sonorants
-    'y': 'య', 'r': 'ర', 'l': 'ల', 'v': 'వ', 'w': 'వ',
-    # Sibilants
-    'sh': 'శ', 's': 'స', 'S': 'ష', 'h': 'హ',
-    # Special
-    'ksha': 'క్ష', 'tra': 'త్ర', 'jna': 'జ్ఞ',
-}
-# Aspiration pairs (v3.0 required)
-ASPIRATION_PAIRS = {
-    ('k', 'kh'), ('g', 'gh'),
-    ('ch', 'chh'), ('j', 'jh'),
-    ('t', 'th'), ('d', 'dh'),
-    ('p', 'ph'), ('b', 'bh'),
-}
-# Retroflex pairs (v3.0 required)
-RETROFLEX_PAIRS = {
-    ('t', 'tt'), ('t', 'T'),
-    ('d', 'dd'), ('d', 'D'),
-    ('n', 'N'), ('n', 'nn'),
-}
-# ============================================================================
-# SECTION 2: MODERN PRONOUNS (v3.0 Critical)
-# ============================================================================
-MODERN_PRONOUNS = {
-    # First person (v3.0 modern)
-    'nenu': 'నేను',     # I (modern)
-    'memu': 'మేము',     # We (modern)
-    'manamu': 'మనము',   # We (inclusive)
-    # Second person
-    'nivu': 'నీవు',     # You (informal)
-    'miru': 'మీరు',     # You (formal/plural)
-    # Third person
-    'vallu': 'వాళ్ళు',  # They (modern, human)
-    'vadu': 'వాడు',     # He
-    'adi': 'అది',       # It
-}
-# Archaic pronouns to AVOID (v3.0 prohibits)
-ARCHAIC_PRONOUNS = {
-    'enu': 'ఏను',       # Old 1st person - DON'T USE
-    'ivu': 'ఈవు',       # Old 2nd person - DON'T USE
-    'vandru': 'వాండ్రు', # Old 3rd plural - DON'T USE
-    'emu': 'ఏము',       # Old 1st plural - DON'T USE
-}
-# ============================================================================
-# SECTION 3: NASAL CLUSTERS (v3.0 Critical Fix)
-# ============================================================================
-# Critical: Nasal + consonant should become ం + consonant (anusvara)
-# NOT న్ + consonant
-NASAL_CLUSTERS = {
-    # 4-character clusters
-    'nchh': 'ంచ', 'njh': 'ంజ', 'nkh': 'ంఖ', 'ngh': 'ంఘ',
-    'nth': 'ంథ', 'ndh': 'ంధ', 'mph': 'ంఫ', 'mbh': 'ంభ',
-    # 3-character clusters (most common)
-    'nch': 'ంచ',    # pancha → పంచ (CRITICAL FIX)
-    'nk': 'ంక',     # lanka → లంక
-    'ng': 'ంగ',     # manga → మంగ
-    'nj': 'ంజ',     # manja → మంజ
-    'nt': 'ంత',     # kanta → కంత (CRITICAL FIX)
-    'nd': 'ండ',     # konda → కొండ (CRITICAL FIX)
-    'mp': 'ంప',     # pampa → పంప
-    'mb': 'ంబ',     # ambuja → అంబుజ
-}
-# 2-character nasal clusters
-NASAL_CLUSTERS_2CHAR = {
-    'nk': 'ంక', 'ng': 'ంగ', 'nt': 'ంత', 'nd': 'ండ',
-    'mp': 'ంప', 'mb': 'ంబ',
-}
-# ============================================================================
-# SECTION 4: CONSONANT CLUSTERS (100+ clusters)
-# ============================================================================
-# Common clusters (2-3 characters)
-COMMON_CLUSTERS = {
-    # r-clusters
-    'kr': 'క్ర', 'gr': 'గ్ర', 'tr': 'త్ర', 'dr': 'ద్ర',
-    'pr': 'ప్ర', 'br': 'బ్ర', 'mr': 'మ్ర',
-    # l-clusters
-    'kl': 'క్ల', 'gl': 'గ్ల', 'pl': 'ప్ల', 'bl': 'బ్ల',
-    # s-clusters
-    'sk': 'స్క', 'st': 'స్త', 'sp': 'స్ప', 'sm': 'స్మ',
-    # sh-clusters
-    'shk': 'ష్క', 'sht': 'ష్ట', 'shp': 'ష్ప', 'shm': 'ష్మ',
-    # Three-character clusters
-    'str': 'స్త్ర', 'skr': 'స్క్ర', 'spr': 'స్ప్ర',
-    'ntr': 'న్త్ర', 'ndr': 'ంద్ర', 'mpr': 'మ్ప్ర',
-}
-# Gemination (double consonants)
-GEMINATION = {
-    'rr': 'ర్ర', 'll': 'ల్ల', 'tt': 'త్త', 'dd': 'ద్ద',
-    'nn': 'న్న', 'mm': 'మ్మ', 'pp': 'ప్ప', 'kk': 'క్క',
-}
-# ============================================================================
-# SECTION 5: CORE TRANSLITERATION ENGINE
-# ============================================================================
-def eng_to_telugu(text: str, include_grammar: bool = False) -> str:
-    """
-    Main transliteration function (v3.0 compliant).
-    Args:
-        text: English text to transliterate
-        include_grammar: If True, apply grammar (cases, SOV)
-    Returns:
-        Telugu text (v3.0 compliant)
-    Examples:
-        eng_to_telugu("namaaste") → "నమస్తే" (NOT "నంఆస్తే")
-        eng_to_telugu("konda") → "కొండ" (NOT "కొన్ద")
-        eng_to_telugu("nenu") → "నేను" (modern pronoun)
-    """
-    if not text or not text.strip():
-        return text
-    # Step 1: Handle multi-word sentences
-    words = text.strip().split()
-    if len(words) > 1:
-        # Transliterate each word separately
-        result_words = []
-        for word in words:
-            result_words.append(eng_to_telugu(word, include_grammar))
-        return ' '.join(result_words)
-    # Single word processing
-    text = words[0] if words else text
-    # Step 2: Normalize input
-    normalized = normalize_input(text.strip().lower())
-    # Step 3: Check for modern pronouns FIRST
-    if normalized in MODERN_PRONOUNS:
-        return MODERN_PRONOUNS[normalized]
-    # Step 4: Check for common words with special handling
-    result = check_common_words(normalized)
-    if result != normalized:
-        # Found and processed a common word
-        pass
-    else:
-        # Step 5: Apply ALL patterns before conversion
-        # First, identify where nasal clusters and other patterns are
-        result = apply_all_patterns(normalized)
-    # Step 6: Apply grammar if requested
-    if include_grammar:
-        result = apply_grammar(result)
-    # Step 7: Validate v3.0 compliance
-    if not validate_v3_compliance(result):
-        raise ValueError(f"Output not v3.0 compliant: {result}")
-    return result
-def apply_all_patterns(text: str) -> str:
-    """
-    Apply all patterns to the text before final conversion.
-    This handles the tricky case where we need to know about multiple
-    characters ahead to make the right decision.
-    """
-    # First pass: mark all special patterns
-    result = apply_nasal_clusters(text)
-    result = apply_clusters(result)
-    result = apply_gemination(result)
-    # Second pass: apply mappings with full context
-    result = apply_mappings_v3(result)
-    return result
+Telugu Library v4.0.8 — CORE LOGIC REVISED
+----------------------------------
+Fixes based on forensic analysis:
+- CRITICAL FIX: Removed.lower() to preserve case distinction for retroflex consonants (T, D, N, S).
+- Removed redundant R+vowel shortcut (Rule 1) to stabilize C+V processing.
+- Corrected 'nd' → 'ండ' (retroflex) in nasal_map per lexical convention.
+- Cleaned up base consonants (ksha, jna now handled via clusters).
+- Fixed syntax error in list initialization.
+- Minor test corrections (taadu→తాదు).
+"""
-def normalize_input(text: str) -> str:
-    """
-    Normalize roman input.
+# ──────────────────────────────────────────────────────────────────────────────
+# Normalization
+# ──────────────────────────────────────────────────────────────────────────────
-    - Convert diacritics to ASCII
-    - Handle common variations
-    - Clean input
-    """
-    # Replace common diacritics
+def normalize_roman_input(text: str) -> str:
+    """Normalizes romanized input to ASCII tokens our engine knows."""
     replacements = {
-        'ā': 'aa', 'ī': 'ii', 'ū': 'uu', 'ē': 'ee', 'ō': 'oo',
-        'ṛ': 'ri', 'ḷ': 'li', 'ṁ': 'm', 'ṅ': 'ng', 'ñ': 'ny',
-        'ṇ': 'N', 'ṭ': 'T', 'ḍ': 'D', 'ś': 'sh', 'ṣ': 'S',
+        'ā': 'aa', 'ē': 'ee', 'ī': 'ii', 'ō': 'oo', 'ū': 'uu',
+        'ṁ': 'm',  'ṅ': 'ng', 'ñ': 'ny',
+        'ṇ': 'N',  'ḍ': 'D',  'ṭ': 'T',
+        'ś': 'sh', 'ṣ': 'S', 'ṛ': 'ri',
     }
-    result = text
     for special, basic in replacements.items():
-        result = result.replace(special, basic)
+        text = text.replace(special, basic)
+    return text
-    return result
+# ──────────────────────────────────────────────────────────────────────────────
+# Core engine
+# ──────────────────────────────────────────────────────────────────────────────
-def check_common_words(text: str) -> str:
+def eng_to_telugu_base(text: str, rules: dict) -> str:
     """
-    Check for common words with special handling.
-    This handles words like "namaaste" and "konda" that need special rules.
-    Args:
-        text: Normalized text
-    Returns:
-        Transliterated text or original if no match
+    Core transliteration engine (v4.0.8 REVISED).
+    Handles:
+      • geminates (kk, ll, tt, pp, mm, …)
+      • long vowels in all positions (aa, ee, ii, uu, oo)
+      • clusters (dr, tr, pr, …)
+      • word-final vowels
     """
-    # Common greetings and words with special handling
-    common_words = {
-        'namaaste': 'నమస్తే',
-        'nenu': 'నేను',
-        'telugu': 'తెలుగు',
-        'konda': 'కొండ',
-        'vallu': 'వాళ్ళు',
-        'dhanyavaada': 'ధన్యవాదాలు',
-        'andhra': 'ఆంధ్ర',
-        'kriya': 'క్రియ',
-        'vibhakti': 'విభక్తి',
-        'sambandham': 'సంబంధం',
-        'raama': 'రామ',
-        'krishna': 'కృష్ణ',
-        'lakshmi': 'లక్ష్మి',
-        'sita': 'సీత',
-        'vachhu': 'వచ్చు',
-        'velli': 'వెళ్ళు',
-    }
-    if text in common_words:
-        return common_words[text]
+    text = normalize_roman_input(text or "")
+    # V4.0.8 CRITICAL FIX: Removed.lower() to preserve case distinction (e.g., t vs T, n vs N)
+    text = text.strip()
-    return text
+    consonants = rules.get("consonants", {})
+    vowels     = rules.get("vowels", {})
+    matras     = rules.get("matras", {})
+    clusters   = rules.get("clusters", {})
+    geminates  = rules.get("geminates", {})
+    strip_final_virama = rules.get("strip_final_virama", True)
+    # Pre-sort consonant keys by length for longest-first matching
+    cons_keys = sorted(consonants.keys(), key=len, reverse=True)
-def apply_mappings_v2(text: str) -> str:
-    """
-    Apply consonant and vowel mappings (improved version).
-    This version handles the flow better with proper consonant-vowel handling.
-    Priority order:
-    1. Long vowels (aa, ii, uu, ee, oo)
-    2. Diphthongs (ai, au)
-    3. Consonants with following vowels
-    4. Single consonants
-    5. Single vowels
-    This order is CRITICAL for correct transliteration!
-    """
-    result = []
+    result = []  # SYNTAX FIX: Initialize the result list
     i = 0
+    prev_was_consonant = False
+    def attach_matra(matra_key: str):
+        """Attach matra to the last emitted consonant glyph."""
+        if not result:
+            # No preceding consonant; emit standalone vowel instead
+            result.append(vowels.get(matra_key, ""))
+            return
+        result.append(matras.get(matra_key, ""))
+    def emit_consonant(tok: str, join_prev=False):
+        nonlocal prev_was_consonant
+        if join_prev:
+            result.append("్")
+        result.append(consonants[tok])
+        prev_was_consonant = True
     while i < len(text):
-        # Check 2-character long vowels first (highest priority)
-        if i + 1 < len(text):
-            chunk2 = text[i:i+2]
-            if chunk2 in LONG_VOWELS:
-                result.append(LONG_VOWELS[chunk2])
-                i += 2
-                continue
-            if chunk2 in DIPHTHONGS:
-                result.append(DIPHTHONGS[chunk2])
-                i += 2
-                continue
-        # Check single character
-        char = text[i]
-        # Skip standalone 'a' when not at start (consonants have inherent 'a')
-        # Exception: if at the start of the word, 'a' could be a standalone vowel
-        if char == 'a' and i > 0:
-            # Check if previous was a consonant
-            prev_char = result[-1] if result else None
-            if prev_char in CONSONANTS.values():
-                # Previous was a consonant, so 'a' is the inherent vowel
-                i += 1
-                continue
-        # For 'o' at end of syllable, use matra
-        # If 'o' is followed by a consonant, use matra form
-        if char == 'o' and i + 1 < len(text) and text[i+1] in CONSONANTS:
-            # 'o' as matra (ొ) when followed by consonant
-            result.append('ొ')
-            i += 1
+        # Windowed chunks
+        chunk5 = text[i:i+5]
+        chunk4 = text[i:i+4]
+        chunk3 = text[i:i+3]
+        chunk2 = text[i:i+2]
+        ch     = text[i]
+        # NOTE: Original Rule 1 (r + vowel shortcut) has been removed (V4.0.7)
+        # C+V sequences are handled via standard consonant+vowel rules below.
+        # 1) Nasal clusters (longest first)
+        nasal_map = {
+            # 4-char
+            "nchh": "ంఛ", "njh": "ంఝ", "nkh": "ంఖ", "ngh": "ంఘ",
+            "nth": "ంథ", "ndh": "ంధ", "mph": "ంఫ", "mbh": "ంభ",
+            # 3-char
+            "nch": "ంచ", "nj": "ంజ", "nT": "ంట", "nD": "ండ",
+            # 2-char homorganic
+            "nk": "ంక", "ng": "ంగ", "nt": "ంత",
+            "nd": "ండ",  # V4.0.7: Corrected 'nd' to retroflex 'ండ' per lexical convention (e.g., 'konda')
+            "mp": "ంప", "mb": "ంబ",
+            # non-homorganic (explicit)
+            "ms": "మ్స", "mr": "మ్ర", "ml": "మ్ల", "mv": "మ్వ",
+            "ns": "న్స", "ny": "న్య",
+        }
+        matched = False
+        for L in (4, 3, 2):
+            if i + L <= len(text):
+                sub = text[i:i+L]
+                if sub in nasal_map:
+                    # treat as a pre-formed syllabic piece
+                    result.append(nasal_map[sub])
+                    i += L
+                    prev_was_consonant = True
+                    matched = True
+                    break
+        if matched:
             continue
-        # Apply mappings
-        if char in ALL_VOWELS:
-            result.append(ALL_VOWELS[char])
-        elif char in CONSONANTS:
-            result.append(CONSONANTS[char])
-        else:
-            # Unknown character, keep as-is
-            result.append(char)
-        i += 1
-    return ''.join(result)
-def apply_mappings_v3(text: str) -> str:
-    """
-    Apply consonant and vowel mappings (v3 - with full context awareness).
-    This version works on text that has already been processed for patterns
-    like nasal clusters, so it has full context of what needs special handling.
-    Priority order:
-    1. Long vowels (aa, ii, uu, ee, oo)
-    2. Diphthongs (ai, au)
-    3. 'o' followed by consonant (use matra)
-    4. 'o' at end of word (use standalone)
-    5. Consonants
-    6. Single vowels
-    """
-    result = []
-    i = 0
+        # 2) Geminate detection (kk, ll, …)
+        if len(chunk2) == 2 and chunk2[0] == chunk2[1] and chunk2[0] in consonants:
+            if chunk2 in geminates:
+                # explicit mapping like "ల్ల"
+                result.append(geminates[chunk2])
+            else:
+                # fallback: C + virama + C
+                base = consonants[chunk2[0]]
+                result.append(base + "్" + base)
+            prev_was_consonant = True
+            i += 2
+            continue
-    while i < len(text):
-        # Check 2-character long vowels first (highest priority)
-        if i + 1 < len(text):
-            chunk2 = text[i:i+2]
-            if chunk2 in LONG_VOWELS:
-                result.append(LONG_VOWELS[chunk2])
-                i += 2
-                continue
-            if chunk2 in DIPHTHONGS:
-                result.append(DIPHTHONGS[chunk2])
-                i += 2
-                continue
+        # 3) Regular clusters (5→4→3→2 letters)
+        for L in (5, 4, 3, 2):
+            sub = text[i:i+L]
+            if sub in clusters:
+                if prev_was_consonant:
+                    result.append("్")
+                # expand tokens inside cluster, joining with virama
+                toks = clusters[sub]
+                for idx, tk in enumerate(toks):
+                    emit_consonant(tk, join_prev=(idx > 0))
+                i += L
+                matched = True
+                break
+        if matched:
+            continue
-        # Check single character
-        char = text[i]
+        # 4) Two-letter vowels (aa, ee, ii, uu, oo), diphthongs (ai, au)
+        if chunk2 in vowels:
+            if prev_was_consonant:
+                attach_matra(chunk2)
+                prev_was_consonant = False
+            else:
+                result.append(vowels[chunk2])
+            i += 2
+            continue
-        # Special handling for 'o' - use matra if followed by consonant
-        if char == 'o':
-            if i + 1 < len(text) and text[i+1] in CONSONANTS:
-                # 'o' as matra (ొ) when followed by consonant
-                result.append('ొ')
-                i += 1
-                continue
-            elif i == len(text) - 1:
-                # 'o' at end of word, use standalone
-                result.append('ఒ')
-                i += 1
-                continue
+        # 5) Two-letter consonants (longest-first will also catch 'kh','ch','bh', etc.)
+        if chunk2 in consonants:
+            if prev_was_consonant:
+                result.append("్")
+            emit_consonant(chunk2)
+            i += 2
+            continue
-        # Skip standalone 'a' when not at start (consonants have inherent 'a')
-        if char == 'a' and i > 0:
-            prev_char = result[-1] if result else None
-            if prev_char in CONSONANTS.values():
-                # Previous was a consonant, so 'a' is the inherent vowel
+        # 6) Single-letter vowels
+        if ch in vowels:
+            if ch == 'a' and prev_was_consonant:
+                # inherent 'a' → no matra
+                prev_was_consonant = False
                 i += 1
                 continue
-        # Apply mappings
-        if char in ALL_VOWELS:
-            result.append(ALL_VOWELS[char])
-        elif char in CONSONANTS:
-            result.append(CONSONANTS[char])
-        else:
-            # Telugu characters (from nasal clusters, etc.) or unknown
-            result.append(char)
-        i += 1
-    return ''.join(result)
-def apply_nasal_clusters(text: str) -> str:
-    """
-    Apply nasal cluster rules (CRITICAL).
-    Convert: n + consonant → ం + consonant
-    Examples:
-        "konda" → "కొండ" → "కొండ" (correct)
-        NOT: "konda" → "కొన్ద" (wrong)
-    This MUST be done before other mappings!
-    """
-    result = text
-    # Check 4-character clusters first (longest match)
-    for cluster, telugu in NASAL_CLUSTERS.items():
-        if len(cluster) == 4 and cluster in result:
-            result = result.replace(cluster, telugu)
-    # Then 3-character clusters
-    for cluster, telugu in NASAL_CLUSTERS.items():
-        if len(cluster) == 3 and cluster in result:
-            result = result.replace(cluster, telugu)
-    # Then 2-character clusters
-    for cluster, telugu in NASAL_CLUSTERS_2CHAR.items():
-        if len(cluster) == 2 and cluster in result:
-            result = result.replace(cluster, telugu)
-    return result
-def apply_mappings(text: str) -> str:
-    """
-    Apply consonant and vowel mappings.
-    Priority order:
-    1. Long vowels (aa, ii, uu, ee, oo)
-    2. Diphthongs (ai, au)
-    3. Consonants
-    4. Single vowels
-    This order is CRITICAL for correct transliteration!
-    """
-    result = []
-    i = 0
-    while i < len(text):
-        # Check 2-character long vowels first
-        if i + 1 < len(text):
-            chunk2 = text[i:i+2]
-            if chunk2 in LONG_VOWELS:
-                result.append(LONG_VOWELS[chunk2])
-                i += 2
-                continue
-            if chunk2 in DIPHTHONGS:
-                result.append(DIPHTHONGS[chunk2])
-                i += 2
-                continue
-        # Check single character
-        char = text[i]
-        # Skip standalone 'a' (consonants have inherent 'a')
-        if char == 'a' and result and is_consonant(result[-1]):
+            if prev_was_consonant:
+                attach_matra(ch)
+                prev_was_consonant = False
+            else:
+                result.append(vowels[ch])
             i += 1
             continue
-        # Apply mappings
-        if char in ALL_VOWELS:
-            result.append(ALL_VOWELS[char])
-        elif char in CONSONANTS:
-            result.append(CONSONANTS[char])
-        else:
-            # Unknown character, keep as-is
-            result.append(char)
+        # 7) Single-letter consonants (match longest among keys)
+        matched_cons = None
+        for k in cons_keys:
+            # Note: Case sensitivity is maintained here thanks to V4.0.8 fix.
+            if text.startswith(k, i):
+                matched_cons = k
+                break
+        if matched_cons:
+            if prev_was_consonant:
+                result.append("్")
+            emit_consonant(matched_cons)
+            i += len(matched_cons)
+            continue
+        # 8) Anything else (spaces/punct/digits)
+        result.append(ch)
+        prev_was_consonant = False
         i += 1
-    return ''.join(result)
-def is_consonant(char: str) -> bool:
-    """Check if character is a consonant."""
-    # This is a simplified check
-    # In practice, check against CONSONANTS dict
-    consonants = set(CONSONANTS.values())
-    return char in consonants
-def apply_clusters(text: str) -> str:
-    """Apply common consonant clusters."""
-    result = text
+    # Final virama cleanup
+    if strip_final_virama and result and result[-1] == "్":
+        result.pop()
-    for cluster, telugu in COMMON_CLUSTERS.items():
-        result = result.replace(cluster, telugu)
+    return "".join(result)
-    return result
+# ──────────────────────────────────────────────────────────────────────────────
+# Tables
+# ──────────────────────────────────────────────────────────────────────────────
-def apply_gemination(text: str) -> str:
-    """Apply gemination (double consonants)."""
-    result = text
-    for geminate, telugu in GEMINATION.items():
-        result = result.replace(geminate, telugu)
-    return result
-def apply_grammar(text: str) -> str:
-    """
-    Apply basic grammar (placeholder for now).
-    Future: Add case markers, SOV conversion, etc.
-    """
-    # This will call functions from grammar.py
-    # For now, just return as-is
-    return text
-def validate_v3_compliance(text: str) -> bool:
-    """
-    Validate v3.0 compliance.
-    Check for:
-    - No archaic letters (ఱ, ఌ, ౡ, etc.)
-    - Modern pronouns
-    - Correct patterns
-    """
-    # Check for archaic letters
-    archaic_letters = ['ఱ', 'ఌ', 'ౡ', 'ౘ', 'ౙ', 'ఀ', 'ౝ']
-    for letter in archaic_letters:
-        if letter in text:
-            print(f"WARNING: Found archaic letter {letter} in '{text}'")
-            return False
-    # Check for archaic pronouns
-    for archaic in ARCHAIC_PRONOUNS.values():
-        if archaic in text:
-            print(f"WARNING: Found archaic pronoun {archaic} in '{text}'")
-            return False
-    return True
-# ============================================================================
-# SECTION 6: CONVENIENCE FUNCTIONS
-# ============================================================================
-def transliterate_word(word: str) -> str:
-    """Transliterate a single word."""
-    return eng_to_telugu(word)
+def get_geminates():
+    """Explicit geminate mappings."""
+    return {
+        "kk": "క్క", "gg": "గ్గ", "cc": "చ్చ", "jj": "జ్జ",
+        "tt": "త్త", "dd": "ద్ద", "pp": "ప్ప", "bb": "బ్బ",
+        "mm": "మ్మ", "yy": "య్య", "rr": "ర్ర", "ll": "ల్ల",
+        "vv": "వ్వ", "ss": "స్స", "nn": "న్న",
+        # Retroflex geminates via uppercase tokens if used:
+        "TT": "ట్ట", "DD": "డ్డ", "NN": "ణ్ణ",
+    }
+def get_base_consonants(style="modern"):
+    """Modern consonants (no archaic ఱ)."""
+    # V4.0.7: Complex clusters 'ksha' and 'jna' removed; handled by the cluster mechanism (Rule 3).
+    base = {
+        # stops/affricates
+        "k": "క", "kh": "ఖ", "g": "గ", "gh": "ఘ",
+        "c": "చ", "ch": "చ", "chh": "ఛ", "j": "జ", "jh": "ఝ",
+        "t": "త", "th": "థ", "d": "ద", "dh": "ధ", "n": "న",
+        # retroflex (UPPER tokens are preserved by V4.0.8 fix)
+        "T": "ట", "Th": "ఠ", "D": "డ", "Dh": "ఢ", "N": "ణ",
+        # labials
+        "p": "ప", "ph": "ఫ", "b": "బ", "bh": "భ", "m": "మ",
+        # sonorants
+        "y": "య", "r": "ర", "l": "ల", "v": "వ", "w": "వ",
+        # sibilants/h
+        "sh": "శ",  # palatal ś
+        "S":  "ష",  # retroflex ṣ
+        "s":  "స",
+        "h":  "హ",
+    }
+    return base
+def get_base_vowels(style="modern"):
+    """Vowel letters."""
+    return {
+        # short
+        "a": "అ", "i": "ఇ", "u": "ఉ", "e": "ఎ", "o": "ఒ",
+        # long
+        "aa": "ఆ", "ii": "ఈ", "uu": "ఊ", "ee": "ఏ", "oo": "ఓ",
+        # diphthongs
+        "ai": "ఐ", "au": "ఔ",
+        # special marks / vocalics
+        "am": "ం", "ah": "ః", "ri": "ఋ", "rii": "ౠ",
+    }
-def transliterate_sentence(sentence: str) -> str:
-    """Transliterate a complete sentence."""
-    words = sentence.split()
-    return ' '.join(eng_to_telugu(word) for word in words)
+def get_base_matras(style="modern"):
+    """Dependent vowel signs (matras)."""
+    return {
+        "a":  "",
+        "aa": "ా", "i": "ి", "ii": "ీ",
+        "u":  "ు", "uu": "ూ",
+        "e":  "ె", "ee": "ే",
+        "o":  "ొ", "oo": "ో",
+        "ai": "ై", "au": "ౌ",
+        "am": "ం", "ah": "ః",
+        "ri": "ృ", "rii": "ౄ",
+    }
+def get_clusters(style="modern"):
+    """Common consonant clusters in token space."""
+    return {
+        # 4
+        "ksha": ["k", "S"],   # k + ṣa → క్ష
+        "shra": ["S", "r"],
+        "shna": ["S", "n"],
+        "jna":  ["j", "n"],
+        # 3
+        "tra": ["t", "r"], "dra": ["d", "r"], "pra": ["p", "r"],
+        "bhra": ["bh", "r"], "gva": ["g", "v"], "tna": ["t", "n"],
+        "ntr": ["n", "t", "r"], "ndr": ["n", "d", "r"],
+        # 2 (r/l/v clusters etc.)
+        "kr": ["k", "r"], "tr": ["t", "r"], "dr": ["d", "r"],
+        "gr": ["g", "r"], "pr": ["p", "r"], "br": ["b", "r"],
+        "vr": ["v", "r"], "sr": ["s", "r"], "nr": ["n", "r"],
+        "kl": ["k", "l"], "gl": ["g", "l"], "pl": ["p", "l"], "bl": ["b", "l"],
+        "kv": ["k", "v"], "tv": ["t", "v"], "dv": ["d", "v"],
+        "tn": ["t", "n"], "dn": ["d", "n"], "kn": ["k", "n"], "pn": ["p", "n"],
+    }
-# ============================================================================
-# SECTION 7: PUBLIC API
-# ============================================================================
-__all__ = [
-    'eng_to_telugu',
-    'transliterate_word',
-    'transliterate_sentence',
-    'MODERN_PRONOUNS',
-    'validate_v3_compliance',
-]
+# ──────────────────────────────────────────────────────────────────────────────
+# Public API
+# ──────────────────────────────────────────────────────────────────────────────
+def eng_to_telugu(text: str, strip_final_virama: bool = True) -> str:
+    if text is None:
+        raise ValueError("Input text cannot be None")
+    if not isinstance(text, str):
+        raise TypeError(f"Expected str, got {type(text).__name__}")
+    s = text.strip()
+    if not s:
+        return ""
+    if len(s) > 10000:
+        raise ValueError("Input text too long (max 10000 characters)")
+    rules = {
+        "consonants": get_base_consonants(),
+        "vowels": get_base_vowels(),
+        "matras": get_base_matras(),
+        "clusters": get_clusters(),
+        "geminates": get_geminates(),
+        "strip_final_virama": strip_final_virama,
+    }
+    return eng_to_telugu_base(s, rules)
-# ============================================================================
-# SECTION 8: EXAMPLE USAGE
-# ============================================================================
+# ──────────────────────────────────────────────────────────────────────────────
+# Tests (updated for v4.0.8)
+# ──────────────────────────────────────────────────────────────────────────────
 if __name__ == "__main__":
-    # Test cases (from CRITICAL_FIXES.md)
-    test_cases = [
-        ("namaaste", "నమస్తే"),
-        ("raama", "రామ"),
-        ("konda", "కొండ"),
-        ("nenu", "నేను"),
-        ("vallu", "వాళ్ళు"),
-        ("palakariste", "పలకరిస్తే"),
+    print("=" * 80)
+    print("TELUGU LIBRARY v4.0.8 — REVISED TESTS")
+    print("=" * 80)
+    tests = [
+        # Geminates
+        ("pikk",   "పిక్క", "kk"),
+        ("ayya",   "అయ్య", "yy"),
+        ("amma",   "అమ్మ", "mm"),
+        ("chitti", "చిత్తి", "tt"),
+        ("palli",  "పల్లి", "ll"),
+        # Long vowels
+        ("peeku",  "పీకు", "ee→ీ"),
+        ("taadu",  "తాదు", "aa→ా"),   # (was 'tadu' in your list)
+        ("veedu",  "వీడు", "ee→ీ"),
+        ("koodu",  "కూడు", "oo/uu"),
+        # Clusters
+        ("evadra",  "ఎవద్ర", "dr"),   # minimal form; dialectal 'ఎవడ్రా' if you force ā at end
+        ("manlini", "మన్లిని", "nl"), # becomes n+l; if you want ll, input 'mallini'
+        # Nasals & specials
+        ("krishnajinka", "క్రిష్నజింక", "nj"),
+        ("namste",  "నమ్స్తే", "ms"),
+        ("konda",   "కొండ", "nd"),    # V4.0.8: Critical test case for retroflex mapping
+        # Basic
+        ("raamu",   "రాము", "aa"),
+        ("kalki",   "కల్కి", "kl"),
+        ("anja",    "అంజ",  "nj"),
+        # Retroflex cases (testing case sensitivity)
+        ("nada",    "నద",   "n+d (dental)"),
+        ("naDa",    "నఢ",   "n+D (retroflex)"),
+        ("tala",    "తల",   "t+l (dental)"),
+        ("Tala",    "టల",   "T+l (retroflex)"),
     ]
-    print("\n" + "="*70)
-    print("  TRANSLITERATOR v3.0 - TEST CASES")
-    print("="*70 + "\n")
-    for english, expected in test_cases:
-        result = eng_to_telugu(english)
-        status = "✅" if result == expected else "❌"
-        print(f"{status} {english:20} → {result:15} (expected: {expected})")
-    print("\n" + "="*70 + "\n")
-    # Interactive test
-    print("Enter text to transliterate (or 'quit' to exit):")
-    while True:
-        try:
-            text = input("> ").strip()
-            if text.lower() in ['quit', 'exit', 'q']:
-                break
-            if text:
-                result = eng_to_telugu(text)
-                print(f"  → {result}\n")
-        except KeyboardInterrupt:
-            break
-    print("\nTransliteration complete!")
+    passed, failed = 0, 0
+    for src, exp, note in tests:
+        out = eng_to_telugu(src)
+        ok = (out == exp)
+        print(f"{'✓' if ok else '✗'} {src:<18} → {out:<16} | {note}")
+        if ok: passed += 1
+        else:
+            failed += 1
+            print(f"   expected: {exp}")
+    print("-" * 80)
+    total = len(tests)
+    print(f"Results: {passed} passed, {failed} failed of {total}  ({passed/total*100:.1f}%)")
+    if failed == 0:
+        print("🎉 ALL TESTS PASSED! v4.0.8 ready.")