telugu-language-tools 5.0.4__py3-none-any.whl → 5.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of telugu-language-tools might be problematic. Click here for more details.

@@ -1,692 +1,374 @@
1
1
  """
2
- Transliterator v3.0 - Complete Template
3
- ========================================
4
-
5
- This is a TEMPLATE showing what the rewritten transliterator.py should look like.
6
- Copy this structure and implement the functions.
7
-
8
- Key Features:
9
- - v3.0 modern script (no archaic letters)
10
- - Modern pronouns (నేను, వాళ్ళు)
11
- - Long vowel support (aa → ఆ)
12
- - Nasal cluster rules (nd → ండ)
13
- - 100+ consonant clusters
14
- - Clean, tested code
15
- """
16
-
17
- from typing import Optional
18
-
19
-
20
- # ============================================================================
21
- # SECTION 1: MODERN v3.0 DATA (v3.0 Compliant - No Archaic Letters)
22
- # ============================================================================
23
-
24
- # Short vowels
25
- VOWELS = {
26
- 'a': 'అ', # a (short)
27
- 'i': 'ఇ', # i (short)
28
- 'u': 'ఉ', # u (short)
29
- 'e': 'ఎ', # e (short)
30
- 'o': 'ఒ', # o (short)
31
- }
32
-
33
- # Long vowels (v3.0 critical)
34
- LONG_VOWELS = {
35
- 'aa': 'ఆ', # Long ā (CRITICAL FIX: was broken)
36
- 'ii': 'ఈ', # Long ī
37
- 'uu': 'ఊ', # Long ū
38
- 'ee': 'ఏ', # Long ē
39
- 'oo': 'ఓ', # Long ō (CRITICAL FIX: was 'ఊ')
40
- }
41
-
42
- # Diphthongs
43
- DIPHTHONGS = {
44
- 'ai': 'ఐ', # ai
45
- 'au': 'ఔ', # au
46
- 'am': 'ం', # anusvara (nasalization)
47
- 'ah': 'ః', # visarga
48
- }
49
-
50
- # All vowels combined
51
- ALL_VOWELS = {**VOWELS, **LONG_VOWELS, **DIPHTHONGS}
52
-
53
- # Vowel matras (for after consonants)
54
- VOWEL_MATRAS = {
55
- 'a': '', # Inherent 'a' (no matra needed)
56
- 'i': 'ి', # i matra
57
- 'u': 'ు', # u matra
58
- 'e': 'ె', # e matra
59
- 'o': 'ొ', # o matra
60
- 'aa': 'ా', # Long ā matra (CRITICAL)
61
- 'ii': 'ీ', # Long ī matra
62
- 'uu': 'ూ', # Long ū matra
63
- 'ee': 'ే', # Long ē matra
64
- 'oo': 'ో', # Long ō matra (CRITICAL)
65
- 'ai': 'ై', # ai matra
66
- 'au': 'ౌ', # au matra
67
- }
68
-
69
- # Modern consonants (36 consonants, v3.0 standard)
70
- # NO archaic: ఱ, ఌ, ౡ, ౘ, ౙ, ఀ, ౝ
71
- CONSONANTS = {
72
- # Velars
73
- 'k': 'క', 'kh': 'ఖ', 'g': 'గ', 'gh': 'ఘ', 'ng': 'ఙ',
74
-
75
- # Palatals
76
- 'ch': 'చ', 'chh': 'ఛ', 'j': 'జ', 'jh': 'ఝ', 'ny': 'ఞ',
77
-
78
- # Dentals
79
- 't': 'త', 'th': 'థ', 'd': 'ద', 'dh': 'ధ', 'n': 'న',
80
-
81
- # Retroflex (marked with capitals or double letters)
82
- 'tt': 'ట', 'T': 'ట', 'Tth': 'ఠ',
83
- 'dd': 'డ', 'D': 'డ', 'Ddh': 'ఢ',
84
- 'nn': 'న్న', 'N': 'ణ', # Modern: use న్న not ణ్ణ
85
-
86
- # Labials
87
- 'p': 'ప', 'ph': 'ఫ', 'b': 'బ', 'bh': 'భ', 'm': 'మ',
88
-
89
- # Sonorants
90
- 'y': 'య', 'r': 'ర', 'l': 'ల', 'v': 'వ', 'w': 'వ',
91
-
92
- # Sibilants
93
- 'sh': 'శ', 's': 'స', 'S': 'ష', 'h': 'హ',
94
-
95
- # Special
96
- 'ksha': 'క్ష', 'tra': 'త్ర', 'jna': 'జ్ఞ',
97
- }
98
-
99
- # Aspiration pairs (v3.0 required)
100
- ASPIRATION_PAIRS = {
101
- ('k', 'kh'), ('g', 'gh'),
102
- ('ch', 'chh'), ('j', 'jh'),
103
- ('t', 'th'), ('d', 'dh'),
104
- ('p', 'ph'), ('b', 'bh'),
105
- }
106
-
107
- # Retroflex pairs (v3.0 required)
108
- RETROFLEX_PAIRS = {
109
- ('t', 'tt'), ('t', 'T'),
110
- ('d', 'dd'), ('d', 'D'),
111
- ('n', 'N'), ('n', 'nn'),
112
- }
113
-
114
-
115
- # ============================================================================
116
- # SECTION 2: MODERN PRONOUNS (v3.0 Critical)
117
- # ============================================================================
118
-
119
- MODERN_PRONOUNS = {
120
- # First person (v3.0 modern)
121
- 'nenu': 'నేను', # I (modern)
122
- 'memu': 'మేము', # We (modern)
123
- 'manamu': 'మనము', # We (inclusive)
124
-
125
- # Second person
126
- 'nivu': 'నీవు', # You (informal)
127
- 'miru': 'మీరు', # You (formal/plural)
128
-
129
- # Third person
130
- 'vallu': 'వాళ్ళు', # They (modern, human)
131
- 'vadu': 'వాడు', # He
132
- 'adi': 'అది', # It
133
- }
134
-
135
- # Archaic pronouns to AVOID (v3.0 prohibits)
136
- ARCHAIC_PRONOUNS = {
137
- 'enu': 'ఏను', # Old 1st person - DON'T USE
138
- 'ivu': 'ఈవు', # Old 2nd person - DON'T USE
139
- 'vandru': 'వాండ్రు', # Old 3rd plural - DON'T USE
140
- 'emu': 'ఏము', # Old 1st plural - DON'T USE
141
- }
142
-
143
-
144
- # ============================================================================
145
- # SECTION 3: NASAL CLUSTERS (v3.0 Critical Fix)
146
- # ============================================================================
147
-
148
- # Critical: Nasal + consonant should become ం + consonant (anusvara)
149
- # NOT న్ + consonant
150
- NASAL_CLUSTERS = {
151
- # 4-character clusters
152
- 'nchh': 'ంచ', 'njh': 'ంజ', 'nkh': 'ంఖ', 'ngh': 'ంఘ',
153
- 'nth': 'ంథ', 'ndh': 'ంధ', 'mph': 'ంఫ', 'mbh': 'ంభ',
154
-
155
- # 3-character clusters (most common)
156
- 'nch': 'ంచ', # pancha → పంచ (CRITICAL FIX)
157
- 'nk': 'ంక', # lanka → లంక
158
- 'ng': 'ంగ', # manga → మంగ
159
- 'nj': 'ంజ', # manja → మంజ
160
- 'nt': 'ంత', # kanta → కంత (CRITICAL FIX)
161
- 'nd': 'ండ', # konda → కొండ (CRITICAL FIX)
162
- 'mp': 'ంప', # pampa → పంప
163
- 'mb': 'ంబ', # ambuja → అంబుజ
164
- }
165
-
166
- # 2-character nasal clusters
167
- NASAL_CLUSTERS_2CHAR = {
168
- 'nk': 'ంక', 'ng': 'ంగ', 'nt': 'ంత', 'nd': 'ండ',
169
- 'mp': 'ంప', 'mb': 'ంబ',
170
- }
171
-
172
-
173
- # ============================================================================
174
- # SECTION 4: CONSONANT CLUSTERS (100+ clusters)
175
- # ============================================================================
176
-
177
- # Common clusters (2-3 characters)
178
- COMMON_CLUSTERS = {
179
- # r-clusters
180
- 'kr': 'క్ర', 'gr': 'గ్ర', 'tr': 'త్ర', 'dr': 'ద్ర',
181
- 'pr': 'ప్ర', 'br': 'బ్ర', 'mr': 'మ్ర',
182
-
183
- # l-clusters
184
- 'kl': 'క్ల', 'gl': 'గ్ల', 'pl': 'ప్ల', 'bl': 'బ్ల',
185
-
186
- # s-clusters
187
- 'sk': 'స్క', 'st': 'స్త', 'sp': 'స్ప', 'sm': 'స్మ',
188
-
189
- # sh-clusters
190
- 'shk': 'ష్క', 'sht': 'ష్ట', 'shp': 'ష్ప', 'shm': 'ష్మ',
191
-
192
- # Three-character clusters
193
- 'str': 'స్త్ర', 'skr': 'స్క్ర', 'spr': 'స్ప్ర',
194
- 'ntr': 'న్త్ర', 'ndr': 'ంద్ర', 'mpr': 'మ్ప్ర',
195
- }
196
-
197
- # Gemination (double consonants)
198
- GEMINATION = {
199
- 'rr': 'ర్ర', 'll': 'ల్ల', 'tt': 'త్త', 'dd': 'ద్ద',
200
- 'nn': 'న్న', 'mm': 'మ్మ', 'pp': 'ప్ప', 'kk': 'క్క',
201
- }
202
-
203
-
204
- # ============================================================================
205
- # SECTION 5: CORE TRANSLITERATION ENGINE
206
- # ============================================================================
207
-
208
- def eng_to_telugu(text: str, include_grammar: bool = False) -> str:
209
- """
210
- Main transliteration function (v3.0 compliant).
211
-
212
- Args:
213
- text: English text to transliterate
214
- include_grammar: If True, apply grammar (cases, SOV)
215
-
216
- Returns:
217
- Telugu text (v3.0 compliant)
218
-
219
- Examples:
220
- eng_to_telugu("namaaste") → "నమస్తే" (NOT "నంఆస్తే")
221
- eng_to_telugu("konda") → "కొండ" (NOT "కొన్ద")
222
- eng_to_telugu("nenu") → "నేను" (modern pronoun)
223
- """
224
- if not text or not text.strip():
225
- return text
226
-
227
- # Step 1: Handle multi-word sentences
228
- words = text.strip().split()
229
- if len(words) > 1:
230
- # Transliterate each word separately
231
- result_words = []
232
- for word in words:
233
- result_words.append(eng_to_telugu(word, include_grammar))
234
- return ' '.join(result_words)
235
-
236
- # Single word processing
237
- text = words[0] if words else text
238
-
239
- # Step 2: Normalize input
240
- normalized = normalize_input(text.strip().lower())
241
-
242
- # Step 3: Check for modern pronouns FIRST
243
- if normalized in MODERN_PRONOUNS:
244
- return MODERN_PRONOUNS[normalized]
245
-
246
- # Step 4: Check for common words with special handling
247
- result = check_common_words(normalized)
248
- if result != normalized:
249
- # Found and processed a common word
250
- pass
251
- else:
252
- # Step 5: Apply ALL patterns before conversion
253
- # First, identify where nasal clusters and other patterns are
254
- result = apply_all_patterns(normalized)
255
-
256
- # Step 6: Apply grammar if requested
257
- if include_grammar:
258
- result = apply_grammar(result)
259
-
260
- # Step 7: Validate v3.0 compliance
261
- if not validate_v3_compliance(result):
262
- raise ValueError(f"Output not v3.0 compliant: {result}")
263
-
264
- return result
265
-
266
-
267
- def apply_all_patterns(text: str) -> str:
268
- """
269
- Apply all patterns to the text before final conversion.
270
-
271
- This handles the tricky case where we need to know about multiple
272
- characters ahead to make the right decision.
273
- """
274
- # First pass: mark all special patterns
275
- result = apply_nasal_clusters(text)
276
- result = apply_clusters(result)
277
- result = apply_gemination(result)
278
-
279
- # Second pass: apply mappings with full context
280
- result = apply_mappings_v3(result)
281
-
282
- return result
2
+ Telugu Library v4.0.8 CORE LOGIC REVISED
3
+ ----------------------------------
4
+ Fixes based on forensic analysis:
5
+ - CRITICAL FIX: Removed.lower() to preserve case distinction for retroflex consonants (T, D, N, S).
6
+ - Removed redundant R+vowel shortcut (Rule 1) to stabilize C+V processing.
7
+ - Corrected 'nd' → 'ండ' (retroflex) in nasal_map per lexical convention.
8
+ - Cleaned up base consonants (ksha, jna now handled via clusters).
9
+ - Fixed syntax error in list initialization.
10
+ - Minor test corrections (taadu→తాదు).
283
11
 
12
+ """
284
13
 
285
- def normalize_input(text: str) -> str:
286
- """
287
- Normalize roman input.
14
+ # ──────────────────────────────────────────────────────────────────────────────
15
+ # Normalization
16
+ # ──────────────────────────────────────────────────────────────────────────────
288
17
 
289
- - Convert diacritics to ASCII
290
- - Handle common variations
291
- - Clean input
292
- """
293
- # Replace common diacritics
18
+ def normalize_roman_input(text: str) -> str:
19
+ """Normalizes romanized input to ASCII tokens our engine knows."""
294
20
  replacements = {
295
- 'ā': 'aa', 'ī': 'ii', 'ū': 'uu', 'ē': 'ee', 'ō': 'oo',
296
- 'ṛ': 'ri', 'ḷ': 'li', 'ṁ': 'm', 'ṅ': 'ng', 'ñ': 'ny',
297
- 'ṇ': 'N', 'ṭ': 'T', 'ḍ': 'D', 'ś': 'sh', 'ṣ': 'S',
21
+ 'ā': 'aa', 'ē': 'ee', 'ī': 'ii', 'ō': 'oo', 'ū': 'uu',
22
+ 'ṁ': 'm', 'ṅ': 'ng', 'ñ': 'ny',
23
+ 'ṇ': 'N', 'ḍ': 'D', '': 'T',
24
+ 'ś': 'sh', 'ṣ': 'S', 'ṛ': 'ri',
298
25
  }
299
-
300
- result = text
301
26
  for special, basic in replacements.items():
302
- result = result.replace(special, basic)
27
+ text = text.replace(special, basic)
28
+ return text
303
29
 
304
- return result
305
30
 
31
+ # ──────────────────────────────────────────────────────────────────────────────
32
+ # Core engine
33
+ # ──────────────────────────────────────────────────────────────────────────────
306
34
 
307
- def check_common_words(text: str) -> str:
35
+ def eng_to_telugu_base(text: str, rules: dict) -> str:
308
36
  """
309
- Check for common words with special handling.
310
-
311
- This handles words like "namaaste" and "konda" that need special rules.
312
-
313
- Args:
314
- text: Normalized text
315
-
316
- Returns:
317
- Transliterated text or original if no match
37
+ Core transliteration engine (v4.0.8 REVISED).
38
+ Handles:
39
+ geminates (kk, ll, tt, pp, mm, …)
40
+ • long vowels in all positions (aa, ee, ii, uu, oo)
41
+ • clusters (dr, tr, pr, …)
42
+ word-final vowels
318
43
  """
319
- # Common greetings and words with special handling
320
- common_words = {
321
- 'namaaste': 'నమస్తే',
322
- 'nenu': 'నేను',
323
- 'telugu': 'తెలుగు',
324
- 'konda': 'కొండ',
325
- 'vallu': 'వాళ్ళు',
326
- 'dhanyavaada': 'ధన్యవాదాలు',
327
- 'andhra': 'ఆంధ్ర',
328
- 'kriya': 'క్రియ',
329
- 'vibhakti': 'విభక్తి',
330
- 'sambandham': 'సంబంధం',
331
- 'raama': 'రామ',
332
- 'krishna': 'కృష్ణ',
333
- 'lakshmi': 'లక్ష్మి',
334
- 'sita': 'సీత',
335
- 'vachhu': 'వచ్చు',
336
- 'velli': 'వెళ్ళు',
337
- }
338
-
339
- if text in common_words:
340
- return common_words[text]
44
+ text = normalize_roman_input(text or "")
45
+ # V4.0.8 CRITICAL FIX: Removed.lower() to preserve case distinction (e.g., t vs T, n vs N)
46
+ text = text.strip()
341
47
 
342
- return text
48
+ consonants = rules.get("consonants", {})
49
+ vowels = rules.get("vowels", {})
50
+ matras = rules.get("matras", {})
51
+ clusters = rules.get("clusters", {})
52
+ geminates = rules.get("geminates", {})
53
+ strip_final_virama = rules.get("strip_final_virama", True)
343
54
 
55
+ # Pre-sort consonant keys by length for longest-first matching
56
+ cons_keys = sorted(consonants.keys(), key=len, reverse=True)
344
57
 
345
- def apply_mappings_v2(text: str) -> str:
346
- """
347
- Apply consonant and vowel mappings (improved version).
348
-
349
- This version handles the flow better with proper consonant-vowel handling.
350
-
351
- Priority order:
352
- 1. Long vowels (aa, ii, uu, ee, oo)
353
- 2. Diphthongs (ai, au)
354
- 3. Consonants with following vowels
355
- 4. Single consonants
356
- 5. Single vowels
357
-
358
- This order is CRITICAL for correct transliteration!
359
- """
360
- result = []
58
+ result = [] # SYNTAX FIX: Initialize the result list
361
59
  i = 0
60
+ prev_was_consonant = False
61
+
62
+ def attach_matra(matra_key: str):
63
+ """Attach matra to the last emitted consonant glyph."""
64
+ if not result:
65
+ # No preceding consonant; emit standalone vowel instead
66
+ result.append(vowels.get(matra_key, ""))
67
+ return
68
+ result.append(matras.get(matra_key, ""))
69
+
70
+ def emit_consonant(tok: str, join_prev=False):
71
+ nonlocal prev_was_consonant
72
+ if join_prev:
73
+ result.append("్")
74
+ result.append(consonants[tok])
75
+ prev_was_consonant = True
362
76
 
363
77
  while i < len(text):
364
- # Check 2-character long vowels first (highest priority)
365
- if i + 1 < len(text):
366
- chunk2 = text[i:i+2]
367
- if chunk2 in LONG_VOWELS:
368
- result.append(LONG_VOWELS[chunk2])
369
- i += 2
370
- continue
371
- if chunk2 in DIPHTHONGS:
372
- result.append(DIPHTHONGS[chunk2])
373
- i += 2
374
- continue
375
-
376
- # Check single character
377
- char = text[i]
378
-
379
- # Skip standalone 'a' when not at start (consonants have inherent 'a')
380
- # Exception: if at the start of the word, 'a' could be a standalone vowel
381
- if char == 'a' and i > 0:
382
- # Check if previous was a consonant
383
- prev_char = result[-1] if result else None
384
- if prev_char in CONSONANTS.values():
385
- # Previous was a consonant, so 'a' is the inherent vowel
386
- i += 1
387
- continue
388
-
389
- # For 'o' at end of syllable, use matra
390
- # If 'o' is followed by a consonant, use matra form
391
- if char == 'o' and i + 1 < len(text) and text[i+1] in CONSONANTS:
392
- # 'o' as matra (ొ) when followed by consonant
393
- result.append('ొ')
394
- i += 1
78
+ # Windowed chunks
79
+ chunk5 = text[i:i+5]
80
+ chunk4 = text[i:i+4]
81
+ chunk3 = text[i:i+3]
82
+ chunk2 = text[i:i+2]
83
+ ch = text[i]
84
+
85
+ # NOTE: Original Rule 1 (r + vowel shortcut) has been removed (V4.0.7)
86
+ # C+V sequences are handled via standard consonant+vowel rules below.
87
+
88
+ # 1) Nasal clusters (longest first)
89
+ nasal_map = {
90
+ # 4-char
91
+ "nchh": "ంఛ", "njh": "ంఝ", "nkh": "ంఖ", "ngh": "ంఘ",
92
+ "nth": "ంథ", "ndh": "ంధ", "mph": "ంఫ", "mbh": "ంభ",
93
+ # 3-char
94
+ "nch": "ంచ", "nj": "ంజ", "nT": "ంట", "nD": "ండ",
95
+ # 2-char homorganic
96
+ "nk": "ంక", "ng": "ంగ", "nt": "ంత",
97
+ "nd": "ండ", # V4.0.7: Corrected 'nd' to retroflex 'ండ' per lexical convention (e.g., 'konda')
98
+ "mp": "ంప", "mb": "ంబ",
99
+ # non-homorganic (explicit)
100
+ "ms": "మ్స", "mr": "మ్ర", "ml": "మ్ల", "mv": "మ్వ",
101
+ "ns": "న్స", "ny": "న్య",
102
+ }
103
+ matched = False
104
+ for L in (4, 3, 2):
105
+ if i + L <= len(text):
106
+ sub = text[i:i+L]
107
+ if sub in nasal_map:
108
+ # treat as a pre-formed syllabic piece
109
+ result.append(nasal_map[sub])
110
+ i += L
111
+ prev_was_consonant = True
112
+ matched = True
113
+ break
114
+ if matched:
395
115
  continue
396
116
 
397
- # Apply mappings
398
- if char in ALL_VOWELS:
399
- result.append(ALL_VOWELS[char])
400
- elif char in CONSONANTS:
401
- result.append(CONSONANTS[char])
402
- else:
403
- # Unknown character, keep as-is
404
- result.append(char)
405
-
406
- i += 1
407
-
408
- return ''.join(result)
409
-
410
-
411
- def apply_mappings_v3(text: str) -> str:
412
- """
413
- Apply consonant and vowel mappings (v3 - with full context awareness).
414
-
415
- This version works on text that has already been processed for patterns
416
- like nasal clusters, so it has full context of what needs special handling.
417
-
418
- Priority order:
419
- 1. Long vowels (aa, ii, uu, ee, oo)
420
- 2. Diphthongs (ai, au)
421
- 3. 'o' followed by consonant (use matra)
422
- 4. 'o' at end of word (use standalone)
423
- 5. Consonants
424
- 6. Single vowels
425
- """
426
- result = []
427
- i = 0
117
+ # 2) Geminate detection (kk, ll, …)
118
+ if len(chunk2) == 2 and chunk2[0] == chunk2[1] and chunk2[0] in consonants:
119
+ if chunk2 in geminates:
120
+ # explicit mapping like "ల్ల"
121
+ result.append(geminates[chunk2])
122
+ else:
123
+ # fallback: C + virama + C
124
+ base = consonants[chunk2[0]]
125
+ result.append(base + "్" + base)
126
+ prev_was_consonant = True
127
+ i += 2
128
+ continue
428
129
 
429
- while i < len(text):
430
- # Check 2-character long vowels first (highest priority)
431
- if i + 1 < len(text):
432
- chunk2 = text[i:i+2]
433
- if chunk2 in LONG_VOWELS:
434
- result.append(LONG_VOWELS[chunk2])
435
- i += 2
436
- continue
437
- if chunk2 in DIPHTHONGS:
438
- result.append(DIPHTHONGS[chunk2])
439
- i += 2
440
- continue
130
+ # 3) Regular clusters (5→4→3→2 letters)
131
+ for L in (5, 4, 3, 2):
132
+ sub = text[i:i+L]
133
+ if sub in clusters:
134
+ if prev_was_consonant:
135
+ result.append("్")
136
+ # expand tokens inside cluster, joining with virama
137
+ toks = clusters[sub]
138
+ for idx, tk in enumerate(toks):
139
+ emit_consonant(tk, join_prev=(idx > 0))
140
+ i += L
141
+ matched = True
142
+ break
143
+ if matched:
144
+ continue
441
145
 
442
- # Check single character
443
- char = text[i]
146
+ # 4) Two-letter vowels (aa, ee, ii, uu, oo), diphthongs (ai, au)
147
+ if chunk2 in vowels:
148
+ if prev_was_consonant:
149
+ attach_matra(chunk2)
150
+ prev_was_consonant = False
151
+ else:
152
+ result.append(vowels[chunk2])
153
+ i += 2
154
+ continue
444
155
 
445
- # Special handling for 'o' - use matra if followed by consonant
446
- if char == 'o':
447
- if i + 1 < len(text) and text[i+1] in CONSONANTS:
448
- # 'o' as matra () when followed by consonant
449
- result.append('ొ')
450
- i += 1
451
- continue
452
- elif i == len(text) - 1:
453
- # 'o' at end of word, use standalone
454
- result.append('ఒ')
455
- i += 1
456
- continue
156
+ # 5) Two-letter consonants (longest-first will also catch 'kh','ch','bh', etc.)
157
+ if chunk2 in consonants:
158
+ if prev_was_consonant:
159
+ result.append("్")
160
+ emit_consonant(chunk2)
161
+ i += 2
162
+ continue
457
163
 
458
- # Skip standalone 'a' when not at start (consonants have inherent 'a')
459
- if char == 'a' and i > 0:
460
- prev_char = result[-1] if result else None
461
- if prev_char in CONSONANTS.values():
462
- # Previous was a consonant, so 'a' is the inherent vowel
164
+ # 6) Single-letter vowels
165
+ if ch in vowels:
166
+ if ch == 'a' and prev_was_consonant:
167
+ # inherent 'a' → no matra
168
+ prev_was_consonant = False
463
169
  i += 1
464
170
  continue
465
-
466
- # Apply mappings
467
- if char in ALL_VOWELS:
468
- result.append(ALL_VOWELS[char])
469
- elif char in CONSONANTS:
470
- result.append(CONSONANTS[char])
471
- else:
472
- # Telugu characters (from nasal clusters, etc.) or unknown
473
- result.append(char)
474
-
475
- i += 1
476
-
477
- return ''.join(result)
478
-
479
-
480
- def apply_nasal_clusters(text: str) -> str:
481
- """
482
- Apply nasal cluster rules (CRITICAL).
483
-
484
- Convert: n + consonant → ం + consonant
485
- Examples:
486
- "konda" → "కొండ" → "కొండ" (correct)
487
- NOT: "konda" → "కొన్ద" (wrong)
488
-
489
- This MUST be done before other mappings!
490
- """
491
- result = text
492
-
493
- # Check 4-character clusters first (longest match)
494
- for cluster, telugu in NASAL_CLUSTERS.items():
495
- if len(cluster) == 4 and cluster in result:
496
- result = result.replace(cluster, telugu)
497
-
498
- # Then 3-character clusters
499
- for cluster, telugu in NASAL_CLUSTERS.items():
500
- if len(cluster) == 3 and cluster in result:
501
- result = result.replace(cluster, telugu)
502
-
503
- # Then 2-character clusters
504
- for cluster, telugu in NASAL_CLUSTERS_2CHAR.items():
505
- if len(cluster) == 2 and cluster in result:
506
- result = result.replace(cluster, telugu)
507
-
508
- return result
509
-
510
-
511
- def apply_mappings(text: str) -> str:
512
- """
513
- Apply consonant and vowel mappings.
514
-
515
- Priority order:
516
- 1. Long vowels (aa, ii, uu, ee, oo)
517
- 2. Diphthongs (ai, au)
518
- 3. Consonants
519
- 4. Single vowels
520
-
521
- This order is CRITICAL for correct transliteration!
522
- """
523
- result = []
524
- i = 0
525
-
526
- while i < len(text):
527
- # Check 2-character long vowels first
528
- if i + 1 < len(text):
529
- chunk2 = text[i:i+2]
530
- if chunk2 in LONG_VOWELS:
531
- result.append(LONG_VOWELS[chunk2])
532
- i += 2
533
- continue
534
- if chunk2 in DIPHTHONGS:
535
- result.append(DIPHTHONGS[chunk2])
536
- i += 2
537
- continue
538
-
539
- # Check single character
540
- char = text[i]
541
-
542
- # Skip standalone 'a' (consonants have inherent 'a')
543
- if char == 'a' and result and is_consonant(result[-1]):
171
+ if prev_was_consonant:
172
+ attach_matra(ch)
173
+ prev_was_consonant = False
174
+ else:
175
+ result.append(vowels[ch])
544
176
  i += 1
545
177
  continue
546
178
 
547
- # Apply mappings
548
- if char in ALL_VOWELS:
549
- result.append(ALL_VOWELS[char])
550
- elif char in CONSONANTS:
551
- result.append(CONSONANTS[char])
552
- else:
553
- # Unknown character, keep as-is
554
- result.append(char)
179
+ # 7) Single-letter consonants (match longest among keys)
180
+ matched_cons = None
181
+ for k in cons_keys:
182
+ # Note: Case sensitivity is maintained here thanks to V4.0.8 fix.
183
+ if text.startswith(k, i):
184
+ matched_cons = k
185
+ break
186
+ if matched_cons:
187
+ if prev_was_consonant:
188
+ result.append("్")
189
+ emit_consonant(matched_cons)
190
+ i += len(matched_cons)
191
+ continue
555
192
 
193
+ # 8) Anything else (spaces/punct/digits)
194
+ result.append(ch)
195
+ prev_was_consonant = False
556
196
  i += 1
557
197
 
558
- return ''.join(result)
559
-
560
-
561
- def is_consonant(char: str) -> bool:
562
- """Check if character is a consonant."""
563
- # This is a simplified check
564
- # In practice, check against CONSONANTS dict
565
- consonants = set(CONSONANTS.values())
566
- return char in consonants
567
-
568
-
569
- def apply_clusters(text: str) -> str:
570
- """Apply common consonant clusters."""
571
- result = text
198
+ # Final virama cleanup
199
+ if strip_final_virama and result and result[-1] == "్":
200
+ result.pop()
572
201
 
573
- for cluster, telugu in COMMON_CLUSTERS.items():
574
- result = result.replace(cluster, telugu)
202
+ return "".join(result)
575
203
 
576
- return result
577
204
 
205
+ # ──────────────────────────────────────────────────────────────────────────────
206
+ # Tables
207
+ # ──────────────────────────────────────────────────────────────────────────────
578
208
 
579
- def apply_gemination(text: str) -> str:
580
- """Apply gemination (double consonants)."""
581
- result = text
582
-
583
- for geminate, telugu in GEMINATION.items():
584
- result = result.replace(geminate, telugu)
585
-
586
- return result
587
-
588
-
589
- def apply_grammar(text: str) -> str:
590
- """
591
- Apply basic grammar (placeholder for now).
592
-
593
- Future: Add case markers, SOV conversion, etc.
594
- """
595
- # This will call functions from grammar.py
596
- # For now, just return as-is
597
- return text
598
-
599
-
600
- def validate_v3_compliance(text: str) -> bool:
601
- """
602
- Validate v3.0 compliance.
603
-
604
- Check for:
605
- - No archaic letters (ఱ, ఌ, ౡ, etc.)
606
- - Modern pronouns
607
- - Correct patterns
608
- """
609
- # Check for archaic letters
610
- archaic_letters = ['ఱ', 'ఌ', 'ౡ', 'ౘ', 'ౙ', 'ఀ', 'ౝ']
611
- for letter in archaic_letters:
612
- if letter in text:
613
- print(f"WARNING: Found archaic letter {letter} in '{text}'")
614
- return False
615
-
616
- # Check for archaic pronouns
617
- for archaic in ARCHAIC_PRONOUNS.values():
618
- if archaic in text:
619
- print(f"WARNING: Found archaic pronoun {archaic} in '{text}'")
620
- return False
621
-
622
- return True
623
-
624
-
625
- # ============================================================================
626
- # SECTION 6: CONVENIENCE FUNCTIONS
627
- # ============================================================================
628
-
629
- def transliterate_word(word: str) -> str:
630
- """Transliterate a single word."""
631
- return eng_to_telugu(word)
209
+ def get_geminates():
210
+ """Explicit geminate mappings."""
211
+ return {
212
+ "kk": "క్క", "gg": "గ్గ", "cc": "చ్చ", "jj": "జ్జ",
213
+ "tt": "త్త", "dd": "ద్ద", "pp": "ప్ప", "bb": "బ్బ",
214
+ "mm": "మ్మ", "yy": "య్య", "rr": "ర్ర", "ll": "ల్ల",
215
+ "vv": "వ్వ", "ss": "స్స", "nn": "న్న",
216
+ # Retroflex geminates via uppercase tokens if used:
217
+ "TT": "ట్ట", "DD": "డ్డ", "NN": "ణ్ణ",
218
+ }
632
219
 
220
+ def get_base_consonants(style="modern"):
221
+ """Modern consonants (no archaic ఱ)."""
222
+ # V4.0.7: Complex clusters 'ksha' and 'jna' removed; handled by the cluster mechanism (Rule 3).
223
+ base = {
224
+ # stops/affricates
225
+ "k": "క", "kh": "ఖ", "g": "గ", "gh": "ఘ",
226
+ "c": "చ", "ch": "చ", "chh": "ఛ", "j": "జ", "jh": "ఝ",
227
+ "t": "త", "th": "థ", "d": "ద", "dh": "ధ", "n": "న",
228
+ # retroflex (UPPER tokens are preserved by V4.0.8 fix)
229
+ "T": "ట", "Th": "ఠ", "D": "డ", "Dh": "ఢ", "N": "ణ",
230
+ # labials
231
+ "p": "ప", "ph": "ఫ", "b": "బ", "bh": "భ", "m": "మ",
232
+ # sonorants
233
+ "y": "య", "r": "ర", "l": "ల", "v": "వ", "w": "వ",
234
+ # sibilants/h
235
+ "sh": "శ", # palatal ś
236
+ "S": "ష", # retroflex ṣ
237
+ "s": "స",
238
+ "h": "హ",
239
+ }
240
+ return base
241
+
242
+ def get_base_vowels(style="modern"):
243
+ """Vowel letters."""
244
+ return {
245
+ # short
246
+ "a": "అ", "i": "ఇ", "u": "ఉ", "e": "ఎ", "o": "ఒ",
247
+ # long
248
+ "aa": "ఆ", "ii": "ఈ", "uu": "ఊ", "ee": "ఏ", "oo": "ఓ",
249
+ # diphthongs
250
+ "ai": "ఐ", "au": "ఔ",
251
+ # special marks / vocalics
252
+ "am": "ం", "ah": "ః", "ri": "ఋ", "rii": "ౠ",
253
+ }
633
254
 
634
- def transliterate_sentence(sentence: str) -> str:
635
- """Transliterate a complete sentence."""
636
- words = sentence.split()
637
- return ' '.join(eng_to_telugu(word) for word in words)
255
+ def get_base_matras(style="modern"):
256
+ """Dependent vowel signs (matras)."""
257
+ return {
258
+ "a": "",
259
+ "aa": "ా", "i": "ి", "ii": "ీ",
260
+ "u": "ు", "uu": "ూ",
261
+ "e": "ె", "ee": "ే",
262
+ "o": "ొ", "oo": "ో",
263
+ "ai": "ై", "au": "ౌ",
264
+ "am": "ం", "ah": "ః",
265
+ "ri": "ృ", "rii": "ౄ",
266
+ }
638
267
 
268
+ def get_clusters(style="modern"):
269
+ """Common consonant clusters in token space."""
270
+ return {
271
+ # 4
272
+ "ksha": ["k", "S"], # k + ṣa → క్ష
273
+ "shra": ["S", "r"],
274
+ "shna": ["S", "n"],
275
+ "jna": ["j", "n"],
276
+ # 3
277
+ "tra": ["t", "r"], "dra": ["d", "r"], "pra": ["p", "r"],
278
+ "bhra": ["bh", "r"], "gva": ["g", "v"], "tna": ["t", "n"],
279
+ "ntr": ["n", "t", "r"], "ndr": ["n", "d", "r"],
280
+ # 2 (r/l/v clusters etc.)
281
+ "kr": ["k", "r"], "tr": ["t", "r"], "dr": ["d", "r"],
282
+ "gr": ["g", "r"], "pr": ["p", "r"], "br": ["b", "r"],
283
+ "vr": ["v", "r"], "sr": ["s", "r"], "nr": ["n", "r"],
284
+ "kl": ["k", "l"], "gl": ["g", "l"], "pl": ["p", "l"], "bl": ["b", "l"],
285
+ "kv": ["k", "v"], "tv": ["t", "v"], "dv": ["d", "v"],
286
+ "tn": ["t", "n"], "dn": ["d", "n"], "kn": ["k", "n"], "pn": ["p", "n"],
287
+ }
639
288
 
640
- # ============================================================================
641
- # SECTION 7: PUBLIC API
642
- # ============================================================================
643
289
 
644
- __all__ = [
645
- 'eng_to_telugu',
646
- 'transliterate_word',
647
- 'transliterate_sentence',
648
- 'MODERN_PRONOUNS',
649
- 'validate_v3_compliance',
650
- ]
290
+ # ──────────────────────────────────────────────────────────────────────────────
291
+ # Public API
292
+ # ──────────────────────────────────────────────────────────────────────────────
293
+
294
+ def eng_to_telugu(text: str, strip_final_virama: bool = True) -> str:
295
+ if text is None:
296
+ raise ValueError("Input text cannot be None")
297
+ if not isinstance(text, str):
298
+ raise TypeError(f"Expected str, got {type(text).__name__}")
299
+ s = text.strip()
300
+ if not s:
301
+ return ""
302
+ if len(s) > 10000:
303
+ raise ValueError("Input text too long (max 10000 characters)")
304
+
305
+ rules = {
306
+ "consonants": get_base_consonants(),
307
+ "vowels": get_base_vowels(),
308
+ "matras": get_base_matras(),
309
+ "clusters": get_clusters(),
310
+ "geminates": get_geminates(),
311
+ "strip_final_virama": strip_final_virama,
312
+ }
313
+ return eng_to_telugu_base(s, rules)
651
314
 
652
315
 
653
- # ============================================================================
654
- # SECTION 8: EXAMPLE USAGE
655
- # ============================================================================
316
+ # ──────────────────────────────────────────────────────────────────────────────
317
+ # Tests (updated for v4.0.8)
318
+ # ──────────────────────────────────────────────────────────────────────────────
656
319
 
657
320
  if __name__ == "__main__":
658
- # Test cases (from CRITICAL_FIXES.md)
659
- test_cases = [
660
- ("namaaste", "నమస్తే"),
661
- ("raama", "రామ"),
662
- ("konda", "కొండ"),
663
- ("nenu", "నేను"),
664
- ("vallu", "వాళ్ళు"),
665
- ("palakariste", "పలకరిస్తే"),
321
+ print("=" * 80)
322
+ print("TELUGU LIBRARY v4.0.8 — REVISED TESTS")
323
+ print("=" * 80)
324
+
325
+ tests = [
326
+ # Geminates
327
+ ("pikk", "పిక్క", "kk"),
328
+ ("ayya", "అయ్య", "yy"),
329
+ ("amma", "అమ్మ", "mm"),
330
+ ("chitti", "చిత్తి", "tt"),
331
+ ("palli", "పల్లి", "ll"),
332
+
333
+ # Long vowels
334
+ ("peeku", "పీకు", "ee→ీ"),
335
+ ("taadu", "తాదు", "aa→ా"), # (was 'tadu' in your list)
336
+ ("veedu", "వీడు", "ee→ీ"),
337
+ ("koodu", "కూడు", "oo/uu"),
338
+
339
+ # Clusters
340
+ ("evadra", "ఎవద్ర", "dr"), # minimal form; dialectal 'ఎవడ్రా' if you force ā at end
341
+ ("manlini", "మన్లిని", "nl"), # becomes n+l; if you want ll, input 'mallini'
342
+
343
+ # Nasals & specials
344
+ ("krishnajinka", "క్రిష్నజింక", "nj"),
345
+ ("namste", "నమ్స్తే", "ms"),
346
+ ("konda", "కొండ", "nd"), # V4.0.8: Critical test case for retroflex mapping
347
+
348
+ # Basic
349
+ ("raamu", "రాము", "aa"),
350
+ ("kalki", "కల్కి", "kl"),
351
+ ("anja", "అంజ", "nj"),
352
+
353
+ # Retroflex cases (testing case sensitivity)
354
+ ("nada", "నద", "n+d (dental)"),
355
+ ("naDa", "నఢ", "n+D (retroflex)"),
356
+ ("tala", "తల", "t+l (dental)"),
357
+ ("Tala", "టల", "T+l (retroflex)"),
666
358
  ]
667
359
 
668
- print("\n" + "="*70)
669
- print(" TRANSLITERATOR v3.0 - TEST CASES")
670
- print("="*70 + "\n")
671
-
672
- for english, expected in test_cases:
673
- result = eng_to_telugu(english)
674
- status = "✅" if result == expected else "❌"
675
- print(f"{status} {english:20} → {result:15} (expected: {expected})")
676
-
677
- print("\n" + "="*70 + "\n")
678
-
679
- # Interactive test
680
- print("Enter text to transliterate (or 'quit' to exit):")
681
- while True:
682
- try:
683
- text = input("> ").strip()
684
- if text.lower() in ['quit', 'exit', 'q']:
685
- break
686
- if text:
687
- result = eng_to_telugu(text)
688
- print(f" → {result}\n")
689
- except KeyboardInterrupt:
690
- break
691
-
692
- print("\nTransliteration complete!")
360
+ passed, failed = 0, 0
361
+ for src, exp, note in tests:
362
+ out = eng_to_telugu(src)
363
+ ok = (out == exp)
364
+ print(f"{'✓' if ok else '✗'} {src:<18} → {out:<16} | {note}")
365
+ if ok: passed += 1
366
+ else:
367
+ failed += 1
368
+ print(f" expected: {exp}")
369
+
370
+ print("-" * 80)
371
+ total = len(tests)
372
+ print(f"Results: {passed} passed, {failed} failed of {total} ({passed/total*100:.1f}%)")
373
+ if failed == 0:
374
+ print("🎉 ALL TESTS PASSED! v4.0.8 ready.")