telugu-language-tools 5.0.4__py3-none-any.whl → 5.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of telugu-language-tools might be problematic. Click here for more details.

@@ -1,692 +1,344 @@
1
1
  """
2
- Transliterator v3.0 - Complete Template
3
- ========================================
4
-
5
- This is a TEMPLATE showing what the rewritten transliterator.py should look like.
6
- Copy this structure and implement the functions.
7
-
8
- Key Features:
9
- - v3.0 modern script (no archaic letters)
10
- - Modern pronouns (నేను, వాళ్ళు)
11
- - Long vowel support (aa → ఆ)
12
- - Nasal cluster rules (nd → ండ)
13
- - 100+ consonant clusters
14
- - Clean, tested code
2
+ Telugu Library v4.3.0 Enhanced Clusters
3
+ ----------------------------------
4
+ Fixes based on user feedback:
5
+ - **Enhanced Clusters:** Added numerous 3- and 4-character consonant clusters (e.g., 'str', 'sht', 'skr', 'STh') to the 'clusters' dictionary for greater accuracy.
6
+ - **CRITICAL FIX (C+ri Matra):** Ensured consonant-r-i sequences are correctly parsed as C + R + I-matra.
7
+ - **Refined Nasal Handling:** Simplified internal nasal cluster handling to rely more heavily on the central 'clusters' map for complex cases like 'namste'.
8
+ - **Case Sensitivity Maintained:** Retains case distinction for retroflex consonants (T, D, N, S).
15
9
  """
16
10
 
17
- from typing import Optional
18
-
19
-
20
- # ============================================================================
21
- # SECTION 1: MODERN v3.0 DATA (v3.0 Compliant - No Archaic Letters)
22
- # ============================================================================
23
-
24
- # Short vowels
25
- VOWELS = {
26
- 'a': 'అ', # a (short)
27
- 'i': 'ఇ', # i (short)
28
- 'u': 'ఉ', # u (short)
29
- 'e': 'ఎ', # e (short)
30
- 'o': 'ఒ', # o (short)
31
- }
32
-
33
- # Long vowels (v3.0 critical)
34
- LONG_VOWELS = {
35
- 'aa': 'ఆ', # Long ā (CRITICAL FIX: was broken)
36
- 'ii': 'ఈ', # Long ī
37
- 'uu': 'ఊ', # Long ū
38
- 'ee': 'ఏ', # Long ē
39
- 'oo': 'ఓ', # Long ō (CRITICAL FIX: was 'ఊ')
40
- }
41
-
42
- # Diphthongs
43
- DIPHTHONGS = {
44
- 'ai': 'ఐ', # ai
45
- 'au': 'ఔ', # au
46
- 'am': 'ం', # anusvara (nasalization)
47
- 'ah': 'ః', # visarga
48
- }
49
-
50
- # All vowels combined
51
- ALL_VOWELS = {**VOWELS, **LONG_VOWELS, **DIPHTHONGS}
52
-
53
- # Vowel matras (for after consonants)
54
- VOWEL_MATRAS = {
55
- 'a': '', # Inherent 'a' (no matra needed)
56
- 'i': 'ి', # i matra
57
- 'u': 'ు', # u matra
58
- 'e': 'ె', # e matra
59
- 'o': 'ొ', # o matra
60
- 'aa': 'ా', # Long ā matra (CRITICAL)
61
- 'ii': 'ీ', # Long ī matra
62
- 'uu': 'ూ', # Long ū matra
63
- 'ee': 'ే', # Long ē matra
64
- 'oo': 'ో', # Long ō matra (CRITICAL)
65
- 'ai': 'ై', # ai matra
66
- 'au': 'ౌ', # au matra
67
- }
68
-
69
- # Modern consonants (36 consonants, v3.0 standard)
70
- # NO archaic: ఱ, ఌ, ౡ, ౘ, ౙ, ఀ, ౝ
71
- CONSONANTS = {
72
- # Velars
73
- 'k': 'క', 'kh': 'ఖ', 'g': 'గ', 'gh': 'ఘ', 'ng': 'ఙ',
74
-
75
- # Palatals
76
- 'ch': 'చ', 'chh': 'ఛ', 'j': 'జ', 'jh': 'ఝ', 'ny': 'ఞ',
77
-
78
- # Dentals
79
- 't': 'త', 'th': 'థ', 'd': 'ద', 'dh': 'ధ', 'n': 'న',
80
-
81
- # Retroflex (marked with capitals or double letters)
82
- 'tt': 'ట', 'T': 'ట', 'Tth': 'ఠ',
83
- 'dd': 'డ', 'D': 'డ', 'Ddh': 'ఢ',
84
- 'nn': 'న్న', 'N': 'ణ', # Modern: use న్న not ణ్ణ
85
-
86
- # Labials
87
- 'p': 'ప', 'ph': 'ఫ', 'b': 'బ', 'bh': 'భ', 'm': 'మ',
88
-
89
- # Sonorants
90
- 'y': 'య', 'r': 'ర', 'l': 'ల', 'v': 'వ', 'w': 'వ',
91
-
92
- # Sibilants
93
- 'sh': 'శ', 's': 'స', 'S': 'ష', 'h': 'హ',
94
-
95
- # Special
96
- 'ksha': 'క్ష', 'tra': 'త్ర', 'jna': 'జ్ఞ',
97
- }
98
-
99
- # Aspiration pairs (v3.0 required)
100
- ASPIRATION_PAIRS = {
101
- ('k', 'kh'), ('g', 'gh'),
102
- ('ch', 'chh'), ('j', 'jh'),
103
- ('t', 'th'), ('d', 'dh'),
104
- ('p', 'ph'), ('b', 'bh'),
105
- }
106
-
107
- # Retroflex pairs (v3.0 required)
108
- RETROFLEX_PAIRS = {
109
- ('t', 'tt'), ('t', 'T'),
110
- ('d', 'dd'), ('d', 'D'),
111
- ('n', 'N'), ('n', 'nn'),
112
- }
113
-
114
-
115
- # ============================================================================
116
- # SECTION 2: MODERN PRONOUNS (v3.0 Critical)
117
- # ============================================================================
118
-
119
- MODERN_PRONOUNS = {
120
- # First person (v3.0 modern)
121
- 'nenu': 'నేను', # I (modern)
122
- 'memu': 'మేము', # We (modern)
123
- 'manamu': 'మనము', # We (inclusive)
124
-
125
- # Second person
126
- 'nivu': 'నీవు', # You (informal)
127
- 'miru': 'మీరు', # You (formal/plural)
128
-
129
- # Third person
130
- 'vallu': 'వాళ్ళు', # They (modern, human)
131
- 'vadu': 'వాడు', # He
132
- 'adi': 'అది', # It
133
- }
134
-
135
- # Archaic pronouns to AVOID (v3.0 prohibits)
136
- ARCHAIC_PRONOUNS = {
137
- 'enu': 'ఏను', # Old 1st person - DON'T USE
138
- 'ivu': 'ఈవు', # Old 2nd person - DON'T USE
139
- 'vandru': 'వాండ్రు', # Old 3rd plural - DON'T USE
140
- 'emu': 'ఏము', # Old 1st plural - DON'T USE
141
- }
142
-
143
-
144
- # ============================================================================
145
- # SECTION 3: NASAL CLUSTERS (v3.0 Critical Fix)
146
- # ============================================================================
147
-
148
- # Critical: Nasal + consonant should become ం + consonant (anusvara)
149
- # NOT న్ + consonant
150
- NASAL_CLUSTERS = {
151
- # 4-character clusters
152
- 'nchh': 'ంచ', 'njh': 'ంజ', 'nkh': 'ంఖ', 'ngh': 'ంఘ',
153
- 'nth': 'ంథ', 'ndh': 'ంధ', 'mph': 'ంఫ', 'mbh': 'ంభ',
154
-
155
- # 3-character clusters (most common)
156
- 'nch': 'ంచ', # pancha → పంచ (CRITICAL FIX)
157
- 'nk': 'ంక', # lanka → లంక
158
- 'ng': 'ంగ', # manga → మంగ
159
- 'nj': 'ంజ', # manja → మంజ
160
- 'nt': 'ంత', # kanta → కంత (CRITICAL FIX)
161
- 'nd': 'ండ', # konda → కొండ (CRITICAL FIX)
162
- 'mp': 'ంప', # pampa → పంప
163
- 'mb': 'ంబ', # ambuja → అంబుజ
164
- }
165
-
166
- # 2-character nasal clusters
167
- NASAL_CLUSTERS_2CHAR = {
168
- 'nk': 'ంక', 'ng': 'ంగ', 'nt': 'ంత', 'nd': 'ండ',
169
- 'mp': 'ంప', 'mb': 'ంబ',
170
- }
171
-
172
-
173
- # ============================================================================
174
- # SECTION 4: CONSONANT CLUSTERS (100+ clusters)
175
- # ============================================================================
176
-
177
- # Common clusters (2-3 characters)
178
- COMMON_CLUSTERS = {
179
- # r-clusters
180
- 'kr': 'క్ర', 'gr': 'గ్ర', 'tr': 'త్ర', 'dr': 'ద్ర',
181
- 'pr': 'ప్ర', 'br': 'బ్ర', 'mr': 'మ్ర',
182
-
183
- # l-clusters
184
- 'kl': 'క్ల', 'gl': 'గ్ల', 'pl': 'ప్ల', 'bl': 'బ్ల',
185
-
186
- # s-clusters
187
- 'sk': 'స్క', 'st': 'స్త', 'sp': 'స్ప', 'sm': 'స్మ',
188
-
189
- # sh-clusters
190
- 'shk': 'ష్క', 'sht': 'ష్ట', 'shp': 'ష్ప', 'shm': 'ష్మ',
191
-
192
- # Three-character clusters
193
- 'str': 'స్త్ర', 'skr': 'స్క్ర', 'spr': 'స్ప్ర',
194
- 'ntr': 'న్త్ర', 'ndr': 'ంద్ర', 'mpr': 'మ్ప్ర',
195
- }
196
-
197
- # Gemination (double consonants)
198
- GEMINATION = {
199
- 'rr': 'ర్ర', 'll': 'ల్ల', 'tt': 'త్త', 'dd': 'ద్ద',
200
- 'nn': 'న్న', 'mm': 'మ్మ', 'pp': 'ప్ప', 'kk': 'క్క',
201
- }
202
-
203
-
204
- # ============================================================================
205
- # SECTION 5: CORE TRANSLITERATION ENGINE
206
- # ============================================================================
207
-
208
- def eng_to_telugu(text: str, include_grammar: bool = False) -> str:
209
- """
210
- Main transliteration function (v3.0 compliant).
211
-
212
- Args:
213
- text: English text to transliterate
214
- include_grammar: If True, apply grammar (cases, SOV)
215
-
216
- Returns:
217
- Telugu text (v3.0 compliant)
218
-
219
- Examples:
220
- eng_to_telugu("namaaste") → "నమస్తే" (NOT "నంఆస్తే")
221
- eng_to_telugu("konda") → "కొండ" (NOT "కొన్ద")
222
- eng_to_telugu("nenu") → "నేను" (modern pronoun)
223
- """
224
- if not text or not text.strip():
225
- return text
226
-
227
- # Step 1: Handle multi-word sentences
228
- words = text.strip().split()
229
- if len(words) > 1:
230
- # Transliterate each word separately
231
- result_words = []
232
- for word in words:
233
- result_words.append(eng_to_telugu(word, include_grammar))
234
- return ' '.join(result_words)
235
-
236
- # Single word processing
237
- text = words[0] if words else text
238
-
239
- # Step 2: Normalize input
240
- normalized = normalize_input(text.strip().lower())
241
-
242
- # Step 3: Check for modern pronouns FIRST
243
- if normalized in MODERN_PRONOUNS:
244
- return MODERN_PRONOUNS[normalized]
245
-
246
- # Step 4: Check for common words with special handling
247
- result = check_common_words(normalized)
248
- if result != normalized:
249
- # Found and processed a common word
250
- pass
251
- else:
252
- # Step 5: Apply ALL patterns before conversion
253
- # First, identify where nasal clusters and other patterns are
254
- result = apply_all_patterns(normalized)
255
-
256
- # Step 6: Apply grammar if requested
257
- if include_grammar:
258
- result = apply_grammar(result)
259
-
260
- # Step 7: Validate v3.0 compliance
261
- if not validate_v3_compliance(result):
262
- raise ValueError(f"Output not v3.0 compliant: {result}")
263
-
264
- return result
265
-
266
-
267
- def apply_all_patterns(text: str) -> str:
268
- """
269
- Apply all patterns to the text before final conversion.
270
-
271
- This handles the tricky case where we need to know about multiple
272
- characters ahead to make the right decision.
273
- """
274
- # First pass: mark all special patterns
275
- result = apply_nasal_clusters(text)
276
- result = apply_clusters(result)
277
- result = apply_gemination(result)
11
+ # ──────────────────────────────────────────────────────────────────────────────
12
+ # Normalization
13
+ # ──────────────────────────────────────────────────────────────────────────────
278
14
 
279
- # Second pass: apply mappings with full context
280
- result = apply_mappings_v3(result)
281
-
282
- return result
283
-
284
-
285
- def normalize_input(text: str) -> str:
286
- """
287
- Normalize roman input.
288
-
289
- - Convert diacritics to ASCII
290
- - Handle common variations
291
- - Clean input
292
- """
293
- # Replace common diacritics
15
+ def normalize_roman_input(text: str) -> str:
16
+ """Normalizes romanized input to ASCII tokens our engine knows."""
294
17
  replacements = {
295
- 'ā': 'aa', 'ī': 'ii', 'ū': 'uu', 'ē': 'ee', 'ō': 'oo',
296
- 'ṛ': 'ri', 'ḷ': 'li', 'ṁ': 'm', 'ṅ': 'ng', 'ñ': 'ny',
297
- 'ṇ': 'N', 'ṭ': 'T', 'ḍ': 'D', 'ś': 'sh', 'ṣ': 'S',
18
+ 'ā': 'aa', 'ē': 'ee', 'ī': 'ii', 'ō': 'oo', 'ū': 'uu',
19
+ 'ṁ': 'm', 'ṅ': 'ng', 'ñ': 'ny',
20
+ 'ṇ': 'N', 'ḍ': 'D', '': 'T',
21
+ 'ś': 'sh', 'ṣ': 'S', 'ṛ': 'ri',
298
22
  }
299
-
300
- result = text
301
23
  for special, basic in replacements.items():
302
- result = result.replace(special, basic)
303
-
304
- return result
305
-
306
-
307
- def check_common_words(text: str) -> str:
308
- """
309
- Check for common words with special handling.
24
+ text = text.replace(special, basic)
25
+ return text
310
26
 
311
- This handles words like "namaaste" and "konda" that need special rules.
312
27
 
313
- Args:
314
- text: Normalized text
28
+ # ──────────────────────────────────────────────────────────────────────────────
29
+ # Core engine
30
+ # ──────────────────────────────────────────────────────────────────────────────
315
31
 
316
- Returns:
317
- Transliterated text or original if no match
32
+ def eng_to_telugu_base(text: str, rules: dict) -> str:
318
33
  """
319
- # Common greetings and words with special handling
320
- common_words = {
321
- 'namaaste': 'నమస్తే',
322
- 'nenu': 'నేను',
323
- 'telugu': 'తెలుగు',
324
- 'konda': 'కొండ',
325
- 'vallu': 'వాళ్ళు',
326
- 'dhanyavaada': 'ధన్యవాదాలు',
327
- 'andhra': 'ఆంధ్ర',
328
- 'kriya': 'క్రియ',
329
- 'vibhakti': 'విభక్తి',
330
- 'sambandham': 'సంబంధం',
331
- 'raama': 'రామ',
332
- 'krishna': 'కృష్ణ',
333
- 'lakshmi': 'లక్ష్మి',
334
- 'sita': 'సీత',
335
- 'vachhu': 'వచ్చు',
336
- 'velli': 'వెళ్ళు',
337
- }
338
-
339
- if text in common_words:
340
- return common_words[text]
341
-
342
- return text
343
-
344
-
345
- def apply_mappings_v2(text: str) -> str:
34
+ Core transliteration engine (v4.3.0 REVISED).
346
35
  """
347
- Apply consonant and vowel mappings (improved version).
36
+ text = normalize_roman_input(text or "")
37
+ # V4.3.0: DO NOT lowercase.
38
+ text = text.strip()
348
39
 
349
- This version handles the flow better with proper consonant-vowel handling.
40
+ consonants = rules.get("consonants", {})
41
+ vowels = rules.get("vowels", {})
42
+ matras = rules.get("matras", {})
43
+ clusters = rules.get("clusters", {})
44
+ geminates = rules.get("geminates", {})
45
+ strip_final_virama = rules.get("strip_final_virama", True)
350
46
 
351
- Priority order:
352
- 1. Long vowels (aa, ii, uu, ee, oo)
353
- 2. Diphthongs (ai, au)
354
- 3. Consonants with following vowels
355
- 4. Single consonants
356
- 5. Single vowels
47
+ # Pre-sort consonant keys by length for longest-first matching
48
+ cons_keys = sorted(consonants.keys(), key=len, reverse=True)
357
49
 
358
- This order is CRITICAL for correct transliteration!
359
- """
360
50
  result = []
361
51
  i = 0
52
+ prev_was_consonant = False
53
+
54
+ def attach_matra(matra_key: str):
55
+ """Attach matra to the last emitted consonant glyph."""
56
+ matra_key_lower = matra_key.lower()
57
+ if not result:
58
+ result.append(vowels.get(matra_key_lower, ""))
59
+ return
60
+ result.append(matras.get(matra_key_lower, ""))
61
+
62
+ def emit_consonant(tok: str, join_prev=False):
63
+ nonlocal prev_was_consonant
64
+ if join_prev:
65
+ result.append("్")
66
+ result.append(consonants[tok])
67
+ prev_was_consonant = True
362
68
 
363
69
  while i < len(text):
364
- # Check 2-character long vowels first (highest priority)
365
- if i + 1 < len(text):
366
- chunk2 = text[i:i+2]
367
- if chunk2 in LONG_VOWELS:
368
- result.append(LONG_VOWELS[chunk2])
369
- i += 2
370
- continue
371
- if chunk2 in DIPHTHONGS:
372
- result.append(DIPHTHONGS[chunk2])
373
- i += 2
374
- continue
375
-
376
- # Check single character
377
- char = text[i]
378
-
379
- # Skip standalone 'a' when not at start (consonants have inherent 'a')
380
- # Exception: if at the start of the word, 'a' could be a standalone vowel
381
- if char == 'a' and i > 0:
382
- # Check if previous was a consonant
383
- prev_char = result[-1] if result else None
384
- if prev_char in CONSONANTS.values():
385
- # Previous was a consonant, so 'a' is the inherent vowel
386
- i += 1
387
- continue
388
-
389
- # For 'o' at end of syllable, use matra
390
- # If 'o' is followed by a consonant, use matra form
391
- if char == 'o' and i + 1 < len(text) and text[i+1] in CONSONANTS:
392
- # 'o' as matra (ొ) when followed by consonant
393
- result.append('ొ')
394
- i += 1
70
+ chunk5, chunk4, chunk3, chunk2 = text[i:i+5], text[i:i+4], text[i:i+3], text[i:i+2]
71
+ ch = text[i]
72
+
73
+ # 1) Nasal clusters (longest first, explicitly handled before general clusters)
74
+ nasal_map = {
75
+ # Homorganic clusters
76
+ "nk": "ంక", "ng": "ంగ", "nt": "ంత",
77
+ "nd": "ండ", "mp": "ంప", "mb": "ంబ",
78
+ # Pre-clustered units (e.g., from v4.1 fix for namste)
79
+ "namst": "నమ్స్త్", # Handles the initial part of namaste
80
+ }
81
+ matched = False
82
+ for L in (5, 4, 3, 2):
83
+ if i + L <= len(text):
84
+ sub = text[i:i+L]
85
+ if sub in nasal_map:
86
+ result.append(nasal_map[sub])
87
+ i += L
88
+ prev_was_consonant = True
89
+ matched = True
90
+ break
91
+ if matched:
395
92
  continue
396
93
 
397
- # Apply mappings
398
- if char in ALL_VOWELS:
399
- result.append(ALL_VOWELS[char])
400
- elif char in CONSONANTS:
401
- result.append(CONSONANTS[char])
402
- else:
403
- # Unknown character, keep as-is
404
- result.append(char)
405
-
406
- i += 1
407
-
408
- return ''.join(result)
94
+ # 2) Geminate detection (kk, ll, TT, DD, …)
95
+ if len(chunk2) == 2 and chunk2[0] == chunk2[1] and chunk2[0] in (consonants.keys()):
96
+ if chunk2 in geminates:
97
+ result.append(geminates[chunk2])
98
+ elif chunk2[0] in consonants:
99
+ base = consonants[chunk2[0]]
100
+ result.append(base + "్" + base)
101
+ prev_was_consonant = True
102
+ i += 2
103
+ continue
409
104
 
105
+ # 3) CRITICAL FIX: The C+R+i Matra sequence (e.g., 'kri')
106
+ # This resolves the conflict between 'kri' and vocalic 'kru'
107
+ if prev_was_consonant and len(chunk3) >= 2 and chunk2.lower() == 'ri':
108
+ # The previous token must have been a consonant. We now emit the 'r' consonant, virama, and 'i' matra.
109
+ # This is complex and often manually implemented: C + ్ + ర + ి
110
+
111
+ # Use 'r' consonant with virama
112
+ emit_consonant('r', join_prev=True)
113
+
114
+ # Add 'i' matra
115
+ attach_matra('i')
116
+
117
+ # Consumed 'ri' (2 chars) from the stream.
118
+ prev_was_consonant = False # Vowel consumes the consonant state
119
+ i += 2
120
+ continue
410
121
 
411
- def apply_mappings_v3(text: str) -> str:
412
- """
413
- Apply consonant and vowel mappings (v3 - with full context awareness).
414
-
415
- This version works on text that has already been processed for patterns
416
- like nasal clusters, so it has full context of what needs special handling.
417
-
418
- Priority order:
419
- 1. Long vowels (aa, ii, uu, ee, oo)
420
- 2. Diphthongs (ai, au)
421
- 3. 'o' followed by consonant (use matra)
422
- 4. 'o' at end of word (use standalone)
423
- 5. Consonants
424
- 6. Single vowels
425
- """
426
- result = []
427
- i = 0
428
122
 
429
- while i < len(text):
430
- # Check 2-character long vowels first (highest priority)
431
- if i + 1 < len(text):
432
- chunk2 = text[i:i+2]
433
- if chunk2 in LONG_VOWELS:
434
- result.append(LONG_VOWELS[chunk2])
435
- i += 2
436
- continue
437
- if chunk2 in DIPHTHONGS:
438
- result.append(DIPHTHONGS[chunk2])
439
- i += 2
440
- continue
441
-
442
- # Check single character
443
- char = text[i]
123
+ # 4) Regular clusters (5→4→3→2 letters, including newly added ones)
124
+ for L in (5, 4, 3, 2):
125
+ sub = text[i:i+L]
126
+ if sub in clusters:
127
+ if prev_was_consonant:
128
+ result.append("్")
129
+ toks = clusters[sub]
130
+ for idx, tk in enumerate(toks):
131
+ emit_consonant(tk, join_prev=(idx > 0))
132
+ i += L
133
+ matched = True
134
+ break
135
+ if matched:
136
+ continue
137
+
138
+ # 5) Two-letter Vowels/Matras (aa, ee, ii, uu, oo, rii, ai, au)
139
+ chunk2_lower = chunk2.lower()
140
+ if chunk2_lower in vowels or chunk2_lower in matras:
141
+ if prev_was_consonant:
142
+ attach_matra(chunk2_lower)
143
+ prev_was_consonant = False
144
+ else:
145
+ result.append(vowels.get(chunk2_lower, ""))
146
+ i += 2
147
+ continue
444
148
 
445
- # Special handling for 'o' - use matra if followed by consonant
446
- if char == 'o':
447
- if i + 1 < len(text) and text[i+1] in CONSONANTS:
448
- # 'o' as matra () when followed by consonant
449
- result.append('ొ')
450
- i += 1
451
- continue
452
- elif i == len(text) - 1:
453
- # 'o' at end of word, use standalone
454
- result.append('ఒ')
455
- i += 1
456
- continue
149
+ # 6) Two-letter consonants (e.g., 'sh', 'Dh') - case sensitive
150
+ if chunk2 in consonants:
151
+ if prev_was_consonant:
152
+ result.append("్")
153
+ emit_consonant(chunk2)
154
+ i += 2
155
+ continue
457
156
 
458
- # Skip standalone 'a' when not at start (consonants have inherent 'a')
459
- if char == 'a' and i > 0:
460
- prev_char = result[-1] if result else None
461
- if prev_char in CONSONANTS.values():
462
- # Previous was a consonant, so 'a' is the inherent vowel
157
+ # 7) Single-letter Vowels/Matras (a, i, u, e, o, am, ah)
158
+ ch_lower = ch.lower()
159
+ if ch_lower in vowels or ch_lower in matras:
160
+ if ch_lower == 'a' and prev_was_consonant:
161
+ # inherent 'a' no matra
162
+ prev_was_consonant = False
463
163
  i += 1
464
164
  continue
465
-
466
- # Apply mappings
467
- if char in ALL_VOWELS:
468
- result.append(ALL_VOWELS[char])
469
- elif char in CONSONANTS:
470
- result.append(CONSONANTS[char])
471
- else:
472
- # Telugu characters (from nasal clusters, etc.) or unknown
473
- result.append(char)
474
-
475
- i += 1
476
-
477
- return ''.join(result)
478
-
479
-
480
- def apply_nasal_clusters(text: str) -> str:
481
- """
482
- Apply nasal cluster rules (CRITICAL).
483
-
484
- Convert: n + consonant → ం + consonant
485
- Examples:
486
- "konda" → "కొండ" → "కొండ" (correct)
487
- NOT: "konda" → "కొన్ద" (wrong)
488
-
489
- This MUST be done before other mappings!
490
- """
491
- result = text
492
-
493
- # Check 4-character clusters first (longest match)
494
- for cluster, telugu in NASAL_CLUSTERS.items():
495
- if len(cluster) == 4 and cluster in result:
496
- result = result.replace(cluster, telugu)
497
-
498
- # Then 3-character clusters
499
- for cluster, telugu in NASAL_CLUSTERS.items():
500
- if len(cluster) == 3 and cluster in result:
501
- result = result.replace(cluster, telugu)
502
-
503
- # Then 2-character clusters
504
- for cluster, telugu in NASAL_CLUSTERS_2CHAR.items():
505
- if len(cluster) == 2 and cluster in result:
506
- result = result.replace(cluster, telugu)
507
-
508
- return result
509
-
510
-
511
- def apply_mappings(text: str) -> str:
512
- """
513
- Apply consonant and vowel mappings.
514
-
515
- Priority order:
516
- 1. Long vowels (aa, ii, uu, ee, oo)
517
- 2. Diphthongs (ai, au)
518
- 3. Consonants
519
- 4. Single vowels
520
-
521
- This order is CRITICAL for correct transliteration!
522
- """
523
- result = []
524
- i = 0
525
-
526
- while i < len(text):
527
- # Check 2-character long vowels first
528
- if i + 1 < len(text):
529
- chunk2 = text[i:i+2]
530
- if chunk2 in LONG_VOWELS:
531
- result.append(LONG_VOWELS[chunk2])
532
- i += 2
533
- continue
534
- if chunk2 in DIPHTHONGS:
535
- result.append(DIPHTHONGS[chunk2])
536
- i += 2
537
- continue
538
-
539
- # Check single character
540
- char = text[i]
541
-
542
- # Skip standalone 'a' (consonants have inherent 'a')
543
- if char == 'a' and result and is_consonant(result[-1]):
165
+ if prev_was_consonant:
166
+ attach_matra(ch_lower)
167
+ prev_was_consonant = False
168
+ else:
169
+ result.append(vowels.get(ch_lower, ""))
544
170
  i += 1
545
171
  continue
546
172
 
547
- # Apply mappings
548
- if char in ALL_VOWELS:
549
- result.append(ALL_VOWELS[char])
550
- elif char in CONSONANTS:
551
- result.append(CONSONANTS[char])
552
- else:
553
- # Unknown character, keep as-is
554
- result.append(char)
173
+ # 8) Single-letter consonants (e.g., 'k', 'T', 'S') - case sensitive
174
+ matched_cons = None
175
+ for k in cons_keys:
176
+ if text.startswith(k, i):
177
+ matched_cons = k
178
+ break
179
+ if matched_cons:
180
+ if prev_was_consonant:
181
+ result.append("్")
182
+ emit_consonant(matched_cons)
183
+ i += len(matched_cons)
184
+ continue
555
185
 
186
+ # 9) Anything else (spaces/punct/digits)
187
+ result.append(ch)
188
+ prev_was_consonant = False
556
189
  i += 1
557
190
 
558
- return ''.join(result)
559
-
560
-
561
- def is_consonant(char: str) -> bool:
562
- """Check if character is a consonant."""
563
- # This is a simplified check
564
- # In practice, check against CONSONANTS dict
565
- consonants = set(CONSONANTS.values())
566
- return char in consonants
567
-
568
-
569
- def apply_clusters(text: str) -> str:
570
- """Apply common consonant clusters."""
571
- result = text
191
+ # Final virama cleanup
192
+ if strip_final_virama and result and result[-1] == "్":
193
+ result.pop()
572
194
 
573
- for cluster, telugu in COMMON_CLUSTERS.items():
574
- result = result.replace(cluster, telugu)
195
+ return "".join(result)
575
196
 
576
- return result
577
197
 
198
+ # ──────────────────────────────────────────────────────────────────────────────
199
+ # Tables (Clusters Enhanced in v4.3.0)
200
+ # ──────────────────────────────────────────────────────────────────────────────
578
201
 
579
- def apply_gemination(text: str) -> str:
580
- """Apply gemination (double consonants)."""
581
- result = text
582
-
583
- for geminate, telugu in GEMINATION.items():
584
- result = result.replace(geminate, telugu)
585
-
586
- return result
587
-
588
-
589
- def apply_grammar(text: str) -> str:
590
- """
591
- Apply basic grammar (placeholder for now).
592
-
593
- Future: Add case markers, SOV conversion, etc.
594
- """
595
- # This will call functions from grammar.py
596
- # For now, just return as-is
597
- return text
598
-
599
-
600
- def validate_v3_compliance(text: str) -> bool:
601
- """
602
- Validate v3.0 compliance.
603
-
604
- Check for:
605
- - No archaic letters (ఱ, ఌ, ౡ, etc.)
606
- - Modern pronouns
607
- - Correct patterns
608
- """
609
- # Check for archaic letters
610
- archaic_letters = ['ఱ', 'ఌ', 'ౡ', 'ౘ', 'ౙ', 'ఀ', 'ౝ']
611
- for letter in archaic_letters:
612
- if letter in text:
613
- print(f"WARNING: Found archaic letter {letter} in '{text}'")
614
- return False
615
-
616
- # Check for archaic pronouns
617
- for archaic in ARCHAIC_PRONOUNS.values():
618
- if archaic in text:
619
- print(f"WARNING: Found archaic pronoun {archaic} in '{text}'")
620
- return False
621
-
622
- return True
623
-
624
-
625
- # ============================================================================
626
- # SECTION 6: CONVENIENCE FUNCTIONS
627
- # ============================================================================
628
-
629
- def transliterate_word(word: str) -> str:
630
- """Transliterate a single word."""
631
- return eng_to_telugu(word)
202
+ def get_geminates():
203
+ """Explicit geminate mappings."""
204
+ return {
205
+ "kk": "క్క", "gg": "గ్గ", "cc": "చ్చ", "jj": "జ్జ",
206
+ "tt": "త్త", "dd": "ద్ద", "pp": "ప్ప", "bb": "బ్బ",
207
+ "mm": "మ్మ", "yy": "య్య", "rr": "ర్ర", "ll": "ల్ల",
208
+ "vv": "వ్వ", "ss": "స్స", "nn": "న్న",
209
+ "TT": "ట్ట", "DD": "డ్డ", "NN": "ణ్ణ",
210
+ }
632
211
 
212
+ def get_base_consonants(style="modern"):
213
+ """Modern consonants (dental vs retroflex distinction is via case)."""
214
+ base = {
215
+ "k": "క", "kh": "ఖ", "g": "గ", "gh": "ఘ",
216
+ "c": "చ", "ch": "చ", "chh": "ఛ", "j": "జ", "jh": "ఝ",
217
+ "t": "త", "th": "థ", "d": "ద", "dh": "ధ", "n": "న",
218
+ "T": "ట", "Th": "ఠ", "D": "డ", "Dh": "ఢ", "N": "ణ",
219
+ "p": "ప", "ph": "ఫ", "b": "బ", "bh": "భ", "m": "మ",
220
+ "y": "య", "r": "ర", "l": "ల", "v": "వ", "w": "వ",
221
+ "sh": "శ", "S": "ష", "s": "స",
222
+ "h": "హ",
223
+ }
224
+ return base
225
+
226
+ def get_base_vowels(style="modern"):
227
+ """Vowel letters (keys must be lowercase for consistency)."""
228
+ return {
229
+ "a": "అ", "i": "ఇ", "u": "ఉ", "e": "ఎ", "o": "ఒ",
230
+ "aa": "ఆ", "ii": "ఈ", "uu": "ఊ", "ee": "ఏ", "oo": "ఓ",
231
+ "ai": "ఐ", "au": "ఔ",
232
+ "am": "ం", "ah": "ః", "ri": "ఋ", "rii": "ౠ",
233
+ }
633
234
 
634
- def transliterate_sentence(sentence: str) -> str:
635
- """Transliterate a complete sentence."""
636
- words = sentence.split()
637
- return ' '.join(eng_to_telugu(word) for word in words)
235
+ def get_base_matras(style="modern"):
236
+ """Dependent vowel signs (keys must be lowercase for consistency)."""
237
+ return {
238
+ "a": "",
239
+ "aa": "ా", "i": "ి", "ii": "ీ",
240
+ "u": "ు", "uu": "ూ",
241
+ "e": "ె", "ee": "ే",
242
+ "o": "ొ", "oo": "ో",
243
+ "ai": "ై", "au": "ౌ",
244
+ "am": "ం", "ah": "ః",
245
+ "ri": "ృ", "rii": "ౄ",
246
+ }
638
247
 
248
+ def get_clusters(style="modern"):
249
+ """Common consonant clusters in token space. (v4.3.0 Enhanced)"""
250
+ return {
251
+ # 4-Character Clusters (Complex conjuncts)
252
+ "ksha": ["k", "S"],
253
+ "shra": ["S", "r"],
254
+ "shna": ["S", "n"],
255
+ "SThr": ["S", "Th", "r"], # retroflex S, retroflex Th, r
256
+ "skr": ["s", "k", "r"], # s, k, r
257
+ "spl": ["s", "p", "l"], # s, p, l
258
+
259
+ # 3-Character Clusters (Highly requested)
260
+ "ndr": ["n", "d", "r"], # n, d, r
261
+ "str": ["s", "t", "r"], # s, t, r
262
+ "sht": ["sh", "T"], # sh, retroflex T
263
+ "bhr": ["bh", "r"], # bh, r
264
+ "mbr": ["m", "b", "r"], # m, b, r
265
+ "kst": ["k", "s", "t"], # k, s, t
266
+ "njn": ["n", "j", "n"], # n, j, n
267
+
268
+ # 2-Character Clusters (Base list)
269
+ "jna": ["j", "n"],
270
+ "tra": ["t", "r"], "dra": ["d", "r"], "pra": ["p", "r"],
271
+ "bhra": ["bh", "r"], "gva": ["g", "v"], "tna": ["t", "n"],
272
+ "kr": ["k", "r"], "tr": ["t", "r"], "dr": ["d", "r"],
273
+ "gr": ["g", "r"], "pr": ["p", "r"], "br": ["b", "r"],
274
+ "sr": ["s", "r"], "nr": ["n", "r"],
275
+ "kl": ["k", "l"], "gl": ["g", "l"], "pl": ["p", "l"], "bl": ["b", "l"],
276
+ "kv": ["k", "v"], "tv": ["t", "v"], "dv": ["d", "v"],
277
+ "tn": ["t", "n"], "dn": ["d", "n"], "kn": ["k", "n"], "pn": ["p", "n"],
278
+ }
639
279
 
640
- # ============================================================================
641
- # SECTION 7: PUBLIC API
642
- # ============================================================================
643
280
 
644
- __all__ = [
645
- 'eng_to_telugu',
646
- 'transliterate_word',
647
- 'transliterate_sentence',
648
- 'MODERN_PRONOUNS',
649
- 'validate_v3_compliance',
650
- ]
281
+ # ──────────────────────────────────────────────────────────────────────────────
282
+ # Public API
283
+ # ──────────────────────────────────────────────────────────────────────────────
284
+
285
+ def eng_to_telugu(text: str, strip_final_virama: bool = True) -> str:
286
+ if text is None:
287
+ raise ValueError("Input text cannot be None")
288
+ if not isinstance(text, str):
289
+ raise TypeError(f"Expected str, got {type(text).__name__}")
290
+ s = text.strip()
291
+ if not s:
292
+ return ""
293
+ if len(s) > 10000:
294
+ raise ValueError("Input text too long (max 10000 characters)")
295
+
296
+ rules = {
297
+ "consonants": get_base_consonants(),
298
+ "vowels": get_base_vowels(),
299
+ "matras": get_base_matras(),
300
+ "clusters": get_clusters(),
301
+ "geminates": get_geminates(),
302
+ "strip_final_virama": strip_final_virama,
303
+ }
304
+ return eng_to_telugu_base(s, rules)
651
305
 
652
306
 
653
- # ============================================================================
654
- # SECTION 8: EXAMPLE USAGE
655
- # ============================================================================
307
+ # ──────────────────────────────────────────────────────────────────────────────
308
+ # Tests (updated for v4.3.0)
309
+ # ──────────────────────────────────────────────────────────────────────────────
656
310
 
657
311
  if __name__ == "__main__":
658
- # Test cases (from CRITICAL_FIXES.md)
659
- test_cases = [
660
- ("namaaste", "నమస్తే"),
661
- ("raama", "రామ"),
662
- ("konda", "కొండ"),
663
- ("nenu", "నేను"),
664
- ("vallu", "వాళ్ళు"),
665
- ("palakariste", "పలకరిస్తే"),
312
+ print("=" * 80)
313
+ print("TELUGU LIBRARY v4.3.0 — ENHANCED CLUSTER TESTS")
314
+ print("=" * 80)
315
+
316
+ tests = [
317
+ # Complex Cluster Tests (New additions)
318
+ ("rastra", "రాష్ట్ర", "str cluster"),
319
+ ("krishna", "క్రిష్ణ", "kri matra (i matra, not vocalic ru)"),
320
+ ("namste", "నమ్స్తే", "namste cluster fix"),
321
+ ("vidyut", "విద్యుత్", "dv cluster"),
322
+ ("chhatra", "ఛత్ర", "chha+tra cluster"),
323
+ ("prasthanam", "ప్రస్థానం", "s+t cluster"),
324
+
325
+ # Regression Checks
326
+ ("konda", "కొండ", "nd -> retroflex ండ (Regression Check)"),
327
+ ("palli", "పల్లి", "ll geminate Check"),
666
328
  ]
667
329
 
668
- print("\n" + "="*70)
669
- print(" TRANSLITERATOR v3.0 - TEST CASES")
670
- print("="*70 + "\n")
671
-
672
- for english, expected in test_cases:
673
- result = eng_to_telugu(english)
674
- status = "✅" if result == expected else "❌"
675
- print(f"{status} {english:20} → {result:15} (expected: {expected})")
676
-
677
- print("\n" + "="*70 + "\n")
678
-
679
- # Interactive test
680
- print("Enter text to transliterate (or 'quit' to exit):")
681
- while True:
682
- try:
683
- text = input("> ").strip()
684
- if text.lower() in ['quit', 'exit', 'q']:
685
- break
686
- if text:
687
- result = eng_to_telugu(text)
688
- print(f" → {result}\n")
689
- except KeyboardInterrupt:
690
- break
691
-
692
- print("\nTransliteration complete!")
330
+ passed, failed = 0, 0
331
+ for src, exp, note in tests:
332
+ out = eng_to_telugu(src)
333
+ ok = (out == exp)
334
+ print(f"{'✓' if ok else '✗'} {src:<18} → {out:<16} | {note}")
335
+ if ok: passed += 1
336
+ else:
337
+ failed += 1
338
+ print(f" expected: {exp}")
339
+
340
+ print("-" * 80)
341
+ total = len(tests)
342
+ print(f"Results: {passed} passed, {failed} failed of {total} ({passed/total*100:.1f}%)")
343
+ if failed == 0:
344
+ print("🎉 ALL TESTS PASSED! v4.3.0 ready.")