telugu-language-tools 4.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,972 @@
1
+ """
2
+ Telugu Library v2.0 - Comprehensive Transliteration Engine
3
+ ===========================================================
4
+
5
+ Features:
6
+ - Old vs New Telugu Alphabet Support (Classical, Modern, Hybrid)
7
+ - Bidirectional Transliteration (English ↔ Telugu)
8
+ - Semantic Word Mapping (English ↔ Telugu meanings)
9
+ - Universal Search (works for both languages)
10
+
11
+ Examples:
12
+ eng_to_telugu("krishna") → కృష్ణ
13
+ telugu_to_eng("కృష్ణ") → krishna
14
+ semantic_match("who") → ["ఎవరు", "ఎవరో"]
15
+ eng_to_telugu_with_style("rama", "modern") → రామ
16
+ eng_to_telugu_with_style("rama", "classical") → రామ
17
+ """
18
+
19
+ # ============================================================================
20
+ # PART 1: ENGLISH → TELUGU TRANSLITERATION
21
+ # ============================================================================
22
+
23
+ def normalize_roman_input(text: str) -> str:
24
+ """Normalizes romanized input to a more basic ASCII representation."""
25
+ replacements = {
26
+ 'ā': 'aa', 'ē': 'ee', 'ī': 'ii', 'ō': 'oo', 'ū': 'uu',
27
+ 'ṁ': 'm', 'ṇ': 'n', 'ḍ': 'd', 'ṭ': 't', 'ś': 'sh',
28
+ 'ṣ': 'sh', 'ṛ': 'ri'
29
+ }
30
+ for special, basic in replacements.items():
31
+ text = text.replace(special, basic)
32
+ return text
33
+
34
+ def eng_to_telugu_base(text: str, rules: dict) -> str:
35
+ """Core transliteration engine with custom rules (pure transliteration only)."""
36
+ text = normalize_roman_input(text)
37
+ text = (text or "").lower().strip()
38
+
39
+ consonants = rules.get("consonants", {})
40
+ vowels = rules.get("vowels", {})
41
+ matras = rules.get("matras", {})
42
+ clusters = rules.get("clusters", {})
43
+ strip_final_virama = rules.get("strip_final_virama", True)
44
+
45
+ result = []
46
+ i = 0
47
+ prev_cons = False
48
+
49
+ def emit_consonant(key: str, join_prev=False):
50
+ nonlocal prev_cons
51
+ if join_prev and prev_cons:
52
+ result.append("్")
53
+ result.append(consonants.get(key, key))
54
+ prev_cons = True
55
+
56
+ while i < len(text):
57
+ chunk5 = text[i:i+5]
58
+ chunk4 = text[i:i+4]
59
+ chunk3 = text[i:i+3]
60
+ chunk2 = text[i:i+2]
61
+ single = text[i]
62
+
63
+ # NEW: Handle vocalic r (r̥) - detect when 'r' between consonants should be vocalic
64
+ if prev_cons and single == 'r':
65
+ # Look ahead: is there a consonant after optional vowel?
66
+ lookahead = i + 1
67
+ while lookahead < len(text) and text[lookahead] in 'aeiou':
68
+ lookahead += 1
69
+ if lookahead < len(text) and text[lookahead] in 'bcdfghjklmnpqrstvwxyz':
70
+ # Vocalic r: add ృ after the previous consonant
71
+ # The ృ suppresses the inherent 'a' of the previous consonant
72
+ # It doesn't affect the next consonant, so prev_cons stays True
73
+ result.append("ృ")
74
+ # Don't reset prev_cons - the next consonant will still have its inherent 'a'
75
+ i += 1
76
+ continue
77
+
78
+ # Handle 5-letter clusters (krish for కృష్ణ)
79
+ if chunk5 in clusters:
80
+ if prev_cons:
81
+ result.append("్")
82
+ for idx, ck in enumerate(clusters[chunk5]):
83
+ emit_consonant(ck, join_prev=(idx > 0))
84
+ i += 5
85
+ continue
86
+
87
+ # Handle 4-letter clusters
88
+ if chunk4 in clusters:
89
+ if prev_cons:
90
+ result.append("్")
91
+ for idx, ck in enumerate(clusters[chunk4]):
92
+ emit_consonant(ck, join_prev=(idx > 0))
93
+ i += 4
94
+ continue
95
+
96
+ # Handle 3-letter clusters
97
+ if chunk3 in clusters:
98
+ if prev_cons:
99
+ result.append("్")
100
+ for idx, ck in enumerate(clusters[chunk3]):
101
+ emit_consonant(ck, join_prev=(idx > 0))
102
+ i += 3
103
+ continue
104
+
105
+ # Handle 2-letter clusters
106
+ if chunk2 in clusters:
107
+ if prev_cons:
108
+ result.append("్")
109
+ for idx, ck in enumerate(clusters[chunk2]):
110
+ emit_consonant(ck, join_prev=(idx > 0))
111
+ i += 2
112
+ continue
113
+
114
+ # Handle 2-letter vowels
115
+ if chunk2 in vowels:
116
+ if prev_cons:
117
+ result.append(matras.get(chunk2, ""))
118
+ prev_cons = False # A vowel sound has been produced
119
+ else:
120
+ result.append(vowels[chunk2])
121
+ prev_cons = False
122
+ i += 2
123
+ continue
124
+
125
+ # Handle 2-letter consonants
126
+ if chunk2 in consonants:
127
+ if prev_cons:
128
+ result.append("్")
129
+ emit_consonant(chunk2)
130
+ i += 2
131
+ continue
132
+
133
+ # Single vowel
134
+ if single in vowels:
135
+ # Skip single "a" - consonants already have inherent 'a' sound
136
+ if single == "a" and prev_cons:
137
+ # 'a' after consonant = inherent vowel (skip)
138
+ prev_cons = False
139
+ i += 1
140
+ continue
141
+
142
+ # Special case: if the previous character is ృ, treat this as a standalone vowel
143
+ # even though prev_cons is True. Also, DON'T reset prev_cons after this vowel
144
+ is_after_vocalic_r = (result and result[-1] == "ృ")
145
+
146
+ # Check if this is a matra (after a consonant) or standalone vowel
147
+ if prev_cons and not is_after_vocalic_r:
148
+ # This is a matra
149
+ result.append(matras.get(single, ""))
150
+ prev_cons = False # A vowel sound has been produced
151
+ else:
152
+ # This is a standalone vowel
153
+ result.append(vowels[single])
154
+ # Standalone vowel, so prev_cons becomes False
155
+ # EXCEPTION: if this vowel comes after ృ, don't reset prev_cons
156
+ # because ృ doesn't suppress inherent vowels (it only suppresses the vowel of its own consonant)
157
+ if not is_after_vocalic_r:
158
+ prev_cons = False
159
+ i += 1
160
+ continue
161
+
162
+ # Single consonant
163
+ if single in consonants:
164
+ if prev_cons:
165
+ result.append("్")
166
+ emit_consonant(single)
167
+ i += 1
168
+ continue
169
+
170
+ # Unknown character
171
+ result.append(single)
172
+ prev_cons = False
173
+ i += 1
174
+
175
+ if strip_final_virama and result and result[-1] == "్":
176
+ result.pop()
177
+
178
+ return "".join(result)
179
+
180
+
181
+ def get_base_consonants(style="modern"):
182
+ """
183
+ Get consonant mappings for old vs new style.
184
+
185
+ Args:
186
+ style: "modern" (new) or "classical" (old)
187
+ """
188
+
189
+ # Common consonants in both styles
190
+ common = {
191
+ "k": "క", "kh": "ఖ", "g": "గ", "gh": "ఘ",
192
+ "ch": "చ", "chh": "ఛ", "j": "జ", "jh": "ఝ",
193
+ "t": "త", "th": "థ", "d": "ద", "dh": "ధ", "n": "న",
194
+ "tt": "ట", "tth": "ఠ", "dd": "డ", "ddh": "ఢ", "nn": "ణ",
195
+ "p": "ప", "ph": "ఫ", "b": "బ", "bh": "భ", "m": "మ",
196
+ "y": "య", "l": "ల", "v": "వ", "w": "వ",
197
+ "sh": "ష", "shh": "శ", "s": "స", "h": "హ",
198
+ }
199
+
200
+ if style == "classical":
201
+ # OLD alphabet includes archaic letters
202
+ return {
203
+ **common,
204
+ "r": "ర",
205
+ "rr": "ఱ", # Retroflex R (archaic)
206
+ "ll": "ళ", # Retroflex L (still used)
207
+ "lll": "ఴ", # Voiced retroflex fricative (obsolete)
208
+ "nga": "ఙ", # Velar nasal (archaic)
209
+ "nya": "ఞ", # Palatal nasal (archaic)
210
+ "nna": "ణ", # Retroflex nasal
211
+ }
212
+ else:
213
+ # NEW alphabet (modern/reformed)
214
+ return {
215
+ **common,
216
+ "r": "ర", # Single R for both
217
+ "rr": "ర్ర", # Double R as conjunct
218
+ "ll": "ళ", # Retroflex L (retained)
219
+ "nga": "న", # Merged with dental N
220
+ "nya": "న", # Merged with dental N
221
+ "nna": "ణ", # Retroflex nasal (retained)
222
+ }
223
+
224
+
225
+ def get_base_vowels(style="modern"):
226
+ """Get vowel mappings for old vs new style."""
227
+
228
+ common = {
229
+ "aa": "ఆ", "a": "అ",
230
+ "ii": "ఈ", "i": "ఇ",
231
+ "uu": "ఊ", "u": "ఉ",
232
+ "ai": "ఐ", "au": "ఔ",
233
+ "am": "ం", "ah": "ః",
234
+ "ri": "ఋ", "rii": "ౠ",
235
+ }
236
+
237
+ if style == "classical":
238
+ return {
239
+ **common,
240
+ "e": "ఎ", # Short e
241
+ "ee": "ఏ", # Long ē
242
+ "o": "ఒ", # Short o
243
+ "oo": "ఓ", # Long ō
244
+ "li": "ౢ", # Vocalic l̥ (archaic)
245
+ "lii": "ౣ", # Vocalic l̥̄ (archaic)
246
+ }
247
+ else:
248
+ return {
249
+ **common,
250
+ "e": "ఎ",
251
+ "ee": "ఏ",
252
+ "o": "ఒ",
253
+ "oo": "ఓ",
254
+ # Archaic vowels dropped
255
+ }
256
+
257
+
258
+ def get_base_matras(style="modern"):
259
+ """Get matra (vowel sign) mappings."""
260
+
261
+ common = {
262
+ "aa": "ా", "a": "",
263
+ "ii": "ీ", "i": "ి",
264
+ "uu": "ూ", "u": "ు",
265
+ "ee": "ే", "e": "ె",
266
+ "oo": "ో", "o": "ొ",
267
+ "ai": "ై", "au": "ౌ",
268
+ "am": "ం", "ah": "ః",
269
+ "ri": "ృ", "rii": "ౄ",
270
+ }
271
+
272
+ if style == "classical":
273
+ return {
274
+ **common,
275
+ "li": "ౢ", # Vocalic l̥ matra (archaic)
276
+ "lii": "ౣ", # Vocalic l̥̄ matra (archaic)
277
+ }
278
+ else:
279
+ return common
280
+
281
+
282
+ def get_clusters(style="modern"):
283
+ """Get common consonant clusters."""
284
+ return {
285
+ # 4-letter clusters
286
+ "ksha": ["k", "sh"],
287
+ "jna": ["j", "n"],
288
+ "shna": ["sh", "n"],
289
+ "shra": ["sh", "r"],
290
+ # 3-letter clusters
291
+ "bhra": ["bh", "r"],
292
+ "gva": ["g", "v"],
293
+ # 2-letter clusters
294
+ "kr": ["k", "r"],
295
+ "tr": ["t", "r"],
296
+ "dr": ["d", "r"],
297
+ "gr": ["g", "r"],
298
+ "pr": ["p", "r"],
299
+ "br": ["b", "r"],
300
+ "vr": ["v", "r"],
301
+ "nr": ["n", "r"],
302
+ "sr": ["s", "r"],
303
+ }
304
+
305
+
306
+ def eng_to_telugu_with_style(text: str, style: str = "modern") -> str:
307
+ """
308
+ Transliteration with style selection (Modern vs Classical vs Hybrid).
309
+ Checks semantic dictionary first, then falls back to transliteration.
310
+
311
+ Args:
312
+ text: English text
313
+ style: "modern" (default), "classical", or "hybrid"
314
+
315
+ Returns:
316
+ Telugu text using the specified alphabet style
317
+ """
318
+ # Input validation
319
+ if text is None:
320
+ raise ValueError("Input text cannot be None")
321
+ if not isinstance(text, str):
322
+ raise TypeError(f"Expected str, got {type(text).__name__}")
323
+ if not text or not text.strip():
324
+ return ""
325
+ if len(text) > 10000:
326
+ raise ValueError("Input text too long (max 10000 characters)")
327
+
328
+ # 1. NEW: Check semantic dictionary first for known words
329
+ norm_text = normalize_for_matching(text)
330
+ forward_dict = get_semantic_dictionary()
331
+
332
+ if norm_text in forward_dict:
333
+ # Found a known word. Return its first meaning.
334
+ return forward_dict[norm_text][0]
335
+
336
+ # 2. No match found. Fallback to pure transliteration.
337
+ rules = {
338
+ "consonants": get_base_consonants(style if style in ["modern", "classical", "hybrid"] else "modern"),
339
+ "vowels": get_base_vowels(style if style in ["modern", "classical", "hybrid"] else "modern"),
340
+ "matras": get_base_matras(style if style in ["modern", "classical", "hybrid"] else "modern"),
341
+ "clusters": get_clusters("modern"),
342
+ "strip_final_virama": True if style in ["modern", "hybrid"] else False,
343
+ }
344
+ return eng_to_telugu_base(text, rules)
345
+
346
+
347
+ def eng_to_telugu_old_new_options(text: str) -> list:
348
+ """
349
+ Generate transliteration options using OLD vs NEW alphabet styles.
350
+
351
+ Args:
352
+ text: English text to transliterate
353
+
354
+ Returns:
355
+ List of tuples: [(telugu_text, style_description), ...]
356
+ """
357
+
358
+ options = []
359
+
360
+ # Style 1: MODERN (New Telugu)
361
+ modern_rules = {
362
+ "consonants": get_base_consonants("modern"),
363
+ "vowels": get_base_vowels("modern"),
364
+ "matras": get_base_matras("modern"),
365
+ "clusters": get_clusters("modern"),
366
+ "strip_final_virama": True,
367
+ }
368
+ modern = eng_to_telugu_base(text, modern_rules)
369
+ options.append((modern, "Modern Telugu (Reformed)"))
370
+
371
+ # Style 2: CLASSICAL (Old Telugu)
372
+ classical_rules = {
373
+ "consonants": get_base_consonants("classical"),
374
+ "vowels": get_base_vowels("classical"),
375
+ "matras": get_base_matras("classical"),
376
+ "clusters": get_clusters("classical"),
377
+ "strip_final_virama": False, # Old style kept virama
378
+ }
379
+ classical = eng_to_telugu_base(text, classical_rules)
380
+ options.append((classical, "Classical Telugu (Pre-reform)"))
381
+
382
+ # Style 3: HYBRID (some old, some new)
383
+ hybrid_rules = {
384
+ "consonants": {**get_base_consonants("modern"), "rr": "ఱ", "ll": "ళ"},
385
+ "vowels": get_base_vowels("modern"),
386
+ "matras": get_base_matras("modern"),
387
+ "clusters": get_clusters("modern"),
388
+ "strip_final_virama": True,
389
+ }
390
+ hybrid = eng_to_telugu_base(text, hybrid_rules)
391
+ options.append((hybrid, "Hybrid (Modern with some archaic letters)"))
392
+
393
+ # Remove duplicates
394
+ seen = set()
395
+ unique_options = []
396
+ for telugu, desc in options:
397
+ if telugu not in seen:
398
+ unique_options.append((telugu, desc))
399
+ seen.add(telugu)
400
+
401
+ return unique_options
402
+
403
+
404
+ def compare_old_new_alphabets():
405
+ """
406
+ Display comparison table of OLD vs NEW Telugu alphabets.
407
+ """
408
+
409
+ print("\n" + "=" * 80)
410
+ print("OLD vs NEW TELUGU ALPHABET COMPARISON")
411
+ print("=" * 80)
412
+
413
+ comparisons = [
414
+ ("Letter", "Old (Classical)", "New (Modern)", "Status", "Notes"),
415
+ ("-" * 15, "-" * 20, "-" * 20, "-" * 12, "-" * 30),
416
+
417
+ # Vowels
418
+ ("a", "అ", "అ", "Unchanged", "Short vowel"),
419
+ ("ā", "ఆ", "ఆ", "Unchanged", "Long vowel"),
420
+ ("i", "ఇ", "ఇ", "Unchanged", "Short vowel"),
421
+ ("ī", "ఈ", "ఈ", "Unchanged", "Long vowel"),
422
+ ("u", "ఉ", "ఉ", "Unchanged", "Short vowel"),
423
+ ("ū", "ఊ", "ఊ", "Unchanged", "Long vowel"),
424
+ ("r̥", "ఋ", "ఋ", "Unchanged", "Vocalic R"),
425
+ ("l̥", "ౢ", "(obsolete)", "Removed", "Vocalic L - archaic"),
426
+ ("e", "ఎ", "ఎ", "Unchanged", "Short E"),
427
+ ("ē", "ఏ", "ఏ", "Unchanged", "Long E"),
428
+ ("ai", "ఐ", "ఐ", "Unchanged", "Diphthong"),
429
+ ("o", "ఒ", "ఒ", "Unchanged", "Short O"),
430
+ ("ō", "ఓ", "ఓ", "Unchanged", "Long O"),
431
+ ("au", "ఔ", "ఔ", "Unchanged", "Diphthong"),
432
+
433
+ ("", "", "", "", ""),
434
+
435
+ # Consonants
436
+ ("ka", "క", "క", "Unchanged", "Velar"),
437
+ ("ṅa", "ఙ", "(merged→న)", "Rare", "Velar nasal"),
438
+ ("cha", "చ", "చ", "Unchanged", "Palatal"),
439
+ ("ña", "ఞ", "(merged→న)", "Rare", "Palatal nasal"),
440
+ ("ṭa", "ట", "ట", "Unchanged", "Retroflex"),
441
+ ("ṇa", "ణ", "ణ", "Unchanged", "Retroflex nasal"),
442
+ ("ta", "త", "త", "Unchanged", "Dental"),
443
+ ("na", "న", "న", "Unchanged", "Dental nasal"),
444
+ ("pa", "ప", "ప", "Unchanged", "Labial"),
445
+ ("ya", "య", "య", "Unchanged", "Semivowel"),
446
+ ("ra", "ర", "ర", "Unchanged", "Alveolar"),
447
+ ("ṟa", "ఱ", "(rare)", "Archaic", "Retroflex R"),
448
+ ("la", "ల", "ల", "Unchanged", "Dental lateral"),
449
+ ("ḷa", "ళ", "ళ", "Unchanged", "Retroflex lateral"),
450
+ ("ḻa", "ఴ", "(obsolete)", "Removed", "Fricative - Tamil loan"),
451
+ ("va", "వ", "వ", "Unchanged", "Labial"),
452
+ ("śa", "శ", "శ", "Unchanged", "Palatal sibilant"),
453
+ ("ṣa", "ష", "ష", "Unchanged", "Retroflex sibilant"),
454
+ ("sa", "స", "స", "Unchanged", "Dental sibilant"),
455
+ ("ha", "హ", "హ", "Unchanged", "Glottal"),
456
+ ]
457
+
458
+ for row in comparisons:
459
+ print(f"{row[0]:15} {row[1]:20} {row[2]:20} {row[3]:12} {row[4]:30}")
460
+
461
+ print("\n" + "=" * 80)
462
+ print("SUMMARY:")
463
+ print(" • Modern Telugu has 56 letters (16 vowels + 36 consonants + 4 modifiers)")
464
+ print(" • Classical Telugu had ~60+ letters including archaic forms")
465
+ print(" • Letters ఱ (ṟa), ఴ (ḻa), ౘ, ౙ, ౚ are now obsolete or very rare")
466
+ print(" • Nasals ఙ (ṅa) and ఞ (ña) mostly merged into న (na) in modern usage")
467
+ print("=" * 80 + "\n")
468
+
469
+
470
+ # Main function for backwards compatibility (v0.9)
471
+ def eng_to_telugu(text: str, strip_final_virama: bool = True) -> str:
472
+ """
473
+ Clean & Extended Telugu Transliteration Engine (BUG-FIXED v0.9)
474
+ Maintained for backwards compatibility with v0.9.
475
+ Checks semantic dictionary first, then falls back to pure transliteration.
476
+
477
+ ✅ Fixed: Added 2-letter cluster checking
478
+ ✅ Fixed: Consistent cluster definitions (no vowels in cluster names)
479
+ ✅ Fixed: Proper processing order (clusters before vowels)
480
+ ✅ Fixed: Comprehensive cluster coverage (2-letter, 3-letter, 4-letter)
481
+ ✅ Includes major clusters (kr, tr, dr, bhra, gva, ksha, jna, shra, shna)
482
+ ✅ Smart virama handling between consonants
483
+ ✅ Correct vowel matras after clusters
484
+ ✅ Optional strip_final_virama for smooth ending
485
+ ✅ Semantic dictionary integration for known words
486
+
487
+ Example:
488
+ eng_to_telugu("krishna") → కృష్ణ
489
+ eng_to_telugu("bhagvaan") → భగవాన్
490
+ eng_to_telugu("karthik") → కార్తిక్
491
+ """
492
+ # 1. Check semantic dictionary first for known words
493
+ norm_text = normalize_for_matching(text)
494
+ forward_dict = get_semantic_dictionary()
495
+
496
+ if norm_text in forward_dict:
497
+ # Found a known word. Return its first meaning.
498
+ return forward_dict[norm_text][0]
499
+
500
+ # 2. No match found. Fallback to pure transliteration.
501
+
502
+ consonants = {
503
+ "k": "క", "kh": "ఖ", "g": "గ", "gh": "ఘ",
504
+ "ch": "చ", "jh": "ఝ", "j": "జ",
505
+ "t": "త", "th": "థ", "d": "ద", "dh": "ధ",
506
+ "n": "న", "p": "ప", "ph": "ఫ", "b": "బ", "bh": "భ", "m": "మ",
507
+ "y": "య", "r": "ర", "l": "ల", "v": "వ", "w": "వ",
508
+ "sh": "ష", "s": "స", "h": "హ",
509
+ }
510
+
511
+ vowels = {
512
+ "aa": "ఆ", "a": "అ", "ii": "ఈ", "i": "ఇ",
513
+ "uu": "ఊ", "u": "ఉ", "ee": "ఏ", "e": "ఎ",
514
+ "oo": "ఓ", "o": "ఒ", "ai": "ఐ", "au": "ఔ",
515
+ "am": "ం", "ah": "ః",
516
+ }
517
+
518
+ matras = {
519
+ "aa": "ా", "a": "", "ii": "ీ", "i": "ి",
520
+ "uu": "ూ", "u": "ు", "ee": "ే", "e": "ె",
521
+ "oo": "ో", "o": "ొ", "ai": "ై", "au": "ౌ",
522
+ "am": "ం", "ah": "ః",
523
+ }
524
+
525
+ # Extended clusters - BUG FIX: Consistent definitions (no vowels in names)
526
+ clusters = {
527
+ # 4-letter clusters
528
+ "ksha": ["k", "sh"],
529
+ "jna": ["j", "n"],
530
+ "shna": ["sh", "n"],
531
+ "shra": ["sh", "r"],
532
+ # 3-letter clusters
533
+ "bhra": ["bh", "r"],
534
+ "gva": ["g", "v"],
535
+ # 2-letter clusters
536
+ "kr": ["k", "r"],
537
+ "tr": ["t", "r"],
538
+ "dr": ["d", "r"],
539
+ "gr": ["g", "r"],
540
+ "pr": ["p", "r"],
541
+ "br": ["b", "r"],
542
+ "vr": ["v", "r"],
543
+ "nr": ["n", "r"],
544
+ "sr": ["s", "r"],
545
+ }
546
+
547
+ rules = {
548
+ "consonants": consonants,
549
+ "vowels": vowels,
550
+ "matras": matras,
551
+ "clusters": clusters,
552
+ "strip_final_virama": strip_final_virama,
553
+ }
554
+
555
+ return eng_to_telugu_base(text, rules)
556
+
557
+
558
+ # ============================================================================
559
+ # PART 2: TELUGU → ENGLISH TRANSLITERATION
560
+ # ============================================================================
561
+
562
+ def telugu_to_eng(text: str) -> str:
563
+ """
564
+ Convert Telugu script to English transliteration.
565
+
566
+ Example:
567
+ ఎవరు → evaru
568
+ కృష్ణ → krishna
569
+ రామ → rama
570
+ """
571
+ # Input validation
572
+ if text is None:
573
+ raise ValueError("Input text cannot be None")
574
+ if not isinstance(text, str):
575
+ raise TypeError(f"Expected str, got {type(text).__name__}")
576
+ if not text or not text.strip():
577
+ return ""
578
+ if len(text) > 10000:
579
+ raise ValueError("Input text too long (max 10000 characters)")
580
+
581
+ # Reverse mapping: Telugu → English
582
+ reverse_vowels = {
583
+ "అ": "a", "ఆ": "aa", "ఇ": "i", "ఈ": "ii",
584
+ "ఉ": "u", "ఊ": "uu", "ఋ": "ri", "ౠ": "rii",
585
+ "ఎ": "e", "ఏ": "ee", "ఐ": "ai",
586
+ "ఒ": "o", "ఓ": "oo", "ఔ": "au",
587
+ "ం": "m", "ః": "h",
588
+ }
589
+
590
+ reverse_consonants = {
591
+ "క": "k", "ఖ": "kh", "గ": "g", "ఘ": "gh", "ఙ": "ng",
592
+ "చ": "ch", "ఛ": "chh", "జ": "j", "ఝ": "jh", "ఞ": "ny",
593
+ "ట": "tt", "ఠ": "tth", "డ": "dd", "ఢ": "ddh", "ణ": "nn",
594
+ "త": "t", "థ": "th", "ద": "d", "ధ": "dh", "న": "n",
595
+ "ప": "p", "ఫ": "ph", "బ": "b", "భ": "bh", "మ": "m",
596
+ "య": "y", "ర": "r", "ల": "l", "ళ": "ll", "వ": "v",
597
+ "శ": "sh", "ష": "sh", "స": "s", "హ": "h",
598
+ }
599
+
600
+ reverse_matras = {
601
+ "ా": "aa", "ి": "i", "ీ": "ii", "ు": "u", "ూ": "uu",
602
+ "ృ": "ri", "ౄ": "rii", "ె": "e", "ే": "ee",
603
+ "ై": "ai", "ొ": "o", "ో": "oo", "ౌ": "au",
604
+ }
605
+
606
+ result = []
607
+ i = 0
608
+ prev_was_consonant = False
609
+
610
+ while i < len(text):
611
+ char = text[i]
612
+
613
+ # Check for virama (halant)
614
+ if char == "్":
615
+ # Just mark that we had a virama
616
+ # The inherent 'a' will not be added because consonants are base form (no 'a')
617
+ prev_was_consonant = True
618
+ i += 1
619
+ continue
620
+
621
+ # Check for consonant
622
+ if char in reverse_consonants:
623
+ base = reverse_consonants[char]
624
+
625
+ # Add inherent vowel 'a' only if NOT after a consonant (with or without virama)
626
+ if not prev_was_consonant:
627
+ base = base + "a"
628
+
629
+ result.append(base)
630
+ prev_was_consonant = True
631
+ i += 1
632
+
633
+ # Check for following matra
634
+ if i < len(text) and text[i] in reverse_matras:
635
+ # Matra replaces the inherent vowel
636
+ matra = reverse_matras[text[i]]
637
+ result[-1] = result[-1][:-1] + matra # Remove 'a' and add matra
638
+ i += 1
639
+ continue
640
+
641
+ # Check for standalone vowel
642
+ if char in reverse_vowels:
643
+ result.append(reverse_vowels[char])
644
+ prev_was_consonant = False
645
+ i += 1
646
+ continue
647
+
648
+ # Unknown character (space, punctuation, etc.)
649
+ result.append(char)
650
+ prev_was_consonant = False
651
+ i += 1
652
+
653
+ return "".join(result)
654
+
655
+
656
+ # ============================================================================
657
+ # PART 3: SEMANTIC WORD MAPPING (English ↔ Telugu)
658
+ # ============================================================================
659
+
660
+ def get_semantic_dictionary():
661
+ """
662
+ Dictionary of English words and their Telugu equivalents.
663
+ Format: {english: [telugu1, telugu2, ...], ...}
664
+ """
665
+ return {
666
+ # Questions
667
+ "who": ["ఎవరు", "ఎవరో"],
668
+ "what": ["ఏమి", "ఏమిటి", "ఎం"],
669
+ "when": ["ఎప్పుడు", "ఎప్పుడో"],
670
+ "where": ["ఎక్కడ", "ఎక్కడో"],
671
+ "why": ["ఎందుకు", "ఎందుకో"],
672
+ "how": ["ఎలా", "ఎలాగ"],
673
+ "which": ["ఏది", "ఏ"],
674
+
675
+ # Common words
676
+ "yes": ["అవును", "అవునండి", "ఔను"],
677
+ "no": ["కాదు", "లేదు"],
678
+ "hello": ["హలో", "నమస్కారం", "వందనం"],
679
+ "thank": ["ధన్యవాదాలు", "కృతజ్ఞతలు"],
680
+ "please": ["దయచేసి", "చేయండి"],
681
+ "sorry": ["క్షమించండి", "సారీ"],
682
+
683
+ # Names (common)
684
+ "rama": ["రామ", "రాముడు"],
685
+ "raama": ["రామ"], # Alternative spelling
686
+ "krishna": ["కృష్ణ", "కృష్ణుడు"],
687
+ "sita": ["సీత"],
688
+ "lakshmi": ["లక్ష్మి"],
689
+ "venkatesh": ["వెంకటేశ్", "వెంకటేశ్వర"],
690
+ "narayana": ["నారాయణ"],
691
+
692
+ # Family
693
+ "mother": ["అమ్మ", "తల్లి"],
694
+ "father": ["నాన్న", "తండ్రి"],
695
+ "brother": ["అన్న", "తమ్ముడు"],
696
+ "sister": ["అక్క", "చెల్లి"],
697
+ "son": ["మగవాడు", "కొడుకు"],
698
+ "daughter": ["అమ్మాయి", "కూతురు"],
699
+ "uncle": ["చిన్నాన్న", "పెదనాన్న"],
700
+ "aunt": ["పిన్ని", "పెద్దనాన్న"],
701
+
702
+ # Numbers
703
+ "one": ["ఒకటి"],
704
+ "two": ["రెండు"],
705
+ "three": ["మూడు"],
706
+ "four": ["నాలుగు"],
707
+ "five": ["ఐదు"],
708
+ "six": ["ఆరు"],
709
+ "seven": ["ఏడు"],
710
+ "eight": ["ఎనిమిది"],
711
+ "nine": ["తొమ్మిది"],
712
+ "ten": ["పది"],
713
+
714
+ # Colors
715
+ "red": ["ఎర్ర"],
716
+ "blue": ["నీలం"],
717
+ "green": ["పచ్చ"],
718
+ "yellow": ["పసుపు"],
719
+ "white": ["తెలుపు"],
720
+ "black": ["నలుపు"],
721
+
722
+ # Days
723
+ "monday": ["సోమవారం"],
724
+ "tuesday": ["మంగళవారం"],
725
+ "wednesday": ["బుధవారం"],
726
+ "thursday": ["గురువారం"],
727
+ "friday": ["శుక్రవారం"],
728
+ "saturday": ["శనివారం"],
729
+ "sunday": ["ఆదివారం"],
730
+ }
731
+
732
+
733
+ def get_reverse_semantic_dictionary():
734
+ """Create reverse mapping: Telugu → English."""
735
+ forward = get_semantic_dictionary()
736
+ reverse = {}
737
+
738
+ for eng, tel_list in forward.items():
739
+ for tel in tel_list:
740
+ if tel not in reverse:
741
+ reverse[tel] = []
742
+ reverse[tel].append(eng)
743
+
744
+ return reverse
745
+
746
+
747
+ def normalize_for_matching(text: str) -> str:
748
+ """Normalize text for semantic matching."""
749
+ return text.lower().strip()
750
+
751
+
752
+ def semantic_match(text: str) -> dict:
753
+ """
754
+ Find semantic matches for input text (works both ways).
755
+
756
+ Returns:
757
+ {
758
+ 'input': original_text,
759
+ 'detected_language': 'english' or 'telugu',
760
+ 'matches': [list of matching words],
761
+ 'transliteration': transliterated version
762
+ }
763
+ """
764
+ text_norm = normalize_for_matching(text)
765
+
766
+ # Check if input is Telugu
767
+ is_telugu = any('\u0C00' <= ch <= '\u0C7F' for ch in text)
768
+
769
+ if is_telugu:
770
+ # Telugu → English
771
+ reverse_dict = get_reverse_semantic_dictionary()
772
+ matches = reverse_dict.get(text_norm, [])
773
+ transliteration = telugu_to_eng(text_norm)
774
+
775
+ return {
776
+ 'input': text,
777
+ 'detected_language': 'telugu',
778
+ 'matches': matches,
779
+ 'transliteration': transliteration,
780
+ }
781
+ else:
782
+ # English → Telugu
783
+ forward_dict = get_semantic_dictionary()
784
+ matches = forward_dict.get(text_norm, [])
785
+ transliteration = eng_to_telugu(text_norm)
786
+
787
+ return {
788
+ 'input': text,
789
+ 'detected_language': 'english',
790
+ 'matches': matches if matches else [transliteration],
791
+ 'transliteration': transliteration,
792
+ }
793
+
794
+
795
+ def bidirectional_search(query: str) -> list:
796
+ """
797
+ Search that works for both English and Telugu input.
798
+ Returns all related words in both languages.
799
+
800
+ Example:
801
+ bidirectional_search("who") → [
802
+ ("English", "who"),
803
+ ("Telugu", "ఎవరు"),
804
+ ("Telugu", "ఎవరో"),
805
+ ("Transliteration", "evaru"),
806
+ ]
807
+ """
808
+ result_data = semantic_match(query)
809
+ results = []
810
+
811
+ if result_data['detected_language'] == 'english':
812
+ # Input was English
813
+ results.append(("English", result_data['input']))
814
+ for match in result_data['matches']:
815
+ results.append(("Telugu", match))
816
+ results.append(("Transliteration", telugu_to_eng(match)))
817
+ else:
818
+ # Input was Telugu
819
+ results.append(("Telugu", result_data['input']))
820
+ results.append(("Transliteration", result_data['transliteration']))
821
+ for match in result_data['matches']:
822
+ results.append(("English", match))
823
+
824
+ # Remove duplicates while preserving order
825
+ seen = set()
826
+ unique_results = []
827
+ for lang, word in results:
828
+ if (lang, word) not in seen:
829
+ unique_results.append((lang, word))
830
+ seen.add((lang, word))
831
+
832
+ return unique_results
833
+
834
+
835
+ # ============================================================================
836
+ # PART 4: SENTENCE HANDLING
837
+ # ============================================================================
838
+
839
+ def eng_to_telugu_sentence(sentence: str, style: str = "modern") -> str:
840
+ """
841
+ Transliterate a complete sentence (multiple words).
842
+ Checks semantic dictionary first, then falls back to transliteration.
843
+ Preserves punctuation and special characters.
844
+
845
+ Args:
846
+ sentence: English sentence to transliterate
847
+ style: Alphabet style ("modern", "classical", or "hybrid")
848
+
849
+ Returns:
850
+ Telugu sentence with spaces and punctuation preserved
851
+
852
+ Example:
853
+ eng_to_telugu_sentence("hello world") # "హలో వర్ల్ద"
854
+ eng_to_telugu_sentence("who is rama") # "ఎవరు ఇస్ రామ"
855
+ eng_to_telugu_sentence("Who is Krishna?") # "ఎవరు ఇస్ కృష్ణ?"
856
+ """
857
+ import re
858
+
859
+ forward_dict = get_semantic_dictionary()
860
+
861
+ # Tokenize preserving punctuation and spaces
862
+ # Unicode-aware pattern: Telugu block, English words, spaces, punctuation
863
+ tokens = re.findall(r'[\u0C00-\u0C7F]+|[a-zA-Z]+|\s+|[^\w\s]', sentence, flags=re.UNICODE)
864
+ result = []
865
+
866
+ for token in tokens:
867
+ if any('\u0C00' <= c <= '\u0C7F' for c in token):
868
+ # Already Telugu
869
+ result.append(token)
870
+ elif token.isalnum():
871
+ # Check semantic dictionary
872
+ norm = normalize_for_matching(token)
873
+ if norm in forward_dict:
874
+ result.append(forward_dict[norm][0])
875
+ else:
876
+ result.append(eng_to_telugu_with_style(token, style))
877
+ else:
878
+ # Space, punctuation, or special character
879
+ result.append(token)
880
+
881
+ return "".join(result)
882
+
883
+
884
+ # ============================================================================
885
+ # PART 5: WORD VARIATIONS
886
+ # ============================================================================
887
+
888
+ def generate_word_variations(word: str) -> list:
889
+ """
890
+ Generate spelling variations for a given Telugu word.
891
+
892
+ Args:
893
+ word: The Telugu word to generate variations for.
894
+
895
+ Returns:
896
+ A list of possible spelling variations.
897
+ """
898
+ import itertools
899
+
900
+ rules = {
901
+ 'ా': ['', 'ె'],
902
+ 'ట': ['ట్ట', 'త', 'త్త'],
903
+ 'ర': ['ర్'],
904
+ 'డ': ['డ్డ'],
905
+ 'క': ['క్క'],
906
+ 'ప': ['ప్ప'],
907
+ 'త': ['త్త'],
908
+ 'చ': ['చ్చ'],
909
+ 'ల': ['ల్ల'],
910
+ 'మ': ['మ్మ'],
911
+ 'వ': ['వ్వ'],
912
+ 'గ': ['గ్గ'],
913
+ 'బ': ['బ్బ'],
914
+ 'స': ['స్స'],
915
+ }
916
+
917
+ variations = {word}
918
+ for i, char in enumerate(word):
919
+ if char in rules:
920
+ for replacement in rules[char]:
921
+ new_word = word[:i] + replacement + word[i+1:]
922
+ variations.add(new_word)
923
+
924
+ # Generate combinations of variations
925
+ # This can be computationally expensive, so we will limit the depth
926
+ # For now, we will just do one level of replacement
927
+
928
+ return sorted(list(variations))
929
+
930
+
931
+ # ============================================================================
932
+ # MAIN - QUICK TEST
933
+ # ============================================================================
934
+
935
+ if __name__ == "__main__":
936
+ print("=" * 80)
937
+ print("TELUGU LIBRARY v2.0 - QUICK TEST")
938
+ print("=" * 80)
939
+
940
+ samples = ["rama", "krishna", "bhagvaan", "who", "ఎవరు", "hello", "హలో"]
941
+
942
+ print("\n1. Basic Transliteration:")
943
+ for sample in samples:
944
+ if any('\u0C00' <= ch <= '\u0C7F' for ch in sample):
945
+ # Telugu input
946
+ eng = telugu_to_eng(sample)
947
+ print(f" Telugu: {sample:15} → English: {eng}")
948
+ else:
949
+ # English input
950
+ tel = eng_to_telugu(sample)
951
+ print(f" English: {sample:15} → Telugu: {tel}")
952
+
953
+ print("\n2. Semantic Matching:")
954
+ for sample in ["who", "ఎవరు", "mother", "అమ్మ"]:
955
+ result = semantic_match(sample)
956
+ print(f" Input: {sample:15}")
957
+ print(f" Language: {result['detected_language']}")
958
+ print(f" Matches: {result['matches']}")
959
+
960
+ print("\n3. Sentence Transliteration:")
961
+ sentences = [
962
+ "hello world",
963
+ "who is rama",
964
+ "thank you",
965
+ ]
966
+ for sent in sentences:
967
+ telugu = eng_to_telugu_sentence(sent)
968
+ print(f" English: {sent:20} → Telugu: {telugu}")
969
+
970
+ print("\n" + "=" * 80)
971
+ print("Test complete!")
972
+ print("=" * 80)