telugu-language-tools 4.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- telugu_language_tools-4.0.2.dist-info/METADATA +956 -0
- telugu_language_tools-4.0.2.dist-info/RECORD +14 -0
- telugu_language_tools-4.0.2.dist-info/WHEEL +5 -0
- telugu_language_tools-4.0.2.dist-info/licenses/LICENSE +21 -0
- telugu_language_tools-4.0.2.dist-info/top_level.txt +1 -0
- telugu_lib/__init__.py +197 -0
- telugu_lib/advanced.py +717 -0
- telugu_lib/cluster_generator.py +399 -0
- telugu_lib/context_rules.py +568 -0
- telugu_lib/enhanced_dictionary.py +516 -0
- telugu_lib/iso15919_mappings.py +430 -0
- telugu_lib/sentence_tools.py +214 -0
- telugu_lib/text_tools.py +108 -0
- telugu_lib/transliterate.py +972 -0
|
@@ -0,0 +1,972 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Telugu Library v2.0 - Comprehensive Transliteration Engine
|
|
3
|
+
===========================================================
|
|
4
|
+
|
|
5
|
+
Features:
|
|
6
|
+
- Old vs New Telugu Alphabet Support (Classical, Modern, Hybrid)
|
|
7
|
+
- Bidirectional Transliteration (English ↔ Telugu)
|
|
8
|
+
- Semantic Word Mapping (English ↔ Telugu meanings)
|
|
9
|
+
- Universal Search (works for both languages)
|
|
10
|
+
|
|
11
|
+
Examples:
|
|
12
|
+
eng_to_telugu("krishna") → కృష్ణ
|
|
13
|
+
telugu_to_eng("కృష్ణ") → krishna
|
|
14
|
+
semantic_match("who") → ["ఎవరు", "ఎవరో"]
|
|
15
|
+
eng_to_telugu_with_style("rama", "modern") → రామ
|
|
16
|
+
eng_to_telugu_with_style("rama", "classical") → రామ
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
# ============================================================================
|
|
20
|
+
# PART 1: ENGLISH → TELUGU TRANSLITERATION
|
|
21
|
+
# ============================================================================
|
|
22
|
+
|
|
23
|
+
def normalize_roman_input(text: str) -> str:
|
|
24
|
+
"""Normalizes romanized input to a more basic ASCII representation."""
|
|
25
|
+
replacements = {
|
|
26
|
+
'ā': 'aa', 'ē': 'ee', 'ī': 'ii', 'ō': 'oo', 'ū': 'uu',
|
|
27
|
+
'ṁ': 'm', 'ṇ': 'n', 'ḍ': 'd', 'ṭ': 't', 'ś': 'sh',
|
|
28
|
+
'ṣ': 'sh', 'ṛ': 'ri'
|
|
29
|
+
}
|
|
30
|
+
for special, basic in replacements.items():
|
|
31
|
+
text = text.replace(special, basic)
|
|
32
|
+
return text
|
|
33
|
+
|
|
34
|
+
def eng_to_telugu_base(text: str, rules: dict) -> str:
|
|
35
|
+
"""Core transliteration engine with custom rules (pure transliteration only)."""
|
|
36
|
+
text = normalize_roman_input(text)
|
|
37
|
+
text = (text or "").lower().strip()
|
|
38
|
+
|
|
39
|
+
consonants = rules.get("consonants", {})
|
|
40
|
+
vowels = rules.get("vowels", {})
|
|
41
|
+
matras = rules.get("matras", {})
|
|
42
|
+
clusters = rules.get("clusters", {})
|
|
43
|
+
strip_final_virama = rules.get("strip_final_virama", True)
|
|
44
|
+
|
|
45
|
+
result = []
|
|
46
|
+
i = 0
|
|
47
|
+
prev_cons = False
|
|
48
|
+
|
|
49
|
+
def emit_consonant(key: str, join_prev=False):
|
|
50
|
+
nonlocal prev_cons
|
|
51
|
+
if join_prev and prev_cons:
|
|
52
|
+
result.append("్")
|
|
53
|
+
result.append(consonants.get(key, key))
|
|
54
|
+
prev_cons = True
|
|
55
|
+
|
|
56
|
+
while i < len(text):
|
|
57
|
+
chunk5 = text[i:i+5]
|
|
58
|
+
chunk4 = text[i:i+4]
|
|
59
|
+
chunk3 = text[i:i+3]
|
|
60
|
+
chunk2 = text[i:i+2]
|
|
61
|
+
single = text[i]
|
|
62
|
+
|
|
63
|
+
# NEW: Handle vocalic r (r̥) - detect when 'r' between consonants should be vocalic
|
|
64
|
+
if prev_cons and single == 'r':
|
|
65
|
+
# Look ahead: is there a consonant after optional vowel?
|
|
66
|
+
lookahead = i + 1
|
|
67
|
+
while lookahead < len(text) and text[lookahead] in 'aeiou':
|
|
68
|
+
lookahead += 1
|
|
69
|
+
if lookahead < len(text) and text[lookahead] in 'bcdfghjklmnpqrstvwxyz':
|
|
70
|
+
# Vocalic r: add ృ after the previous consonant
|
|
71
|
+
# The ృ suppresses the inherent 'a' of the previous consonant
|
|
72
|
+
# It doesn't affect the next consonant, so prev_cons stays True
|
|
73
|
+
result.append("ృ")
|
|
74
|
+
# Don't reset prev_cons - the next consonant will still have its inherent 'a'
|
|
75
|
+
i += 1
|
|
76
|
+
continue
|
|
77
|
+
|
|
78
|
+
# Handle 5-letter clusters (krish for కృష్ణ)
|
|
79
|
+
if chunk5 in clusters:
|
|
80
|
+
if prev_cons:
|
|
81
|
+
result.append("్")
|
|
82
|
+
for idx, ck in enumerate(clusters[chunk5]):
|
|
83
|
+
emit_consonant(ck, join_prev=(idx > 0))
|
|
84
|
+
i += 5
|
|
85
|
+
continue
|
|
86
|
+
|
|
87
|
+
# Handle 4-letter clusters
|
|
88
|
+
if chunk4 in clusters:
|
|
89
|
+
if prev_cons:
|
|
90
|
+
result.append("్")
|
|
91
|
+
for idx, ck in enumerate(clusters[chunk4]):
|
|
92
|
+
emit_consonant(ck, join_prev=(idx > 0))
|
|
93
|
+
i += 4
|
|
94
|
+
continue
|
|
95
|
+
|
|
96
|
+
# Handle 3-letter clusters
|
|
97
|
+
if chunk3 in clusters:
|
|
98
|
+
if prev_cons:
|
|
99
|
+
result.append("్")
|
|
100
|
+
for idx, ck in enumerate(clusters[chunk3]):
|
|
101
|
+
emit_consonant(ck, join_prev=(idx > 0))
|
|
102
|
+
i += 3
|
|
103
|
+
continue
|
|
104
|
+
|
|
105
|
+
# Handle 2-letter clusters
|
|
106
|
+
if chunk2 in clusters:
|
|
107
|
+
if prev_cons:
|
|
108
|
+
result.append("్")
|
|
109
|
+
for idx, ck in enumerate(clusters[chunk2]):
|
|
110
|
+
emit_consonant(ck, join_prev=(idx > 0))
|
|
111
|
+
i += 2
|
|
112
|
+
continue
|
|
113
|
+
|
|
114
|
+
# Handle 2-letter vowels
|
|
115
|
+
if chunk2 in vowels:
|
|
116
|
+
if prev_cons:
|
|
117
|
+
result.append(matras.get(chunk2, ""))
|
|
118
|
+
prev_cons = False # A vowel sound has been produced
|
|
119
|
+
else:
|
|
120
|
+
result.append(vowels[chunk2])
|
|
121
|
+
prev_cons = False
|
|
122
|
+
i += 2
|
|
123
|
+
continue
|
|
124
|
+
|
|
125
|
+
# Handle 2-letter consonants
|
|
126
|
+
if chunk2 in consonants:
|
|
127
|
+
if prev_cons:
|
|
128
|
+
result.append("్")
|
|
129
|
+
emit_consonant(chunk2)
|
|
130
|
+
i += 2
|
|
131
|
+
continue
|
|
132
|
+
|
|
133
|
+
# Single vowel
|
|
134
|
+
if single in vowels:
|
|
135
|
+
# Skip single "a" - consonants already have inherent 'a' sound
|
|
136
|
+
if single == "a" and prev_cons:
|
|
137
|
+
# 'a' after consonant = inherent vowel (skip)
|
|
138
|
+
prev_cons = False
|
|
139
|
+
i += 1
|
|
140
|
+
continue
|
|
141
|
+
|
|
142
|
+
# Special case: if the previous character is ృ, treat this as a standalone vowel
|
|
143
|
+
# even though prev_cons is True. Also, DON'T reset prev_cons after this vowel
|
|
144
|
+
is_after_vocalic_r = (result and result[-1] == "ృ")
|
|
145
|
+
|
|
146
|
+
# Check if this is a matra (after a consonant) or standalone vowel
|
|
147
|
+
if prev_cons and not is_after_vocalic_r:
|
|
148
|
+
# This is a matra
|
|
149
|
+
result.append(matras.get(single, ""))
|
|
150
|
+
prev_cons = False # A vowel sound has been produced
|
|
151
|
+
else:
|
|
152
|
+
# This is a standalone vowel
|
|
153
|
+
result.append(vowels[single])
|
|
154
|
+
# Standalone vowel, so prev_cons becomes False
|
|
155
|
+
# EXCEPTION: if this vowel comes after ృ, don't reset prev_cons
|
|
156
|
+
# because ృ doesn't suppress inherent vowels (it only suppresses the vowel of its own consonant)
|
|
157
|
+
if not is_after_vocalic_r:
|
|
158
|
+
prev_cons = False
|
|
159
|
+
i += 1
|
|
160
|
+
continue
|
|
161
|
+
|
|
162
|
+
# Single consonant
|
|
163
|
+
if single in consonants:
|
|
164
|
+
if prev_cons:
|
|
165
|
+
result.append("్")
|
|
166
|
+
emit_consonant(single)
|
|
167
|
+
i += 1
|
|
168
|
+
continue
|
|
169
|
+
|
|
170
|
+
# Unknown character
|
|
171
|
+
result.append(single)
|
|
172
|
+
prev_cons = False
|
|
173
|
+
i += 1
|
|
174
|
+
|
|
175
|
+
if strip_final_virama and result and result[-1] == "్":
|
|
176
|
+
result.pop()
|
|
177
|
+
|
|
178
|
+
return "".join(result)
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def get_base_consonants(style="modern"):
|
|
182
|
+
"""
|
|
183
|
+
Get consonant mappings for old vs new style.
|
|
184
|
+
|
|
185
|
+
Args:
|
|
186
|
+
style: "modern" (new) or "classical" (old)
|
|
187
|
+
"""
|
|
188
|
+
|
|
189
|
+
# Common consonants in both styles
|
|
190
|
+
common = {
|
|
191
|
+
"k": "క", "kh": "ఖ", "g": "గ", "gh": "ఘ",
|
|
192
|
+
"ch": "చ", "chh": "ఛ", "j": "జ", "jh": "ఝ",
|
|
193
|
+
"t": "త", "th": "థ", "d": "ద", "dh": "ధ", "n": "న",
|
|
194
|
+
"tt": "ట", "tth": "ఠ", "dd": "డ", "ddh": "ఢ", "nn": "ణ",
|
|
195
|
+
"p": "ప", "ph": "ఫ", "b": "బ", "bh": "భ", "m": "మ",
|
|
196
|
+
"y": "య", "l": "ల", "v": "వ", "w": "వ",
|
|
197
|
+
"sh": "ష", "shh": "శ", "s": "స", "h": "హ",
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
if style == "classical":
|
|
201
|
+
# OLD alphabet includes archaic letters
|
|
202
|
+
return {
|
|
203
|
+
**common,
|
|
204
|
+
"r": "ర",
|
|
205
|
+
"rr": "ఱ", # Retroflex R (archaic)
|
|
206
|
+
"ll": "ళ", # Retroflex L (still used)
|
|
207
|
+
"lll": "ఴ", # Voiced retroflex fricative (obsolete)
|
|
208
|
+
"nga": "ఙ", # Velar nasal (archaic)
|
|
209
|
+
"nya": "ఞ", # Palatal nasal (archaic)
|
|
210
|
+
"nna": "ణ", # Retroflex nasal
|
|
211
|
+
}
|
|
212
|
+
else:
|
|
213
|
+
# NEW alphabet (modern/reformed)
|
|
214
|
+
return {
|
|
215
|
+
**common,
|
|
216
|
+
"r": "ర", # Single R for both
|
|
217
|
+
"rr": "ర్ర", # Double R as conjunct
|
|
218
|
+
"ll": "ళ", # Retroflex L (retained)
|
|
219
|
+
"nga": "న", # Merged with dental N
|
|
220
|
+
"nya": "న", # Merged with dental N
|
|
221
|
+
"nna": "ణ", # Retroflex nasal (retained)
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def get_base_vowels(style="modern"):
|
|
226
|
+
"""Get vowel mappings for old vs new style."""
|
|
227
|
+
|
|
228
|
+
common = {
|
|
229
|
+
"aa": "ఆ", "a": "అ",
|
|
230
|
+
"ii": "ఈ", "i": "ఇ",
|
|
231
|
+
"uu": "ఊ", "u": "ఉ",
|
|
232
|
+
"ai": "ఐ", "au": "ఔ",
|
|
233
|
+
"am": "ం", "ah": "ః",
|
|
234
|
+
"ri": "ఋ", "rii": "ౠ",
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
if style == "classical":
|
|
238
|
+
return {
|
|
239
|
+
**common,
|
|
240
|
+
"e": "ఎ", # Short e
|
|
241
|
+
"ee": "ఏ", # Long ē
|
|
242
|
+
"o": "ఒ", # Short o
|
|
243
|
+
"oo": "ఓ", # Long ō
|
|
244
|
+
"li": "ౢ", # Vocalic l̥ (archaic)
|
|
245
|
+
"lii": "ౣ", # Vocalic l̥̄ (archaic)
|
|
246
|
+
}
|
|
247
|
+
else:
|
|
248
|
+
return {
|
|
249
|
+
**common,
|
|
250
|
+
"e": "ఎ",
|
|
251
|
+
"ee": "ఏ",
|
|
252
|
+
"o": "ఒ",
|
|
253
|
+
"oo": "ఓ",
|
|
254
|
+
# Archaic vowels dropped
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def get_base_matras(style="modern"):
|
|
259
|
+
"""Get matra (vowel sign) mappings."""
|
|
260
|
+
|
|
261
|
+
common = {
|
|
262
|
+
"aa": "ా", "a": "",
|
|
263
|
+
"ii": "ీ", "i": "ి",
|
|
264
|
+
"uu": "ూ", "u": "ు",
|
|
265
|
+
"ee": "ే", "e": "ె",
|
|
266
|
+
"oo": "ో", "o": "ొ",
|
|
267
|
+
"ai": "ై", "au": "ౌ",
|
|
268
|
+
"am": "ం", "ah": "ః",
|
|
269
|
+
"ri": "ృ", "rii": "ౄ",
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
if style == "classical":
|
|
273
|
+
return {
|
|
274
|
+
**common,
|
|
275
|
+
"li": "ౢ", # Vocalic l̥ matra (archaic)
|
|
276
|
+
"lii": "ౣ", # Vocalic l̥̄ matra (archaic)
|
|
277
|
+
}
|
|
278
|
+
else:
|
|
279
|
+
return common
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def get_clusters(style="modern"):
|
|
283
|
+
"""Get common consonant clusters."""
|
|
284
|
+
return {
|
|
285
|
+
# 4-letter clusters
|
|
286
|
+
"ksha": ["k", "sh"],
|
|
287
|
+
"jna": ["j", "n"],
|
|
288
|
+
"shna": ["sh", "n"],
|
|
289
|
+
"shra": ["sh", "r"],
|
|
290
|
+
# 3-letter clusters
|
|
291
|
+
"bhra": ["bh", "r"],
|
|
292
|
+
"gva": ["g", "v"],
|
|
293
|
+
# 2-letter clusters
|
|
294
|
+
"kr": ["k", "r"],
|
|
295
|
+
"tr": ["t", "r"],
|
|
296
|
+
"dr": ["d", "r"],
|
|
297
|
+
"gr": ["g", "r"],
|
|
298
|
+
"pr": ["p", "r"],
|
|
299
|
+
"br": ["b", "r"],
|
|
300
|
+
"vr": ["v", "r"],
|
|
301
|
+
"nr": ["n", "r"],
|
|
302
|
+
"sr": ["s", "r"],
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def eng_to_telugu_with_style(text: str, style: str = "modern") -> str:
|
|
307
|
+
"""
|
|
308
|
+
Transliteration with style selection (Modern vs Classical vs Hybrid).
|
|
309
|
+
Checks semantic dictionary first, then falls back to transliteration.
|
|
310
|
+
|
|
311
|
+
Args:
|
|
312
|
+
text: English text
|
|
313
|
+
style: "modern" (default), "classical", or "hybrid"
|
|
314
|
+
|
|
315
|
+
Returns:
|
|
316
|
+
Telugu text using the specified alphabet style
|
|
317
|
+
"""
|
|
318
|
+
# Input validation
|
|
319
|
+
if text is None:
|
|
320
|
+
raise ValueError("Input text cannot be None")
|
|
321
|
+
if not isinstance(text, str):
|
|
322
|
+
raise TypeError(f"Expected str, got {type(text).__name__}")
|
|
323
|
+
if not text or not text.strip():
|
|
324
|
+
return ""
|
|
325
|
+
if len(text) > 10000:
|
|
326
|
+
raise ValueError("Input text too long (max 10000 characters)")
|
|
327
|
+
|
|
328
|
+
# 1. NEW: Check semantic dictionary first for known words
|
|
329
|
+
norm_text = normalize_for_matching(text)
|
|
330
|
+
forward_dict = get_semantic_dictionary()
|
|
331
|
+
|
|
332
|
+
if norm_text in forward_dict:
|
|
333
|
+
# Found a known word. Return its first meaning.
|
|
334
|
+
return forward_dict[norm_text][0]
|
|
335
|
+
|
|
336
|
+
# 2. No match found. Fallback to pure transliteration.
|
|
337
|
+
rules = {
|
|
338
|
+
"consonants": get_base_consonants(style if style in ["modern", "classical", "hybrid"] else "modern"),
|
|
339
|
+
"vowels": get_base_vowels(style if style in ["modern", "classical", "hybrid"] else "modern"),
|
|
340
|
+
"matras": get_base_matras(style if style in ["modern", "classical", "hybrid"] else "modern"),
|
|
341
|
+
"clusters": get_clusters("modern"),
|
|
342
|
+
"strip_final_virama": True if style in ["modern", "hybrid"] else False,
|
|
343
|
+
}
|
|
344
|
+
return eng_to_telugu_base(text, rules)
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
def eng_to_telugu_old_new_options(text: str) -> list:
|
|
348
|
+
"""
|
|
349
|
+
Generate transliteration options using OLD vs NEW alphabet styles.
|
|
350
|
+
|
|
351
|
+
Args:
|
|
352
|
+
text: English text to transliterate
|
|
353
|
+
|
|
354
|
+
Returns:
|
|
355
|
+
List of tuples: [(telugu_text, style_description), ...]
|
|
356
|
+
"""
|
|
357
|
+
|
|
358
|
+
options = []
|
|
359
|
+
|
|
360
|
+
# Style 1: MODERN (New Telugu)
|
|
361
|
+
modern_rules = {
|
|
362
|
+
"consonants": get_base_consonants("modern"),
|
|
363
|
+
"vowels": get_base_vowels("modern"),
|
|
364
|
+
"matras": get_base_matras("modern"),
|
|
365
|
+
"clusters": get_clusters("modern"),
|
|
366
|
+
"strip_final_virama": True,
|
|
367
|
+
}
|
|
368
|
+
modern = eng_to_telugu_base(text, modern_rules)
|
|
369
|
+
options.append((modern, "Modern Telugu (Reformed)"))
|
|
370
|
+
|
|
371
|
+
# Style 2: CLASSICAL (Old Telugu)
|
|
372
|
+
classical_rules = {
|
|
373
|
+
"consonants": get_base_consonants("classical"),
|
|
374
|
+
"vowels": get_base_vowels("classical"),
|
|
375
|
+
"matras": get_base_matras("classical"),
|
|
376
|
+
"clusters": get_clusters("classical"),
|
|
377
|
+
"strip_final_virama": False, # Old style kept virama
|
|
378
|
+
}
|
|
379
|
+
classical = eng_to_telugu_base(text, classical_rules)
|
|
380
|
+
options.append((classical, "Classical Telugu (Pre-reform)"))
|
|
381
|
+
|
|
382
|
+
# Style 3: HYBRID (some old, some new)
|
|
383
|
+
hybrid_rules = {
|
|
384
|
+
"consonants": {**get_base_consonants("modern"), "rr": "ఱ", "ll": "ళ"},
|
|
385
|
+
"vowels": get_base_vowels("modern"),
|
|
386
|
+
"matras": get_base_matras("modern"),
|
|
387
|
+
"clusters": get_clusters("modern"),
|
|
388
|
+
"strip_final_virama": True,
|
|
389
|
+
}
|
|
390
|
+
hybrid = eng_to_telugu_base(text, hybrid_rules)
|
|
391
|
+
options.append((hybrid, "Hybrid (Modern with some archaic letters)"))
|
|
392
|
+
|
|
393
|
+
# Remove duplicates
|
|
394
|
+
seen = set()
|
|
395
|
+
unique_options = []
|
|
396
|
+
for telugu, desc in options:
|
|
397
|
+
if telugu not in seen:
|
|
398
|
+
unique_options.append((telugu, desc))
|
|
399
|
+
seen.add(telugu)
|
|
400
|
+
|
|
401
|
+
return unique_options
|
|
402
|
+
|
|
403
|
+
|
|
404
|
+
def compare_old_new_alphabets():
|
|
405
|
+
"""
|
|
406
|
+
Display comparison table of OLD vs NEW Telugu alphabets.
|
|
407
|
+
"""
|
|
408
|
+
|
|
409
|
+
print("\n" + "=" * 80)
|
|
410
|
+
print("OLD vs NEW TELUGU ALPHABET COMPARISON")
|
|
411
|
+
print("=" * 80)
|
|
412
|
+
|
|
413
|
+
comparisons = [
|
|
414
|
+
("Letter", "Old (Classical)", "New (Modern)", "Status", "Notes"),
|
|
415
|
+
("-" * 15, "-" * 20, "-" * 20, "-" * 12, "-" * 30),
|
|
416
|
+
|
|
417
|
+
# Vowels
|
|
418
|
+
("a", "అ", "అ", "Unchanged", "Short vowel"),
|
|
419
|
+
("ā", "ఆ", "ఆ", "Unchanged", "Long vowel"),
|
|
420
|
+
("i", "ఇ", "ఇ", "Unchanged", "Short vowel"),
|
|
421
|
+
("ī", "ఈ", "ఈ", "Unchanged", "Long vowel"),
|
|
422
|
+
("u", "ఉ", "ఉ", "Unchanged", "Short vowel"),
|
|
423
|
+
("ū", "ఊ", "ఊ", "Unchanged", "Long vowel"),
|
|
424
|
+
("r̥", "ఋ", "ఋ", "Unchanged", "Vocalic R"),
|
|
425
|
+
("l̥", "ౢ", "(obsolete)", "Removed", "Vocalic L - archaic"),
|
|
426
|
+
("e", "ఎ", "ఎ", "Unchanged", "Short E"),
|
|
427
|
+
("ē", "ఏ", "ఏ", "Unchanged", "Long E"),
|
|
428
|
+
("ai", "ఐ", "ఐ", "Unchanged", "Diphthong"),
|
|
429
|
+
("o", "ఒ", "ఒ", "Unchanged", "Short O"),
|
|
430
|
+
("ō", "ఓ", "ఓ", "Unchanged", "Long O"),
|
|
431
|
+
("au", "ఔ", "ఔ", "Unchanged", "Diphthong"),
|
|
432
|
+
|
|
433
|
+
("", "", "", "", ""),
|
|
434
|
+
|
|
435
|
+
# Consonants
|
|
436
|
+
("ka", "క", "క", "Unchanged", "Velar"),
|
|
437
|
+
("ṅa", "ఙ", "(merged→న)", "Rare", "Velar nasal"),
|
|
438
|
+
("cha", "చ", "చ", "Unchanged", "Palatal"),
|
|
439
|
+
("ña", "ఞ", "(merged→న)", "Rare", "Palatal nasal"),
|
|
440
|
+
("ṭa", "ట", "ట", "Unchanged", "Retroflex"),
|
|
441
|
+
("ṇa", "ణ", "ణ", "Unchanged", "Retroflex nasal"),
|
|
442
|
+
("ta", "త", "త", "Unchanged", "Dental"),
|
|
443
|
+
("na", "న", "న", "Unchanged", "Dental nasal"),
|
|
444
|
+
("pa", "ప", "ప", "Unchanged", "Labial"),
|
|
445
|
+
("ya", "య", "య", "Unchanged", "Semivowel"),
|
|
446
|
+
("ra", "ర", "ర", "Unchanged", "Alveolar"),
|
|
447
|
+
("ṟa", "ఱ", "(rare)", "Archaic", "Retroflex R"),
|
|
448
|
+
("la", "ల", "ల", "Unchanged", "Dental lateral"),
|
|
449
|
+
("ḷa", "ళ", "ళ", "Unchanged", "Retroflex lateral"),
|
|
450
|
+
("ḻa", "ఴ", "(obsolete)", "Removed", "Fricative - Tamil loan"),
|
|
451
|
+
("va", "వ", "వ", "Unchanged", "Labial"),
|
|
452
|
+
("śa", "శ", "శ", "Unchanged", "Palatal sibilant"),
|
|
453
|
+
("ṣa", "ష", "ష", "Unchanged", "Retroflex sibilant"),
|
|
454
|
+
("sa", "స", "స", "Unchanged", "Dental sibilant"),
|
|
455
|
+
("ha", "హ", "హ", "Unchanged", "Glottal"),
|
|
456
|
+
]
|
|
457
|
+
|
|
458
|
+
for row in comparisons:
|
|
459
|
+
print(f"{row[0]:15} {row[1]:20} {row[2]:20} {row[3]:12} {row[4]:30}")
|
|
460
|
+
|
|
461
|
+
print("\n" + "=" * 80)
|
|
462
|
+
print("SUMMARY:")
|
|
463
|
+
print(" • Modern Telugu has 56 letters (16 vowels + 36 consonants + 4 modifiers)")
|
|
464
|
+
print(" • Classical Telugu had ~60+ letters including archaic forms")
|
|
465
|
+
print(" • Letters ఱ (ṟa), ఴ (ḻa), ౘ, ౙ, ౚ are now obsolete or very rare")
|
|
466
|
+
print(" • Nasals ఙ (ṅa) and ఞ (ña) mostly merged into న (na) in modern usage")
|
|
467
|
+
print("=" * 80 + "\n")
|
|
468
|
+
|
|
469
|
+
|
|
470
|
+
# Main function for backwards compatibility (v0.9)
|
|
471
|
+
def eng_to_telugu(text: str, strip_final_virama: bool = True) -> str:
|
|
472
|
+
"""
|
|
473
|
+
Clean & Extended Telugu Transliteration Engine (BUG-FIXED v0.9)
|
|
474
|
+
Maintained for backwards compatibility with v0.9.
|
|
475
|
+
Checks semantic dictionary first, then falls back to pure transliteration.
|
|
476
|
+
|
|
477
|
+
✅ Fixed: Added 2-letter cluster checking
|
|
478
|
+
✅ Fixed: Consistent cluster definitions (no vowels in cluster names)
|
|
479
|
+
✅ Fixed: Proper processing order (clusters before vowels)
|
|
480
|
+
✅ Fixed: Comprehensive cluster coverage (2-letter, 3-letter, 4-letter)
|
|
481
|
+
✅ Includes major clusters (kr, tr, dr, bhra, gva, ksha, jna, shra, shna)
|
|
482
|
+
✅ Smart virama handling between consonants
|
|
483
|
+
✅ Correct vowel matras after clusters
|
|
484
|
+
✅ Optional strip_final_virama for smooth ending
|
|
485
|
+
✅ Semantic dictionary integration for known words
|
|
486
|
+
|
|
487
|
+
Example:
|
|
488
|
+
eng_to_telugu("krishna") → కృష్ణ
|
|
489
|
+
eng_to_telugu("bhagvaan") → భగవాన్
|
|
490
|
+
eng_to_telugu("karthik") → కార్తిక్
|
|
491
|
+
"""
|
|
492
|
+
# 1. Check semantic dictionary first for known words
|
|
493
|
+
norm_text = normalize_for_matching(text)
|
|
494
|
+
forward_dict = get_semantic_dictionary()
|
|
495
|
+
|
|
496
|
+
if norm_text in forward_dict:
|
|
497
|
+
# Found a known word. Return its first meaning.
|
|
498
|
+
return forward_dict[norm_text][0]
|
|
499
|
+
|
|
500
|
+
# 2. No match found. Fallback to pure transliteration.
|
|
501
|
+
|
|
502
|
+
consonants = {
|
|
503
|
+
"k": "క", "kh": "ఖ", "g": "గ", "gh": "ఘ",
|
|
504
|
+
"ch": "చ", "jh": "ఝ", "j": "జ",
|
|
505
|
+
"t": "త", "th": "థ", "d": "ద", "dh": "ధ",
|
|
506
|
+
"n": "న", "p": "ప", "ph": "ఫ", "b": "బ", "bh": "భ", "m": "మ",
|
|
507
|
+
"y": "య", "r": "ర", "l": "ల", "v": "వ", "w": "వ",
|
|
508
|
+
"sh": "ష", "s": "స", "h": "హ",
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
vowels = {
|
|
512
|
+
"aa": "ఆ", "a": "అ", "ii": "ఈ", "i": "ఇ",
|
|
513
|
+
"uu": "ఊ", "u": "ఉ", "ee": "ఏ", "e": "ఎ",
|
|
514
|
+
"oo": "ఓ", "o": "ఒ", "ai": "ఐ", "au": "ఔ",
|
|
515
|
+
"am": "ం", "ah": "ః",
|
|
516
|
+
}
|
|
517
|
+
|
|
518
|
+
matras = {
|
|
519
|
+
"aa": "ా", "a": "", "ii": "ీ", "i": "ి",
|
|
520
|
+
"uu": "ూ", "u": "ు", "ee": "ే", "e": "ె",
|
|
521
|
+
"oo": "ో", "o": "ొ", "ai": "ై", "au": "ౌ",
|
|
522
|
+
"am": "ం", "ah": "ః",
|
|
523
|
+
}
|
|
524
|
+
|
|
525
|
+
# Extended clusters - BUG FIX: Consistent definitions (no vowels in names)
|
|
526
|
+
clusters = {
|
|
527
|
+
# 4-letter clusters
|
|
528
|
+
"ksha": ["k", "sh"],
|
|
529
|
+
"jna": ["j", "n"],
|
|
530
|
+
"shna": ["sh", "n"],
|
|
531
|
+
"shra": ["sh", "r"],
|
|
532
|
+
# 3-letter clusters
|
|
533
|
+
"bhra": ["bh", "r"],
|
|
534
|
+
"gva": ["g", "v"],
|
|
535
|
+
# 2-letter clusters
|
|
536
|
+
"kr": ["k", "r"],
|
|
537
|
+
"tr": ["t", "r"],
|
|
538
|
+
"dr": ["d", "r"],
|
|
539
|
+
"gr": ["g", "r"],
|
|
540
|
+
"pr": ["p", "r"],
|
|
541
|
+
"br": ["b", "r"],
|
|
542
|
+
"vr": ["v", "r"],
|
|
543
|
+
"nr": ["n", "r"],
|
|
544
|
+
"sr": ["s", "r"],
|
|
545
|
+
}
|
|
546
|
+
|
|
547
|
+
rules = {
|
|
548
|
+
"consonants": consonants,
|
|
549
|
+
"vowels": vowels,
|
|
550
|
+
"matras": matras,
|
|
551
|
+
"clusters": clusters,
|
|
552
|
+
"strip_final_virama": strip_final_virama,
|
|
553
|
+
}
|
|
554
|
+
|
|
555
|
+
return eng_to_telugu_base(text, rules)
|
|
556
|
+
|
|
557
|
+
|
|
558
|
+
# ============================================================================
|
|
559
|
+
# PART 2: TELUGU → ENGLISH TRANSLITERATION
|
|
560
|
+
# ============================================================================
|
|
561
|
+
|
|
562
|
+
def telugu_to_eng(text: str) -> str:
|
|
563
|
+
"""
|
|
564
|
+
Convert Telugu script to English transliteration.
|
|
565
|
+
|
|
566
|
+
Example:
|
|
567
|
+
ఎవరు → evaru
|
|
568
|
+
కృష్ణ → krishna
|
|
569
|
+
రామ → rama
|
|
570
|
+
"""
|
|
571
|
+
# Input validation
|
|
572
|
+
if text is None:
|
|
573
|
+
raise ValueError("Input text cannot be None")
|
|
574
|
+
if not isinstance(text, str):
|
|
575
|
+
raise TypeError(f"Expected str, got {type(text).__name__}")
|
|
576
|
+
if not text or not text.strip():
|
|
577
|
+
return ""
|
|
578
|
+
if len(text) > 10000:
|
|
579
|
+
raise ValueError("Input text too long (max 10000 characters)")
|
|
580
|
+
|
|
581
|
+
# Reverse mapping: Telugu → English
|
|
582
|
+
reverse_vowels = {
|
|
583
|
+
"అ": "a", "ఆ": "aa", "ఇ": "i", "ఈ": "ii",
|
|
584
|
+
"ఉ": "u", "ఊ": "uu", "ఋ": "ri", "ౠ": "rii",
|
|
585
|
+
"ఎ": "e", "ఏ": "ee", "ఐ": "ai",
|
|
586
|
+
"ఒ": "o", "ఓ": "oo", "ఔ": "au",
|
|
587
|
+
"ం": "m", "ః": "h",
|
|
588
|
+
}
|
|
589
|
+
|
|
590
|
+
reverse_consonants = {
|
|
591
|
+
"క": "k", "ఖ": "kh", "గ": "g", "ఘ": "gh", "ఙ": "ng",
|
|
592
|
+
"చ": "ch", "ఛ": "chh", "జ": "j", "ఝ": "jh", "ఞ": "ny",
|
|
593
|
+
"ట": "tt", "ఠ": "tth", "డ": "dd", "ఢ": "ddh", "ణ": "nn",
|
|
594
|
+
"త": "t", "థ": "th", "ద": "d", "ధ": "dh", "న": "n",
|
|
595
|
+
"ప": "p", "ఫ": "ph", "బ": "b", "భ": "bh", "మ": "m",
|
|
596
|
+
"య": "y", "ర": "r", "ల": "l", "ళ": "ll", "వ": "v",
|
|
597
|
+
"శ": "sh", "ష": "sh", "స": "s", "హ": "h",
|
|
598
|
+
}
|
|
599
|
+
|
|
600
|
+
reverse_matras = {
|
|
601
|
+
"ా": "aa", "ి": "i", "ీ": "ii", "ు": "u", "ూ": "uu",
|
|
602
|
+
"ృ": "ri", "ౄ": "rii", "ె": "e", "ే": "ee",
|
|
603
|
+
"ై": "ai", "ొ": "o", "ో": "oo", "ౌ": "au",
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
result = []
|
|
607
|
+
i = 0
|
|
608
|
+
prev_was_consonant = False
|
|
609
|
+
|
|
610
|
+
while i < len(text):
|
|
611
|
+
char = text[i]
|
|
612
|
+
|
|
613
|
+
# Check for virama (halant)
|
|
614
|
+
if char == "్":
|
|
615
|
+
# Just mark that we had a virama
|
|
616
|
+
# The inherent 'a' will not be added because consonants are base form (no 'a')
|
|
617
|
+
prev_was_consonant = True
|
|
618
|
+
i += 1
|
|
619
|
+
continue
|
|
620
|
+
|
|
621
|
+
# Check for consonant
|
|
622
|
+
if char in reverse_consonants:
|
|
623
|
+
base = reverse_consonants[char]
|
|
624
|
+
|
|
625
|
+
# Add inherent vowel 'a' only if NOT after a consonant (with or without virama)
|
|
626
|
+
if not prev_was_consonant:
|
|
627
|
+
base = base + "a"
|
|
628
|
+
|
|
629
|
+
result.append(base)
|
|
630
|
+
prev_was_consonant = True
|
|
631
|
+
i += 1
|
|
632
|
+
|
|
633
|
+
# Check for following matra
|
|
634
|
+
if i < len(text) and text[i] in reverse_matras:
|
|
635
|
+
# Matra replaces the inherent vowel
|
|
636
|
+
matra = reverse_matras[text[i]]
|
|
637
|
+
result[-1] = result[-1][:-1] + matra # Remove 'a' and add matra
|
|
638
|
+
i += 1
|
|
639
|
+
continue
|
|
640
|
+
|
|
641
|
+
# Check for standalone vowel
|
|
642
|
+
if char in reverse_vowels:
|
|
643
|
+
result.append(reverse_vowels[char])
|
|
644
|
+
prev_was_consonant = False
|
|
645
|
+
i += 1
|
|
646
|
+
continue
|
|
647
|
+
|
|
648
|
+
# Unknown character (space, punctuation, etc.)
|
|
649
|
+
result.append(char)
|
|
650
|
+
prev_was_consonant = False
|
|
651
|
+
i += 1
|
|
652
|
+
|
|
653
|
+
return "".join(result)
|
|
654
|
+
|
|
655
|
+
|
|
656
|
+
# ============================================================================
|
|
657
|
+
# PART 3: SEMANTIC WORD MAPPING (English ↔ Telugu)
|
|
658
|
+
# ============================================================================
|
|
659
|
+
|
|
660
|
+
def get_semantic_dictionary():
|
|
661
|
+
"""
|
|
662
|
+
Dictionary of English words and their Telugu equivalents.
|
|
663
|
+
Format: {english: [telugu1, telugu2, ...], ...}
|
|
664
|
+
"""
|
|
665
|
+
return {
|
|
666
|
+
# Questions
|
|
667
|
+
"who": ["ఎవరు", "ఎవరో"],
|
|
668
|
+
"what": ["ఏమి", "ఏమిటి", "ఎం"],
|
|
669
|
+
"when": ["ఎప్పుడు", "ఎప్పుడో"],
|
|
670
|
+
"where": ["ఎక్కడ", "ఎక్కడో"],
|
|
671
|
+
"why": ["ఎందుకు", "ఎందుకో"],
|
|
672
|
+
"how": ["ఎలా", "ఎలాగ"],
|
|
673
|
+
"which": ["ఏది", "ఏ"],
|
|
674
|
+
|
|
675
|
+
# Common words
|
|
676
|
+
"yes": ["అవును", "అవునండి", "ఔను"],
|
|
677
|
+
"no": ["కాదు", "లేదు"],
|
|
678
|
+
"hello": ["హలో", "నమస్కారం", "వందనం"],
|
|
679
|
+
"thank": ["ధన్యవాదాలు", "కృతజ్ఞతలు"],
|
|
680
|
+
"please": ["దయచేసి", "చేయండి"],
|
|
681
|
+
"sorry": ["క్షమించండి", "సారీ"],
|
|
682
|
+
|
|
683
|
+
# Names (common)
|
|
684
|
+
"rama": ["రామ", "రాముడు"],
|
|
685
|
+
"raama": ["రామ"], # Alternative spelling
|
|
686
|
+
"krishna": ["కృష్ణ", "కృష్ణుడు"],
|
|
687
|
+
"sita": ["సీత"],
|
|
688
|
+
"lakshmi": ["లక్ష్మి"],
|
|
689
|
+
"venkatesh": ["వెంకటేశ్", "వెంకటేశ్వర"],
|
|
690
|
+
"narayana": ["నారాయణ"],
|
|
691
|
+
|
|
692
|
+
# Family
|
|
693
|
+
"mother": ["అమ్మ", "తల్లి"],
|
|
694
|
+
"father": ["నాన్న", "తండ్రి"],
|
|
695
|
+
"brother": ["అన్న", "తమ్ముడు"],
|
|
696
|
+
"sister": ["అక్క", "చెల్లి"],
|
|
697
|
+
"son": ["మగవాడు", "కొడుకు"],
|
|
698
|
+
"daughter": ["అమ్మాయి", "కూతురు"],
|
|
699
|
+
"uncle": ["చిన్నాన్న", "పెదనాన్న"],
|
|
700
|
+
"aunt": ["పిన్ని", "పెద్దనాన్న"],
|
|
701
|
+
|
|
702
|
+
# Numbers
|
|
703
|
+
"one": ["ఒకటి"],
|
|
704
|
+
"two": ["రెండు"],
|
|
705
|
+
"three": ["మూడు"],
|
|
706
|
+
"four": ["నాలుగు"],
|
|
707
|
+
"five": ["ఐదు"],
|
|
708
|
+
"six": ["ఆరు"],
|
|
709
|
+
"seven": ["ఏడు"],
|
|
710
|
+
"eight": ["ఎనిమిది"],
|
|
711
|
+
"nine": ["తొమ్మిది"],
|
|
712
|
+
"ten": ["పది"],
|
|
713
|
+
|
|
714
|
+
# Colors
|
|
715
|
+
"red": ["ఎర్ర"],
|
|
716
|
+
"blue": ["నీలం"],
|
|
717
|
+
"green": ["పచ్చ"],
|
|
718
|
+
"yellow": ["పసుపు"],
|
|
719
|
+
"white": ["తెలుపు"],
|
|
720
|
+
"black": ["నలుపు"],
|
|
721
|
+
|
|
722
|
+
# Days
|
|
723
|
+
"monday": ["సోమవారం"],
|
|
724
|
+
"tuesday": ["మంగళవారం"],
|
|
725
|
+
"wednesday": ["బుధవారం"],
|
|
726
|
+
"thursday": ["గురువారం"],
|
|
727
|
+
"friday": ["శుక్రవారం"],
|
|
728
|
+
"saturday": ["శనివారం"],
|
|
729
|
+
"sunday": ["ఆదివారం"],
|
|
730
|
+
}
|
|
731
|
+
|
|
732
|
+
|
|
733
|
+
def get_reverse_semantic_dictionary():
|
|
734
|
+
"""Create reverse mapping: Telugu → English."""
|
|
735
|
+
forward = get_semantic_dictionary()
|
|
736
|
+
reverse = {}
|
|
737
|
+
|
|
738
|
+
for eng, tel_list in forward.items():
|
|
739
|
+
for tel in tel_list:
|
|
740
|
+
if tel not in reverse:
|
|
741
|
+
reverse[tel] = []
|
|
742
|
+
reverse[tel].append(eng)
|
|
743
|
+
|
|
744
|
+
return reverse
|
|
745
|
+
|
|
746
|
+
|
|
747
|
+
def normalize_for_matching(text: str) -> str:
|
|
748
|
+
"""Normalize text for semantic matching."""
|
|
749
|
+
return text.lower().strip()
|
|
750
|
+
|
|
751
|
+
|
|
752
|
+
def semantic_match(text: str) -> dict:
|
|
753
|
+
"""
|
|
754
|
+
Find semantic matches for input text (works both ways).
|
|
755
|
+
|
|
756
|
+
Returns:
|
|
757
|
+
{
|
|
758
|
+
'input': original_text,
|
|
759
|
+
'detected_language': 'english' or 'telugu',
|
|
760
|
+
'matches': [list of matching words],
|
|
761
|
+
'transliteration': transliterated version
|
|
762
|
+
}
|
|
763
|
+
"""
|
|
764
|
+
text_norm = normalize_for_matching(text)
|
|
765
|
+
|
|
766
|
+
# Check if input is Telugu
|
|
767
|
+
is_telugu = any('\u0C00' <= ch <= '\u0C7F' for ch in text)
|
|
768
|
+
|
|
769
|
+
if is_telugu:
|
|
770
|
+
# Telugu → English
|
|
771
|
+
reverse_dict = get_reverse_semantic_dictionary()
|
|
772
|
+
matches = reverse_dict.get(text_norm, [])
|
|
773
|
+
transliteration = telugu_to_eng(text_norm)
|
|
774
|
+
|
|
775
|
+
return {
|
|
776
|
+
'input': text,
|
|
777
|
+
'detected_language': 'telugu',
|
|
778
|
+
'matches': matches,
|
|
779
|
+
'transliteration': transliteration,
|
|
780
|
+
}
|
|
781
|
+
else:
|
|
782
|
+
# English → Telugu
|
|
783
|
+
forward_dict = get_semantic_dictionary()
|
|
784
|
+
matches = forward_dict.get(text_norm, [])
|
|
785
|
+
transliteration = eng_to_telugu(text_norm)
|
|
786
|
+
|
|
787
|
+
return {
|
|
788
|
+
'input': text,
|
|
789
|
+
'detected_language': 'english',
|
|
790
|
+
'matches': matches if matches else [transliteration],
|
|
791
|
+
'transliteration': transliteration,
|
|
792
|
+
}
|
|
793
|
+
|
|
794
|
+
|
|
795
|
+
def bidirectional_search(query: str) -> list:
|
|
796
|
+
"""
|
|
797
|
+
Search that works for both English and Telugu input.
|
|
798
|
+
Returns all related words in both languages.
|
|
799
|
+
|
|
800
|
+
Example:
|
|
801
|
+
bidirectional_search("who") → [
|
|
802
|
+
("English", "who"),
|
|
803
|
+
("Telugu", "ఎవరు"),
|
|
804
|
+
("Telugu", "ఎవరో"),
|
|
805
|
+
("Transliteration", "evaru"),
|
|
806
|
+
]
|
|
807
|
+
"""
|
|
808
|
+
result_data = semantic_match(query)
|
|
809
|
+
results = []
|
|
810
|
+
|
|
811
|
+
if result_data['detected_language'] == 'english':
|
|
812
|
+
# Input was English
|
|
813
|
+
results.append(("English", result_data['input']))
|
|
814
|
+
for match in result_data['matches']:
|
|
815
|
+
results.append(("Telugu", match))
|
|
816
|
+
results.append(("Transliteration", telugu_to_eng(match)))
|
|
817
|
+
else:
|
|
818
|
+
# Input was Telugu
|
|
819
|
+
results.append(("Telugu", result_data['input']))
|
|
820
|
+
results.append(("Transliteration", result_data['transliteration']))
|
|
821
|
+
for match in result_data['matches']:
|
|
822
|
+
results.append(("English", match))
|
|
823
|
+
|
|
824
|
+
# Remove duplicates while preserving order
|
|
825
|
+
seen = set()
|
|
826
|
+
unique_results = []
|
|
827
|
+
for lang, word in results:
|
|
828
|
+
if (lang, word) not in seen:
|
|
829
|
+
unique_results.append((lang, word))
|
|
830
|
+
seen.add((lang, word))
|
|
831
|
+
|
|
832
|
+
return unique_results
|
|
833
|
+
|
|
834
|
+
|
|
835
|
+
# ============================================================================
|
|
836
|
+
# PART 4: SENTENCE HANDLING
|
|
837
|
+
# ============================================================================
|
|
838
|
+
|
|
839
|
+
def eng_to_telugu_sentence(sentence: str, style: str = "modern") -> str:
|
|
840
|
+
"""
|
|
841
|
+
Transliterate a complete sentence (multiple words).
|
|
842
|
+
Checks semantic dictionary first, then falls back to transliteration.
|
|
843
|
+
Preserves punctuation and special characters.
|
|
844
|
+
|
|
845
|
+
Args:
|
|
846
|
+
sentence: English sentence to transliterate
|
|
847
|
+
style: Alphabet style ("modern", "classical", or "hybrid")
|
|
848
|
+
|
|
849
|
+
Returns:
|
|
850
|
+
Telugu sentence with spaces and punctuation preserved
|
|
851
|
+
|
|
852
|
+
Example:
|
|
853
|
+
eng_to_telugu_sentence("hello world") # "హలో వర్ల్ద"
|
|
854
|
+
eng_to_telugu_sentence("who is rama") # "ఎవరు ఇస్ రామ"
|
|
855
|
+
eng_to_telugu_sentence("Who is Krishna?") # "ఎవరు ఇస్ కృష్ణ?"
|
|
856
|
+
"""
|
|
857
|
+
import re
|
|
858
|
+
|
|
859
|
+
forward_dict = get_semantic_dictionary()
|
|
860
|
+
|
|
861
|
+
# Tokenize preserving punctuation and spaces
|
|
862
|
+
# Unicode-aware pattern: Telugu block, English words, spaces, punctuation
|
|
863
|
+
tokens = re.findall(r'[\u0C00-\u0C7F]+|[a-zA-Z]+|\s+|[^\w\s]', sentence, flags=re.UNICODE)
|
|
864
|
+
result = []
|
|
865
|
+
|
|
866
|
+
for token in tokens:
|
|
867
|
+
if any('\u0C00' <= c <= '\u0C7F' for c in token):
|
|
868
|
+
# Already Telugu
|
|
869
|
+
result.append(token)
|
|
870
|
+
elif token.isalnum():
|
|
871
|
+
# Check semantic dictionary
|
|
872
|
+
norm = normalize_for_matching(token)
|
|
873
|
+
if norm in forward_dict:
|
|
874
|
+
result.append(forward_dict[norm][0])
|
|
875
|
+
else:
|
|
876
|
+
result.append(eng_to_telugu_with_style(token, style))
|
|
877
|
+
else:
|
|
878
|
+
# Space, punctuation, or special character
|
|
879
|
+
result.append(token)
|
|
880
|
+
|
|
881
|
+
return "".join(result)
|
|
882
|
+
|
|
883
|
+
|
|
884
|
+
# ============================================================================
|
|
885
|
+
# PART 5: WORD VARIATIONS
|
|
886
|
+
# ============================================================================
|
|
887
|
+
|
|
888
|
+
def generate_word_variations(word: str) -> list:
|
|
889
|
+
"""
|
|
890
|
+
Generate spelling variations for a given Telugu word.
|
|
891
|
+
|
|
892
|
+
Args:
|
|
893
|
+
word: The Telugu word to generate variations for.
|
|
894
|
+
|
|
895
|
+
Returns:
|
|
896
|
+
A list of possible spelling variations.
|
|
897
|
+
"""
|
|
898
|
+
import itertools
|
|
899
|
+
|
|
900
|
+
rules = {
|
|
901
|
+
'ా': ['', 'ె'],
|
|
902
|
+
'ట': ['ట్ట', 'త', 'త్త'],
|
|
903
|
+
'ర': ['ర్'],
|
|
904
|
+
'డ': ['డ్డ'],
|
|
905
|
+
'క': ['క్క'],
|
|
906
|
+
'ప': ['ప్ప'],
|
|
907
|
+
'త': ['త్త'],
|
|
908
|
+
'చ': ['చ్చ'],
|
|
909
|
+
'ల': ['ల్ల'],
|
|
910
|
+
'మ': ['మ్మ'],
|
|
911
|
+
'వ': ['వ్వ'],
|
|
912
|
+
'గ': ['గ్గ'],
|
|
913
|
+
'బ': ['బ్బ'],
|
|
914
|
+
'స': ['స్స'],
|
|
915
|
+
}
|
|
916
|
+
|
|
917
|
+
variations = {word}
|
|
918
|
+
for i, char in enumerate(word):
|
|
919
|
+
if char in rules:
|
|
920
|
+
for replacement in rules[char]:
|
|
921
|
+
new_word = word[:i] + replacement + word[i+1:]
|
|
922
|
+
variations.add(new_word)
|
|
923
|
+
|
|
924
|
+
# Generate combinations of variations
|
|
925
|
+
# This can be computationally expensive, so we will limit the depth
|
|
926
|
+
# For now, we will just do one level of replacement
|
|
927
|
+
|
|
928
|
+
return sorted(list(variations))
|
|
929
|
+
|
|
930
|
+
|
|
931
|
+
# ============================================================================
|
|
932
|
+
# MAIN - QUICK TEST
|
|
933
|
+
# ============================================================================
|
|
934
|
+
|
|
935
|
+
if __name__ == "__main__":
|
|
936
|
+
print("=" * 80)
|
|
937
|
+
print("TELUGU LIBRARY v2.0 - QUICK TEST")
|
|
938
|
+
print("=" * 80)
|
|
939
|
+
|
|
940
|
+
samples = ["rama", "krishna", "bhagvaan", "who", "ఎవరు", "hello", "హలో"]
|
|
941
|
+
|
|
942
|
+
print("\n1. Basic Transliteration:")
|
|
943
|
+
for sample in samples:
|
|
944
|
+
if any('\u0C00' <= ch <= '\u0C7F' for ch in sample):
|
|
945
|
+
# Telugu input
|
|
946
|
+
eng = telugu_to_eng(sample)
|
|
947
|
+
print(f" Telugu: {sample:15} → English: {eng}")
|
|
948
|
+
else:
|
|
949
|
+
# English input
|
|
950
|
+
tel = eng_to_telugu(sample)
|
|
951
|
+
print(f" English: {sample:15} → Telugu: {tel}")
|
|
952
|
+
|
|
953
|
+
print("\n2. Semantic Matching:")
|
|
954
|
+
for sample in ["who", "ఎవరు", "mother", "అమ్మ"]:
|
|
955
|
+
result = semantic_match(sample)
|
|
956
|
+
print(f" Input: {sample:15}")
|
|
957
|
+
print(f" Language: {result['detected_language']}")
|
|
958
|
+
print(f" Matches: {result['matches']}")
|
|
959
|
+
|
|
960
|
+
print("\n3. Sentence Transliteration:")
|
|
961
|
+
sentences = [
|
|
962
|
+
"hello world",
|
|
963
|
+
"who is rama",
|
|
964
|
+
"thank you",
|
|
965
|
+
]
|
|
966
|
+
for sent in sentences:
|
|
967
|
+
telugu = eng_to_telugu_sentence(sent)
|
|
968
|
+
print(f" English: {sent:20} → Telugu: {telugu}")
|
|
969
|
+
|
|
970
|
+
print("\n" + "=" * 80)
|
|
971
|
+
print("Test complete!")
|
|
972
|
+
print("=" * 80)
|