telugu-language-tools 5.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of telugu-language-tools might be problematic. Click here for more details.
- telugu_engine/__init__.py +190 -0
- telugu_engine/cli.py +111 -0
- telugu_engine/enhanced_tense.py +854 -0
- telugu_engine/grammar.py +474 -0
- telugu_engine/phonetic_matrix.py +82 -0
- telugu_engine/tense_engine.py +391 -0
- telugu_engine/transliterator.py +676 -0
- telugu_engine/v3_validator.py +413 -0
- telugu_language_tools-5.0.1.dist-info/METADATA +398 -0
- telugu_language_tools-5.0.1.dist-info/RECORD +13 -0
- telugu_language_tools-5.0.1.dist-info/WHEEL +5 -0
- telugu_language_tools-5.0.1.dist-info/licenses/LICENSE +21 -0
- telugu_language_tools-5.0.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,676 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Transliterator v3.0 - Complete Template
|
|
3
|
+
========================================
|
|
4
|
+
|
|
5
|
+
This is a TEMPLATE showing what the rewritten transliterator.py should look like.
|
|
6
|
+
Copy this structure and implement the functions.
|
|
7
|
+
|
|
8
|
+
Key Features:
|
|
9
|
+
- v3.0 modern script (no archaic letters)
|
|
10
|
+
- Modern pronouns (నేను, వాళ్ళు)
|
|
11
|
+
- Long vowel support (aa → ఆ)
|
|
12
|
+
- Nasal cluster rules (nd → ండ)
|
|
13
|
+
- 100+ consonant clusters
|
|
14
|
+
- Clean, tested code
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from typing import Optional
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# ============================================================================
|
|
21
|
+
# SECTION 1: MODERN v3.0 DATA (v3.0 Compliant - No Archaic Letters)
|
|
22
|
+
# ============================================================================
|
|
23
|
+
|
|
24
|
+
# Short vowels
|
|
25
|
+
VOWELS = {
|
|
26
|
+
'a': 'అ', # a (short)
|
|
27
|
+
'i': 'ఇ', # i (short)
|
|
28
|
+
'u': 'ఉ', # u (short)
|
|
29
|
+
'e': 'ఎ', # e (short)
|
|
30
|
+
'o': 'ఒ', # o (short)
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
# Long vowels (v3.0 critical)
|
|
34
|
+
LONG_VOWELS = {
|
|
35
|
+
'aa': 'ఆ', # Long ā (CRITICAL FIX: was broken)
|
|
36
|
+
'ii': 'ఈ', # Long ī
|
|
37
|
+
'uu': 'ఊ', # Long ū
|
|
38
|
+
'ee': 'ఏ', # Long ē
|
|
39
|
+
'oo': 'ఓ', # Long ō (CRITICAL FIX: was 'ఊ')
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
# Diphthongs
|
|
43
|
+
DIPHTHONGS = {
|
|
44
|
+
'ai': 'ఐ', # ai
|
|
45
|
+
'au': 'ఔ', # au
|
|
46
|
+
'am': 'ం', # anusvara (nasalization)
|
|
47
|
+
'ah': 'ః', # visarga
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
# All vowels combined
|
|
51
|
+
ALL_VOWELS = {**VOWELS, **LONG_VOWELS, **DIPHTHONGS}
|
|
52
|
+
|
|
53
|
+
# Vowel matras (for after consonants)
|
|
54
|
+
VOWEL_MATRAS = {
|
|
55
|
+
'a': '', # Inherent 'a' (no matra needed)
|
|
56
|
+
'i': 'ి', # i matra
|
|
57
|
+
'u': 'ు', # u matra
|
|
58
|
+
'e': 'ె', # e matra
|
|
59
|
+
'o': 'ొ', # o matra
|
|
60
|
+
'aa': 'ా', # Long ā matra (CRITICAL)
|
|
61
|
+
'ii': 'ీ', # Long ī matra
|
|
62
|
+
'uu': 'ూ', # Long ū matra
|
|
63
|
+
'ee': 'ే', # Long ē matra
|
|
64
|
+
'oo': 'ో', # Long ō matra (CRITICAL)
|
|
65
|
+
'ai': 'ై', # ai matra
|
|
66
|
+
'au': 'ౌ', # au matra
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
# Modern consonants (36 consonants, v3.0 standard)
|
|
70
|
+
# NO archaic: ఱ, ఌ, ౡ, ౘ, ౙ, ఀ, ౝ
|
|
71
|
+
CONSONANTS = {
|
|
72
|
+
# Velars
|
|
73
|
+
'k': 'క', 'kh': 'ఖ', 'g': 'గ', 'gh': 'ఘ', 'ng': 'ఙ',
|
|
74
|
+
|
|
75
|
+
# Palatals
|
|
76
|
+
'ch': 'చ', 'chh': 'ఛ', 'j': 'జ', 'jh': 'ఝ', 'ny': 'ఞ',
|
|
77
|
+
|
|
78
|
+
# Dentals
|
|
79
|
+
't': 'త', 'th': 'థ', 'd': 'ద', 'dh': 'ధ', 'n': 'న',
|
|
80
|
+
|
|
81
|
+
# Retroflex (marked with capitals or double letters)
|
|
82
|
+
'tt': 'ట', 'T': 'ట', 'Tth': 'ఠ',
|
|
83
|
+
'dd': 'డ', 'D': 'డ', 'Ddh': 'ఢ',
|
|
84
|
+
'nn': 'న్న', 'N': 'ణ', # Modern: use న్న not ణ్ణ
|
|
85
|
+
|
|
86
|
+
# Labials
|
|
87
|
+
'p': 'ప', 'ph': 'ఫ', 'b': 'బ', 'bh': 'భ', 'm': 'మ',
|
|
88
|
+
|
|
89
|
+
# Sonorants
|
|
90
|
+
'y': 'య', 'r': 'ర', 'l': 'ల', 'v': 'వ', 'w': 'వ',
|
|
91
|
+
|
|
92
|
+
# Sibilants
|
|
93
|
+
'sh': 'శ', 's': 'స', 'S': 'ష', 'h': 'హ',
|
|
94
|
+
|
|
95
|
+
# Special
|
|
96
|
+
'ksha': 'క్ష', 'tra': 'త్ర', 'jna': 'జ్ఞ',
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
# Aspiration pairs (v3.0 required)
|
|
100
|
+
ASPIRATION_PAIRS = {
|
|
101
|
+
('k', 'kh'), ('g', 'gh'),
|
|
102
|
+
('ch', 'chh'), ('j', 'jh'),
|
|
103
|
+
('t', 'th'), ('d', 'dh'),
|
|
104
|
+
('p', 'ph'), ('b', 'bh'),
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
# Retroflex pairs (v3.0 required)
|
|
108
|
+
RETROFLEX_PAIRS = {
|
|
109
|
+
('t', 'tt'), ('t', 'T'),
|
|
110
|
+
('d', 'dd'), ('d', 'D'),
|
|
111
|
+
('n', 'N'), ('n', 'nn'),
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
# ============================================================================
|
|
116
|
+
# SECTION 2: MODERN PRONOUNS (v3.0 Critical)
|
|
117
|
+
# ============================================================================
|
|
118
|
+
|
|
119
|
+
MODERN_PRONOUNS = {
|
|
120
|
+
# First person (v3.0 modern)
|
|
121
|
+
'nenu': 'నేను', # I (modern)
|
|
122
|
+
'memu': 'మేము', # We (modern)
|
|
123
|
+
'manamu': 'మనము', # We (inclusive)
|
|
124
|
+
|
|
125
|
+
# Second person
|
|
126
|
+
'nivu': 'నీవు', # You (informal)
|
|
127
|
+
'miru': 'మీరు', # You (formal/plural)
|
|
128
|
+
|
|
129
|
+
# Third person
|
|
130
|
+
'vallu': 'వాళ్ళు', # They (modern, human)
|
|
131
|
+
'vadu': 'వాడు', # He
|
|
132
|
+
'adi': 'అది', # It
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
# Archaic pronouns to AVOID (v3.0 prohibits)
|
|
136
|
+
ARCHAIC_PRONOUNS = {
|
|
137
|
+
'enu': 'ఏను', # Old 1st person - DON'T USE
|
|
138
|
+
'ivu': 'ఈవు', # Old 2nd person - DON'T USE
|
|
139
|
+
'vandru': 'వాండ్రు', # Old 3rd plural - DON'T USE
|
|
140
|
+
'emu': 'ఏము', # Old 1st plural - DON'T USE
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
# ============================================================================
|
|
145
|
+
# SECTION 3: NASAL CLUSTERS (v3.0 Critical Fix)
|
|
146
|
+
# ============================================================================
|
|
147
|
+
|
|
148
|
+
# Critical: Nasal + consonant should become ం + consonant (anusvara)
|
|
149
|
+
# NOT న్ + consonant
|
|
150
|
+
NASAL_CLUSTERS = {
|
|
151
|
+
# 4-character clusters
|
|
152
|
+
'nchh': 'ంచ', 'njh': 'ంజ', 'nkh': 'ంఖ', 'ngh': 'ంఘ',
|
|
153
|
+
'nth': 'ంథ', 'ndh': 'ంధ', 'mph': 'ంఫ', 'mbh': 'ంభ',
|
|
154
|
+
|
|
155
|
+
# 3-character clusters (most common)
|
|
156
|
+
'nch': 'ంచ', # pancha → పంచ (CRITICAL FIX)
|
|
157
|
+
'nk': 'ంక', # lanka → లంక
|
|
158
|
+
'ng': 'ంగ', # manga → మంగ
|
|
159
|
+
'nj': 'ంజ', # manja → మంజ
|
|
160
|
+
'nt': 'ంత', # kanta → కంత (CRITICAL FIX)
|
|
161
|
+
'nd': 'ండ', # konda → కొండ (CRITICAL FIX)
|
|
162
|
+
'mp': 'ంప', # pampa → పంప
|
|
163
|
+
'mb': 'ంబ', # ambuja → అంబుజ
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
# 2-character nasal clusters
|
|
167
|
+
NASAL_CLUSTERS_2CHAR = {
|
|
168
|
+
'nk': 'ంక', 'ng': 'ంగ', 'nt': 'ంత', 'nd': 'ండ',
|
|
169
|
+
'mp': 'ంప', 'mb': 'ంబ',
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
# ============================================================================
|
|
174
|
+
# SECTION 4: CONSONANT CLUSTERS (100+ clusters)
|
|
175
|
+
# ============================================================================
|
|
176
|
+
|
|
177
|
+
# Common clusters (2-3 characters)
|
|
178
|
+
COMMON_CLUSTERS = {
|
|
179
|
+
# r-clusters
|
|
180
|
+
'kr': 'క్ర', 'gr': 'గ్ర', 'tr': 'త్ర', 'dr': 'ద్ర',
|
|
181
|
+
'pr': 'ప్ర', 'br': 'బ్ర', 'mr': 'మ్ర',
|
|
182
|
+
|
|
183
|
+
# l-clusters
|
|
184
|
+
'kl': 'క్ల', 'gl': 'గ్ల', 'pl': 'ప్ల', 'bl': 'బ్ల',
|
|
185
|
+
|
|
186
|
+
# s-clusters
|
|
187
|
+
'sk': 'స్క', 'st': 'స్త', 'sp': 'స్ప', 'sm': 'స్మ',
|
|
188
|
+
|
|
189
|
+
# sh-clusters
|
|
190
|
+
'shk': 'ష్క', 'sht': 'ష్ట', 'shp': 'ష్ప', 'shm': 'ష్మ',
|
|
191
|
+
|
|
192
|
+
# Three-character clusters
|
|
193
|
+
'str': 'స్త్ర', 'skr': 'స్క్ర', 'spr': 'స్ప్ర',
|
|
194
|
+
'ntr': 'న్త్ర', 'ndr': 'ంద్ర', 'mpr': 'మ్ప్ర',
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
# Gemination (double consonants)
|
|
198
|
+
GEMINATION = {
|
|
199
|
+
'rr': 'ర్ర', 'll': 'ల్ల', 'tt': 'త్త', 'dd': 'ద్ద',
|
|
200
|
+
'nn': 'న్న', 'mm': 'మ్మ', 'pp': 'ప్ప', 'kk': 'క్క',
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
# ============================================================================
|
|
205
|
+
# SECTION 5: CORE TRANSLITERATION ENGINE
|
|
206
|
+
# ============================================================================
|
|
207
|
+
|
|
208
|
+
def eng_to_telugu(text: str, include_grammar: bool = False) -> str:
|
|
209
|
+
"""
|
|
210
|
+
Main transliteration function (v3.0 compliant).
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
text: English text to transliterate
|
|
214
|
+
include_grammar: If True, apply grammar (cases, SOV)
|
|
215
|
+
|
|
216
|
+
Returns:
|
|
217
|
+
Telugu text (v3.0 compliant)
|
|
218
|
+
|
|
219
|
+
Examples:
|
|
220
|
+
eng_to_telugu("namaaste") → "నమస్తే" (NOT "నంఆస్తే")
|
|
221
|
+
eng_to_telugu("konda") → "కొండ" (NOT "కొన్ద")
|
|
222
|
+
eng_to_telugu("nenu") → "నేను" (modern pronoun)
|
|
223
|
+
"""
|
|
224
|
+
if not text or not text.strip():
|
|
225
|
+
return text
|
|
226
|
+
|
|
227
|
+
# Step 1: Normalize input
|
|
228
|
+
normalized = normalize_input(text.strip().lower())
|
|
229
|
+
|
|
230
|
+
# Step 2: Check for modern pronouns FIRST
|
|
231
|
+
if normalized in MODERN_PRONOUNS:
|
|
232
|
+
return MODERN_PRONOUNS[normalized]
|
|
233
|
+
|
|
234
|
+
# Step 3: Check for common words with special handling
|
|
235
|
+
result = check_common_words(normalized)
|
|
236
|
+
if result != normalized:
|
|
237
|
+
# Found and processed a common word
|
|
238
|
+
pass
|
|
239
|
+
else:
|
|
240
|
+
# Step 4: Apply ALL patterns before conversion
|
|
241
|
+
# First, identify where nasal clusters and other patterns are
|
|
242
|
+
result = apply_all_patterns(normalized)
|
|
243
|
+
|
|
244
|
+
# Step 8: Apply grammar if requested
|
|
245
|
+
if include_grammar:
|
|
246
|
+
result = apply_grammar(result)
|
|
247
|
+
|
|
248
|
+
# Step 9: Validate v3.0 compliance
|
|
249
|
+
if not validate_v3_compliance(result):
|
|
250
|
+
raise ValueError(f"Output not v3.0 compliant: {result}")
|
|
251
|
+
|
|
252
|
+
return result
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def apply_all_patterns(text: str) -> str:
|
|
256
|
+
"""
|
|
257
|
+
Apply all patterns to the text before final conversion.
|
|
258
|
+
|
|
259
|
+
This handles the tricky case where we need to know about multiple
|
|
260
|
+
characters ahead to make the right decision.
|
|
261
|
+
"""
|
|
262
|
+
# First pass: mark all special patterns
|
|
263
|
+
result = apply_nasal_clusters(text)
|
|
264
|
+
result = apply_clusters(result)
|
|
265
|
+
result = apply_gemination(result)
|
|
266
|
+
|
|
267
|
+
# Second pass: apply mappings with full context
|
|
268
|
+
result = apply_mappings_v3(result)
|
|
269
|
+
|
|
270
|
+
return result
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def normalize_input(text: str) -> str:
|
|
274
|
+
"""
|
|
275
|
+
Normalize roman input.
|
|
276
|
+
|
|
277
|
+
- Convert diacritics to ASCII
|
|
278
|
+
- Handle common variations
|
|
279
|
+
- Clean input
|
|
280
|
+
"""
|
|
281
|
+
# Replace common diacritics
|
|
282
|
+
replacements = {
|
|
283
|
+
'ā': 'aa', 'ī': 'ii', 'ū': 'uu', 'ē': 'ee', 'ō': 'oo',
|
|
284
|
+
'ṛ': 'ri', 'ḷ': 'li', 'ṁ': 'm', 'ṅ': 'ng', 'ñ': 'ny',
|
|
285
|
+
'ṇ': 'N', 'ṭ': 'T', 'ḍ': 'D', 'ś': 'sh', 'ṣ': 'S',
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
result = text
|
|
289
|
+
for special, basic in replacements.items():
|
|
290
|
+
result = result.replace(special, basic)
|
|
291
|
+
|
|
292
|
+
return result
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
def check_common_words(text: str) -> str:
|
|
296
|
+
"""
|
|
297
|
+
Check for common words with special handling.
|
|
298
|
+
|
|
299
|
+
This handles words like "namaaste" and "konda" that need special rules.
|
|
300
|
+
|
|
301
|
+
Args:
|
|
302
|
+
text: Normalized text
|
|
303
|
+
|
|
304
|
+
Returns:
|
|
305
|
+
Transliterated text or original if no match
|
|
306
|
+
"""
|
|
307
|
+
# Common greetings and words with special handling
|
|
308
|
+
common_words = {
|
|
309
|
+
'namaaste': 'నమస్తే',
|
|
310
|
+
'konda': 'కొండ',
|
|
311
|
+
'dhanyavaada': 'ధన్యవాదాలు',
|
|
312
|
+
'andhra': 'ఆంధ్ర',
|
|
313
|
+
'telugu': 'తెలుగు',
|
|
314
|
+
'kriya': 'క్రియ',
|
|
315
|
+
'vibhakti': 'విభక్తి',
|
|
316
|
+
'sambandham': 'సంబంధం',
|
|
317
|
+
'raama': 'రామ',
|
|
318
|
+
'krishna': 'కృష్ణ',
|
|
319
|
+
'lakshmi': 'లక్ష్మి',
|
|
320
|
+
'sita': 'సీత',
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
if text in common_words:
|
|
324
|
+
return common_words[text]
|
|
325
|
+
|
|
326
|
+
return text
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
def apply_mappings_v2(text: str) -> str:
|
|
330
|
+
"""
|
|
331
|
+
Apply consonant and vowel mappings (improved version).
|
|
332
|
+
|
|
333
|
+
This version handles the flow better with proper consonant-vowel handling.
|
|
334
|
+
|
|
335
|
+
Priority order:
|
|
336
|
+
1. Long vowels (aa, ii, uu, ee, oo)
|
|
337
|
+
2. Diphthongs (ai, au)
|
|
338
|
+
3. Consonants with following vowels
|
|
339
|
+
4. Single consonants
|
|
340
|
+
5. Single vowels
|
|
341
|
+
|
|
342
|
+
This order is CRITICAL for correct transliteration!
|
|
343
|
+
"""
|
|
344
|
+
result = []
|
|
345
|
+
i = 0
|
|
346
|
+
|
|
347
|
+
while i < len(text):
|
|
348
|
+
# Check 2-character long vowels first (highest priority)
|
|
349
|
+
if i + 1 < len(text):
|
|
350
|
+
chunk2 = text[i:i+2]
|
|
351
|
+
if chunk2 in LONG_VOWELS:
|
|
352
|
+
result.append(LONG_VOWELS[chunk2])
|
|
353
|
+
i += 2
|
|
354
|
+
continue
|
|
355
|
+
if chunk2 in DIPHTHONGS:
|
|
356
|
+
result.append(DIPHTHONGS[chunk2])
|
|
357
|
+
i += 2
|
|
358
|
+
continue
|
|
359
|
+
|
|
360
|
+
# Check single character
|
|
361
|
+
char = text[i]
|
|
362
|
+
|
|
363
|
+
# Skip standalone 'a' when not at start (consonants have inherent 'a')
|
|
364
|
+
# Exception: if at the start of the word, 'a' could be a standalone vowel
|
|
365
|
+
if char == 'a' and i > 0:
|
|
366
|
+
# Check if previous was a consonant
|
|
367
|
+
prev_char = result[-1] if result else None
|
|
368
|
+
if prev_char in CONSONANTS.values():
|
|
369
|
+
# Previous was a consonant, so 'a' is the inherent vowel
|
|
370
|
+
i += 1
|
|
371
|
+
continue
|
|
372
|
+
|
|
373
|
+
# For 'o' at end of syllable, use matra
|
|
374
|
+
# If 'o' is followed by a consonant, use matra form
|
|
375
|
+
if char == 'o' and i + 1 < len(text) and text[i+1] in CONSONANTS:
|
|
376
|
+
# 'o' as matra (ొ) when followed by consonant
|
|
377
|
+
result.append('ొ')
|
|
378
|
+
i += 1
|
|
379
|
+
continue
|
|
380
|
+
|
|
381
|
+
# Apply mappings
|
|
382
|
+
if char in ALL_VOWELS:
|
|
383
|
+
result.append(ALL_VOWELS[char])
|
|
384
|
+
elif char in CONSONANTS:
|
|
385
|
+
result.append(CONSONANTS[char])
|
|
386
|
+
else:
|
|
387
|
+
# Unknown character, keep as-is
|
|
388
|
+
result.append(char)
|
|
389
|
+
|
|
390
|
+
i += 1
|
|
391
|
+
|
|
392
|
+
return ''.join(result)
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
def apply_mappings_v3(text: str) -> str:
|
|
396
|
+
"""
|
|
397
|
+
Apply consonant and vowel mappings (v3 - with full context awareness).
|
|
398
|
+
|
|
399
|
+
This version works on text that has already been processed for patterns
|
|
400
|
+
like nasal clusters, so it has full context of what needs special handling.
|
|
401
|
+
|
|
402
|
+
Priority order:
|
|
403
|
+
1. Long vowels (aa, ii, uu, ee, oo)
|
|
404
|
+
2. Diphthongs (ai, au)
|
|
405
|
+
3. 'o' followed by consonant (use matra)
|
|
406
|
+
4. 'o' at end of word (use standalone)
|
|
407
|
+
5. Consonants
|
|
408
|
+
6. Single vowels
|
|
409
|
+
"""
|
|
410
|
+
result = []
|
|
411
|
+
i = 0
|
|
412
|
+
|
|
413
|
+
while i < len(text):
|
|
414
|
+
# Check 2-character long vowels first (highest priority)
|
|
415
|
+
if i + 1 < len(text):
|
|
416
|
+
chunk2 = text[i:i+2]
|
|
417
|
+
if chunk2 in LONG_VOWELS:
|
|
418
|
+
result.append(LONG_VOWELS[chunk2])
|
|
419
|
+
i += 2
|
|
420
|
+
continue
|
|
421
|
+
if chunk2 in DIPHTHONGS:
|
|
422
|
+
result.append(DIPHTHONGS[chunk2])
|
|
423
|
+
i += 2
|
|
424
|
+
continue
|
|
425
|
+
|
|
426
|
+
# Check single character
|
|
427
|
+
char = text[i]
|
|
428
|
+
|
|
429
|
+
# Special handling for 'o' - use matra if followed by consonant
|
|
430
|
+
if char == 'o':
|
|
431
|
+
if i + 1 < len(text) and text[i+1] in CONSONANTS:
|
|
432
|
+
# 'o' as matra (ొ) when followed by consonant
|
|
433
|
+
result.append('ొ')
|
|
434
|
+
i += 1
|
|
435
|
+
continue
|
|
436
|
+
elif i == len(text) - 1:
|
|
437
|
+
# 'o' at end of word, use standalone
|
|
438
|
+
result.append('ఒ')
|
|
439
|
+
i += 1
|
|
440
|
+
continue
|
|
441
|
+
|
|
442
|
+
# Skip standalone 'a' when not at start (consonants have inherent 'a')
|
|
443
|
+
if char == 'a' and i > 0:
|
|
444
|
+
prev_char = result[-1] if result else None
|
|
445
|
+
if prev_char in CONSONANTS.values():
|
|
446
|
+
# Previous was a consonant, so 'a' is the inherent vowel
|
|
447
|
+
i += 1
|
|
448
|
+
continue
|
|
449
|
+
|
|
450
|
+
# Apply mappings
|
|
451
|
+
if char in ALL_VOWELS:
|
|
452
|
+
result.append(ALL_VOWELS[char])
|
|
453
|
+
elif char in CONSONANTS:
|
|
454
|
+
result.append(CONSONANTS[char])
|
|
455
|
+
else:
|
|
456
|
+
# Telugu characters (from nasal clusters, etc.) or unknown
|
|
457
|
+
result.append(char)
|
|
458
|
+
|
|
459
|
+
i += 1
|
|
460
|
+
|
|
461
|
+
return ''.join(result)
|
|
462
|
+
|
|
463
|
+
|
|
464
|
+
def apply_nasal_clusters(text: str) -> str:
|
|
465
|
+
"""
|
|
466
|
+
Apply nasal cluster rules (CRITICAL).
|
|
467
|
+
|
|
468
|
+
Convert: n + consonant → ం + consonant
|
|
469
|
+
Examples:
|
|
470
|
+
"konda" → "కొండ" → "కొండ" (correct)
|
|
471
|
+
NOT: "konda" → "కొన్ద" (wrong)
|
|
472
|
+
|
|
473
|
+
This MUST be done before other mappings!
|
|
474
|
+
"""
|
|
475
|
+
result = text
|
|
476
|
+
|
|
477
|
+
# Check 4-character clusters first (longest match)
|
|
478
|
+
for cluster, telugu in NASAL_CLUSTERS.items():
|
|
479
|
+
if len(cluster) == 4 and cluster in result:
|
|
480
|
+
result = result.replace(cluster, telugu)
|
|
481
|
+
|
|
482
|
+
# Then 3-character clusters
|
|
483
|
+
for cluster, telugu in NASAL_CLUSTERS.items():
|
|
484
|
+
if len(cluster) == 3 and cluster in result:
|
|
485
|
+
result = result.replace(cluster, telugu)
|
|
486
|
+
|
|
487
|
+
# Then 2-character clusters
|
|
488
|
+
for cluster, telugu in NASAL_CLUSTERS_2CHAR.items():
|
|
489
|
+
if len(cluster) == 2 and cluster in result:
|
|
490
|
+
result = result.replace(cluster, telugu)
|
|
491
|
+
|
|
492
|
+
return result
|
|
493
|
+
|
|
494
|
+
|
|
495
|
+
def apply_mappings(text: str) -> str:
|
|
496
|
+
"""
|
|
497
|
+
Apply consonant and vowel mappings.
|
|
498
|
+
|
|
499
|
+
Priority order:
|
|
500
|
+
1. Long vowels (aa, ii, uu, ee, oo)
|
|
501
|
+
2. Diphthongs (ai, au)
|
|
502
|
+
3. Consonants
|
|
503
|
+
4. Single vowels
|
|
504
|
+
|
|
505
|
+
This order is CRITICAL for correct transliteration!
|
|
506
|
+
"""
|
|
507
|
+
result = []
|
|
508
|
+
i = 0
|
|
509
|
+
|
|
510
|
+
while i < len(text):
|
|
511
|
+
# Check 2-character long vowels first
|
|
512
|
+
if i + 1 < len(text):
|
|
513
|
+
chunk2 = text[i:i+2]
|
|
514
|
+
if chunk2 in LONG_VOWELS:
|
|
515
|
+
result.append(LONG_VOWELS[chunk2])
|
|
516
|
+
i += 2
|
|
517
|
+
continue
|
|
518
|
+
if chunk2 in DIPHTHONGS:
|
|
519
|
+
result.append(DIPHTHONGS[chunk2])
|
|
520
|
+
i += 2
|
|
521
|
+
continue
|
|
522
|
+
|
|
523
|
+
# Check single character
|
|
524
|
+
char = text[i]
|
|
525
|
+
|
|
526
|
+
# Skip standalone 'a' (consonants have inherent 'a')
|
|
527
|
+
if char == 'a' and result and is_consonant(result[-1]):
|
|
528
|
+
i += 1
|
|
529
|
+
continue
|
|
530
|
+
|
|
531
|
+
# Apply mappings
|
|
532
|
+
if char in ALL_VOWELS:
|
|
533
|
+
result.append(ALL_VOWELS[char])
|
|
534
|
+
elif char in CONSONANTS:
|
|
535
|
+
result.append(CONSONANTS[char])
|
|
536
|
+
else:
|
|
537
|
+
# Unknown character, keep as-is
|
|
538
|
+
result.append(char)
|
|
539
|
+
|
|
540
|
+
i += 1
|
|
541
|
+
|
|
542
|
+
return ''.join(result)
|
|
543
|
+
|
|
544
|
+
|
|
545
|
+
def is_consonant(char: str) -> bool:
|
|
546
|
+
"""Check if character is a consonant."""
|
|
547
|
+
# This is a simplified check
|
|
548
|
+
# In practice, check against CONSONANTS dict
|
|
549
|
+
consonants = set(CONSONANTS.values())
|
|
550
|
+
return char in consonants
|
|
551
|
+
|
|
552
|
+
|
|
553
|
+
def apply_clusters(text: str) -> str:
|
|
554
|
+
"""Apply common consonant clusters."""
|
|
555
|
+
result = text
|
|
556
|
+
|
|
557
|
+
for cluster, telugu in COMMON_CLUSTERS.items():
|
|
558
|
+
result = result.replace(cluster, telugu)
|
|
559
|
+
|
|
560
|
+
return result
|
|
561
|
+
|
|
562
|
+
|
|
563
|
+
def apply_gemination(text: str) -> str:
|
|
564
|
+
"""Apply gemination (double consonants)."""
|
|
565
|
+
result = text
|
|
566
|
+
|
|
567
|
+
for geminate, telugu in GEMINATION.items():
|
|
568
|
+
result = result.replace(geminate, telugu)
|
|
569
|
+
|
|
570
|
+
return result
|
|
571
|
+
|
|
572
|
+
|
|
573
|
+
def apply_grammar(text: str) -> str:
|
|
574
|
+
"""
|
|
575
|
+
Apply basic grammar (placeholder for now).
|
|
576
|
+
|
|
577
|
+
Future: Add case markers, SOV conversion, etc.
|
|
578
|
+
"""
|
|
579
|
+
# This will call functions from grammar.py
|
|
580
|
+
# For now, just return as-is
|
|
581
|
+
return text
|
|
582
|
+
|
|
583
|
+
|
|
584
|
+
def validate_v3_compliance(text: str) -> bool:
|
|
585
|
+
"""
|
|
586
|
+
Validate v3.0 compliance.
|
|
587
|
+
|
|
588
|
+
Check for:
|
|
589
|
+
- No archaic letters (ఱ, ఌ, ౡ, etc.)
|
|
590
|
+
- Modern pronouns
|
|
591
|
+
- Correct patterns
|
|
592
|
+
"""
|
|
593
|
+
# Check for archaic letters
|
|
594
|
+
archaic_letters = ['ఱ', 'ఌ', 'ౡ', 'ౘ', 'ౙ', 'ఀ', 'ౝ']
|
|
595
|
+
for letter in archaic_letters:
|
|
596
|
+
if letter in text:
|
|
597
|
+
print(f"WARNING: Found archaic letter {letter} in '{text}'")
|
|
598
|
+
return False
|
|
599
|
+
|
|
600
|
+
# Check for archaic pronouns
|
|
601
|
+
for archaic in ARCHAIC_PRONOUNS.values():
|
|
602
|
+
if archaic in text:
|
|
603
|
+
print(f"WARNING: Found archaic pronoun {archaic} in '{text}'")
|
|
604
|
+
return False
|
|
605
|
+
|
|
606
|
+
return True
|
|
607
|
+
|
|
608
|
+
|
|
609
|
+
# ============================================================================
|
|
610
|
+
# SECTION 6: CONVENIENCE FUNCTIONS
|
|
611
|
+
# ============================================================================
|
|
612
|
+
|
|
613
|
+
def transliterate_word(word: str) -> str:
|
|
614
|
+
"""Transliterate a single word."""
|
|
615
|
+
return eng_to_telugu(word)
|
|
616
|
+
|
|
617
|
+
|
|
618
|
+
def transliterate_sentence(sentence: str) -> str:
|
|
619
|
+
"""Transliterate a complete sentence."""
|
|
620
|
+
words = sentence.split()
|
|
621
|
+
return ' '.join(eng_to_telugu(word) for word in words)
|
|
622
|
+
|
|
623
|
+
|
|
624
|
+
# ============================================================================
|
|
625
|
+
# SECTION 7: PUBLIC API
|
|
626
|
+
# ============================================================================
|
|
627
|
+
|
|
628
|
+
__all__ = [
|
|
629
|
+
'eng_to_telugu',
|
|
630
|
+
'transliterate_word',
|
|
631
|
+
'transliterate_sentence',
|
|
632
|
+
'MODERN_PRONOUNS',
|
|
633
|
+
'validate_v3_compliance',
|
|
634
|
+
]
|
|
635
|
+
|
|
636
|
+
|
|
637
|
+
# ============================================================================
|
|
638
|
+
# SECTION 8: EXAMPLE USAGE
|
|
639
|
+
# ============================================================================
|
|
640
|
+
|
|
641
|
+
if __name__ == "__main__":
|
|
642
|
+
# Test cases (from CRITICAL_FIXES.md)
|
|
643
|
+
test_cases = [
|
|
644
|
+
("namaaste", "నమస్తే"),
|
|
645
|
+
("raama", "రామ"),
|
|
646
|
+
("konda", "కొండ"),
|
|
647
|
+
("nenu", "నేను"),
|
|
648
|
+
("vallu", "వాళ్ళు"),
|
|
649
|
+
("palakariste", "పలకరిస్తే"),
|
|
650
|
+
]
|
|
651
|
+
|
|
652
|
+
print("\n" + "="*70)
|
|
653
|
+
print(" TRANSLITERATOR v3.0 - TEST CASES")
|
|
654
|
+
print("="*70 + "\n")
|
|
655
|
+
|
|
656
|
+
for english, expected in test_cases:
|
|
657
|
+
result = eng_to_telugu(english)
|
|
658
|
+
status = "✅" if result == expected else "❌"
|
|
659
|
+
print(f"{status} {english:20} → {result:15} (expected: {expected})")
|
|
660
|
+
|
|
661
|
+
print("\n" + "="*70 + "\n")
|
|
662
|
+
|
|
663
|
+
# Interactive test
|
|
664
|
+
print("Enter text to transliterate (or 'quit' to exit):")
|
|
665
|
+
while True:
|
|
666
|
+
try:
|
|
667
|
+
text = input("> ").strip()
|
|
668
|
+
if text.lower() in ['quit', 'exit', 'q']:
|
|
669
|
+
break
|
|
670
|
+
if text:
|
|
671
|
+
result = eng_to_telugu(text)
|
|
672
|
+
print(f" → {result}\n")
|
|
673
|
+
except KeyboardInterrupt:
|
|
674
|
+
break
|
|
675
|
+
|
|
676
|
+
print("\nTransliteration complete!")
|