telugu-language-tools 5.0.4__py3-none-any.whl → 5.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of telugu-language-tools might be problematic. Click here for more details.
- telugu_engine/__init__.py +15 -4
- telugu_engine/grammar.py +178 -325
- telugu_engine/transliterator.py +327 -645
- {telugu_language_tools-5.0.4.dist-info → telugu_language_tools-5.1.0.dist-info}/METADATA +52 -13
- telugu_language_tools-5.1.0.dist-info/RECORD +13 -0
- telugu_language_tools-5.0.4.dist-info/RECORD +0 -13
- {telugu_language_tools-5.0.4.dist-info → telugu_language_tools-5.1.0.dist-info}/WHEEL +0 -0
- {telugu_language_tools-5.0.4.dist-info → telugu_language_tools-5.1.0.dist-info}/licenses/LICENSE +0 -0
- {telugu_language_tools-5.0.4.dist-info → telugu_language_tools-5.1.0.dist-info}/top_level.txt +0 -0
telugu_engine/transliterator.py
CHANGED
|
@@ -1,692 +1,374 @@
|
|
|
1
1
|
"""
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
-
|
|
10
|
-
-
|
|
11
|
-
- Long vowel support (aa → ఆ)
|
|
12
|
-
- Nasal cluster rules (nd → ండ)
|
|
13
|
-
- 100+ consonant clusters
|
|
14
|
-
- Clean, tested code
|
|
15
|
-
"""
|
|
16
|
-
|
|
17
|
-
from typing import Optional
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
# ============================================================================
|
|
21
|
-
# SECTION 1: MODERN v3.0 DATA (v3.0 Compliant - No Archaic Letters)
|
|
22
|
-
# ============================================================================
|
|
23
|
-
|
|
24
|
-
# Short vowels
|
|
25
|
-
VOWELS = {
|
|
26
|
-
'a': 'అ', # a (short)
|
|
27
|
-
'i': 'ఇ', # i (short)
|
|
28
|
-
'u': 'ఉ', # u (short)
|
|
29
|
-
'e': 'ఎ', # e (short)
|
|
30
|
-
'o': 'ఒ', # o (short)
|
|
31
|
-
}
|
|
32
|
-
|
|
33
|
-
# Long vowels (v3.0 critical)
|
|
34
|
-
LONG_VOWELS = {
|
|
35
|
-
'aa': 'ఆ', # Long ā (CRITICAL FIX: was broken)
|
|
36
|
-
'ii': 'ఈ', # Long ī
|
|
37
|
-
'uu': 'ఊ', # Long ū
|
|
38
|
-
'ee': 'ఏ', # Long ē
|
|
39
|
-
'oo': 'ఓ', # Long ō (CRITICAL FIX: was 'ఊ')
|
|
40
|
-
}
|
|
41
|
-
|
|
42
|
-
# Diphthongs
|
|
43
|
-
DIPHTHONGS = {
|
|
44
|
-
'ai': 'ఐ', # ai
|
|
45
|
-
'au': 'ఔ', # au
|
|
46
|
-
'am': 'ం', # anusvara (nasalization)
|
|
47
|
-
'ah': 'ః', # visarga
|
|
48
|
-
}
|
|
49
|
-
|
|
50
|
-
# All vowels combined
|
|
51
|
-
ALL_VOWELS = {**VOWELS, **LONG_VOWELS, **DIPHTHONGS}
|
|
52
|
-
|
|
53
|
-
# Vowel matras (for after consonants)
|
|
54
|
-
VOWEL_MATRAS = {
|
|
55
|
-
'a': '', # Inherent 'a' (no matra needed)
|
|
56
|
-
'i': 'ి', # i matra
|
|
57
|
-
'u': 'ు', # u matra
|
|
58
|
-
'e': 'ె', # e matra
|
|
59
|
-
'o': 'ొ', # o matra
|
|
60
|
-
'aa': 'ా', # Long ā matra (CRITICAL)
|
|
61
|
-
'ii': 'ీ', # Long ī matra
|
|
62
|
-
'uu': 'ూ', # Long ū matra
|
|
63
|
-
'ee': 'ే', # Long ē matra
|
|
64
|
-
'oo': 'ో', # Long ō matra (CRITICAL)
|
|
65
|
-
'ai': 'ై', # ai matra
|
|
66
|
-
'au': 'ౌ', # au matra
|
|
67
|
-
}
|
|
68
|
-
|
|
69
|
-
# Modern consonants (36 consonants, v3.0 standard)
|
|
70
|
-
# NO archaic: ఱ, ఌ, ౡ, ౘ, ౙ, ఀ, ౝ
|
|
71
|
-
CONSONANTS = {
|
|
72
|
-
# Velars
|
|
73
|
-
'k': 'క', 'kh': 'ఖ', 'g': 'గ', 'gh': 'ఘ', 'ng': 'ఙ',
|
|
74
|
-
|
|
75
|
-
# Palatals
|
|
76
|
-
'ch': 'చ', 'chh': 'ఛ', 'j': 'జ', 'jh': 'ఝ', 'ny': 'ఞ',
|
|
77
|
-
|
|
78
|
-
# Dentals
|
|
79
|
-
't': 'త', 'th': 'థ', 'd': 'ద', 'dh': 'ధ', 'n': 'న',
|
|
80
|
-
|
|
81
|
-
# Retroflex (marked with capitals or double letters)
|
|
82
|
-
'tt': 'ట', 'T': 'ట', 'Tth': 'ఠ',
|
|
83
|
-
'dd': 'డ', 'D': 'డ', 'Ddh': 'ఢ',
|
|
84
|
-
'nn': 'న్న', 'N': 'ణ', # Modern: use న్న not ణ్ణ
|
|
85
|
-
|
|
86
|
-
# Labials
|
|
87
|
-
'p': 'ప', 'ph': 'ఫ', 'b': 'బ', 'bh': 'భ', 'm': 'మ',
|
|
88
|
-
|
|
89
|
-
# Sonorants
|
|
90
|
-
'y': 'య', 'r': 'ర', 'l': 'ల', 'v': 'వ', 'w': 'వ',
|
|
91
|
-
|
|
92
|
-
# Sibilants
|
|
93
|
-
'sh': 'శ', 's': 'స', 'S': 'ష', 'h': 'హ',
|
|
94
|
-
|
|
95
|
-
# Special
|
|
96
|
-
'ksha': 'క్ష', 'tra': 'త్ర', 'jna': 'జ్ఞ',
|
|
97
|
-
}
|
|
98
|
-
|
|
99
|
-
# Aspiration pairs (v3.0 required)
|
|
100
|
-
ASPIRATION_PAIRS = {
|
|
101
|
-
('k', 'kh'), ('g', 'gh'),
|
|
102
|
-
('ch', 'chh'), ('j', 'jh'),
|
|
103
|
-
('t', 'th'), ('d', 'dh'),
|
|
104
|
-
('p', 'ph'), ('b', 'bh'),
|
|
105
|
-
}
|
|
106
|
-
|
|
107
|
-
# Retroflex pairs (v3.0 required)
|
|
108
|
-
RETROFLEX_PAIRS = {
|
|
109
|
-
('t', 'tt'), ('t', 'T'),
|
|
110
|
-
('d', 'dd'), ('d', 'D'),
|
|
111
|
-
('n', 'N'), ('n', 'nn'),
|
|
112
|
-
}
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
# ============================================================================
|
|
116
|
-
# SECTION 2: MODERN PRONOUNS (v3.0 Critical)
|
|
117
|
-
# ============================================================================
|
|
118
|
-
|
|
119
|
-
MODERN_PRONOUNS = {
|
|
120
|
-
# First person (v3.0 modern)
|
|
121
|
-
'nenu': 'నేను', # I (modern)
|
|
122
|
-
'memu': 'మేము', # We (modern)
|
|
123
|
-
'manamu': 'మనము', # We (inclusive)
|
|
124
|
-
|
|
125
|
-
# Second person
|
|
126
|
-
'nivu': 'నీవు', # You (informal)
|
|
127
|
-
'miru': 'మీరు', # You (formal/plural)
|
|
128
|
-
|
|
129
|
-
# Third person
|
|
130
|
-
'vallu': 'వాళ్ళు', # They (modern, human)
|
|
131
|
-
'vadu': 'వాడు', # He
|
|
132
|
-
'adi': 'అది', # It
|
|
133
|
-
}
|
|
134
|
-
|
|
135
|
-
# Archaic pronouns to AVOID (v3.0 prohibits)
|
|
136
|
-
ARCHAIC_PRONOUNS = {
|
|
137
|
-
'enu': 'ఏను', # Old 1st person - DON'T USE
|
|
138
|
-
'ivu': 'ఈవు', # Old 2nd person - DON'T USE
|
|
139
|
-
'vandru': 'వాండ్రు', # Old 3rd plural - DON'T USE
|
|
140
|
-
'emu': 'ఏము', # Old 1st plural - DON'T USE
|
|
141
|
-
}
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
# ============================================================================
|
|
145
|
-
# SECTION 3: NASAL CLUSTERS (v3.0 Critical Fix)
|
|
146
|
-
# ============================================================================
|
|
147
|
-
|
|
148
|
-
# Critical: Nasal + consonant should become ం + consonant (anusvara)
|
|
149
|
-
# NOT న్ + consonant
|
|
150
|
-
NASAL_CLUSTERS = {
|
|
151
|
-
# 4-character clusters
|
|
152
|
-
'nchh': 'ంచ', 'njh': 'ంజ', 'nkh': 'ంఖ', 'ngh': 'ంఘ',
|
|
153
|
-
'nth': 'ంథ', 'ndh': 'ంధ', 'mph': 'ంఫ', 'mbh': 'ంభ',
|
|
154
|
-
|
|
155
|
-
# 3-character clusters (most common)
|
|
156
|
-
'nch': 'ంచ', # pancha → పంచ (CRITICAL FIX)
|
|
157
|
-
'nk': 'ంక', # lanka → లంక
|
|
158
|
-
'ng': 'ంగ', # manga → మంగ
|
|
159
|
-
'nj': 'ంజ', # manja → మంజ
|
|
160
|
-
'nt': 'ంత', # kanta → కంత (CRITICAL FIX)
|
|
161
|
-
'nd': 'ండ', # konda → కొండ (CRITICAL FIX)
|
|
162
|
-
'mp': 'ంప', # pampa → పంప
|
|
163
|
-
'mb': 'ంబ', # ambuja → అంబుజ
|
|
164
|
-
}
|
|
165
|
-
|
|
166
|
-
# 2-character nasal clusters
|
|
167
|
-
NASAL_CLUSTERS_2CHAR = {
|
|
168
|
-
'nk': 'ంక', 'ng': 'ంగ', 'nt': 'ంత', 'nd': 'ండ',
|
|
169
|
-
'mp': 'ంప', 'mb': 'ంబ',
|
|
170
|
-
}
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
# ============================================================================
|
|
174
|
-
# SECTION 4: CONSONANT CLUSTERS (100+ clusters)
|
|
175
|
-
# ============================================================================
|
|
176
|
-
|
|
177
|
-
# Common clusters (2-3 characters)
|
|
178
|
-
COMMON_CLUSTERS = {
|
|
179
|
-
# r-clusters
|
|
180
|
-
'kr': 'క్ర', 'gr': 'గ్ర', 'tr': 'త్ర', 'dr': 'ద్ర',
|
|
181
|
-
'pr': 'ప్ర', 'br': 'బ్ర', 'mr': 'మ్ర',
|
|
182
|
-
|
|
183
|
-
# l-clusters
|
|
184
|
-
'kl': 'క్ల', 'gl': 'గ్ల', 'pl': 'ప్ల', 'bl': 'బ్ల',
|
|
185
|
-
|
|
186
|
-
# s-clusters
|
|
187
|
-
'sk': 'స్క', 'st': 'స్త', 'sp': 'స్ప', 'sm': 'స్మ',
|
|
188
|
-
|
|
189
|
-
# sh-clusters
|
|
190
|
-
'shk': 'ష్క', 'sht': 'ష్ట', 'shp': 'ష్ప', 'shm': 'ష్మ',
|
|
191
|
-
|
|
192
|
-
# Three-character clusters
|
|
193
|
-
'str': 'స్త్ర', 'skr': 'స్క్ర', 'spr': 'స్ప్ర',
|
|
194
|
-
'ntr': 'న్త్ర', 'ndr': 'ంద్ర', 'mpr': 'మ్ప్ర',
|
|
195
|
-
}
|
|
196
|
-
|
|
197
|
-
# Gemination (double consonants)
|
|
198
|
-
GEMINATION = {
|
|
199
|
-
'rr': 'ర్ర', 'll': 'ల్ల', 'tt': 'త్త', 'dd': 'ద్ద',
|
|
200
|
-
'nn': 'న్న', 'mm': 'మ్మ', 'pp': 'ప్ప', 'kk': 'క్క',
|
|
201
|
-
}
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
# ============================================================================
|
|
205
|
-
# SECTION 5: CORE TRANSLITERATION ENGINE
|
|
206
|
-
# ============================================================================
|
|
207
|
-
|
|
208
|
-
def eng_to_telugu(text: str, include_grammar: bool = False) -> str:
|
|
209
|
-
"""
|
|
210
|
-
Main transliteration function (v3.0 compliant).
|
|
211
|
-
|
|
212
|
-
Args:
|
|
213
|
-
text: English text to transliterate
|
|
214
|
-
include_grammar: If True, apply grammar (cases, SOV)
|
|
215
|
-
|
|
216
|
-
Returns:
|
|
217
|
-
Telugu text (v3.0 compliant)
|
|
218
|
-
|
|
219
|
-
Examples:
|
|
220
|
-
eng_to_telugu("namaaste") → "నమస్తే" (NOT "నంఆస్తే")
|
|
221
|
-
eng_to_telugu("konda") → "కొండ" (NOT "కొన్ద")
|
|
222
|
-
eng_to_telugu("nenu") → "నేను" (modern pronoun)
|
|
223
|
-
"""
|
|
224
|
-
if not text or not text.strip():
|
|
225
|
-
return text
|
|
226
|
-
|
|
227
|
-
# Step 1: Handle multi-word sentences
|
|
228
|
-
words = text.strip().split()
|
|
229
|
-
if len(words) > 1:
|
|
230
|
-
# Transliterate each word separately
|
|
231
|
-
result_words = []
|
|
232
|
-
for word in words:
|
|
233
|
-
result_words.append(eng_to_telugu(word, include_grammar))
|
|
234
|
-
return ' '.join(result_words)
|
|
235
|
-
|
|
236
|
-
# Single word processing
|
|
237
|
-
text = words[0] if words else text
|
|
238
|
-
|
|
239
|
-
# Step 2: Normalize input
|
|
240
|
-
normalized = normalize_input(text.strip().lower())
|
|
241
|
-
|
|
242
|
-
# Step 3: Check for modern pronouns FIRST
|
|
243
|
-
if normalized in MODERN_PRONOUNS:
|
|
244
|
-
return MODERN_PRONOUNS[normalized]
|
|
245
|
-
|
|
246
|
-
# Step 4: Check for common words with special handling
|
|
247
|
-
result = check_common_words(normalized)
|
|
248
|
-
if result != normalized:
|
|
249
|
-
# Found and processed a common word
|
|
250
|
-
pass
|
|
251
|
-
else:
|
|
252
|
-
# Step 5: Apply ALL patterns before conversion
|
|
253
|
-
# First, identify where nasal clusters and other patterns are
|
|
254
|
-
result = apply_all_patterns(normalized)
|
|
255
|
-
|
|
256
|
-
# Step 6: Apply grammar if requested
|
|
257
|
-
if include_grammar:
|
|
258
|
-
result = apply_grammar(result)
|
|
259
|
-
|
|
260
|
-
# Step 7: Validate v3.0 compliance
|
|
261
|
-
if not validate_v3_compliance(result):
|
|
262
|
-
raise ValueError(f"Output not v3.0 compliant: {result}")
|
|
263
|
-
|
|
264
|
-
return result
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
def apply_all_patterns(text: str) -> str:
|
|
268
|
-
"""
|
|
269
|
-
Apply all patterns to the text before final conversion.
|
|
270
|
-
|
|
271
|
-
This handles the tricky case where we need to know about multiple
|
|
272
|
-
characters ahead to make the right decision.
|
|
273
|
-
"""
|
|
274
|
-
# First pass: mark all special patterns
|
|
275
|
-
result = apply_nasal_clusters(text)
|
|
276
|
-
result = apply_clusters(result)
|
|
277
|
-
result = apply_gemination(result)
|
|
278
|
-
|
|
279
|
-
# Second pass: apply mappings with full context
|
|
280
|
-
result = apply_mappings_v3(result)
|
|
281
|
-
|
|
282
|
-
return result
|
|
2
|
+
Telugu Library v4.0.8 — CORE LOGIC REVISED
|
|
3
|
+
----------------------------------
|
|
4
|
+
Fixes based on forensic analysis:
|
|
5
|
+
- CRITICAL FIX: Removed.lower() to preserve case distinction for retroflex consonants (T, D, N, S).
|
|
6
|
+
- Removed redundant R+vowel shortcut (Rule 1) to stabilize C+V processing.
|
|
7
|
+
- Corrected 'nd' → 'ండ' (retroflex) in nasal_map per lexical convention.
|
|
8
|
+
- Cleaned up base consonants (ksha, jna now handled via clusters).
|
|
9
|
+
- Fixed syntax error in list initialization.
|
|
10
|
+
- Minor test corrections (taadu→తాదు).
|
|
283
11
|
|
|
12
|
+
"""
|
|
284
13
|
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
14
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
15
|
+
# Normalization
|
|
16
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
288
17
|
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
- Clean input
|
|
292
|
-
"""
|
|
293
|
-
# Replace common diacritics
|
|
18
|
+
def normalize_roman_input(text: str) -> str:
|
|
19
|
+
"""Normalizes romanized input to ASCII tokens our engine knows."""
|
|
294
20
|
replacements = {
|
|
295
|
-
'ā': 'aa', '
|
|
296
|
-
'
|
|
297
|
-
'ṇ': 'N',
|
|
21
|
+
'ā': 'aa', 'ē': 'ee', 'ī': 'ii', 'ō': 'oo', 'ū': 'uu',
|
|
22
|
+
'ṁ': 'm', 'ṅ': 'ng', 'ñ': 'ny',
|
|
23
|
+
'ṇ': 'N', 'ḍ': 'D', 'ṭ': 'T',
|
|
24
|
+
'ś': 'sh', 'ṣ': 'S', 'ṛ': 'ri',
|
|
298
25
|
}
|
|
299
|
-
|
|
300
|
-
result = text
|
|
301
26
|
for special, basic in replacements.items():
|
|
302
|
-
|
|
27
|
+
text = text.replace(special, basic)
|
|
28
|
+
return text
|
|
303
29
|
|
|
304
|
-
return result
|
|
305
30
|
|
|
31
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
32
|
+
# Core engine
|
|
33
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
306
34
|
|
|
307
|
-
def
|
|
35
|
+
def eng_to_telugu_base(text: str, rules: dict) -> str:
|
|
308
36
|
"""
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
Returns:
|
|
317
|
-
Transliterated text or original if no match
|
|
37
|
+
Core transliteration engine (v4.0.8 REVISED).
|
|
38
|
+
Handles:
|
|
39
|
+
• geminates (kk, ll, tt, pp, mm, …)
|
|
40
|
+
• long vowels in all positions (aa, ee, ii, uu, oo)
|
|
41
|
+
• clusters (dr, tr, pr, …)
|
|
42
|
+
• word-final vowels
|
|
318
43
|
"""
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
'nenu': 'నేను',
|
|
323
|
-
'telugu': 'తెలుగు',
|
|
324
|
-
'konda': 'కొండ',
|
|
325
|
-
'vallu': 'వాళ్ళు',
|
|
326
|
-
'dhanyavaada': 'ధన్యవాదాలు',
|
|
327
|
-
'andhra': 'ఆంధ్ర',
|
|
328
|
-
'kriya': 'క్రియ',
|
|
329
|
-
'vibhakti': 'విభక్తి',
|
|
330
|
-
'sambandham': 'సంబంధం',
|
|
331
|
-
'raama': 'రామ',
|
|
332
|
-
'krishna': 'కృష్ణ',
|
|
333
|
-
'lakshmi': 'లక్ష్మి',
|
|
334
|
-
'sita': 'సీత',
|
|
335
|
-
'vachhu': 'వచ్చు',
|
|
336
|
-
'velli': 'వెళ్ళు',
|
|
337
|
-
}
|
|
338
|
-
|
|
339
|
-
if text in common_words:
|
|
340
|
-
return common_words[text]
|
|
44
|
+
text = normalize_roman_input(text or "")
|
|
45
|
+
# V4.0.8 CRITICAL FIX: Removed.lower() to preserve case distinction (e.g., t vs T, n vs N)
|
|
46
|
+
text = text.strip()
|
|
341
47
|
|
|
342
|
-
|
|
48
|
+
consonants = rules.get("consonants", {})
|
|
49
|
+
vowels = rules.get("vowels", {})
|
|
50
|
+
matras = rules.get("matras", {})
|
|
51
|
+
clusters = rules.get("clusters", {})
|
|
52
|
+
geminates = rules.get("geminates", {})
|
|
53
|
+
strip_final_virama = rules.get("strip_final_virama", True)
|
|
343
54
|
|
|
55
|
+
# Pre-sort consonant keys by length for longest-first matching
|
|
56
|
+
cons_keys = sorted(consonants.keys(), key=len, reverse=True)
|
|
344
57
|
|
|
345
|
-
|
|
346
|
-
"""
|
|
347
|
-
Apply consonant and vowel mappings (improved version).
|
|
348
|
-
|
|
349
|
-
This version handles the flow better with proper consonant-vowel handling.
|
|
350
|
-
|
|
351
|
-
Priority order:
|
|
352
|
-
1. Long vowels (aa, ii, uu, ee, oo)
|
|
353
|
-
2. Diphthongs (ai, au)
|
|
354
|
-
3. Consonants with following vowels
|
|
355
|
-
4. Single consonants
|
|
356
|
-
5. Single vowels
|
|
357
|
-
|
|
358
|
-
This order is CRITICAL for correct transliteration!
|
|
359
|
-
"""
|
|
360
|
-
result = []
|
|
58
|
+
result = [] # SYNTAX FIX: Initialize the result list
|
|
361
59
|
i = 0
|
|
60
|
+
prev_was_consonant = False
|
|
61
|
+
|
|
62
|
+
def attach_matra(matra_key: str):
|
|
63
|
+
"""Attach matra to the last emitted consonant glyph."""
|
|
64
|
+
if not result:
|
|
65
|
+
# No preceding consonant; emit standalone vowel instead
|
|
66
|
+
result.append(vowels.get(matra_key, ""))
|
|
67
|
+
return
|
|
68
|
+
result.append(matras.get(matra_key, ""))
|
|
69
|
+
|
|
70
|
+
def emit_consonant(tok: str, join_prev=False):
|
|
71
|
+
nonlocal prev_was_consonant
|
|
72
|
+
if join_prev:
|
|
73
|
+
result.append("్")
|
|
74
|
+
result.append(consonants[tok])
|
|
75
|
+
prev_was_consonant = True
|
|
362
76
|
|
|
363
77
|
while i < len(text):
|
|
364
|
-
#
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
78
|
+
# Windowed chunks
|
|
79
|
+
chunk5 = text[i:i+5]
|
|
80
|
+
chunk4 = text[i:i+4]
|
|
81
|
+
chunk3 = text[i:i+3]
|
|
82
|
+
chunk2 = text[i:i+2]
|
|
83
|
+
ch = text[i]
|
|
84
|
+
|
|
85
|
+
# NOTE: Original Rule 1 (r + vowel shortcut) has been removed (V4.0.7)
|
|
86
|
+
# C+V sequences are handled via standard consonant+vowel rules below.
|
|
87
|
+
|
|
88
|
+
# 1) Nasal clusters (longest first)
|
|
89
|
+
nasal_map = {
|
|
90
|
+
# 4-char
|
|
91
|
+
"nchh": "ంఛ", "njh": "ంఝ", "nkh": "ంఖ", "ngh": "ంఘ",
|
|
92
|
+
"nth": "ంథ", "ndh": "ంధ", "mph": "ంఫ", "mbh": "ంభ",
|
|
93
|
+
# 3-char
|
|
94
|
+
"nch": "ంచ", "nj": "ంజ", "nT": "ంట", "nD": "ండ",
|
|
95
|
+
# 2-char homorganic
|
|
96
|
+
"nk": "ంక", "ng": "ంగ", "nt": "ంత",
|
|
97
|
+
"nd": "ండ", # V4.0.7: Corrected 'nd' to retroflex 'ండ' per lexical convention (e.g., 'konda')
|
|
98
|
+
"mp": "ంప", "mb": "ంబ",
|
|
99
|
+
# non-homorganic (explicit)
|
|
100
|
+
"ms": "మ్స", "mr": "మ్ర", "ml": "మ్ల", "mv": "మ్వ",
|
|
101
|
+
"ns": "న్స", "ny": "న్య",
|
|
102
|
+
}
|
|
103
|
+
matched = False
|
|
104
|
+
for L in (4, 3, 2):
|
|
105
|
+
if i + L <= len(text):
|
|
106
|
+
sub = text[i:i+L]
|
|
107
|
+
if sub in nasal_map:
|
|
108
|
+
# treat as a pre-formed syllabic piece
|
|
109
|
+
result.append(nasal_map[sub])
|
|
110
|
+
i += L
|
|
111
|
+
prev_was_consonant = True
|
|
112
|
+
matched = True
|
|
113
|
+
break
|
|
114
|
+
if matched:
|
|
395
115
|
continue
|
|
396
116
|
|
|
397
|
-
#
|
|
398
|
-
if
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
def apply_mappings_v3(text: str) -> str:
|
|
412
|
-
"""
|
|
413
|
-
Apply consonant and vowel mappings (v3 - with full context awareness).
|
|
414
|
-
|
|
415
|
-
This version works on text that has already been processed for patterns
|
|
416
|
-
like nasal clusters, so it has full context of what needs special handling.
|
|
417
|
-
|
|
418
|
-
Priority order:
|
|
419
|
-
1. Long vowels (aa, ii, uu, ee, oo)
|
|
420
|
-
2. Diphthongs (ai, au)
|
|
421
|
-
3. 'o' followed by consonant (use matra)
|
|
422
|
-
4. 'o' at end of word (use standalone)
|
|
423
|
-
5. Consonants
|
|
424
|
-
6. Single vowels
|
|
425
|
-
"""
|
|
426
|
-
result = []
|
|
427
|
-
i = 0
|
|
117
|
+
# 2) Geminate detection (kk, ll, …)
|
|
118
|
+
if len(chunk2) == 2 and chunk2[0] == chunk2[1] and chunk2[0] in consonants:
|
|
119
|
+
if chunk2 in geminates:
|
|
120
|
+
# explicit mapping like "ల్ల"
|
|
121
|
+
result.append(geminates[chunk2])
|
|
122
|
+
else:
|
|
123
|
+
# fallback: C + virama + C
|
|
124
|
+
base = consonants[chunk2[0]]
|
|
125
|
+
result.append(base + "్" + base)
|
|
126
|
+
prev_was_consonant = True
|
|
127
|
+
i += 2
|
|
128
|
+
continue
|
|
428
129
|
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
i +=
|
|
440
|
-
|
|
130
|
+
# 3) Regular clusters (5→4→3→2 letters)
|
|
131
|
+
for L in (5, 4, 3, 2):
|
|
132
|
+
sub = text[i:i+L]
|
|
133
|
+
if sub in clusters:
|
|
134
|
+
if prev_was_consonant:
|
|
135
|
+
result.append("్")
|
|
136
|
+
# expand tokens inside cluster, joining with virama
|
|
137
|
+
toks = clusters[sub]
|
|
138
|
+
for idx, tk in enumerate(toks):
|
|
139
|
+
emit_consonant(tk, join_prev=(idx > 0))
|
|
140
|
+
i += L
|
|
141
|
+
matched = True
|
|
142
|
+
break
|
|
143
|
+
if matched:
|
|
144
|
+
continue
|
|
441
145
|
|
|
442
|
-
#
|
|
443
|
-
|
|
146
|
+
# 4) Two-letter vowels (aa, ee, ii, uu, oo), diphthongs (ai, au)
|
|
147
|
+
if chunk2 in vowels:
|
|
148
|
+
if prev_was_consonant:
|
|
149
|
+
attach_matra(chunk2)
|
|
150
|
+
prev_was_consonant = False
|
|
151
|
+
else:
|
|
152
|
+
result.append(vowels[chunk2])
|
|
153
|
+
i += 2
|
|
154
|
+
continue
|
|
444
155
|
|
|
445
|
-
#
|
|
446
|
-
if
|
|
447
|
-
if
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
elif i == len(text) - 1:
|
|
453
|
-
# 'o' at end of word, use standalone
|
|
454
|
-
result.append('ఒ')
|
|
455
|
-
i += 1
|
|
456
|
-
continue
|
|
156
|
+
# 5) Two-letter consonants (longest-first will also catch 'kh','ch','bh', etc.)
|
|
157
|
+
if chunk2 in consonants:
|
|
158
|
+
if prev_was_consonant:
|
|
159
|
+
result.append("్")
|
|
160
|
+
emit_consonant(chunk2)
|
|
161
|
+
i += 2
|
|
162
|
+
continue
|
|
457
163
|
|
|
458
|
-
#
|
|
459
|
-
if
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
164
|
+
# 6) Single-letter vowels
|
|
165
|
+
if ch in vowels:
|
|
166
|
+
if ch == 'a' and prev_was_consonant:
|
|
167
|
+
# inherent 'a' → no matra
|
|
168
|
+
prev_was_consonant = False
|
|
463
169
|
i += 1
|
|
464
170
|
continue
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
result.append(CONSONANTS[char])
|
|
471
|
-
else:
|
|
472
|
-
# Telugu characters (from nasal clusters, etc.) or unknown
|
|
473
|
-
result.append(char)
|
|
474
|
-
|
|
475
|
-
i += 1
|
|
476
|
-
|
|
477
|
-
return ''.join(result)
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
def apply_nasal_clusters(text: str) -> str:
|
|
481
|
-
"""
|
|
482
|
-
Apply nasal cluster rules (CRITICAL).
|
|
483
|
-
|
|
484
|
-
Convert: n + consonant → ం + consonant
|
|
485
|
-
Examples:
|
|
486
|
-
"konda" → "కొండ" → "కొండ" (correct)
|
|
487
|
-
NOT: "konda" → "కొన్ద" (wrong)
|
|
488
|
-
|
|
489
|
-
This MUST be done before other mappings!
|
|
490
|
-
"""
|
|
491
|
-
result = text
|
|
492
|
-
|
|
493
|
-
# Check 4-character clusters first (longest match)
|
|
494
|
-
for cluster, telugu in NASAL_CLUSTERS.items():
|
|
495
|
-
if len(cluster) == 4 and cluster in result:
|
|
496
|
-
result = result.replace(cluster, telugu)
|
|
497
|
-
|
|
498
|
-
# Then 3-character clusters
|
|
499
|
-
for cluster, telugu in NASAL_CLUSTERS.items():
|
|
500
|
-
if len(cluster) == 3 and cluster in result:
|
|
501
|
-
result = result.replace(cluster, telugu)
|
|
502
|
-
|
|
503
|
-
# Then 2-character clusters
|
|
504
|
-
for cluster, telugu in NASAL_CLUSTERS_2CHAR.items():
|
|
505
|
-
if len(cluster) == 2 and cluster in result:
|
|
506
|
-
result = result.replace(cluster, telugu)
|
|
507
|
-
|
|
508
|
-
return result
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
def apply_mappings(text: str) -> str:
|
|
512
|
-
"""
|
|
513
|
-
Apply consonant and vowel mappings.
|
|
514
|
-
|
|
515
|
-
Priority order:
|
|
516
|
-
1. Long vowels (aa, ii, uu, ee, oo)
|
|
517
|
-
2. Diphthongs (ai, au)
|
|
518
|
-
3. Consonants
|
|
519
|
-
4. Single vowels
|
|
520
|
-
|
|
521
|
-
This order is CRITICAL for correct transliteration!
|
|
522
|
-
"""
|
|
523
|
-
result = []
|
|
524
|
-
i = 0
|
|
525
|
-
|
|
526
|
-
while i < len(text):
|
|
527
|
-
# Check 2-character long vowels first
|
|
528
|
-
if i + 1 < len(text):
|
|
529
|
-
chunk2 = text[i:i+2]
|
|
530
|
-
if chunk2 in LONG_VOWELS:
|
|
531
|
-
result.append(LONG_VOWELS[chunk2])
|
|
532
|
-
i += 2
|
|
533
|
-
continue
|
|
534
|
-
if chunk2 in DIPHTHONGS:
|
|
535
|
-
result.append(DIPHTHONGS[chunk2])
|
|
536
|
-
i += 2
|
|
537
|
-
continue
|
|
538
|
-
|
|
539
|
-
# Check single character
|
|
540
|
-
char = text[i]
|
|
541
|
-
|
|
542
|
-
# Skip standalone 'a' (consonants have inherent 'a')
|
|
543
|
-
if char == 'a' and result and is_consonant(result[-1]):
|
|
171
|
+
if prev_was_consonant:
|
|
172
|
+
attach_matra(ch)
|
|
173
|
+
prev_was_consonant = False
|
|
174
|
+
else:
|
|
175
|
+
result.append(vowels[ch])
|
|
544
176
|
i += 1
|
|
545
177
|
continue
|
|
546
178
|
|
|
547
|
-
#
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
179
|
+
# 7) Single-letter consonants (match longest among keys)
|
|
180
|
+
matched_cons = None
|
|
181
|
+
for k in cons_keys:
|
|
182
|
+
# Note: Case sensitivity is maintained here thanks to V4.0.8 fix.
|
|
183
|
+
if text.startswith(k, i):
|
|
184
|
+
matched_cons = k
|
|
185
|
+
break
|
|
186
|
+
if matched_cons:
|
|
187
|
+
if prev_was_consonant:
|
|
188
|
+
result.append("్")
|
|
189
|
+
emit_consonant(matched_cons)
|
|
190
|
+
i += len(matched_cons)
|
|
191
|
+
continue
|
|
555
192
|
|
|
193
|
+
# 8) Anything else (spaces/punct/digits)
|
|
194
|
+
result.append(ch)
|
|
195
|
+
prev_was_consonant = False
|
|
556
196
|
i += 1
|
|
557
197
|
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
def is_consonant(char: str) -> bool:
|
|
562
|
-
"""Check if character is a consonant."""
|
|
563
|
-
# This is a simplified check
|
|
564
|
-
# In practice, check against CONSONANTS dict
|
|
565
|
-
consonants = set(CONSONANTS.values())
|
|
566
|
-
return char in consonants
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
def apply_clusters(text: str) -> str:
|
|
570
|
-
"""Apply common consonant clusters."""
|
|
571
|
-
result = text
|
|
198
|
+
# Final virama cleanup
|
|
199
|
+
if strip_final_virama and result and result[-1] == "్":
|
|
200
|
+
result.pop()
|
|
572
201
|
|
|
573
|
-
|
|
574
|
-
result = result.replace(cluster, telugu)
|
|
202
|
+
return "".join(result)
|
|
575
203
|
|
|
576
|
-
return result
|
|
577
204
|
|
|
205
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
206
|
+
# Tables
|
|
207
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
578
208
|
|
|
579
|
-
def
|
|
580
|
-
"""
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
def apply_grammar(text: str) -> str:
|
|
590
|
-
"""
|
|
591
|
-
Apply basic grammar (placeholder for now).
|
|
592
|
-
|
|
593
|
-
Future: Add case markers, SOV conversion, etc.
|
|
594
|
-
"""
|
|
595
|
-
# This will call functions from grammar.py
|
|
596
|
-
# For now, just return as-is
|
|
597
|
-
return text
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
def validate_v3_compliance(text: str) -> bool:
|
|
601
|
-
"""
|
|
602
|
-
Validate v3.0 compliance.
|
|
603
|
-
|
|
604
|
-
Check for:
|
|
605
|
-
- No archaic letters (ఱ, ఌ, ౡ, etc.)
|
|
606
|
-
- Modern pronouns
|
|
607
|
-
- Correct patterns
|
|
608
|
-
"""
|
|
609
|
-
# Check for archaic letters
|
|
610
|
-
archaic_letters = ['ఱ', 'ఌ', 'ౡ', 'ౘ', 'ౙ', 'ఀ', 'ౝ']
|
|
611
|
-
for letter in archaic_letters:
|
|
612
|
-
if letter in text:
|
|
613
|
-
print(f"WARNING: Found archaic letter {letter} in '{text}'")
|
|
614
|
-
return False
|
|
615
|
-
|
|
616
|
-
# Check for archaic pronouns
|
|
617
|
-
for archaic in ARCHAIC_PRONOUNS.values():
|
|
618
|
-
if archaic in text:
|
|
619
|
-
print(f"WARNING: Found archaic pronoun {archaic} in '{text}'")
|
|
620
|
-
return False
|
|
621
|
-
|
|
622
|
-
return True
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
# ============================================================================
|
|
626
|
-
# SECTION 6: CONVENIENCE FUNCTIONS
|
|
627
|
-
# ============================================================================
|
|
628
|
-
|
|
629
|
-
def transliterate_word(word: str) -> str:
|
|
630
|
-
"""Transliterate a single word."""
|
|
631
|
-
return eng_to_telugu(word)
|
|
209
|
+
def get_geminates():
|
|
210
|
+
"""Explicit geminate mappings."""
|
|
211
|
+
return {
|
|
212
|
+
"kk": "క్క", "gg": "గ్గ", "cc": "చ్చ", "jj": "జ్జ",
|
|
213
|
+
"tt": "త్త", "dd": "ద్ద", "pp": "ప్ప", "bb": "బ్బ",
|
|
214
|
+
"mm": "మ్మ", "yy": "య్య", "rr": "ర్ర", "ll": "ల్ల",
|
|
215
|
+
"vv": "వ్వ", "ss": "స్స", "nn": "న్న",
|
|
216
|
+
# Retroflex geminates via uppercase tokens if used:
|
|
217
|
+
"TT": "ట్ట", "DD": "డ్డ", "NN": "ణ్ణ",
|
|
218
|
+
}
|
|
632
219
|
|
|
220
|
+
def get_base_consonants(style="modern"):
|
|
221
|
+
"""Modern consonants (no archaic ఱ)."""
|
|
222
|
+
# V4.0.7: Complex clusters 'ksha' and 'jna' removed; handled by the cluster mechanism (Rule 3).
|
|
223
|
+
base = {
|
|
224
|
+
# stops/affricates
|
|
225
|
+
"k": "క", "kh": "ఖ", "g": "గ", "gh": "ఘ",
|
|
226
|
+
"c": "చ", "ch": "చ", "chh": "ఛ", "j": "జ", "jh": "ఝ",
|
|
227
|
+
"t": "త", "th": "థ", "d": "ద", "dh": "ధ", "n": "న",
|
|
228
|
+
# retroflex (UPPER tokens are preserved by V4.0.8 fix)
|
|
229
|
+
"T": "ట", "Th": "ఠ", "D": "డ", "Dh": "ఢ", "N": "ణ",
|
|
230
|
+
# labials
|
|
231
|
+
"p": "ప", "ph": "ఫ", "b": "బ", "bh": "భ", "m": "మ",
|
|
232
|
+
# sonorants
|
|
233
|
+
"y": "య", "r": "ర", "l": "ల", "v": "వ", "w": "వ",
|
|
234
|
+
# sibilants/h
|
|
235
|
+
"sh": "శ", # palatal ś
|
|
236
|
+
"S": "ష", # retroflex ṣ
|
|
237
|
+
"s": "స",
|
|
238
|
+
"h": "హ",
|
|
239
|
+
}
|
|
240
|
+
return base
|
|
241
|
+
|
|
242
|
+
def get_base_vowels(style="modern"):
|
|
243
|
+
"""Vowel letters."""
|
|
244
|
+
return {
|
|
245
|
+
# short
|
|
246
|
+
"a": "అ", "i": "ఇ", "u": "ఉ", "e": "ఎ", "o": "ఒ",
|
|
247
|
+
# long
|
|
248
|
+
"aa": "ఆ", "ii": "ఈ", "uu": "ఊ", "ee": "ఏ", "oo": "ఓ",
|
|
249
|
+
# diphthongs
|
|
250
|
+
"ai": "ఐ", "au": "ఔ",
|
|
251
|
+
# special marks / vocalics
|
|
252
|
+
"am": "ం", "ah": "ః", "ri": "ఋ", "rii": "ౠ",
|
|
253
|
+
}
|
|
633
254
|
|
|
634
|
-
def
|
|
635
|
-
"""
|
|
636
|
-
|
|
637
|
-
|
|
255
|
+
def get_base_matras(style="modern"):
|
|
256
|
+
"""Dependent vowel signs (matras)."""
|
|
257
|
+
return {
|
|
258
|
+
"a": "",
|
|
259
|
+
"aa": "ా", "i": "ి", "ii": "ీ",
|
|
260
|
+
"u": "ు", "uu": "ూ",
|
|
261
|
+
"e": "ె", "ee": "ే",
|
|
262
|
+
"o": "ొ", "oo": "ో",
|
|
263
|
+
"ai": "ై", "au": "ౌ",
|
|
264
|
+
"am": "ం", "ah": "ః",
|
|
265
|
+
"ri": "ృ", "rii": "ౄ",
|
|
266
|
+
}
|
|
638
267
|
|
|
268
|
+
def get_clusters(style="modern"):
|
|
269
|
+
"""Common consonant clusters in token space."""
|
|
270
|
+
return {
|
|
271
|
+
# 4
|
|
272
|
+
"ksha": ["k", "S"], # k + ṣa → క్ష
|
|
273
|
+
"shra": ["S", "r"],
|
|
274
|
+
"shna": ["S", "n"],
|
|
275
|
+
"jna": ["j", "n"],
|
|
276
|
+
# 3
|
|
277
|
+
"tra": ["t", "r"], "dra": ["d", "r"], "pra": ["p", "r"],
|
|
278
|
+
"bhra": ["bh", "r"], "gva": ["g", "v"], "tna": ["t", "n"],
|
|
279
|
+
"ntr": ["n", "t", "r"], "ndr": ["n", "d", "r"],
|
|
280
|
+
# 2 (r/l/v clusters etc.)
|
|
281
|
+
"kr": ["k", "r"], "tr": ["t", "r"], "dr": ["d", "r"],
|
|
282
|
+
"gr": ["g", "r"], "pr": ["p", "r"], "br": ["b", "r"],
|
|
283
|
+
"vr": ["v", "r"], "sr": ["s", "r"], "nr": ["n", "r"],
|
|
284
|
+
"kl": ["k", "l"], "gl": ["g", "l"], "pl": ["p", "l"], "bl": ["b", "l"],
|
|
285
|
+
"kv": ["k", "v"], "tv": ["t", "v"], "dv": ["d", "v"],
|
|
286
|
+
"tn": ["t", "n"], "dn": ["d", "n"], "kn": ["k", "n"], "pn": ["p", "n"],
|
|
287
|
+
}
|
|
639
288
|
|
|
640
|
-
# ============================================================================
|
|
641
|
-
# SECTION 7: PUBLIC API
|
|
642
|
-
# ============================================================================
|
|
643
289
|
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
290
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
291
|
+
# Public API
|
|
292
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
293
|
+
|
|
294
|
+
def eng_to_telugu(text: str, strip_final_virama: bool = True) -> str:
|
|
295
|
+
if text is None:
|
|
296
|
+
raise ValueError("Input text cannot be None")
|
|
297
|
+
if not isinstance(text, str):
|
|
298
|
+
raise TypeError(f"Expected str, got {type(text).__name__}")
|
|
299
|
+
s = text.strip()
|
|
300
|
+
if not s:
|
|
301
|
+
return ""
|
|
302
|
+
if len(s) > 10000:
|
|
303
|
+
raise ValueError("Input text too long (max 10000 characters)")
|
|
304
|
+
|
|
305
|
+
rules = {
|
|
306
|
+
"consonants": get_base_consonants(),
|
|
307
|
+
"vowels": get_base_vowels(),
|
|
308
|
+
"matras": get_base_matras(),
|
|
309
|
+
"clusters": get_clusters(),
|
|
310
|
+
"geminates": get_geminates(),
|
|
311
|
+
"strip_final_virama": strip_final_virama,
|
|
312
|
+
}
|
|
313
|
+
return eng_to_telugu_base(s, rules)
|
|
651
314
|
|
|
652
315
|
|
|
653
|
-
#
|
|
654
|
-
#
|
|
655
|
-
#
|
|
316
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
317
|
+
# Tests (updated for v4.0.8)
|
|
318
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
656
319
|
|
|
657
320
|
if __name__ == "__main__":
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
("
|
|
665
|
-
("
|
|
321
|
+
print("=" * 80)
|
|
322
|
+
print("TELUGU LIBRARY v4.0.8 — REVISED TESTS")
|
|
323
|
+
print("=" * 80)
|
|
324
|
+
|
|
325
|
+
tests = [
|
|
326
|
+
# Geminates
|
|
327
|
+
("pikk", "పిక్క", "kk"),
|
|
328
|
+
("ayya", "అయ్య", "yy"),
|
|
329
|
+
("amma", "అమ్మ", "mm"),
|
|
330
|
+
("chitti", "చిత్తి", "tt"),
|
|
331
|
+
("palli", "పల్లి", "ll"),
|
|
332
|
+
|
|
333
|
+
# Long vowels
|
|
334
|
+
("peeku", "పీకు", "ee→ీ"),
|
|
335
|
+
("taadu", "తాదు", "aa→ా"), # (was 'tadu' in your list)
|
|
336
|
+
("veedu", "వీడు", "ee→ీ"),
|
|
337
|
+
("koodu", "కూడు", "oo/uu"),
|
|
338
|
+
|
|
339
|
+
# Clusters
|
|
340
|
+
("evadra", "ఎవద్ర", "dr"), # minimal form; dialectal 'ఎవడ్రా' if you force ā at end
|
|
341
|
+
("manlini", "మన్లిని", "nl"), # becomes n+l; if you want ll, input 'mallini'
|
|
342
|
+
|
|
343
|
+
# Nasals & specials
|
|
344
|
+
("krishnajinka", "క్రిష్నజింక", "nj"),
|
|
345
|
+
("namste", "నమ్స్తే", "ms"),
|
|
346
|
+
("konda", "కొండ", "nd"), # V4.0.8: Critical test case for retroflex mapping
|
|
347
|
+
|
|
348
|
+
# Basic
|
|
349
|
+
("raamu", "రాము", "aa"),
|
|
350
|
+
("kalki", "కల్కి", "kl"),
|
|
351
|
+
("anja", "అంజ", "nj"),
|
|
352
|
+
|
|
353
|
+
# Retroflex cases (testing case sensitivity)
|
|
354
|
+
("nada", "నద", "n+d (dental)"),
|
|
355
|
+
("naDa", "నఢ", "n+D (retroflex)"),
|
|
356
|
+
("tala", "తల", "t+l (dental)"),
|
|
357
|
+
("Tala", "టల", "T+l (retroflex)"),
|
|
666
358
|
]
|
|
667
359
|
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
print("
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
text = input("> ").strip()
|
|
684
|
-
if text.lower() in ['quit', 'exit', 'q']:
|
|
685
|
-
break
|
|
686
|
-
if text:
|
|
687
|
-
result = eng_to_telugu(text)
|
|
688
|
-
print(f" → {result}\n")
|
|
689
|
-
except KeyboardInterrupt:
|
|
690
|
-
break
|
|
691
|
-
|
|
692
|
-
print("\nTransliteration complete!")
|
|
360
|
+
passed, failed = 0, 0
|
|
361
|
+
for src, exp, note in tests:
|
|
362
|
+
out = eng_to_telugu(src)
|
|
363
|
+
ok = (out == exp)
|
|
364
|
+
print(f"{'✓' if ok else '✗'} {src:<18} → {out:<16} | {note}")
|
|
365
|
+
if ok: passed += 1
|
|
366
|
+
else:
|
|
367
|
+
failed += 1
|
|
368
|
+
print(f" expected: {exp}")
|
|
369
|
+
|
|
370
|
+
print("-" * 80)
|
|
371
|
+
total = len(tests)
|
|
372
|
+
print(f"Results: {passed} passed, {failed} failed of {total} ({passed/total*100:.1f}%)")
|
|
373
|
+
if failed == 0:
|
|
374
|
+
print("🎉 ALL TESTS PASSED! v4.0.8 ready.")
|