telugu-language-tools 5.0.4__py3-none-any.whl → 5.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of telugu-language-tools might be problematic. Click here for more details.
- telugu_engine/__init__.py +20 -25
- telugu_engine/enhanced_tense.py +184 -649
- telugu_engine/grammar.py +178 -325
- telugu_engine/transliterator.py +295 -643
- {telugu_language_tools-5.0.4.dist-info → telugu_language_tools-5.5.0.dist-info}/METADATA +84 -13
- telugu_language_tools-5.5.0.dist-info/RECORD +12 -0
- telugu_engine/tense_engine.py +0 -391
- telugu_language_tools-5.0.4.dist-info/RECORD +0 -13
- {telugu_language_tools-5.0.4.dist-info → telugu_language_tools-5.5.0.dist-info}/WHEEL +0 -0
- {telugu_language_tools-5.0.4.dist-info → telugu_language_tools-5.5.0.dist-info}/licenses/LICENSE +0 -0
- {telugu_language_tools-5.0.4.dist-info → telugu_language_tools-5.5.0.dist-info}/top_level.txt +0 -0
telugu_engine/transliterator.py
CHANGED
|
@@ -1,692 +1,344 @@
|
|
|
1
1
|
"""
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
- v3.0 modern script (no archaic letters)
|
|
10
|
-
- Modern pronouns (నేను, వాళ్ళు)
|
|
11
|
-
- Long vowel support (aa → ఆ)
|
|
12
|
-
- Nasal cluster rules (nd → ండ)
|
|
13
|
-
- 100+ consonant clusters
|
|
14
|
-
- Clean, tested code
|
|
2
|
+
Telugu Library v4.3.0 — Enhanced Clusters
|
|
3
|
+
----------------------------------
|
|
4
|
+
Fixes based on user feedback:
|
|
5
|
+
- **Enhanced Clusters:** Added numerous 3- and 4-character consonant clusters (e.g., 'str', 'sht', 'skr', 'STh') to the 'clusters' dictionary for greater accuracy.
|
|
6
|
+
- **CRITICAL FIX (C+ri Matra):** Ensured consonant-r-i sequences are correctly parsed as C + R + I-matra.
|
|
7
|
+
- **Refined Nasal Handling:** Simplified internal nasal cluster handling to rely more heavily on the central 'clusters' map for complex cases like 'namste'.
|
|
8
|
+
- **Case Sensitivity Maintained:** Retains case distinction for retroflex consonants (T, D, N, S).
|
|
15
9
|
"""
|
|
16
10
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
# ============================================================================
|
|
21
|
-
# SECTION 1: MODERN v3.0 DATA (v3.0 Compliant - No Archaic Letters)
|
|
22
|
-
# ============================================================================
|
|
23
|
-
|
|
24
|
-
# Short vowels
|
|
25
|
-
VOWELS = {
|
|
26
|
-
'a': 'అ', # a (short)
|
|
27
|
-
'i': 'ఇ', # i (short)
|
|
28
|
-
'u': 'ఉ', # u (short)
|
|
29
|
-
'e': 'ఎ', # e (short)
|
|
30
|
-
'o': 'ఒ', # o (short)
|
|
31
|
-
}
|
|
32
|
-
|
|
33
|
-
# Long vowels (v3.0 critical)
|
|
34
|
-
LONG_VOWELS = {
|
|
35
|
-
'aa': 'ఆ', # Long ā (CRITICAL FIX: was broken)
|
|
36
|
-
'ii': 'ఈ', # Long ī
|
|
37
|
-
'uu': 'ఊ', # Long ū
|
|
38
|
-
'ee': 'ఏ', # Long ē
|
|
39
|
-
'oo': 'ఓ', # Long ō (CRITICAL FIX: was 'ఊ')
|
|
40
|
-
}
|
|
41
|
-
|
|
42
|
-
# Diphthongs
|
|
43
|
-
DIPHTHONGS = {
|
|
44
|
-
'ai': 'ఐ', # ai
|
|
45
|
-
'au': 'ఔ', # au
|
|
46
|
-
'am': 'ం', # anusvara (nasalization)
|
|
47
|
-
'ah': 'ః', # visarga
|
|
48
|
-
}
|
|
49
|
-
|
|
50
|
-
# All vowels combined
|
|
51
|
-
ALL_VOWELS = {**VOWELS, **LONG_VOWELS, **DIPHTHONGS}
|
|
52
|
-
|
|
53
|
-
# Vowel matras (for after consonants)
|
|
54
|
-
VOWEL_MATRAS = {
|
|
55
|
-
'a': '', # Inherent 'a' (no matra needed)
|
|
56
|
-
'i': 'ి', # i matra
|
|
57
|
-
'u': 'ు', # u matra
|
|
58
|
-
'e': 'ె', # e matra
|
|
59
|
-
'o': 'ొ', # o matra
|
|
60
|
-
'aa': 'ా', # Long ā matra (CRITICAL)
|
|
61
|
-
'ii': 'ీ', # Long ī matra
|
|
62
|
-
'uu': 'ూ', # Long ū matra
|
|
63
|
-
'ee': 'ే', # Long ē matra
|
|
64
|
-
'oo': 'ో', # Long ō matra (CRITICAL)
|
|
65
|
-
'ai': 'ై', # ai matra
|
|
66
|
-
'au': 'ౌ', # au matra
|
|
67
|
-
}
|
|
68
|
-
|
|
69
|
-
# Modern consonants (36 consonants, v3.0 standard)
|
|
70
|
-
# NO archaic: ఱ, ఌ, ౡ, ౘ, ౙ, ఀ, ౝ
|
|
71
|
-
CONSONANTS = {
|
|
72
|
-
# Velars
|
|
73
|
-
'k': 'క', 'kh': 'ఖ', 'g': 'గ', 'gh': 'ఘ', 'ng': 'ఙ',
|
|
74
|
-
|
|
75
|
-
# Palatals
|
|
76
|
-
'ch': 'చ', 'chh': 'ఛ', 'j': 'జ', 'jh': 'ఝ', 'ny': 'ఞ',
|
|
77
|
-
|
|
78
|
-
# Dentals
|
|
79
|
-
't': 'త', 'th': 'థ', 'd': 'ద', 'dh': 'ధ', 'n': 'న',
|
|
80
|
-
|
|
81
|
-
# Retroflex (marked with capitals or double letters)
|
|
82
|
-
'tt': 'ట', 'T': 'ట', 'Tth': 'ఠ',
|
|
83
|
-
'dd': 'డ', 'D': 'డ', 'Ddh': 'ఢ',
|
|
84
|
-
'nn': 'న్న', 'N': 'ణ', # Modern: use న్న not ణ్ణ
|
|
85
|
-
|
|
86
|
-
# Labials
|
|
87
|
-
'p': 'ప', 'ph': 'ఫ', 'b': 'బ', 'bh': 'భ', 'm': 'మ',
|
|
88
|
-
|
|
89
|
-
# Sonorants
|
|
90
|
-
'y': 'య', 'r': 'ర', 'l': 'ల', 'v': 'వ', 'w': 'వ',
|
|
91
|
-
|
|
92
|
-
# Sibilants
|
|
93
|
-
'sh': 'శ', 's': 'స', 'S': 'ష', 'h': 'హ',
|
|
94
|
-
|
|
95
|
-
# Special
|
|
96
|
-
'ksha': 'క్ష', 'tra': 'త్ర', 'jna': 'జ్ఞ',
|
|
97
|
-
}
|
|
98
|
-
|
|
99
|
-
# Aspiration pairs (v3.0 required)
|
|
100
|
-
ASPIRATION_PAIRS = {
|
|
101
|
-
('k', 'kh'), ('g', 'gh'),
|
|
102
|
-
('ch', 'chh'), ('j', 'jh'),
|
|
103
|
-
('t', 'th'), ('d', 'dh'),
|
|
104
|
-
('p', 'ph'), ('b', 'bh'),
|
|
105
|
-
}
|
|
106
|
-
|
|
107
|
-
# Retroflex pairs (v3.0 required)
|
|
108
|
-
RETROFLEX_PAIRS = {
|
|
109
|
-
('t', 'tt'), ('t', 'T'),
|
|
110
|
-
('d', 'dd'), ('d', 'D'),
|
|
111
|
-
('n', 'N'), ('n', 'nn'),
|
|
112
|
-
}
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
# ============================================================================
|
|
116
|
-
# SECTION 2: MODERN PRONOUNS (v3.0 Critical)
|
|
117
|
-
# ============================================================================
|
|
118
|
-
|
|
119
|
-
MODERN_PRONOUNS = {
|
|
120
|
-
# First person (v3.0 modern)
|
|
121
|
-
'nenu': 'నేను', # I (modern)
|
|
122
|
-
'memu': 'మేము', # We (modern)
|
|
123
|
-
'manamu': 'మనము', # We (inclusive)
|
|
124
|
-
|
|
125
|
-
# Second person
|
|
126
|
-
'nivu': 'నీవు', # You (informal)
|
|
127
|
-
'miru': 'మీరు', # You (formal/plural)
|
|
128
|
-
|
|
129
|
-
# Third person
|
|
130
|
-
'vallu': 'వాళ్ళు', # They (modern, human)
|
|
131
|
-
'vadu': 'వాడు', # He
|
|
132
|
-
'adi': 'అది', # It
|
|
133
|
-
}
|
|
134
|
-
|
|
135
|
-
# Archaic pronouns to AVOID (v3.0 prohibits)
|
|
136
|
-
ARCHAIC_PRONOUNS = {
|
|
137
|
-
'enu': 'ఏను', # Old 1st person - DON'T USE
|
|
138
|
-
'ivu': 'ఈవు', # Old 2nd person - DON'T USE
|
|
139
|
-
'vandru': 'వాండ్రు', # Old 3rd plural - DON'T USE
|
|
140
|
-
'emu': 'ఏము', # Old 1st plural - DON'T USE
|
|
141
|
-
}
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
# ============================================================================
|
|
145
|
-
# SECTION 3: NASAL CLUSTERS (v3.0 Critical Fix)
|
|
146
|
-
# ============================================================================
|
|
147
|
-
|
|
148
|
-
# Critical: Nasal + consonant should become ం + consonant (anusvara)
|
|
149
|
-
# NOT న్ + consonant
|
|
150
|
-
NASAL_CLUSTERS = {
|
|
151
|
-
# 4-character clusters
|
|
152
|
-
'nchh': 'ంచ', 'njh': 'ంజ', 'nkh': 'ంఖ', 'ngh': 'ంఘ',
|
|
153
|
-
'nth': 'ంథ', 'ndh': 'ంధ', 'mph': 'ంఫ', 'mbh': 'ంభ',
|
|
154
|
-
|
|
155
|
-
# 3-character clusters (most common)
|
|
156
|
-
'nch': 'ంచ', # pancha → పంచ (CRITICAL FIX)
|
|
157
|
-
'nk': 'ంక', # lanka → లంక
|
|
158
|
-
'ng': 'ంగ', # manga → మంగ
|
|
159
|
-
'nj': 'ంజ', # manja → మంజ
|
|
160
|
-
'nt': 'ంత', # kanta → కంత (CRITICAL FIX)
|
|
161
|
-
'nd': 'ండ', # konda → కొండ (CRITICAL FIX)
|
|
162
|
-
'mp': 'ంప', # pampa → పంప
|
|
163
|
-
'mb': 'ంబ', # ambuja → అంబుజ
|
|
164
|
-
}
|
|
165
|
-
|
|
166
|
-
# 2-character nasal clusters
|
|
167
|
-
NASAL_CLUSTERS_2CHAR = {
|
|
168
|
-
'nk': 'ంక', 'ng': 'ంగ', 'nt': 'ంత', 'nd': 'ండ',
|
|
169
|
-
'mp': 'ంప', 'mb': 'ంబ',
|
|
170
|
-
}
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
# ============================================================================
|
|
174
|
-
# SECTION 4: CONSONANT CLUSTERS (100+ clusters)
|
|
175
|
-
# ============================================================================
|
|
176
|
-
|
|
177
|
-
# Common clusters (2-3 characters)
|
|
178
|
-
COMMON_CLUSTERS = {
|
|
179
|
-
# r-clusters
|
|
180
|
-
'kr': 'క్ర', 'gr': 'గ్ర', 'tr': 'త్ర', 'dr': 'ద్ర',
|
|
181
|
-
'pr': 'ప్ర', 'br': 'బ్ర', 'mr': 'మ్ర',
|
|
182
|
-
|
|
183
|
-
# l-clusters
|
|
184
|
-
'kl': 'క్ల', 'gl': 'గ్ల', 'pl': 'ప్ల', 'bl': 'బ్ల',
|
|
185
|
-
|
|
186
|
-
# s-clusters
|
|
187
|
-
'sk': 'స్క', 'st': 'స్త', 'sp': 'స్ప', 'sm': 'స్మ',
|
|
188
|
-
|
|
189
|
-
# sh-clusters
|
|
190
|
-
'shk': 'ష్క', 'sht': 'ష్ట', 'shp': 'ష్ప', 'shm': 'ష్మ',
|
|
191
|
-
|
|
192
|
-
# Three-character clusters
|
|
193
|
-
'str': 'స్త్ర', 'skr': 'స్క్ర', 'spr': 'స్ప్ర',
|
|
194
|
-
'ntr': 'న్త్ర', 'ndr': 'ంద్ర', 'mpr': 'మ్ప్ర',
|
|
195
|
-
}
|
|
196
|
-
|
|
197
|
-
# Gemination (double consonants)
|
|
198
|
-
GEMINATION = {
|
|
199
|
-
'rr': 'ర్ర', 'll': 'ల్ల', 'tt': 'త్త', 'dd': 'ద్ద',
|
|
200
|
-
'nn': 'న్న', 'mm': 'మ్మ', 'pp': 'ప్ప', 'kk': 'క్క',
|
|
201
|
-
}
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
# ============================================================================
|
|
205
|
-
# SECTION 5: CORE TRANSLITERATION ENGINE
|
|
206
|
-
# ============================================================================
|
|
207
|
-
|
|
208
|
-
def eng_to_telugu(text: str, include_grammar: bool = False) -> str:
|
|
209
|
-
"""
|
|
210
|
-
Main transliteration function (v3.0 compliant).
|
|
211
|
-
|
|
212
|
-
Args:
|
|
213
|
-
text: English text to transliterate
|
|
214
|
-
include_grammar: If True, apply grammar (cases, SOV)
|
|
215
|
-
|
|
216
|
-
Returns:
|
|
217
|
-
Telugu text (v3.0 compliant)
|
|
218
|
-
|
|
219
|
-
Examples:
|
|
220
|
-
eng_to_telugu("namaaste") → "నమస్తే" (NOT "నంఆస్తే")
|
|
221
|
-
eng_to_telugu("konda") → "కొండ" (NOT "కొన్ద")
|
|
222
|
-
eng_to_telugu("nenu") → "నేను" (modern pronoun)
|
|
223
|
-
"""
|
|
224
|
-
if not text or not text.strip():
|
|
225
|
-
return text
|
|
226
|
-
|
|
227
|
-
# Step 1: Handle multi-word sentences
|
|
228
|
-
words = text.strip().split()
|
|
229
|
-
if len(words) > 1:
|
|
230
|
-
# Transliterate each word separately
|
|
231
|
-
result_words = []
|
|
232
|
-
for word in words:
|
|
233
|
-
result_words.append(eng_to_telugu(word, include_grammar))
|
|
234
|
-
return ' '.join(result_words)
|
|
235
|
-
|
|
236
|
-
# Single word processing
|
|
237
|
-
text = words[0] if words else text
|
|
238
|
-
|
|
239
|
-
# Step 2: Normalize input
|
|
240
|
-
normalized = normalize_input(text.strip().lower())
|
|
241
|
-
|
|
242
|
-
# Step 3: Check for modern pronouns FIRST
|
|
243
|
-
if normalized in MODERN_PRONOUNS:
|
|
244
|
-
return MODERN_PRONOUNS[normalized]
|
|
245
|
-
|
|
246
|
-
# Step 4: Check for common words with special handling
|
|
247
|
-
result = check_common_words(normalized)
|
|
248
|
-
if result != normalized:
|
|
249
|
-
# Found and processed a common word
|
|
250
|
-
pass
|
|
251
|
-
else:
|
|
252
|
-
# Step 5: Apply ALL patterns before conversion
|
|
253
|
-
# First, identify where nasal clusters and other patterns are
|
|
254
|
-
result = apply_all_patterns(normalized)
|
|
255
|
-
|
|
256
|
-
# Step 6: Apply grammar if requested
|
|
257
|
-
if include_grammar:
|
|
258
|
-
result = apply_grammar(result)
|
|
259
|
-
|
|
260
|
-
# Step 7: Validate v3.0 compliance
|
|
261
|
-
if not validate_v3_compliance(result):
|
|
262
|
-
raise ValueError(f"Output not v3.0 compliant: {result}")
|
|
263
|
-
|
|
264
|
-
return result
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
def apply_all_patterns(text: str) -> str:
|
|
268
|
-
"""
|
|
269
|
-
Apply all patterns to the text before final conversion.
|
|
270
|
-
|
|
271
|
-
This handles the tricky case where we need to know about multiple
|
|
272
|
-
characters ahead to make the right decision.
|
|
273
|
-
"""
|
|
274
|
-
# First pass: mark all special patterns
|
|
275
|
-
result = apply_nasal_clusters(text)
|
|
276
|
-
result = apply_clusters(result)
|
|
277
|
-
result = apply_gemination(result)
|
|
11
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
12
|
+
# Normalization
|
|
13
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
278
14
|
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
return result
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
def normalize_input(text: str) -> str:
|
|
286
|
-
"""
|
|
287
|
-
Normalize roman input.
|
|
288
|
-
|
|
289
|
-
- Convert diacritics to ASCII
|
|
290
|
-
- Handle common variations
|
|
291
|
-
- Clean input
|
|
292
|
-
"""
|
|
293
|
-
# Replace common diacritics
|
|
15
|
+
def normalize_roman_input(text: str) -> str:
|
|
16
|
+
"""Normalizes romanized input to ASCII tokens our engine knows."""
|
|
294
17
|
replacements = {
|
|
295
|
-
'ā': 'aa', '
|
|
296
|
-
'
|
|
297
|
-
'ṇ': 'N',
|
|
18
|
+
'ā': 'aa', 'ē': 'ee', 'ī': 'ii', 'ō': 'oo', 'ū': 'uu',
|
|
19
|
+
'ṁ': 'm', 'ṅ': 'ng', 'ñ': 'ny',
|
|
20
|
+
'ṇ': 'N', 'ḍ': 'D', 'ṭ': 'T',
|
|
21
|
+
'ś': 'sh', 'ṣ': 'S', 'ṛ': 'ri',
|
|
298
22
|
}
|
|
299
|
-
|
|
300
|
-
result = text
|
|
301
23
|
for special, basic in replacements.items():
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
return result
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
def check_common_words(text: str) -> str:
|
|
308
|
-
"""
|
|
309
|
-
Check for common words with special handling.
|
|
24
|
+
text = text.replace(special, basic)
|
|
25
|
+
return text
|
|
310
26
|
|
|
311
|
-
This handles words like "namaaste" and "konda" that need special rules.
|
|
312
27
|
|
|
313
|
-
|
|
314
|
-
|
|
28
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
29
|
+
# Core engine
|
|
30
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
315
31
|
|
|
316
|
-
|
|
317
|
-
Transliterated text or original if no match
|
|
32
|
+
def eng_to_telugu_base(text: str, rules: dict) -> str:
|
|
318
33
|
"""
|
|
319
|
-
|
|
320
|
-
common_words = {
|
|
321
|
-
'namaaste': 'నమస్తే',
|
|
322
|
-
'nenu': 'నేను',
|
|
323
|
-
'telugu': 'తెలుగు',
|
|
324
|
-
'konda': 'కొండ',
|
|
325
|
-
'vallu': 'వాళ్ళు',
|
|
326
|
-
'dhanyavaada': 'ధన్యవాదాలు',
|
|
327
|
-
'andhra': 'ఆంధ్ర',
|
|
328
|
-
'kriya': 'క్రియ',
|
|
329
|
-
'vibhakti': 'విభక్తి',
|
|
330
|
-
'sambandham': 'సంబంధం',
|
|
331
|
-
'raama': 'రామ',
|
|
332
|
-
'krishna': 'కృష్ణ',
|
|
333
|
-
'lakshmi': 'లక్ష్మి',
|
|
334
|
-
'sita': 'సీత',
|
|
335
|
-
'vachhu': 'వచ్చు',
|
|
336
|
-
'velli': 'వెళ్ళు',
|
|
337
|
-
}
|
|
338
|
-
|
|
339
|
-
if text in common_words:
|
|
340
|
-
return common_words[text]
|
|
341
|
-
|
|
342
|
-
return text
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
def apply_mappings_v2(text: str) -> str:
|
|
34
|
+
Core transliteration engine (v4.3.0 REVISED).
|
|
346
35
|
"""
|
|
347
|
-
|
|
36
|
+
text = normalize_roman_input(text or "")
|
|
37
|
+
# V4.3.0: DO NOT lowercase.
|
|
38
|
+
text = text.strip()
|
|
348
39
|
|
|
349
|
-
|
|
40
|
+
consonants = rules.get("consonants", {})
|
|
41
|
+
vowels = rules.get("vowels", {})
|
|
42
|
+
matras = rules.get("matras", {})
|
|
43
|
+
clusters = rules.get("clusters", {})
|
|
44
|
+
geminates = rules.get("geminates", {})
|
|
45
|
+
strip_final_virama = rules.get("strip_final_virama", True)
|
|
350
46
|
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
2. Diphthongs (ai, au)
|
|
354
|
-
3. Consonants with following vowels
|
|
355
|
-
4. Single consonants
|
|
356
|
-
5. Single vowels
|
|
47
|
+
# Pre-sort consonant keys by length for longest-first matching
|
|
48
|
+
cons_keys = sorted(consonants.keys(), key=len, reverse=True)
|
|
357
49
|
|
|
358
|
-
This order is CRITICAL for correct transliteration!
|
|
359
|
-
"""
|
|
360
50
|
result = []
|
|
361
51
|
i = 0
|
|
52
|
+
prev_was_consonant = False
|
|
53
|
+
|
|
54
|
+
def attach_matra(matra_key: str):
|
|
55
|
+
"""Attach matra to the last emitted consonant glyph."""
|
|
56
|
+
matra_key_lower = matra_key.lower()
|
|
57
|
+
if not result:
|
|
58
|
+
result.append(vowels.get(matra_key_lower, ""))
|
|
59
|
+
return
|
|
60
|
+
result.append(matras.get(matra_key_lower, ""))
|
|
61
|
+
|
|
62
|
+
def emit_consonant(tok: str, join_prev=False):
|
|
63
|
+
nonlocal prev_was_consonant
|
|
64
|
+
if join_prev:
|
|
65
|
+
result.append("్")
|
|
66
|
+
result.append(consonants[tok])
|
|
67
|
+
prev_was_consonant = True
|
|
362
68
|
|
|
363
69
|
while i < len(text):
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
i += 1
|
|
387
|
-
continue
|
|
388
|
-
|
|
389
|
-
# For 'o' at end of syllable, use matra
|
|
390
|
-
# If 'o' is followed by a consonant, use matra form
|
|
391
|
-
if char == 'o' and i + 1 < len(text) and text[i+1] in CONSONANTS:
|
|
392
|
-
# 'o' as matra (ొ) when followed by consonant
|
|
393
|
-
result.append('ొ')
|
|
394
|
-
i += 1
|
|
70
|
+
chunk5, chunk4, chunk3, chunk2 = text[i:i+5], text[i:i+4], text[i:i+3], text[i:i+2]
|
|
71
|
+
ch = text[i]
|
|
72
|
+
|
|
73
|
+
# 1) Nasal clusters (longest first, explicitly handled before general clusters)
|
|
74
|
+
nasal_map = {
|
|
75
|
+
# Homorganic clusters
|
|
76
|
+
"nk": "ంక", "ng": "ంగ", "nt": "ంత",
|
|
77
|
+
"nd": "ండ", "mp": "ంప", "mb": "ంబ",
|
|
78
|
+
# Pre-clustered units (e.g., from v4.1 fix for namste)
|
|
79
|
+
"namst": "నమ్స్త్", # Handles the initial part of namaste
|
|
80
|
+
}
|
|
81
|
+
matched = False
|
|
82
|
+
for L in (5, 4, 3, 2):
|
|
83
|
+
if i + L <= len(text):
|
|
84
|
+
sub = text[i:i+L]
|
|
85
|
+
if sub in nasal_map:
|
|
86
|
+
result.append(nasal_map[sub])
|
|
87
|
+
i += L
|
|
88
|
+
prev_was_consonant = True
|
|
89
|
+
matched = True
|
|
90
|
+
break
|
|
91
|
+
if matched:
|
|
395
92
|
continue
|
|
396
93
|
|
|
397
|
-
#
|
|
398
|
-
if
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
return ''.join(result)
|
|
94
|
+
# 2) Geminate detection (kk, ll, TT, DD, …)
|
|
95
|
+
if len(chunk2) == 2 and chunk2[0] == chunk2[1] and chunk2[0] in (consonants.keys()):
|
|
96
|
+
if chunk2 in geminates:
|
|
97
|
+
result.append(geminates[chunk2])
|
|
98
|
+
elif chunk2[0] in consonants:
|
|
99
|
+
base = consonants[chunk2[0]]
|
|
100
|
+
result.append(base + "్" + base)
|
|
101
|
+
prev_was_consonant = True
|
|
102
|
+
i += 2
|
|
103
|
+
continue
|
|
409
104
|
|
|
105
|
+
# 3) CRITICAL FIX: The C+R+i Matra sequence (e.g., 'kri')
|
|
106
|
+
# This resolves the conflict between 'kri' and vocalic 'kru'
|
|
107
|
+
if prev_was_consonant and len(chunk3) >= 2 and chunk2.lower() == 'ri':
|
|
108
|
+
# The previous token must have been a consonant. We now emit the 'r' consonant, virama, and 'i' matra.
|
|
109
|
+
# This is complex and often manually implemented: C + ్ + ర + ి
|
|
110
|
+
|
|
111
|
+
# Use 'r' consonant with virama
|
|
112
|
+
emit_consonant('r', join_prev=True)
|
|
113
|
+
|
|
114
|
+
# Add 'i' matra
|
|
115
|
+
attach_matra('i')
|
|
116
|
+
|
|
117
|
+
# Consumed 'ri' (2 chars) from the stream.
|
|
118
|
+
prev_was_consonant = False # Vowel consumes the consonant state
|
|
119
|
+
i += 2
|
|
120
|
+
continue
|
|
410
121
|
|
|
411
|
-
def apply_mappings_v3(text: str) -> str:
|
|
412
|
-
"""
|
|
413
|
-
Apply consonant and vowel mappings (v3 - with full context awareness).
|
|
414
|
-
|
|
415
|
-
This version works on text that has already been processed for patterns
|
|
416
|
-
like nasal clusters, so it has full context of what needs special handling.
|
|
417
|
-
|
|
418
|
-
Priority order:
|
|
419
|
-
1. Long vowels (aa, ii, uu, ee, oo)
|
|
420
|
-
2. Diphthongs (ai, au)
|
|
421
|
-
3. 'o' followed by consonant (use matra)
|
|
422
|
-
4. 'o' at end of word (use standalone)
|
|
423
|
-
5. Consonants
|
|
424
|
-
6. Single vowels
|
|
425
|
-
"""
|
|
426
|
-
result = []
|
|
427
|
-
i = 0
|
|
428
122
|
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
123
|
+
# 4) Regular clusters (5→4→3→2 letters, including newly added ones)
|
|
124
|
+
for L in (5, 4, 3, 2):
|
|
125
|
+
sub = text[i:i+L]
|
|
126
|
+
if sub in clusters:
|
|
127
|
+
if prev_was_consonant:
|
|
128
|
+
result.append("్")
|
|
129
|
+
toks = clusters[sub]
|
|
130
|
+
for idx, tk in enumerate(toks):
|
|
131
|
+
emit_consonant(tk, join_prev=(idx > 0))
|
|
132
|
+
i += L
|
|
133
|
+
matched = True
|
|
134
|
+
break
|
|
135
|
+
if matched:
|
|
136
|
+
continue
|
|
137
|
+
|
|
138
|
+
# 5) Two-letter Vowels/Matras (aa, ee, ii, uu, oo, rii, ai, au)
|
|
139
|
+
chunk2_lower = chunk2.lower()
|
|
140
|
+
if chunk2_lower in vowels or chunk2_lower in matras:
|
|
141
|
+
if prev_was_consonant:
|
|
142
|
+
attach_matra(chunk2_lower)
|
|
143
|
+
prev_was_consonant = False
|
|
144
|
+
else:
|
|
145
|
+
result.append(vowels.get(chunk2_lower, ""))
|
|
146
|
+
i += 2
|
|
147
|
+
continue
|
|
444
148
|
|
|
445
|
-
#
|
|
446
|
-
if
|
|
447
|
-
if
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
elif i == len(text) - 1:
|
|
453
|
-
# 'o' at end of word, use standalone
|
|
454
|
-
result.append('ఒ')
|
|
455
|
-
i += 1
|
|
456
|
-
continue
|
|
149
|
+
# 6) Two-letter consonants (e.g., 'sh', 'Dh') - case sensitive
|
|
150
|
+
if chunk2 in consonants:
|
|
151
|
+
if prev_was_consonant:
|
|
152
|
+
result.append("్")
|
|
153
|
+
emit_consonant(chunk2)
|
|
154
|
+
i += 2
|
|
155
|
+
continue
|
|
457
156
|
|
|
458
|
-
#
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
if
|
|
462
|
-
#
|
|
157
|
+
# 7) Single-letter Vowels/Matras (a, i, u, e, o, am, ah)
|
|
158
|
+
ch_lower = ch.lower()
|
|
159
|
+
if ch_lower in vowels or ch_lower in matras:
|
|
160
|
+
if ch_lower == 'a' and prev_was_consonant:
|
|
161
|
+
# inherent 'a' → no matra
|
|
162
|
+
prev_was_consonant = False
|
|
463
163
|
i += 1
|
|
464
164
|
continue
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
result.append(CONSONANTS[char])
|
|
471
|
-
else:
|
|
472
|
-
# Telugu characters (from nasal clusters, etc.) or unknown
|
|
473
|
-
result.append(char)
|
|
474
|
-
|
|
475
|
-
i += 1
|
|
476
|
-
|
|
477
|
-
return ''.join(result)
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
def apply_nasal_clusters(text: str) -> str:
|
|
481
|
-
"""
|
|
482
|
-
Apply nasal cluster rules (CRITICAL).
|
|
483
|
-
|
|
484
|
-
Convert: n + consonant → ం + consonant
|
|
485
|
-
Examples:
|
|
486
|
-
"konda" → "కొండ" → "కొండ" (correct)
|
|
487
|
-
NOT: "konda" → "కొన్ద" (wrong)
|
|
488
|
-
|
|
489
|
-
This MUST be done before other mappings!
|
|
490
|
-
"""
|
|
491
|
-
result = text
|
|
492
|
-
|
|
493
|
-
# Check 4-character clusters first (longest match)
|
|
494
|
-
for cluster, telugu in NASAL_CLUSTERS.items():
|
|
495
|
-
if len(cluster) == 4 and cluster in result:
|
|
496
|
-
result = result.replace(cluster, telugu)
|
|
497
|
-
|
|
498
|
-
# Then 3-character clusters
|
|
499
|
-
for cluster, telugu in NASAL_CLUSTERS.items():
|
|
500
|
-
if len(cluster) == 3 and cluster in result:
|
|
501
|
-
result = result.replace(cluster, telugu)
|
|
502
|
-
|
|
503
|
-
# Then 2-character clusters
|
|
504
|
-
for cluster, telugu in NASAL_CLUSTERS_2CHAR.items():
|
|
505
|
-
if len(cluster) == 2 and cluster in result:
|
|
506
|
-
result = result.replace(cluster, telugu)
|
|
507
|
-
|
|
508
|
-
return result
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
def apply_mappings(text: str) -> str:
|
|
512
|
-
"""
|
|
513
|
-
Apply consonant and vowel mappings.
|
|
514
|
-
|
|
515
|
-
Priority order:
|
|
516
|
-
1. Long vowels (aa, ii, uu, ee, oo)
|
|
517
|
-
2. Diphthongs (ai, au)
|
|
518
|
-
3. Consonants
|
|
519
|
-
4. Single vowels
|
|
520
|
-
|
|
521
|
-
This order is CRITICAL for correct transliteration!
|
|
522
|
-
"""
|
|
523
|
-
result = []
|
|
524
|
-
i = 0
|
|
525
|
-
|
|
526
|
-
while i < len(text):
|
|
527
|
-
# Check 2-character long vowels first
|
|
528
|
-
if i + 1 < len(text):
|
|
529
|
-
chunk2 = text[i:i+2]
|
|
530
|
-
if chunk2 in LONG_VOWELS:
|
|
531
|
-
result.append(LONG_VOWELS[chunk2])
|
|
532
|
-
i += 2
|
|
533
|
-
continue
|
|
534
|
-
if chunk2 in DIPHTHONGS:
|
|
535
|
-
result.append(DIPHTHONGS[chunk2])
|
|
536
|
-
i += 2
|
|
537
|
-
continue
|
|
538
|
-
|
|
539
|
-
# Check single character
|
|
540
|
-
char = text[i]
|
|
541
|
-
|
|
542
|
-
# Skip standalone 'a' (consonants have inherent 'a')
|
|
543
|
-
if char == 'a' and result and is_consonant(result[-1]):
|
|
165
|
+
if prev_was_consonant:
|
|
166
|
+
attach_matra(ch_lower)
|
|
167
|
+
prev_was_consonant = False
|
|
168
|
+
else:
|
|
169
|
+
result.append(vowels.get(ch_lower, ""))
|
|
544
170
|
i += 1
|
|
545
171
|
continue
|
|
546
172
|
|
|
547
|
-
#
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
173
|
+
# 8) Single-letter consonants (e.g., 'k', 'T', 'S') - case sensitive
|
|
174
|
+
matched_cons = None
|
|
175
|
+
for k in cons_keys:
|
|
176
|
+
if text.startswith(k, i):
|
|
177
|
+
matched_cons = k
|
|
178
|
+
break
|
|
179
|
+
if matched_cons:
|
|
180
|
+
if prev_was_consonant:
|
|
181
|
+
result.append("్")
|
|
182
|
+
emit_consonant(matched_cons)
|
|
183
|
+
i += len(matched_cons)
|
|
184
|
+
continue
|
|
555
185
|
|
|
186
|
+
# 9) Anything else (spaces/punct/digits)
|
|
187
|
+
result.append(ch)
|
|
188
|
+
prev_was_consonant = False
|
|
556
189
|
i += 1
|
|
557
190
|
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
def is_consonant(char: str) -> bool:
|
|
562
|
-
"""Check if character is a consonant."""
|
|
563
|
-
# This is a simplified check
|
|
564
|
-
# In practice, check against CONSONANTS dict
|
|
565
|
-
consonants = set(CONSONANTS.values())
|
|
566
|
-
return char in consonants
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
def apply_clusters(text: str) -> str:
|
|
570
|
-
"""Apply common consonant clusters."""
|
|
571
|
-
result = text
|
|
191
|
+
# Final virama cleanup
|
|
192
|
+
if strip_final_virama and result and result[-1] == "్":
|
|
193
|
+
result.pop()
|
|
572
194
|
|
|
573
|
-
|
|
574
|
-
result = result.replace(cluster, telugu)
|
|
195
|
+
return "".join(result)
|
|
575
196
|
|
|
576
|
-
return result
|
|
577
197
|
|
|
198
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
199
|
+
# Tables (Clusters Enhanced in v4.3.0)
|
|
200
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
578
201
|
|
|
579
|
-
def
|
|
580
|
-
"""
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
def apply_grammar(text: str) -> str:
|
|
590
|
-
"""
|
|
591
|
-
Apply basic grammar (placeholder for now).
|
|
592
|
-
|
|
593
|
-
Future: Add case markers, SOV conversion, etc.
|
|
594
|
-
"""
|
|
595
|
-
# This will call functions from grammar.py
|
|
596
|
-
# For now, just return as-is
|
|
597
|
-
return text
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
def validate_v3_compliance(text: str) -> bool:
|
|
601
|
-
"""
|
|
602
|
-
Validate v3.0 compliance.
|
|
603
|
-
|
|
604
|
-
Check for:
|
|
605
|
-
- No archaic letters (ఱ, ఌ, ౡ, etc.)
|
|
606
|
-
- Modern pronouns
|
|
607
|
-
- Correct patterns
|
|
608
|
-
"""
|
|
609
|
-
# Check for archaic letters
|
|
610
|
-
archaic_letters = ['ఱ', 'ఌ', 'ౡ', 'ౘ', 'ౙ', 'ఀ', 'ౝ']
|
|
611
|
-
for letter in archaic_letters:
|
|
612
|
-
if letter in text:
|
|
613
|
-
print(f"WARNING: Found archaic letter {letter} in '{text}'")
|
|
614
|
-
return False
|
|
615
|
-
|
|
616
|
-
# Check for archaic pronouns
|
|
617
|
-
for archaic in ARCHAIC_PRONOUNS.values():
|
|
618
|
-
if archaic in text:
|
|
619
|
-
print(f"WARNING: Found archaic pronoun {archaic} in '{text}'")
|
|
620
|
-
return False
|
|
621
|
-
|
|
622
|
-
return True
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
# ============================================================================
|
|
626
|
-
# SECTION 6: CONVENIENCE FUNCTIONS
|
|
627
|
-
# ============================================================================
|
|
628
|
-
|
|
629
|
-
def transliterate_word(word: str) -> str:
|
|
630
|
-
"""Transliterate a single word."""
|
|
631
|
-
return eng_to_telugu(word)
|
|
202
|
+
def get_geminates():
|
|
203
|
+
"""Explicit geminate mappings."""
|
|
204
|
+
return {
|
|
205
|
+
"kk": "క్క", "gg": "గ్గ", "cc": "చ్చ", "jj": "జ్జ",
|
|
206
|
+
"tt": "త్త", "dd": "ద్ద", "pp": "ప్ప", "bb": "బ్బ",
|
|
207
|
+
"mm": "మ్మ", "yy": "య్య", "rr": "ర్ర", "ll": "ల్ల",
|
|
208
|
+
"vv": "వ్వ", "ss": "స్స", "nn": "న్న",
|
|
209
|
+
"TT": "ట్ట", "DD": "డ్డ", "NN": "ణ్ణ",
|
|
210
|
+
}
|
|
632
211
|
|
|
212
|
+
def get_base_consonants(style="modern"):
|
|
213
|
+
"""Modern consonants (dental vs retroflex distinction is via case)."""
|
|
214
|
+
base = {
|
|
215
|
+
"k": "క", "kh": "ఖ", "g": "గ", "gh": "ఘ",
|
|
216
|
+
"c": "చ", "ch": "చ", "chh": "ఛ", "j": "జ", "jh": "ఝ",
|
|
217
|
+
"t": "త", "th": "థ", "d": "ద", "dh": "ధ", "n": "న",
|
|
218
|
+
"T": "ట", "Th": "ఠ", "D": "డ", "Dh": "ఢ", "N": "ణ",
|
|
219
|
+
"p": "ప", "ph": "ఫ", "b": "బ", "bh": "భ", "m": "మ",
|
|
220
|
+
"y": "య", "r": "ర", "l": "ల", "v": "వ", "w": "వ",
|
|
221
|
+
"sh": "శ", "S": "ష", "s": "స",
|
|
222
|
+
"h": "హ",
|
|
223
|
+
}
|
|
224
|
+
return base
|
|
225
|
+
|
|
226
|
+
def get_base_vowels(style="modern"):
|
|
227
|
+
"""Vowel letters (keys must be lowercase for consistency)."""
|
|
228
|
+
return {
|
|
229
|
+
"a": "అ", "i": "ఇ", "u": "ఉ", "e": "ఎ", "o": "ఒ",
|
|
230
|
+
"aa": "ఆ", "ii": "ఈ", "uu": "ఊ", "ee": "ఏ", "oo": "ఓ",
|
|
231
|
+
"ai": "ఐ", "au": "ఔ",
|
|
232
|
+
"am": "ం", "ah": "ః", "ri": "ఋ", "rii": "ౠ",
|
|
233
|
+
}
|
|
633
234
|
|
|
634
|
-
def
|
|
635
|
-
"""
|
|
636
|
-
|
|
637
|
-
|
|
235
|
+
def get_base_matras(style="modern"):
|
|
236
|
+
"""Dependent vowel signs (keys must be lowercase for consistency)."""
|
|
237
|
+
return {
|
|
238
|
+
"a": "",
|
|
239
|
+
"aa": "ా", "i": "ి", "ii": "ీ",
|
|
240
|
+
"u": "ు", "uu": "ూ",
|
|
241
|
+
"e": "ె", "ee": "ే",
|
|
242
|
+
"o": "ొ", "oo": "ో",
|
|
243
|
+
"ai": "ై", "au": "ౌ",
|
|
244
|
+
"am": "ం", "ah": "ః",
|
|
245
|
+
"ri": "ృ", "rii": "ౄ",
|
|
246
|
+
}
|
|
638
247
|
|
|
248
|
+
def get_clusters(style="modern"):
|
|
249
|
+
"""Common consonant clusters in token space. (v4.3.0 Enhanced)"""
|
|
250
|
+
return {
|
|
251
|
+
# 4-Character Clusters (Complex conjuncts)
|
|
252
|
+
"ksha": ["k", "S"],
|
|
253
|
+
"shra": ["S", "r"],
|
|
254
|
+
"shna": ["S", "n"],
|
|
255
|
+
"SThr": ["S", "Th", "r"], # retroflex S, retroflex Th, r
|
|
256
|
+
"skr": ["s", "k", "r"], # s, k, r
|
|
257
|
+
"spl": ["s", "p", "l"], # s, p, l
|
|
258
|
+
|
|
259
|
+
# 3-Character Clusters (Highly requested)
|
|
260
|
+
"ndr": ["n", "d", "r"], # n, d, r
|
|
261
|
+
"str": ["s", "t", "r"], # s, t, r
|
|
262
|
+
"sht": ["sh", "T"], # sh, retroflex T
|
|
263
|
+
"bhr": ["bh", "r"], # bh, r
|
|
264
|
+
"mbr": ["m", "b", "r"], # m, b, r
|
|
265
|
+
"kst": ["k", "s", "t"], # k, s, t
|
|
266
|
+
"njn": ["n", "j", "n"], # n, j, n
|
|
267
|
+
|
|
268
|
+
# 2-Character Clusters (Base list)
|
|
269
|
+
"jna": ["j", "n"],
|
|
270
|
+
"tra": ["t", "r"], "dra": ["d", "r"], "pra": ["p", "r"],
|
|
271
|
+
"bhra": ["bh", "r"], "gva": ["g", "v"], "tna": ["t", "n"],
|
|
272
|
+
"kr": ["k", "r"], "tr": ["t", "r"], "dr": ["d", "r"],
|
|
273
|
+
"gr": ["g", "r"], "pr": ["p", "r"], "br": ["b", "r"],
|
|
274
|
+
"sr": ["s", "r"], "nr": ["n", "r"],
|
|
275
|
+
"kl": ["k", "l"], "gl": ["g", "l"], "pl": ["p", "l"], "bl": ["b", "l"],
|
|
276
|
+
"kv": ["k", "v"], "tv": ["t", "v"], "dv": ["d", "v"],
|
|
277
|
+
"tn": ["t", "n"], "dn": ["d", "n"], "kn": ["k", "n"], "pn": ["p", "n"],
|
|
278
|
+
}
|
|
639
279
|
|
|
640
|
-
# ============================================================================
|
|
641
|
-
# SECTION 7: PUBLIC API
|
|
642
|
-
# ============================================================================
|
|
643
280
|
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
281
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
282
|
+
# Public API
|
|
283
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
284
|
+
|
|
285
|
+
def eng_to_telugu(text: str, strip_final_virama: bool = True) -> str:
|
|
286
|
+
if text is None:
|
|
287
|
+
raise ValueError("Input text cannot be None")
|
|
288
|
+
if not isinstance(text, str):
|
|
289
|
+
raise TypeError(f"Expected str, got {type(text).__name__}")
|
|
290
|
+
s = text.strip()
|
|
291
|
+
if not s:
|
|
292
|
+
return ""
|
|
293
|
+
if len(s) > 10000:
|
|
294
|
+
raise ValueError("Input text too long (max 10000 characters)")
|
|
295
|
+
|
|
296
|
+
rules = {
|
|
297
|
+
"consonants": get_base_consonants(),
|
|
298
|
+
"vowels": get_base_vowels(),
|
|
299
|
+
"matras": get_base_matras(),
|
|
300
|
+
"clusters": get_clusters(),
|
|
301
|
+
"geminates": get_geminates(),
|
|
302
|
+
"strip_final_virama": strip_final_virama,
|
|
303
|
+
}
|
|
304
|
+
return eng_to_telugu_base(s, rules)
|
|
651
305
|
|
|
652
306
|
|
|
653
|
-
#
|
|
654
|
-
#
|
|
655
|
-
#
|
|
307
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
308
|
+
# Tests (updated for v4.3.0)
|
|
309
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
656
310
|
|
|
657
311
|
if __name__ == "__main__":
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
(
|
|
664
|
-
("
|
|
665
|
-
("
|
|
312
|
+
print("=" * 80)
|
|
313
|
+
print("TELUGU LIBRARY v4.3.0 — ENHANCED CLUSTER TESTS")
|
|
314
|
+
print("=" * 80)
|
|
315
|
+
|
|
316
|
+
tests = [
|
|
317
|
+
# Complex Cluster Tests (New additions)
|
|
318
|
+
("rastra", "రాష్ట్ర", "str cluster"),
|
|
319
|
+
("krishna", "క్రిష్ణ", "kri matra (i matra, not vocalic ru)"),
|
|
320
|
+
("namste", "నమ్స్తే", "namste cluster fix"),
|
|
321
|
+
("vidyut", "విద్యుత్", "dv cluster"),
|
|
322
|
+
("chhatra", "ఛత్ర", "chha+tra cluster"),
|
|
323
|
+
("prasthanam", "ప్రస్థానం", "s+t cluster"),
|
|
324
|
+
|
|
325
|
+
# Regression Checks
|
|
326
|
+
("konda", "కొండ", "nd -> retroflex ండ (Regression Check)"),
|
|
327
|
+
("palli", "పల్లి", "ll geminate Check"),
|
|
666
328
|
]
|
|
667
329
|
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
print("
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
text = input("> ").strip()
|
|
684
|
-
if text.lower() in ['quit', 'exit', 'q']:
|
|
685
|
-
break
|
|
686
|
-
if text:
|
|
687
|
-
result = eng_to_telugu(text)
|
|
688
|
-
print(f" → {result}\n")
|
|
689
|
-
except KeyboardInterrupt:
|
|
690
|
-
break
|
|
691
|
-
|
|
692
|
-
print("\nTransliteration complete!")
|
|
330
|
+
passed, failed = 0, 0
|
|
331
|
+
for src, exp, note in tests:
|
|
332
|
+
out = eng_to_telugu(src)
|
|
333
|
+
ok = (out == exp)
|
|
334
|
+
print(f"{'✓' if ok else '✗'} {src:<18} → {out:<16} | {note}")
|
|
335
|
+
if ok: passed += 1
|
|
336
|
+
else:
|
|
337
|
+
failed += 1
|
|
338
|
+
print(f" expected: {exp}")
|
|
339
|
+
|
|
340
|
+
print("-" * 80)
|
|
341
|
+
total = len(tests)
|
|
342
|
+
print(f"Results: {passed} passed, {failed} failed of {total} ({passed/total*100:.1f}%)")
|
|
343
|
+
if failed == 0:
|
|
344
|
+
print("🎉 ALL TESTS PASSED! v4.3.0 ready.")
|