telugu-language-tools 5.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of telugu-language-tools might be problematic. Click here for more details.
- telugu_engine/__init__.py +190 -0
- telugu_engine/cli.py +111 -0
- telugu_engine/enhanced_tense.py +854 -0
- telugu_engine/grammar.py +474 -0
- telugu_engine/phonetic_matrix.py +82 -0
- telugu_engine/tense_engine.py +391 -0
- telugu_engine/transliterator.py +692 -0
- telugu_engine/v3_validator.py +413 -0
- telugu_language_tools-5.0.4.dist-info/METADATA +398 -0
- telugu_language_tools-5.0.4.dist-info/RECORD +13 -0
- telugu_language_tools-5.0.4.dist-info/WHEEL +5 -0
- telugu_language_tools-5.0.4.dist-info/licenses/LICENSE +21 -0
- telugu_language_tools-5.0.4.dist-info/top_level.txt +1 -0
telugu_engine/grammar.py
ADDED
|
@@ -0,0 +1,474 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Modern Telugu Grammar Engine v3.0
|
|
3
|
+
==================================
|
|
4
|
+
|
|
5
|
+
This module provides modern Telugu grammar processing:
|
|
6
|
+
- Modern verb patterns (Past Participle + Person Marker)
|
|
7
|
+
- 4-case system (Nominative, Accusative, Dative, Locative)
|
|
8
|
+
- SOV syntax conversion
|
|
9
|
+
- Vowel harmony enforcement
|
|
10
|
+
- Sandhi rules
|
|
11
|
+
|
|
12
|
+
Usage:
|
|
13
|
+
from telugu_engine.grammar import apply_case, conjugate_verb
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from typing import List, Dict, Optional
|
|
17
|
+
import re
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# ============================================================================
|
|
21
|
+
# SECTION 1: MODERN VERB PATTERNS (v3.0 Critical)
|
|
22
|
+
# ============================================================================
|
|
23
|
+
|
|
24
|
+
# Person markers (v3.0 modern)
|
|
25
|
+
PERSON_MARKERS = {
|
|
26
|
+
# 1st person
|
|
27
|
+
'1ps': 'ఆను', # I (past)
|
|
28
|
+
'1pp': 'ఆము', # We (past)
|
|
29
|
+
|
|
30
|
+
# 2nd person
|
|
31
|
+
'2ps': 'ఆవు', # You (informal, past)
|
|
32
|
+
'2pp': 'ఆరు', # You (formal/plural, past)
|
|
33
|
+
|
|
34
|
+
# 3rd person
|
|
35
|
+
'3ps': 'ఆడు', # He/She/It (past)
|
|
36
|
+
'3pp': 'ఆరు', # They (past)
|
|
37
|
+
'3pp_alt': 'అవి', # They (alternative, neuter)
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
# Verb roots (examples)
|
|
41
|
+
VERB_ROOTS = {
|
|
42
|
+
'cheyyu': 'చేయు', # to do
|
|
43
|
+
'tinu': 'తిను', # to eat
|
|
44
|
+
'vaddu': 'వడ్డు', # to come
|
|
45
|
+
'chaduvu': 'చదువు', # to read
|
|
46
|
+
'raavu': 'రావు', # to be
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
# Past participles (ROOT + సిన)
|
|
50
|
+
# Modern pattern: చేయు + సిన = చేసిన (NOT చేసితి)
|
|
51
|
+
PAST_PARTICIPLES = {
|
|
52
|
+
'cheyyu': 'చేసిన', # done
|
|
53
|
+
'tinu': 'తిన్న', # eaten
|
|
54
|
+
'vaddu': 'వచ్చిన', # came
|
|
55
|
+
'chaduvu': 'చదివిన', # read
|
|
56
|
+
'raavu': 'రాలేదు', # not came
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def conjugate_verb(root: str, tense: str, person: str) -> str:
|
|
61
|
+
"""
|
|
62
|
+
Conjugate verb using modern v3.0 pattern.
|
|
63
|
+
|
|
64
|
+
Pattern: PAST PARTICIPLE + PERSON MARKER
|
|
65
|
+
Examples:
|
|
66
|
+
conjugate_verb('cheyyu', 'past', '1ps') → 'చేసినాను'
|
|
67
|
+
conjugate_verb('tinu', 'past', '3pp') → 'తిన్నారు'
|
|
68
|
+
|
|
69
|
+
OLD (WRONG) pattern: చేసితిని, చేసితిరి
|
|
70
|
+
NEW (CORRECT) pattern: చేసినాను, చేసినారు
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
root: Verb root (e.g., 'cheyyu')
|
|
74
|
+
tense: 'past', 'present', 'future'
|
|
75
|
+
person: '1ps', '1pp', '2ps', '2pp', '3ps', '3pp'
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
Conjugated verb form
|
|
79
|
+
"""
|
|
80
|
+
if tense != 'past':
|
|
81
|
+
# Handle present/future later
|
|
82
|
+
# For now, just return root
|
|
83
|
+
return VERB_ROOTS.get(root, root)
|
|
84
|
+
|
|
85
|
+
# Get past participle
|
|
86
|
+
participle = PAST_PARTICIPLES.get(root, root + 'ిన')
|
|
87
|
+
|
|
88
|
+
# Get person marker
|
|
89
|
+
marker = PERSON_MARKERS.get(person, '')
|
|
90
|
+
|
|
91
|
+
# Combine: PARTICIPLE + MARKER
|
|
92
|
+
result = participle + marker
|
|
93
|
+
|
|
94
|
+
return result
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
# ============================================================================
|
|
98
|
+
# SECTION 2: 4-CASE SYSTEM (v3.0 Modern)
|
|
99
|
+
# ============================================================================
|
|
100
|
+
|
|
101
|
+
# Case markers (v3.0 simplified - 4 cases in practice)
|
|
102
|
+
CASE_MARKERS = {
|
|
103
|
+
'nominative': 'డు', # Subject (e.g., రాముడు)
|
|
104
|
+
'accusative': 'ను', # Direct object (e.g., పుస్తకం)
|
|
105
|
+
'dative': 'కు', # Indirect object (e.g., రాముడికు)
|
|
106
|
+
'locative': 'లో', # Location (e.g., ఇంట్లో)
|
|
107
|
+
'genitive': 'యొక్క', # Possession (e.g., రాము యొక్క)
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
# Formality markers
|
|
111
|
+
FORMALITY_MARKERS = {
|
|
112
|
+
'informal': '', # Use with friends/family
|
|
113
|
+
'formal': 'గారు', # Respectful (e.g., మీరు వచ్చారుగారు)
|
|
114
|
+
'honorific': 'వారు', # Very respectful
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def apply_case(noun: str, case: str, formality: str = 'informal') -> str:
|
|
119
|
+
"""
|
|
120
|
+
Apply case marker to noun.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
noun: Base noun (e.g., 'రాము')
|
|
124
|
+
case: 'nominative', 'accusative', 'dative', 'locative'
|
|
125
|
+
formality: 'informal', 'formal', 'honorific'
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
Noun with case marker
|
|
129
|
+
|
|
130
|
+
Examples:
|
|
131
|
+
apply_case('రాము', 'nominative') → 'రాముడు'
|
|
132
|
+
apply_case('పుస్తకం', 'accusative') → 'పుస్తకంను'
|
|
133
|
+
apply_case('ఇల్లు', 'locative') → 'ఇంట్లో'
|
|
134
|
+
"""
|
|
135
|
+
if case not in CASE_MARKERS:
|
|
136
|
+
raise ValueError(f"Invalid case: {case}. Use: {list(CASE_MARKERS.keys())}")
|
|
137
|
+
|
|
138
|
+
# Get case marker
|
|
139
|
+
marker = CASE_MARKERS[case]
|
|
140
|
+
|
|
141
|
+
# Apply vowel harmony (simplified)
|
|
142
|
+
# For now, just add marker
|
|
143
|
+
# TODO: Add proper vowel harmony checking
|
|
144
|
+
|
|
145
|
+
# Add formality if needed
|
|
146
|
+
formal = FORMALITY_MARKERS.get(formality, '')
|
|
147
|
+
|
|
148
|
+
result = noun + marker + formal
|
|
149
|
+
|
|
150
|
+
return result
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
# ============================================================================
|
|
154
|
+
# SECTION 3: SOV SYNTAX CONVERSION (v3.0 Critical)
|
|
155
|
+
# ============================================================================
|
|
156
|
+
|
|
157
|
+
# Common English words to identify parts of speech
|
|
158
|
+
POS_PATTERNS = {
|
|
159
|
+
'pronouns': ['i', 'you', 'he', 'she', 'it', 'we', 'they'],
|
|
160
|
+
'articles': ['a', 'an', 'the'],
|
|
161
|
+
'prepositions': ['in', 'on', 'at', 'to', 'from', 'with', 'by'],
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def convert_svo_to_soV(sentence: str) -> str:
|
|
166
|
+
"""
|
|
167
|
+
Convert English SVO to Telugu SOV.
|
|
168
|
+
|
|
169
|
+
Examples:
|
|
170
|
+
"Ramu reads book" → "రాము పుస్తకం చదువుతాడు"
|
|
171
|
+
S O V S O V
|
|
172
|
+
|
|
173
|
+
Algorithm:
|
|
174
|
+
1. Identify subject, object, verb
|
|
175
|
+
2. Add case markers
|
|
176
|
+
3. Reorder to SOV
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
sentence: English sentence (e.g., "Ramu reads book")
|
|
180
|
+
|
|
181
|
+
Returns:
|
|
182
|
+
Telugu sentence in SOV order
|
|
183
|
+
|
|
184
|
+
TODO: This is a simplified version. A real implementation would use
|
|
185
|
+
POS tagging for better accuracy.
|
|
186
|
+
"""
|
|
187
|
+
words = sentence.strip().split()
|
|
188
|
+
if len(words) < 2:
|
|
189
|
+
return sentence
|
|
190
|
+
|
|
191
|
+
# Simple heuristic: first word is subject, last is verb
|
|
192
|
+
subject = words[0]
|
|
193
|
+
verb = words[-1]
|
|
194
|
+
|
|
195
|
+
# Everything in between is object
|
|
196
|
+
object_words = words[1:-1] if len(words) > 2 else words[1:2]
|
|
197
|
+
obj = ' '.join(object_words) if object_words else ''
|
|
198
|
+
|
|
199
|
+
return {
|
|
200
|
+
'subject': subject,
|
|
201
|
+
'object': obj,
|
|
202
|
+
'verb': verb
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def build_telugu_sentence(subject: str, obj: str, verb: str) -> str:
|
|
207
|
+
"""
|
|
208
|
+
Build Telugu sentence with proper morphology.
|
|
209
|
+
|
|
210
|
+
Args:
|
|
211
|
+
subject: Subject (will get nominative case)
|
|
212
|
+
object: Object (will get accusative case)
|
|
213
|
+
verb: Verb (will be conjugated)
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
Complete Telugu sentence in SOV order
|
|
217
|
+
|
|
218
|
+
Example:
|
|
219
|
+
build_telugu_sentence('Ramu', 'book', 'reads')
|
|
220
|
+
→ "రాము పుస్తకం చదువుతాడు"
|
|
221
|
+
"""
|
|
222
|
+
# Transliterate to Telugu
|
|
223
|
+
from .transliterator import eng_to_telugu
|
|
224
|
+
|
|
225
|
+
subject_telugu = eng_to_telugu(subject)
|
|
226
|
+
obj_telugu = eng_to_telugu(obj) if obj else ''
|
|
227
|
+
verb_telugu = eng_to_telugu(verb)
|
|
228
|
+
|
|
229
|
+
# Apply case markers
|
|
230
|
+
subject_telugu = apply_case(subject_telugu, 'nominative')
|
|
231
|
+
if obj_telugu:
|
|
232
|
+
obj_telugu = apply_case(obj_telugu, 'accusative')
|
|
233
|
+
|
|
234
|
+
# Conjugate verb (simplified)
|
|
235
|
+
# TODO: Add proper tense/person detection
|
|
236
|
+
verb_telugu = conjugate_verb('chaduvu', 'present', '3ps')
|
|
237
|
+
|
|
238
|
+
# Build SOV sentence
|
|
239
|
+
parts = [subject_telugu]
|
|
240
|
+
if obj_telugu:
|
|
241
|
+
parts.append(obj_telugu)
|
|
242
|
+
parts.append(verb_telugu)
|
|
243
|
+
|
|
244
|
+
return ' '.join(parts)
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
# ============================================================================
|
|
248
|
+
# SECTION 4: SANDHI RULES (Native Telugu)
|
|
249
|
+
# ============================================================================
|
|
250
|
+
|
|
251
|
+
# Native Telugu sandhi rules
|
|
252
|
+
NATIVE_SANDHI = {
|
|
253
|
+
# Ukārasandhi (u-elision) - MOST FREQUENT in v3.0
|
|
254
|
+
'ukarasandhi': {
|
|
255
|
+
'pattern': r'ు([aeiou])',
|
|
256
|
+
'replacement': r'\1', # Remove 'ు' before vowel
|
|
257
|
+
'example': 'వాడు + ఎవడు = వాడేవడు'
|
|
258
|
+
},
|
|
259
|
+
|
|
260
|
+
# Ikārasandhi (i-elision)
|
|
261
|
+
'ikarasandhi': {
|
|
262
|
+
'pattern': r'ి([aeiou])',
|
|
263
|
+
'replacement': r'\1', # Remove 'ి' before vowel
|
|
264
|
+
'example': 'తాటి + అంకం = తాటాంకం'
|
|
265
|
+
},
|
|
266
|
+
|
|
267
|
+
# Akārasandhi (a-elision)
|
|
268
|
+
'akarasandhi': {
|
|
269
|
+
'pattern': r'([aeo])ా([aeiou])',
|
|
270
|
+
'replacement': r'\1\2', # Simplify vowel sequence
|
|
271
|
+
'example': 'పాల + ఆవు = పాలావు'
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
# Sanskrit sandhi rules (for Tatsama words)
|
|
276
|
+
SANSKRIT_SANDHI = {
|
|
277
|
+
# Savarṇadīrghās (vowel lengthening)
|
|
278
|
+
'savarnadirsha': {
|
|
279
|
+
'pattern': r'([a])([a])',
|
|
280
|
+
'replacement': r'ా', # Same vowel + same vowel = long vowel
|
|
281
|
+
'example': 'దేవ + ఆలయం = దేవాలయం'
|
|
282
|
+
},
|
|
283
|
+
|
|
284
|
+
# Guṇas (vowel raising)
|
|
285
|
+
'gunasandhi': {
|
|
286
|
+
'pattern': r'([a])([iue])',
|
|
287
|
+
'replacement': r'ే\2', # a + i/u/e = e
|
|
288
|
+
'example': 'మహా + ఇంద్ర = మహేంద్ర'
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def apply_sandhi(word1: str, word2: str, origin: str = 'native') -> str:
|
|
294
|
+
"""
|
|
295
|
+
Apply sandhi rules between two words.
|
|
296
|
+
|
|
297
|
+
Args:
|
|
298
|
+
word1: First word
|
|
299
|
+
word2: Second word
|
|
300
|
+
origin: 'native' for Telugu words, 'sanskrit' for Sanskrit words
|
|
301
|
+
|
|
302
|
+
Returns:
|
|
303
|
+
Combined word with sandhi applied
|
|
304
|
+
|
|
305
|
+
Examples:
|
|
306
|
+
apply_sandhi('వాడు', 'ఎవడు', 'native') → 'వాడేవడు'
|
|
307
|
+
apply_sandhi('దేవ', 'ఆలయం', 'sanskrit') → 'దేవాలయం'
|
|
308
|
+
"""
|
|
309
|
+
if origin == 'native':
|
|
310
|
+
# Apply native Telugu sandhi
|
|
311
|
+
combined = word1 + word2
|
|
312
|
+
|
|
313
|
+
# Apply Ukārasandhi (most common)
|
|
314
|
+
pattern = NATIVE_SANDHI['ukarasandhi']['pattern']
|
|
315
|
+
replacement = NATIVE_SANDHI['ukarasandhi']['replacement']
|
|
316
|
+
result = re.sub(pattern, replacement, combined)
|
|
317
|
+
|
|
318
|
+
return result
|
|
319
|
+
|
|
320
|
+
elif origin == 'sanskrit':
|
|
321
|
+
# Apply Sanskrit sandhi
|
|
322
|
+
combined = word1 + word2
|
|
323
|
+
|
|
324
|
+
# Apply Savarṇadīrghās
|
|
325
|
+
pattern = SANSKRIT_SANDHI['savarnadirsha']['pattern']
|
|
326
|
+
replacement = SANSKRIT_SANDHI['savarnadirsha']['replacement']
|
|
327
|
+
result = re.sub(pattern, replacement, combined)
|
|
328
|
+
|
|
329
|
+
return result
|
|
330
|
+
|
|
331
|
+
else:
|
|
332
|
+
# No sandhi
|
|
333
|
+
return word1 + word2
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
# ============================================================================
|
|
337
|
+
# SECTION 5: VOWEL HARMONY
|
|
338
|
+
# ============================================================================
|
|
339
|
+
|
|
340
|
+
# Vowel classes
|
|
341
|
+
VOWEL_CLASSES = {
|
|
342
|
+
'front': ['ఇ', 'ఈ', 'ఎ', 'ఏ', 'ఐ'],
|
|
343
|
+
'back': ['అ', 'ఆ', 'ఉ', 'ఊ', 'ఒ', 'ఓ', 'ఔ'],
|
|
344
|
+
'neutral': ['ర', 'ల', 'వ', 'య', 'న', 'మ', 'న్', 'ం'] # Consonants
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
def check_vowel_harmony(word: str) -> bool:
|
|
349
|
+
"""
|
|
350
|
+
Check if word respects vowel harmony.
|
|
351
|
+
|
|
352
|
+
Vowel harmony: suffixes should match root vowel quality
|
|
353
|
+
(front/back consistency)
|
|
354
|
+
|
|
355
|
+
Args:
|
|
356
|
+
word: Telugu word to check
|
|
357
|
+
|
|
358
|
+
Returns:
|
|
359
|
+
True if harmony is maintained, False otherwise
|
|
360
|
+
|
|
361
|
+
Example:
|
|
362
|
+
check_vowel_harmony('నమస్తే') → True (all back vowels)
|
|
363
|
+
check_vowel_harmony('వేడుక') → False (mixed front/back)
|
|
364
|
+
"""
|
|
365
|
+
vowels_in_word = []
|
|
366
|
+
for char in word:
|
|
367
|
+
for vclass, vowels in VOWEL_CLASSES.items():
|
|
368
|
+
if char in vowels and vclass != 'neutral':
|
|
369
|
+
vowels_in_word.append(vclass)
|
|
370
|
+
|
|
371
|
+
if not vowels_in_word:
|
|
372
|
+
return True # No vowels = neutral
|
|
373
|
+
|
|
374
|
+
# Check if all vowels are same class
|
|
375
|
+
has_front = any(v == 'front' for v in vowels_in_word)
|
|
376
|
+
has_back = any(v == 'back' for v in vowels_in_word)
|
|
377
|
+
|
|
378
|
+
# If both front and back vowels present, harmony broken
|
|
379
|
+
return not (has_front and has_back)
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
def apply_vowel_harmony(base: str, suffix: str) -> str:
|
|
383
|
+
"""
|
|
384
|
+
Apply vowel harmony to suffix based on base.
|
|
385
|
+
|
|
386
|
+
Args:
|
|
387
|
+
base: Base word (determines harmony class)
|
|
388
|
+
suffix: Suffix to modify
|
|
389
|
+
|
|
390
|
+
Returns:
|
|
391
|
+
Harmonized suffix
|
|
392
|
+
"""
|
|
393
|
+
# Find dominant vowel class in base
|
|
394
|
+
base_vowels = []
|
|
395
|
+
for char in base:
|
|
396
|
+
for vclass, vowels in VOWEL_CLASSES.items():
|
|
397
|
+
if char in vowels and vclass != 'neutral':
|
|
398
|
+
base_vowels.append(vclass)
|
|
399
|
+
|
|
400
|
+
if not base_vowels:
|
|
401
|
+
return suffix # No vowels in base
|
|
402
|
+
|
|
403
|
+
# Get dominant class (most common)
|
|
404
|
+
from collections import Counter
|
|
405
|
+
counts = Counter(base_vowels)
|
|
406
|
+
dominant_class = counts.most_common(1)[0][0]
|
|
407
|
+
|
|
408
|
+
# Modify suffix to match
|
|
409
|
+
if dominant_class == 'front':
|
|
410
|
+
# Convert back vowels to front in suffix
|
|
411
|
+
harmonized = suffix
|
|
412
|
+
harmonized = harmonized.replace('ఆ', 'ఇ')
|
|
413
|
+
harmonized = harmonized.replace('ఊ', 'ఈ')
|
|
414
|
+
harmonized = harmonized.replace('ఓ', 'ఏ')
|
|
415
|
+
return harmonized
|
|
416
|
+
else:
|
|
417
|
+
# Keep as is (already back or neutral)
|
|
418
|
+
return suffix
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
# ============================================================================
|
|
422
|
+
# SECTION 6: PUBLIC API
|
|
423
|
+
# ============================================================================
|
|
424
|
+
|
|
425
|
+
__all__ = [
|
|
426
|
+
'conjugate_verb',
|
|
427
|
+
'apply_case',
|
|
428
|
+
'convert_svo_to_soV',
|
|
429
|
+
'build_telugu_sentence',
|
|
430
|
+
'apply_sandhi',
|
|
431
|
+
'check_vowel_harmony',
|
|
432
|
+
'apply_vowel_harmony',
|
|
433
|
+
]
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
# ============================================================================
|
|
437
|
+
# SECTION 7: EXAMPLE USAGE
|
|
438
|
+
# ============================================================================
|
|
439
|
+
|
|
440
|
+
if __name__ == "__main__":
|
|
441
|
+
print("\n" + "="*70)
|
|
442
|
+
print(" MODERN TELUGU GRAMMAR v3.0 - EXAMPLES")
|
|
443
|
+
print("="*70 + "\n")
|
|
444
|
+
|
|
445
|
+
# Test verb conjugation
|
|
446
|
+
print("1. Modern Verb Conjugation:")
|
|
447
|
+
print(f" ' చేయు + past + 1ps' → {conjugate_verb('cheyyu', 'past', '1ps')}")
|
|
448
|
+
print(f" ' తిను + past + 3pp' → {conjugate_verb('tinu', 'past', '3pp')}")
|
|
449
|
+
print(" (NOT: చేసితిని, తినిరి - old pattern)\n")
|
|
450
|
+
|
|
451
|
+
# Test case system
|
|
452
|
+
print("2. 4-Case System:")
|
|
453
|
+
print(f" 'రాము + nominative' → {apply_case('రాము', 'nominative')}")
|
|
454
|
+
print(f" 'పుస్తకం + accusative' → {apply_case('పుస్తకం', 'accusative')}")
|
|
455
|
+
print(f" 'ఇల్లు + locative' → {apply_case('ఇల్లు', 'locative')}\n")
|
|
456
|
+
|
|
457
|
+
# Test SOV conversion
|
|
458
|
+
print("3. SOV Syntax Conversion:")
|
|
459
|
+
svo = convert_svo_to_soV("Ramu reads book")
|
|
460
|
+
print(f" 'Ramu reads book' → {svo}")
|
|
461
|
+
print(f" Built sentence: {build_telugu_sentence('Ramu', 'book', 'reads')}\n")
|
|
462
|
+
|
|
463
|
+
# Test sandhi
|
|
464
|
+
print("4. Sandhi Rules:")
|
|
465
|
+
print(f" 'వాడు + ఎవడు' → {apply_sandhi('వాడు', 'ఎవడు', 'native')}")
|
|
466
|
+
print(f" (Ukārasandhi: u-elision)\n")
|
|
467
|
+
|
|
468
|
+
# Test vowel harmony
|
|
469
|
+
print("5. Vowel Harmony:")
|
|
470
|
+
print(f" 'నమస్తే' → {check_vowel_harmony('నమస్తే')} (True - all back)")
|
|
471
|
+
print(f" 'వేడుక' → {check_vowel_harmony('వేడుక')} (False - mixed)")
|
|
472
|
+
print(f" 'తిను' + 'అను' → '{apply_vowel_harmony('తిను', 'అను')}'\n")
|
|
473
|
+
|
|
474
|
+
print("="*70 + "\n")
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Phonetic normalization rules for lightweight Telugu engine.
|
|
3
|
+
|
|
4
|
+
This module does NOT translate to Telugu script. It only normalizes
|
|
5
|
+
romanized inputs into a consistent, rule-based phonetic form before
|
|
6
|
+
tense building and transliteration.
|
|
7
|
+
|
|
8
|
+
Keep rules small and composable; avoid large tables.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import re
|
|
14
|
+
from typing import Callable, Iterable, List, Tuple
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
Rule = Tuple[re.Pattern, str]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _compile_rules() -> List[Rule]:
|
|
21
|
+
rules: List[Rule] = []
|
|
22
|
+
|
|
23
|
+
# Normalize common digraphs and clusters to a canonical form
|
|
24
|
+
# Prioritize longer patterns first to avoid partial matches.
|
|
25
|
+
patterns = [
|
|
26
|
+
(r"ksh", "ksh"),
|
|
27
|
+
(r"x", "ks"),
|
|
28
|
+
(r"shh", "sh"),
|
|
29
|
+
(r"sch", "sh"),
|
|
30
|
+
(r"sha", "sha"),
|
|
31
|
+
(r"shi", "shi"),
|
|
32
|
+
(r"shu", "shu"),
|
|
33
|
+
(r"sh", "sh"),
|
|
34
|
+
(r"chh", "ch"),
|
|
35
|
+
(r"cch", "ch"),
|
|
36
|
+
(r"ph", "ph"), # keep aspirates
|
|
37
|
+
(r"th", "th"),
|
|
38
|
+
(r"dh", "dh"),
|
|
39
|
+
(r"kh", "kh"),
|
|
40
|
+
(r"gh", "gh"),
|
|
41
|
+
(r"bh", "bh"),
|
|
42
|
+
(r"aa", "aa"),
|
|
43
|
+
(r"ii|ee", "ii"),
|
|
44
|
+
(r"uu|oo", "uu"),
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
for pat, rep in patterns:
|
|
48
|
+
rules.append((re.compile(pat), rep))
|
|
49
|
+
|
|
50
|
+
# Example targeted rule from the spec
|
|
51
|
+
rules.insert(0, (re.compile(r"kri"), "kri")) # keep 'kri' together
|
|
52
|
+
|
|
53
|
+
return rules
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
_RULES = _compile_rules()
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def map_sound(text: str) -> str:
|
|
60
|
+
"""
|
|
61
|
+
Normalize romanized input to a canonical phonetic form.
|
|
62
|
+
|
|
63
|
+
This is a conservative pass meant to standardize inputs. It intentionally
|
|
64
|
+
does only lightweight replacements. Transliteration to Telugu script is
|
|
65
|
+
handled by `transliterator.eng_to_telugu`.
|
|
66
|
+
"""
|
|
67
|
+
s = text.strip().lower()
|
|
68
|
+
if not s:
|
|
69
|
+
return s
|
|
70
|
+
|
|
71
|
+
# apply simple replacements
|
|
72
|
+
for pat, rep in _RULES:
|
|
73
|
+
s = pat.sub(rep, s)
|
|
74
|
+
|
|
75
|
+
# collapse excessive spaces
|
|
76
|
+
s = re.sub(r"\s+", " ", s)
|
|
77
|
+
|
|
78
|
+
return s
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
__all__ = ["map_sound"]
|
|
82
|
+
|