telugu-language-tools 5.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of telugu-language-tools might be problematic. Click here for more details.

@@ -0,0 +1,474 @@
1
+ """
2
+ Modern Telugu Grammar Engine v3.0
3
+ ==================================
4
+
5
+ This module provides modern Telugu grammar processing:
6
+ - Modern verb patterns (Past Participle + Person Marker)
7
+ - 4-case system (Nominative, Accusative, Dative, Locative)
8
+ - SOV syntax conversion
9
+ - Vowel harmony enforcement
10
+ - Sandhi rules
11
+
12
+ Usage:
13
+ from telugu_engine.grammar import apply_case, conjugate_verb
14
+ """
15
+
16
+ from typing import List, Dict, Optional
17
+ import re
18
+
19
+
20
+ # ============================================================================
21
+ # SECTION 1: MODERN VERB PATTERNS (v3.0 Critical)
22
+ # ============================================================================
23
+
24
+ # Person markers (v3.0 modern)
25
+ PERSON_MARKERS = {
26
+ # 1st person
27
+ '1ps': 'ఆను', # I (past)
28
+ '1pp': 'ఆము', # We (past)
29
+
30
+ # 2nd person
31
+ '2ps': 'ఆవు', # You (informal, past)
32
+ '2pp': 'ఆరు', # You (formal/plural, past)
33
+
34
+ # 3rd person
35
+ '3ps': 'ఆడు', # He/She/It (past)
36
+ '3pp': 'ఆరు', # They (past)
37
+ '3pp_alt': 'అవి', # They (alternative, neuter)
38
+ }
39
+
40
+ # Verb roots (examples)
41
+ VERB_ROOTS = {
42
+ 'cheyyu': 'చేయు', # to do
43
+ 'tinu': 'తిను', # to eat
44
+ 'vaddu': 'వడ్డు', # to come
45
+ 'chaduvu': 'చదువు', # to read
46
+ 'raavu': 'రావు', # to be
47
+ }
48
+
49
+ # Past participles (ROOT + సిన)
50
+ # Modern pattern: చేయు + సిన = చేసిన (NOT చేసితి)
51
+ PAST_PARTICIPLES = {
52
+ 'cheyyu': 'చేసిన', # done
53
+ 'tinu': 'తిన్న', # eaten
54
+ 'vaddu': 'వచ్చిన', # came
55
+ 'chaduvu': 'చదివిన', # read
56
+ 'raavu': 'రాలేదు', # not came
57
+ }
58
+
59
+
60
+ def conjugate_verb(root: str, tense: str, person: str) -> str:
61
+ """
62
+ Conjugate verb using modern v3.0 pattern.
63
+
64
+ Pattern: PAST PARTICIPLE + PERSON MARKER
65
+ Examples:
66
+ conjugate_verb('cheyyu', 'past', '1ps') → 'చేసినాను'
67
+ conjugate_verb('tinu', 'past', '3pp') → 'తిన్నారు'
68
+
69
+ OLD (WRONG) pattern: చేసితిని, చేసితిరి
70
+ NEW (CORRECT) pattern: చేసినాను, చేసినారు
71
+
72
+ Args:
73
+ root: Verb root (e.g., 'cheyyu')
74
+ tense: 'past', 'present', 'future'
75
+ person: '1ps', '1pp', '2ps', '2pp', '3ps', '3pp'
76
+
77
+ Returns:
78
+ Conjugated verb form
79
+ """
80
+ if tense != 'past':
81
+ # Handle present/future later
82
+ # For now, just return root
83
+ return VERB_ROOTS.get(root, root)
84
+
85
+ # Get past participle
86
+ participle = PAST_PARTICIPLES.get(root, root + 'ిన')
87
+
88
+ # Get person marker
89
+ marker = PERSON_MARKERS.get(person, '')
90
+
91
+ # Combine: PARTICIPLE + MARKER
92
+ result = participle + marker
93
+
94
+ return result
95
+
96
+
97
+ # ============================================================================
98
+ # SECTION 2: 4-CASE SYSTEM (v3.0 Modern)
99
+ # ============================================================================
100
+
101
+ # Case markers (v3.0 simplified - 4 cases in practice)
102
+ CASE_MARKERS = {
103
+ 'nominative': 'డు', # Subject (e.g., రాముడు)
104
+ 'accusative': 'ను', # Direct object (e.g., పుస్తకం)
105
+ 'dative': 'కు', # Indirect object (e.g., రాముడికు)
106
+ 'locative': 'లో', # Location (e.g., ఇంట్లో)
107
+ 'genitive': 'యొక్క', # Possession (e.g., రాము యొక్క)
108
+ }
109
+
110
+ # Formality markers
111
+ FORMALITY_MARKERS = {
112
+ 'informal': '', # Use with friends/family
113
+ 'formal': 'గారు', # Respectful (e.g., మీరు వచ్చారుగారు)
114
+ 'honorific': 'వారు', # Very respectful
115
+ }
116
+
117
+
118
+ def apply_case(noun: str, case: str, formality: str = 'informal') -> str:
119
+ """
120
+ Apply case marker to noun.
121
+
122
+ Args:
123
+ noun: Base noun (e.g., 'రాము')
124
+ case: 'nominative', 'accusative', 'dative', 'locative'
125
+ formality: 'informal', 'formal', 'honorific'
126
+
127
+ Returns:
128
+ Noun with case marker
129
+
130
+ Examples:
131
+ apply_case('రాము', 'nominative') → 'రాముడు'
132
+ apply_case('పుస్తకం', 'accusative') → 'పుస్తకంను'
133
+ apply_case('ఇల్లు', 'locative') → 'ఇంట్లో'
134
+ """
135
+ if case not in CASE_MARKERS:
136
+ raise ValueError(f"Invalid case: {case}. Use: {list(CASE_MARKERS.keys())}")
137
+
138
+ # Get case marker
139
+ marker = CASE_MARKERS[case]
140
+
141
+ # Apply vowel harmony (simplified)
142
+ # For now, just add marker
143
+ # TODO: Add proper vowel harmony checking
144
+
145
+ # Add formality if needed
146
+ formal = FORMALITY_MARKERS.get(formality, '')
147
+
148
+ result = noun + marker + formal
149
+
150
+ return result
151
+
152
+
153
+ # ============================================================================
154
+ # SECTION 3: SOV SYNTAX CONVERSION (v3.0 Critical)
155
+ # ============================================================================
156
+
157
+ # Common English words to identify parts of speech
158
+ POS_PATTERNS = {
159
+ 'pronouns': ['i', 'you', 'he', 'she', 'it', 'we', 'they'],
160
+ 'articles': ['a', 'an', 'the'],
161
+ 'prepositions': ['in', 'on', 'at', 'to', 'from', 'with', 'by'],
162
+ }
163
+
164
+
165
+ def convert_svo_to_soV(sentence: str) -> str:
166
+ """
167
+ Convert English SVO to Telugu SOV.
168
+
169
+ Examples:
170
+ "Ramu reads book" → "రాము పుస్తకం చదువుతాడు"
171
+ S O V S O V
172
+
173
+ Algorithm:
174
+ 1. Identify subject, object, verb
175
+ 2. Add case markers
176
+ 3. Reorder to SOV
177
+
178
+ Args:
179
+ sentence: English sentence (e.g., "Ramu reads book")
180
+
181
+ Returns:
182
+ Telugu sentence in SOV order
183
+
184
+ TODO: This is a simplified version. A real implementation would use
185
+ POS tagging for better accuracy.
186
+ """
187
+ words = sentence.strip().split()
188
+ if len(words) < 2:
189
+ return sentence
190
+
191
+ # Simple heuristic: first word is subject, last is verb
192
+ subject = words[0]
193
+ verb = words[-1]
194
+
195
+ # Everything in between is object
196
+ object_words = words[1:-1] if len(words) > 2 else words[1:2]
197
+ obj = ' '.join(object_words) if object_words else ''
198
+
199
+ return {
200
+ 'subject': subject,
201
+ 'object': obj,
202
+ 'verb': verb
203
+ }
204
+
205
+
206
+ def build_telugu_sentence(subject: str, obj: str, verb: str) -> str:
207
+ """
208
+ Build Telugu sentence with proper morphology.
209
+
210
+ Args:
211
+ subject: Subject (will get nominative case)
212
+ object: Object (will get accusative case)
213
+ verb: Verb (will be conjugated)
214
+
215
+ Returns:
216
+ Complete Telugu sentence in SOV order
217
+
218
+ Example:
219
+ build_telugu_sentence('Ramu', 'book', 'reads')
220
+ → "రాము పుస్తకం చదువుతాడు"
221
+ """
222
+ # Transliterate to Telugu
223
+ from .transliterator import eng_to_telugu
224
+
225
+ subject_telugu = eng_to_telugu(subject)
226
+ obj_telugu = eng_to_telugu(obj) if obj else ''
227
+ verb_telugu = eng_to_telugu(verb)
228
+
229
+ # Apply case markers
230
+ subject_telugu = apply_case(subject_telugu, 'nominative')
231
+ if obj_telugu:
232
+ obj_telugu = apply_case(obj_telugu, 'accusative')
233
+
234
+ # Conjugate verb (simplified)
235
+ # TODO: Add proper tense/person detection
236
+ verb_telugu = conjugate_verb('chaduvu', 'present', '3ps')
237
+
238
+ # Build SOV sentence
239
+ parts = [subject_telugu]
240
+ if obj_telugu:
241
+ parts.append(obj_telugu)
242
+ parts.append(verb_telugu)
243
+
244
+ return ' '.join(parts)
245
+
246
+
247
+ # ============================================================================
248
+ # SECTION 4: SANDHI RULES (Native Telugu)
249
+ # ============================================================================
250
+
251
+ # Native Telugu sandhi rules
252
+ NATIVE_SANDHI = {
253
+ # Ukārasandhi (u-elision) - MOST FREQUENT in v3.0
254
+ 'ukarasandhi': {
255
+ 'pattern': r'ు([aeiou])',
256
+ 'replacement': r'\1', # Remove 'ు' before vowel
257
+ 'example': 'వాడు + ఎవడు = వాడేవడు'
258
+ },
259
+
260
+ # Ikārasandhi (i-elision)
261
+ 'ikarasandhi': {
262
+ 'pattern': r'ి([aeiou])',
263
+ 'replacement': r'\1', # Remove 'ి' before vowel
264
+ 'example': 'తాటి + అంకం = తాటాంకం'
265
+ },
266
+
267
+ # Akārasandhi (a-elision)
268
+ 'akarasandhi': {
269
+ 'pattern': r'([aeo])ా([aeiou])',
270
+ 'replacement': r'\1\2', # Simplify vowel sequence
271
+ 'example': 'పాల + ఆవు = పాలావు'
272
+ }
273
+ }
274
+
275
+ # Sanskrit sandhi rules (for Tatsama words)
276
+ SANSKRIT_SANDHI = {
277
+ # Savarṇadīrghās (vowel lengthening)
278
+ 'savarnadirsha': {
279
+ 'pattern': r'([a])([a])',
280
+ 'replacement': r'ా', # Same vowel + same vowel = long vowel
281
+ 'example': 'దేవ + ఆలయం = దేవాలయం'
282
+ },
283
+
284
+ # Guṇas (vowel raising)
285
+ 'gunasandhi': {
286
+ 'pattern': r'([a])([iue])',
287
+ 'replacement': r'ే\2', # a + i/u/e = e
288
+ 'example': 'మహా + ఇంద్ర = మహేంద్ర'
289
+ }
290
+ }
291
+
292
+
293
+ def apply_sandhi(word1: str, word2: str, origin: str = 'native') -> str:
294
+ """
295
+ Apply sandhi rules between two words.
296
+
297
+ Args:
298
+ word1: First word
299
+ word2: Second word
300
+ origin: 'native' for Telugu words, 'sanskrit' for Sanskrit words
301
+
302
+ Returns:
303
+ Combined word with sandhi applied
304
+
305
+ Examples:
306
+ apply_sandhi('వాడు', 'ఎవడు', 'native') → 'వాడేవడు'
307
+ apply_sandhi('దేవ', 'ఆలయం', 'sanskrit') → 'దేవాలయం'
308
+ """
309
+ if origin == 'native':
310
+ # Apply native Telugu sandhi
311
+ combined = word1 + word2
312
+
313
+ # Apply Ukārasandhi (most common)
314
+ pattern = NATIVE_SANDHI['ukarasandhi']['pattern']
315
+ replacement = NATIVE_SANDHI['ukarasandhi']['replacement']
316
+ result = re.sub(pattern, replacement, combined)
317
+
318
+ return result
319
+
320
+ elif origin == 'sanskrit':
321
+ # Apply Sanskrit sandhi
322
+ combined = word1 + word2
323
+
324
+ # Apply Savarṇadīrghās
325
+ pattern = SANSKRIT_SANDHI['savarnadirsha']['pattern']
326
+ replacement = SANSKRIT_SANDHI['savarnadirsha']['replacement']
327
+ result = re.sub(pattern, replacement, combined)
328
+
329
+ return result
330
+
331
+ else:
332
+ # No sandhi
333
+ return word1 + word2
334
+
335
+
336
+ # ============================================================================
337
+ # SECTION 5: VOWEL HARMONY
338
+ # ============================================================================
339
+
340
+ # Vowel classes
341
+ VOWEL_CLASSES = {
342
+ 'front': ['ఇ', 'ఈ', 'ఎ', 'ఏ', 'ఐ'],
343
+ 'back': ['అ', 'ఆ', 'ఉ', 'ఊ', 'ఒ', 'ఓ', 'ఔ'],
344
+ 'neutral': ['ర', 'ల', 'వ', 'య', 'న', 'మ', 'న్', 'ం'] # Consonants
345
+ }
346
+
347
+
348
+ def check_vowel_harmony(word: str) -> bool:
349
+ """
350
+ Check if word respects vowel harmony.
351
+
352
+ Vowel harmony: suffixes should match root vowel quality
353
+ (front/back consistency)
354
+
355
+ Args:
356
+ word: Telugu word to check
357
+
358
+ Returns:
359
+ True if harmony is maintained, False otherwise
360
+
361
+ Example:
362
+ check_vowel_harmony('నమస్తే') → True (all back vowels)
363
+ check_vowel_harmony('వేడుక') → False (mixed front/back)
364
+ """
365
+ vowels_in_word = []
366
+ for char in word:
367
+ for vclass, vowels in VOWEL_CLASSES.items():
368
+ if char in vowels and vclass != 'neutral':
369
+ vowels_in_word.append(vclass)
370
+
371
+ if not vowels_in_word:
372
+ return True # No vowels = neutral
373
+
374
+ # Check if all vowels are same class
375
+ has_front = any(v == 'front' for v in vowels_in_word)
376
+ has_back = any(v == 'back' for v in vowels_in_word)
377
+
378
+ # If both front and back vowels present, harmony broken
379
+ return not (has_front and has_back)
380
+
381
+
382
+ def apply_vowel_harmony(base: str, suffix: str) -> str:
383
+ """
384
+ Apply vowel harmony to suffix based on base.
385
+
386
+ Args:
387
+ base: Base word (determines harmony class)
388
+ suffix: Suffix to modify
389
+
390
+ Returns:
391
+ Harmonized suffix
392
+ """
393
+ # Find dominant vowel class in base
394
+ base_vowels = []
395
+ for char in base:
396
+ for vclass, vowels in VOWEL_CLASSES.items():
397
+ if char in vowels and vclass != 'neutral':
398
+ base_vowels.append(vclass)
399
+
400
+ if not base_vowels:
401
+ return suffix # No vowels in base
402
+
403
+ # Get dominant class (most common)
404
+ from collections import Counter
405
+ counts = Counter(base_vowels)
406
+ dominant_class = counts.most_common(1)[0][0]
407
+
408
+ # Modify suffix to match
409
+ if dominant_class == 'front':
410
+ # Convert back vowels to front in suffix
411
+ harmonized = suffix
412
+ harmonized = harmonized.replace('ఆ', 'ఇ')
413
+ harmonized = harmonized.replace('ఊ', 'ఈ')
414
+ harmonized = harmonized.replace('ఓ', 'ఏ')
415
+ return harmonized
416
+ else:
417
+ # Keep as is (already back or neutral)
418
+ return suffix
419
+
420
+
421
+ # ============================================================================
422
+ # SECTION 6: PUBLIC API
423
+ # ============================================================================
424
+
425
+ __all__ = [
426
+ 'conjugate_verb',
427
+ 'apply_case',
428
+ 'convert_svo_to_soV',
429
+ 'build_telugu_sentence',
430
+ 'apply_sandhi',
431
+ 'check_vowel_harmony',
432
+ 'apply_vowel_harmony',
433
+ ]
434
+
435
+
436
+ # ============================================================================
437
+ # SECTION 7: EXAMPLE USAGE
438
+ # ============================================================================
439
+
440
+ if __name__ == "__main__":
441
+ print("\n" + "="*70)
442
+ print(" MODERN TELUGU GRAMMAR v3.0 - EXAMPLES")
443
+ print("="*70 + "\n")
444
+
445
+ # Test verb conjugation
446
+ print("1. Modern Verb Conjugation:")
447
+ print(f" ' చేయు + past + 1ps' → {conjugate_verb('cheyyu', 'past', '1ps')}")
448
+ print(f" ' తిను + past + 3pp' → {conjugate_verb('tinu', 'past', '3pp')}")
449
+ print(" (NOT: చేసితిని, తినిరి - old pattern)\n")
450
+
451
+ # Test case system
452
+ print("2. 4-Case System:")
453
+ print(f" 'రాము + nominative' → {apply_case('రాము', 'nominative')}")
454
+ print(f" 'పుస్తకం + accusative' → {apply_case('పుస్తకం', 'accusative')}")
455
+ print(f" 'ఇల్లు + locative' → {apply_case('ఇల్లు', 'locative')}\n")
456
+
457
+ # Test SOV conversion
458
+ print("3. SOV Syntax Conversion:")
459
+ svo = convert_svo_to_soV("Ramu reads book")
460
+ print(f" 'Ramu reads book' → {svo}")
461
+ print(f" Built sentence: {build_telugu_sentence('Ramu', 'book', 'reads')}\n")
462
+
463
+ # Test sandhi
464
+ print("4. Sandhi Rules:")
465
+ print(f" 'వాడు + ఎవడు' → {apply_sandhi('వాడు', 'ఎవడు', 'native')}")
466
+ print(f" (Ukārasandhi: u-elision)\n")
467
+
468
+ # Test vowel harmony
469
+ print("5. Vowel Harmony:")
470
+ print(f" 'నమస్తే' → {check_vowel_harmony('నమస్తే')} (True - all back)")
471
+ print(f" 'వేడుక' → {check_vowel_harmony('వేడుక')} (False - mixed)")
472
+ print(f" 'తిను' + 'అను' → '{apply_vowel_harmony('తిను', 'అను')}'\n")
473
+
474
+ print("="*70 + "\n")
@@ -0,0 +1,82 @@
1
+ """
2
+ Phonetic normalization rules for lightweight Telugu engine.
3
+
4
+ This module does NOT translate to Telugu script. It only normalizes
5
+ romanized inputs into a consistent, rule-based phonetic form before
6
+ tense building and transliteration.
7
+
8
+ Keep rules small and composable; avoid large tables.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import re
14
+ from typing import Callable, Iterable, List, Tuple
15
+
16
+
17
+ Rule = Tuple[re.Pattern, str]
18
+
19
+
20
+ def _compile_rules() -> List[Rule]:
21
+ rules: List[Rule] = []
22
+
23
+ # Normalize common digraphs and clusters to a canonical form
24
+ # Prioritize longer patterns first to avoid partial matches.
25
+ patterns = [
26
+ (r"ksh", "ksh"),
27
+ (r"x", "ks"),
28
+ (r"shh", "sh"),
29
+ (r"sch", "sh"),
30
+ (r"sha", "sha"),
31
+ (r"shi", "shi"),
32
+ (r"shu", "shu"),
33
+ (r"sh", "sh"),
34
+ (r"chh", "ch"),
35
+ (r"cch", "ch"),
36
+ (r"ph", "ph"), # keep aspirates
37
+ (r"th", "th"),
38
+ (r"dh", "dh"),
39
+ (r"kh", "kh"),
40
+ (r"gh", "gh"),
41
+ (r"bh", "bh"),
42
+ (r"aa", "aa"),
43
+ (r"ii|ee", "ii"),
44
+ (r"uu|oo", "uu"),
45
+ ]
46
+
47
+ for pat, rep in patterns:
48
+ rules.append((re.compile(pat), rep))
49
+
50
+ # Example targeted rule from the spec
51
+ rules.insert(0, (re.compile(r"kri"), "kri")) # keep 'kri' together
52
+
53
+ return rules
54
+
55
+
56
+ _RULES = _compile_rules()
57
+
58
+
59
+ def map_sound(text: str) -> str:
60
+ """
61
+ Normalize romanized input to a canonical phonetic form.
62
+
63
+ This is a conservative pass meant to standardize inputs. It intentionally
64
+ does only lightweight replacements. Transliteration to Telugu script is
65
+ handled by `transliterator.eng_to_telugu`.
66
+ """
67
+ s = text.strip().lower()
68
+ if not s:
69
+ return s
70
+
71
+ # apply simple replacements
72
+ for pat, rep in _RULES:
73
+ s = pat.sub(rep, s)
74
+
75
+ # collapse excessive spaces
76
+ s = re.sub(r"\s+", " ", s)
77
+
78
+ return s
79
+
80
+
81
+ __all__ = ["map_sound"]
82
+