telugu-language-tools 4.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,568 @@
1
+ """
2
+ Context-Aware Transliteration Rules
3
+ ====================================
4
+
5
+ Intelligent disambiguation based on phonetic context, position, and patterns.
6
+
7
+ Features:
8
+ - Context-aware nasal selection (5 types: ఙ, ఞ, ణ, న, మ)
9
+ - Vowel length disambiguation (short vs long)
10
+ - Retroflex vs dental selection
11
+ - Anusvara intelligent placement
12
+ - Schwa deletion rules
13
+
14
+ Usage:
15
+ from telugu_lib.context_rules import ContextualTransliterator
16
+ trans = ContextualTransliterator()
17
+ result = trans.transliterate("samskara")
18
+ """
19
+
20
+ from typing import Optional, Tuple, List
21
+ from .iso15919_mappings import get_iso_consonants, get_iso_vowels, get_articulation_class
22
+
23
+
24
+ # ============================================================================
25
+ # CONTEXT-AWARE NASAL SELECTION
26
+ # ============================================================================
27
+
28
+ class NasalSelector:
29
+ """
30
+ Select appropriate nasal consonant based on phonetic context.
31
+
32
+ Telugu has 5 nasal types:
33
+ - ఙ (ṅa) - Velar nasal (after k, g)
34
+ - ఞ (ña) - Palatal nasal (after ch, j)
35
+ - ణ (ṇa) - Retroflex nasal (after ṭ, ḍ)
36
+ - న (na) - Dental nasal (default, after t, d)
37
+ - ం (ṁ) - Anusvara (before consonants)
38
+ """
39
+
40
+ def __init__(self):
41
+ self.consonants = get_iso_consonants("mixed")
42
+
43
+ # Articulation classes
44
+ self.VELAR = ["k", "kh", "g", "gh"]
45
+ self.PALATAL = ["c", "ch", "chh", "j", "jh", "ś", "sh"]
46
+ self.RETROFLEX = ["ṭ", "ṭh", "ḍ", "ḍh", "T", "Th", "D", "Dh", "ṣ", "S"]
47
+ self.DENTAL = ["t", "th", "d", "dh", "s"]
48
+ self.LABIAL = ["p", "ph", "b", "bh", "m", "v", "w"]
49
+
50
+ def select_nasal(self, prev_char: Optional[str], next_char: Optional[str],
51
+ nasal_input: str) -> str:
52
+ """
53
+ Select appropriate nasal based on context.
54
+
55
+ Args:
56
+ prev_char: Previous consonant (if any)
57
+ next_char: Following consonant (if any)
58
+ nasal_input: Original nasal input (n, m, etc.)
59
+
60
+ Returns:
61
+ Appropriate Telugu nasal character
62
+ """
63
+
64
+ # Rule 1: If nasal before consonant → use anusvara (ం)
65
+ if next_char and next_char in self.consonants:
66
+ return "ం" # Anusvara before any consonant
67
+
68
+ # Rule 2: After velar → velar nasal
69
+ if prev_char in self.VELAR:
70
+ return "ఙ" # ṅa
71
+
72
+ # Rule 3: After palatal → palatal nasal
73
+ if prev_char in self.PALATAL:
74
+ return "ఞ" # ña
75
+
76
+ # Rule 4: After retroflex → retroflex nasal
77
+ if prev_char in self.RETROFLEX:
78
+ return "ణ" # ṇa
79
+
80
+ # Rule 5: After labial → labial nasal (m)
81
+ if prev_char in self.LABIAL:
82
+ return "మ" # ma
83
+
84
+ # Rule 6: Explicit retroflex nasal input
85
+ if nasal_input in ["ṇ", "N"]:
86
+ return "ణ"
87
+
88
+ # Rule 7: Explicit labial nasal input
89
+ if nasal_input in ["m", "M"]:
90
+ return "మ"
91
+
92
+ # Default: Dental nasal (most common)
93
+ return "న" # na
94
+
95
+
96
+ # ============================================================================
97
+ # VOWEL LENGTH DISAMBIGUATION
98
+ # ============================================================================
99
+
100
+ class VowelDisambiguator:
101
+ """
102
+ Determine if vowel should be short or long based on context.
103
+
104
+ Handles ambiguous cases:
105
+ - "o" → ఒ (short) or ఓ (long)?
106
+ - "e" → ఎ (short) or ఏ (long)?
107
+ """
108
+
109
+ def __init__(self):
110
+ # Known words with specific vowel lengths
111
+ self.KNOWN_WORDS = {
112
+ # Short O
113
+ "kotta": "కొత్త", # new (short o)
114
+ "kotha": "కొత్త", # new
115
+ "pota": "పొట", # cover (short o)
116
+ "rotta": "రొట్టె", # bread (short o)
117
+
118
+ # Long O
119
+ "kota": "కోట", # fort (long o)
120
+ "thota": "తోట", # garden (long o)
121
+ "bota": "బోట", # boat (long o)
122
+ "rota": "రోట", # rotation (long o)
123
+ "mango": "మాంగో", # mango (long o)
124
+
125
+ # Short E
126
+ "cheta": "చెట", # tree (short e)
127
+ "peta": "పెట", # box (short e)
128
+ "chetu": "చెట్టు", # tree (short e)
129
+
130
+ # Long E
131
+ "prema": "ప్రేమ", # love (long e)
132
+ "deva": "దేవ", # god (long e)
133
+ "sevа": "సేవ", # service (long e)
134
+ "kela": "కేళ", # banana (long e)
135
+ }
136
+
137
+ def disambiguate_vowel(self, word: str, position: int, vowel: str) -> str:
138
+ """
139
+ Determine correct vowel length based on context.
140
+
141
+ Args:
142
+ word: Complete word being transliterated
143
+ position: Position of vowel in word
144
+ vowel: Ambiguous vowel ('o' or 'e')
145
+
146
+ Returns:
147
+ Appropriate short or long vowel
148
+ """
149
+
150
+ # Check dictionary first
151
+ word_lower = word.lower()
152
+ if word_lower in self.KNOWN_WORDS:
153
+ # Extract vowel at position from known word
154
+ return self._extract_vowel_from_known(word_lower, position)
155
+
156
+ # Heuristic rules
157
+ if vowel == 'o':
158
+ return self._disambiguate_o(word, position)
159
+ elif vowel == 'e':
160
+ return self._disambiguate_e(word, position)
161
+
162
+ # Default: return short vowel
163
+ return "ఒ" if vowel == 'o' else "ఎ"
164
+
165
+ def _disambiguate_o(self, word: str, position: int) -> str:
166
+ """Disambiguate 'o' → ఒ or ఓ"""
167
+
168
+ # Rule 1: Word-final 'o' → usually long
169
+ if position == len(word) - 1:
170
+ return "ఓ" # Long o
171
+
172
+ # Rule 2: Before single consonant + vowel → usually long
173
+ if position < len(word) - 2:
174
+ if word[position + 1].isalpha() and not word[position + 1] in 'aeiou':
175
+ if position + 2 < len(word) and word[position + 2] in 'aeiou':
176
+ return "ఓ" # Long o
177
+
178
+ # Rule 3: In open syllable → long
179
+ if position < len(word) - 1 and word[position + 1] in 'aeiou':
180
+ return "ఓ" # Long o
181
+
182
+ # Rule 4: Before double consonant → short
183
+ if position < len(word) - 2:
184
+ if word[position + 1] == word[position + 2]:
185
+ return "ఒ" # Short o
186
+
187
+ # Rule 5: In loanwords (has 'o' in English) → usually long
188
+ english_patterns = ['photo', 'video', 'radio', 'mobile', 'logo']
189
+ if any(pattern in word.lower() for pattern in english_patterns):
190
+ return "ఓ" # Long o
191
+
192
+ # Default: Short
193
+ return "ఒ"
194
+
195
+ def _disambiguate_e(self, word: str, position: int) -> str:
196
+ """Disambiguate 'e' → ఎ or ఏ"""
197
+
198
+ # Rule 1: Word-final 'e' → usually short
199
+ if position == len(word) - 1:
200
+ return "ఎ" # Short e
201
+
202
+ # Rule 2: Before 'r' or 'm' at end → usually long
203
+ if position < len(word) - 1:
204
+ next_chars = word[position + 1:position + 3]
205
+ if next_chars in ['ra', 're', 'ma', 'me', 'va', 've']:
206
+ return "ఏ" # Long e
207
+
208
+ # Rule 3: In stressed syllable → long
209
+ # (Approximate: if followed by single consonant)
210
+ if position < len(word) - 2:
211
+ if word[position + 1].isalpha() and not word[position + 1] in 'aeiou':
212
+ if position + 2 < len(word) and word[position + 2] in 'aeiou':
213
+ return "ఏ" # Long e
214
+
215
+ # Rule 4: Before double consonant → short
216
+ if position < len(word) - 2:
217
+ if word[position + 1] == word[position + 2]:
218
+ return "ఎ" # Short e
219
+
220
+ # Default: Short
221
+ return "ఎ"
222
+
223
+ def _extract_vowel_from_known(self, word: str, position: int) -> Optional[str]:
224
+ """Extract specific vowel from known word"""
225
+ telugu_word = self.KNOWN_WORDS.get(word)
226
+ if telugu_word and position < len(telugu_word):
227
+ # Return the character at approximate position
228
+ # (This is simplified - real implementation needs proper indexing)
229
+ return telugu_word[min(position, len(telugu_word) - 1)]
230
+ return None
231
+
232
+
233
+ # ============================================================================
234
+ # RETROFLEX VS DENTAL DISAMBIGUATION
235
+ # ============================================================================
236
+
237
+ class RetroflexSelector:
238
+ """
239
+ Determine if consonant should be dental or retroflex.
240
+
241
+ Handles:
242
+ - t → త (dental) or ட (retroflex)?
243
+ - d → ద (dental) or డ (retroflex)?
244
+ - n → న (dental) or ణ (retroflex)?
245
+ """
246
+
247
+ def __init__(self):
248
+ # Patterns that indicate retroflex
249
+ self.RETROFLEX_CONTEXTS = {
250
+ # After these sounds → usually retroflex
251
+ 'after_r': ['r', 'R', 'ṛ'],
252
+ 'after_long_vowel': ['ā', 'ī', 'ū', 'ē', 'ō', 'A', 'I', 'U', 'E', 'O', 'aa', 'ii', 'uu', 'ee', 'oo'],
253
+ # Word patterns
254
+ 'retroflex_words': ['kaTa', 'paNDu', 'koTi', 'gaDDa', 'vaDDu'],
255
+ }
256
+
257
+ def select_t_variant(self, word: str, position: int, explicit: Optional[str] = None) -> str:
258
+ """
259
+ Select between త (dental t) and ట (retroflex ṭ).
260
+
261
+ Args:
262
+ word: Complete word
263
+ position: Position of 't' in word
264
+ explicit: Explicit marker ('t' or 'T')
265
+
266
+ Returns:
267
+ Telugu consonant (త or ట)
268
+ """
269
+
270
+ # Explicit retroflex marker
271
+ if explicit in ['T', 'ṭ']:
272
+ return "ట"
273
+
274
+ # Rule 1: After 'r' → usually retroflex
275
+ if position > 0 and word[position - 1] in ['r', 'R']:
276
+ return "ట"
277
+
278
+ # Rule 2: After long vowel → often retroflex
279
+ if position > 1:
280
+ prev_two = word[position - 2:position]
281
+ if prev_two in ['aa', 'ii', 'uu', 'ee', 'oo'] or prev_two[-1] in ['A', 'I', 'U', 'E', 'O']:
282
+ return "ட"
283
+
284
+ # Rule 3: Word-initial → usually dental
285
+ if position == 0:
286
+ return "త"
287
+
288
+ # Rule 4: Before 'r' in cluster 'tr' → usually dental in English loanwords
289
+ if position < len(word) - 1 and word[position + 1] == 'r':
290
+ return "త" # "train" → త్రైన్, not ట్రైన్
291
+
292
+ # Rule 5: Double 'tt' → dental
293
+ if position > 0 and word[position - 1] == 't':
294
+ return "త"
295
+ if position < len(word) - 1 and word[position + 1] == 't':
296
+ return "త"
297
+
298
+ # Default: Dental (more common)
299
+ return "త"
300
+
301
+ def select_d_variant(self, word: str, position: int, explicit: Optional[str] = None) -> str:
302
+ """Select between ద (dental d) and డ (retroflex ḍ)"""
303
+
304
+ if explicit in ['D', 'ḍ']:
305
+ return "డ"
306
+
307
+ # Similar rules to t_variant
308
+ if position > 0 and word[position - 1] in ['r', 'R']:
309
+ return "డ"
310
+
311
+ if position == 0:
312
+ return "ద"
313
+
314
+ # Default: Dental
315
+ return "ద"
316
+
317
+ def select_n_variant(self, word: str, position: int, explicit: Optional[str] = None) -> str:
318
+ """Select between న (dental n) and ణ (retroflex ṇ)"""
319
+
320
+ if explicit in ['N', 'ṇ']:
321
+ return "ణ"
322
+
323
+ # After retroflex consonants → retroflex nasal
324
+ if position > 0:
325
+ prev = word[position - 1]
326
+ if prev in ['ṭ', 'ṭh', 'ḍ', 'ḍh', 'T', 'D', 'ṣ', 'S', 'ḷ', 'L']:
327
+ return "ణ"
328
+
329
+ # After long vowel + 'r'
330
+ if position > 1:
331
+ if word[position - 1] in ['r', 'R'] and word[position - 2] in ['a', 'i', 'u']:
332
+ return "ణ"
333
+
334
+ # Default: Dental
335
+ return "న"
336
+
337
+
338
+ # ============================================================================
339
+ # SCHWA DELETION AND VIRAMA PLACEMENT
340
+ # ============================================================================
341
+
342
+ class SchwaHandler:
343
+ """
344
+ Handle inherent vowel (schwa) deletion.
345
+
346
+ Determines when to add virama (్) to suppress inherent 'a'.
347
+ """
348
+
349
+ def should_suppress_schwa(self, word: str, position: int,
350
+ next_char: Optional[str] = None) -> bool:
351
+ """
352
+ Determine if inherent 'a' should be suppressed with virama.
353
+
354
+ Args:
355
+ word: Complete word
356
+ position: Position of consonant
357
+ next_char: Next character in transliteration
358
+
359
+ Returns:
360
+ True if virama should be added
361
+ """
362
+
363
+ # Rule 1: Consonant cluster → suppress on all but last
364
+ if next_char and self._is_consonant(next_char):
365
+ return True
366
+
367
+ # Rule 2: Word-final consonant in Sanskrit loanwords
368
+ if position == len(word) - 1:
369
+ if self._is_sanskrit_loanword(word):
370
+ # Sanskrit words often end in consonants
371
+ return True
372
+ # Telugu native words typically end in vowels
373
+ return False
374
+
375
+ # Rule 3: Before explicit vowel marker
376
+ if position < len(word) - 1:
377
+ next_input = word[position + 1]
378
+ if next_input in 'aāiīuūeēoōṛṝḷṝ':
379
+ return True
380
+
381
+ # Rule 4: In specific patterns
382
+ # "film" → ఫిల్మ్ (suppress after l before m)
383
+ if position < len(word) - 1:
384
+ current = word[position]
385
+ next_input = word[position + 1]
386
+
387
+ # Consonant + consonant at word end
388
+ if position == len(word) - 2 and self._is_consonant(next_input):
389
+ return True
390
+
391
+ # Default: Don't suppress
392
+ return False
393
+
394
+ def _is_consonant(self, char: str) -> bool:
395
+ """Check if character is a consonant"""
396
+ consonants = "bcdfghjklmnpqrstvwxyzṭḍṇśṣṅñḷṟ"
397
+ return char.lower() in consonants
398
+
399
+ def _is_sanskrit_loanword(self, word: str) -> bool:
400
+ """Heuristic to detect Sanskrit loanwords"""
401
+ # Sanskrit loanwords often have specific patterns
402
+ sanskrit_patterns = [
403
+ 'ksh', 'jn', 'shr', # Special clusters
404
+ 'am$', 'ah$', # Anusvara/visarga endings
405
+ ]
406
+
407
+ word_lower = word.lower()
408
+ for pattern in sanskrit_patterns:
409
+ if pattern.replace('$', '') in word_lower:
410
+ return True
411
+
412
+ return False
413
+
414
+
415
+ # ============================================================================
416
+ # CONTEXTUAL TRANSLITERATOR (MAIN CLASS)
417
+ # ============================================================================
418
+
419
+ class ContextualTransliterator:
420
+ """
421
+ Main transliterator with context-aware intelligence.
422
+
423
+ Combines all context rules for optimal accuracy.
424
+ """
425
+
426
+ def __init__(self):
427
+ self.nasal_selector = NasalSelector()
428
+ self.vowel_disambiguator = VowelDisambiguator()
429
+ self.retroflex_selector = RetroflexSelector()
430
+ self.schwa_handler = SchwaHandler()
431
+
432
+ self.consonants = get_iso_consonants("mixed")
433
+ self.vowels = get_iso_vowels("mixed")
434
+
435
+ def transliterate(self, text: str) -> str:
436
+ """
437
+ Transliterate with context-aware rules.
438
+
439
+ Args:
440
+ text: Roman text to transliterate
441
+
442
+ Returns:
443
+ Telugu text with context-aware disambiguation
444
+ """
445
+ if not text:
446
+ return ""
447
+
448
+ result = []
449
+ i = 0
450
+ prev_consonant = None
451
+
452
+ while i < len(text):
453
+ current = text[i]
454
+ next_char = text[i + 1] if i < len(text) - 1 else None
455
+
456
+ # Handle nasal with context
457
+ if current in ['n', 'm', 'N', 'ṇ', 'ṅ', 'ñ']:
458
+ nasal = self.nasal_selector.select_nasal(
459
+ prev_consonant, next_char, current
460
+ )
461
+ result.append(nasal)
462
+ i += 1
463
+ continue
464
+
465
+ # Handle ambiguous t/d/n with retroflex selection
466
+ if current == 't':
467
+ t_variant = self.retroflex_selector.select_t_variant(text, i, current)
468
+ result.append(t_variant)
469
+ prev_consonant = current
470
+ i += 1
471
+ continue
472
+
473
+ # Handle ambiguous vowels
474
+ if current in ['o', 'e'] and (i == 0 or prev_consonant):
475
+ vowel = self.vowel_disambiguator.disambiguate_vowel(text, i, current)
476
+ result.append(vowel)
477
+ i += 1
478
+ continue
479
+
480
+ # Default character handling
481
+ if current in self.consonants:
482
+ result.append(self.consonants[current])
483
+ prev_consonant = current
484
+ elif current in self.vowels:
485
+ result.append(self.vowels[current])
486
+ prev_consonant = None
487
+ else:
488
+ result.append(current)
489
+
490
+ i += 1
491
+
492
+ return ''.join(result)
493
+
494
+
495
+ # ============================================================================
496
+ # CONVENIENCE FUNCTIONS
497
+ # ============================================================================
498
+
499
+ def transliterate_with_context(text: str) -> str:
500
+ """
501
+ Convenience function for context-aware transliteration.
502
+
503
+ Usage:
504
+ from telugu_lib.context_rules import transliterate_with_context
505
+ result = transliterate_with_context("samskara")
506
+ """
507
+ trans = ContextualTransliterator()
508
+ return trans.transliterate(text)
509
+
510
+
511
+ # ============================================================================
512
+ # TESTING
513
+ # ============================================================================
514
+
515
+ if __name__ == "__main__":
516
+ # Test nasal selection
517
+ print("=" * 70)
518
+ print("CONTEXT-AWARE NASAL SELECTION TESTING")
519
+ print("=" * 70)
520
+
521
+ selector = NasalSelector()
522
+ test_cases = [
523
+ ("k", "a", "n", "ఙ"), # After velar → velar nasal
524
+ ("j", "a", "n", "ఞ"), # After palatal → palatal nasal
525
+ ("T", "a", "n", "ణ"), # After retroflex → retroflex nasal
526
+ ("t", "a", "n", "న"), # After dental → dental nasal
527
+ (None, "k", "n", "ం"), # Before consonant → anusvara
528
+ ]
529
+
530
+ for prev, next_c, nasal, expected in test_cases:
531
+ result = selector.select_nasal(prev, next_c, nasal)
532
+ status = "✅" if result == expected else "❌"
533
+ print(f"{status} prev={prev}, next={next_c}, nasal={nasal} → {result} (expected {expected})")
534
+
535
+ # Test vowel disambiguation
536
+ print("\n" + "=" * 70)
537
+ print("VOWEL LENGTH DISAMBIGUATION TESTING")
538
+ print("=" * 70)
539
+
540
+ disambiguator = VowelDisambiguator()
541
+ vowel_tests = [
542
+ ("mango", 4, "o", "ఓ"), # Word-final o → long
543
+ ("kotta", 1, "o", "ఒ"), # Before double consonant → short
544
+ ("kota", 1, "o", "ఓ"), # Known word → long
545
+ ("prema", 2, "e", "ఏ"), # Known word → long
546
+ ]
547
+
548
+ for word, pos, vowel, expected in vowel_tests:
549
+ result = disambiguator.disambiguate_vowel(word, pos, vowel)
550
+ status = "✅" if result == expected else "❌"
551
+ print(f"{status} {word}[{pos}] '{vowel}' → {result} (expected {expected})")
552
+
553
+ print("\n" + "=" * 70)
554
+ print("COMPLETE CONTEXTUAL TRANSLITERATION")
555
+ print("=" * 70)
556
+
557
+ trans = ContextualTransliterator()
558
+ examples = [
559
+ "samskara",
560
+ "mango",
561
+ "temple",
562
+ "kota",
563
+ "prema",
564
+ ]
565
+
566
+ for word in examples:
567
+ result = trans.transliterate(word)
568
+ print(f"{word:15} → {result}")