telugu-language-tools 4.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- telugu_language_tools-4.0.2.dist-info/METADATA +956 -0
- telugu_language_tools-4.0.2.dist-info/RECORD +14 -0
- telugu_language_tools-4.0.2.dist-info/WHEEL +5 -0
- telugu_language_tools-4.0.2.dist-info/licenses/LICENSE +21 -0
- telugu_language_tools-4.0.2.dist-info/top_level.txt +1 -0
- telugu_lib/__init__.py +197 -0
- telugu_lib/advanced.py +717 -0
- telugu_lib/cluster_generator.py +399 -0
- telugu_lib/context_rules.py +568 -0
- telugu_lib/enhanced_dictionary.py +516 -0
- telugu_lib/iso15919_mappings.py +430 -0
- telugu_lib/sentence_tools.py +214 -0
- telugu_lib/text_tools.py +108 -0
- telugu_lib/transliterate.py +972 -0
telugu_lib/advanced.py
ADDED
|
@@ -0,0 +1,717 @@
|
|
|
1
|
+
# ============================================================================
|
|
2
|
+
# PART 5: ADVANCED FEATURES & UTILITIES
|
|
3
|
+
# ============================================================================
|
|
4
|
+
|
|
5
|
+
from functools import lru_cache
|
|
6
|
+
import json
|
|
7
|
+
import os
|
|
8
|
+
from typing import List, Dict, Tuple, Optional, Union
|
|
9
|
+
|
|
10
|
+
# Import existing transliteration functions
|
|
11
|
+
from .transliterate import (
|
|
12
|
+
eng_to_telugu_with_style,
|
|
13
|
+
telugu_to_eng,
|
|
14
|
+
semantic_match,
|
|
15
|
+
compare_old_new_alphabets,
|
|
16
|
+
get_semantic_dictionary
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
class TeluguEngineConfig:
|
|
20
|
+
"""Configuration manager for Telugu transliteration engine."""
|
|
21
|
+
|
|
22
|
+
def __init__(self,
|
|
23
|
+
default_style: str = "modern",
|
|
24
|
+
enable_semantic_cache: bool = True,
|
|
25
|
+
enable_transliteration_cache: bool = True,
|
|
26
|
+
auto_detect_language: bool = True,
|
|
27
|
+
preserve_english_words: bool = True,
|
|
28
|
+
english_word_threshold: float = 0.8):
|
|
29
|
+
"""
|
|
30
|
+
Args:
|
|
31
|
+
default_style: Default alphabet style ("modern", "classical", "hybrid")
|
|
32
|
+
enable_semantic_cache: Cache semantic dictionary lookups
|
|
33
|
+
enable_transliteration_cache: Cache transliteration results
|
|
34
|
+
auto_detect_language: Automatically detect input language
|
|
35
|
+
preserve_english_words: Keep common English words untransliterated
|
|
36
|
+
english_word_threshold: Threshold for detecting English words (0-1)
|
|
37
|
+
"""
|
|
38
|
+
self.default_style = default_style
|
|
39
|
+
self.enable_semantic_cache = enable_semantic_cache
|
|
40
|
+
self.enable_transliteration_cache = enable_transliteration_cache
|
|
41
|
+
self.auto_detect_language = auto_detect_language
|
|
42
|
+
self.preserve_english_words = preserve_english_words
|
|
43
|
+
self.english_word_threshold = english_word_threshold
|
|
44
|
+
self._common_english_words = self._load_common_english_words()
|
|
45
|
+
|
|
46
|
+
def _load_common_english_words(self) -> set:
|
|
47
|
+
"""Load common English words that should not be transliterated."""
|
|
48
|
+
return {
|
|
49
|
+
"the", "is", "are", "was", "were", "be", "been", "being",
|
|
50
|
+
"have", "has", "had", "do", "does", "did", "will", "would",
|
|
51
|
+
"could", "should", "may", "might", "must", "shall",
|
|
52
|
+
"and", "or", "but", "if", "then", "than", "because", "so",
|
|
53
|
+
"a", "an", "in", "on", "at", "to", "for", "of", "with", "by",
|
|
54
|
+
"from", "up", "about", "into", "through", "during", "before",
|
|
55
|
+
"after", "above", "below", "between", "among", "i", "you", "he",
|
|
56
|
+
"she", "it", "we", "they", "me", "him", "her", "us", "them",
|
|
57
|
+
"my", "your", "his", "her", "its", "our", "their", "this", "that",
|
|
58
|
+
"these", "those", "what", "which", "who", "when", "where", "why",
|
|
59
|
+
"how", "all", "any", "both", "each", "few", "more", "most", "other",
|
|
60
|
+
"some", "such", "no", "nor", "not", "only", "own", "same", "than",
|
|
61
|
+
"too", "very", "can", "just", "but", "also", "even", "ever", "still",
|
|
62
|
+
"yet"
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
# Global configuration instance
|
|
66
|
+
_config = TeluguEngineConfig()
|
|
67
|
+
|
|
68
|
+
def set_config(**kwargs):
|
|
69
|
+
"""Update global configuration."""
|
|
70
|
+
global _config
|
|
71
|
+
for key, value in kwargs.items():
|
|
72
|
+
if hasattr(_config, key):
|
|
73
|
+
setattr(_config, key, value)
|
|
74
|
+
else:
|
|
75
|
+
raise ValueError(f"Unknown config parameter: {key}")
|
|
76
|
+
|
|
77
|
+
def get_config():
|
|
78
|
+
"""Get current configuration."""
|
|
79
|
+
return _config
|
|
80
|
+
|
|
81
|
+
@lru_cache(maxsize=1024)
|
|
82
|
+
def cached_eng_to_telugu(text: str, style: str) -> str:
|
|
83
|
+
"""Cached version of eng_to_telugu_with_style."""
|
|
84
|
+
return eng_to_telugu_with_style(text, style)
|
|
85
|
+
|
|
86
|
+
@lru_cache(maxsize=1024)
|
|
87
|
+
def cached_telugu_to_eng(text: str) -> str:
|
|
88
|
+
"""Cached version of telugu_to_eng."""
|
|
89
|
+
return telugu_to_eng(text)
|
|
90
|
+
|
|
91
|
+
def transliterate(text: str, style: Optional[str] = None) -> str:
|
|
92
|
+
"""
|
|
93
|
+
Intelligent transliteration with auto-detection and caching.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
text: Text to transliterate (English or Telugu)
|
|
97
|
+
style: Alphabet style (auto-detected if None)
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
Transliterated text
|
|
101
|
+
"""
|
|
102
|
+
if not text or not text.strip():
|
|
103
|
+
return ""
|
|
104
|
+
|
|
105
|
+
# Auto-detect language if enabled
|
|
106
|
+
if _config.auto_detect_language:
|
|
107
|
+
is_telugu = any('\u0C00' <= ch <= '\u0C7F' for ch in text)
|
|
108
|
+
if is_telugu:
|
|
109
|
+
if _config.enable_transliteration_cache:
|
|
110
|
+
return cached_telugu_to_eng(text)
|
|
111
|
+
return telugu_to_eng(text)
|
|
112
|
+
|
|
113
|
+
# English to Telugu
|
|
114
|
+
style = style or _config.default_style
|
|
115
|
+
|
|
116
|
+
# Check if word should be preserved
|
|
117
|
+
if _config.preserve_english_words:
|
|
118
|
+
words = text.split()
|
|
119
|
+
if len(words) == 1 and words[0].lower() in _config._common_english_words:
|
|
120
|
+
return text
|
|
121
|
+
|
|
122
|
+
if _config.enable_transliteration_cache:
|
|
123
|
+
return cached_eng_to_telugu(text, style)
|
|
124
|
+
return eng_to_telugu_with_style(text, style)
|
|
125
|
+
|
|
126
|
+
def batch_transliterate(items: List[str],
|
|
127
|
+
style: Optional[str] = None,
|
|
128
|
+
show_progress: bool = False) -> List[str]:
|
|
129
|
+
"""
|
|
130
|
+
Transliterate a list of strings efficiently.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
items: List of strings to transliterate
|
|
134
|
+
style: Alphabet style
|
|
135
|
+
show_progress: Show progress bar
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
List of transliterated strings
|
|
139
|
+
"""
|
|
140
|
+
results = []
|
|
141
|
+
total = len(items)
|
|
142
|
+
|
|
143
|
+
for idx, item in enumerate(items):
|
|
144
|
+
if show_progress and idx % 100 == 0:
|
|
145
|
+
print(f"Processing... {idx}/{total} ({idx/total*100:.1f}%)")
|
|
146
|
+
results.append(transliterate(item, style))
|
|
147
|
+
|
|
148
|
+
return results
|
|
149
|
+
|
|
150
|
+
def batch_transliterate_dict(data: Dict[str, str],
|
|
151
|
+
style: Optional[str] = None) -> Dict[str, str]:
|
|
152
|
+
"""
|
|
153
|
+
Transliterate dictionary values (preserves keys).
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
data: Dictionary with translatable values
|
|
157
|
+
style: Alphabet style
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
Dictionary with transliterated values
|
|
161
|
+
"""
|
|
162
|
+
return {k: transliterate(v, style) for k, v in data.items()}
|
|
163
|
+
|
|
164
|
+
def process_file(input_path: str,
|
|
165
|
+
output_path: Optional[str] = None,
|
|
166
|
+
style: Optional[str] = None,
|
|
167
|
+
encoding: str = 'utf-8') -> str:
|
|
168
|
+
"""
|
|
169
|
+
Transliterate entire file content.
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
input_path: Input file path
|
|
173
|
+
output_path: Output file path (optional)
|
|
174
|
+
style: Alphabet style
|
|
175
|
+
encoding: File encoding
|
|
176
|
+
|
|
177
|
+
Returns:
|
|
178
|
+
Transliterated content
|
|
179
|
+
"""
|
|
180
|
+
try:
|
|
181
|
+
with open(input_path, 'r', encoding=encoding) as f:
|
|
182
|
+
content = f.read()
|
|
183
|
+
|
|
184
|
+
result = transliterate(content, style)
|
|
185
|
+
|
|
186
|
+
if output_path:
|
|
187
|
+
with open(output_path, 'w', encoding=encoding) as f:
|
|
188
|
+
f.write(result)
|
|
189
|
+
print(f"✓ Written to {output_path}")
|
|
190
|
+
|
|
191
|
+
return result
|
|
192
|
+
except Exception as e:
|
|
193
|
+
print(f"✗ Error processing file {input_path}: {str(e)}")
|
|
194
|
+
return ""
|
|
195
|
+
|
|
196
|
+
# ============================================================================
|
|
197
|
+
# PART 6: ENHANCED SEMANTIC DICTIONARY
|
|
198
|
+
# ============================================================================
|
|
199
|
+
|
|
200
|
+
def get_enhanced_semantic_dictionary():
|
|
201
|
+
"""
|
|
202
|
+
Extended dictionary with categories and metadata.
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
Dictionary with structure:
|
|
206
|
+
{
|
|
207
|
+
"category": {
|
|
208
|
+
"english": ["telugu1", "telugu2", ...],
|
|
209
|
+
...
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
"""
|
|
213
|
+
return {
|
|
214
|
+
"pronouns": {
|
|
215
|
+
"i": ["నేను"], "you": ["నువ్వు", "మీరు"], "he": ["అతను"],
|
|
216
|
+
"she": ["ఆమె"], "it": ["అది"], "we": ["మనం", "మేము"],
|
|
217
|
+
"they": ["వాళ్లు"], "me": ["నన్ను"], "him": ["అతన్ని"],
|
|
218
|
+
"her": ["ఆమెను"], "us": ["మమ్మల్ని"], "them": ["వాళ్ళను"],
|
|
219
|
+
},
|
|
220
|
+
"verbs": {
|
|
221
|
+
"go": ["వెళ్ళు"], "come": ["వచ్చు"], "eat": ["తిను"],
|
|
222
|
+
"drink": ["తాగు"], "sleep": ["నిద్ర పో"], "see": ["చూడు"],
|
|
223
|
+
"do": ["చేయి"], "say": ["చెప్పు"], "get": ["పొందు"],
|
|
224
|
+
"make": ["తయారు చేయి"], "know": ["తెలుసు"], "think": ["అనుకో"],
|
|
225
|
+
},
|
|
226
|
+
"directions": {
|
|
227
|
+
"up": ["పైకి"], "down": ["కిందికి"], "left": ["ఎడమ"],
|
|
228
|
+
"right": ["కుడి"], "north": ["ఉత్తరం"], "south": ["దక్షిణం"],
|
|
229
|
+
"east": ["తూర్పు"], "west": ["పశ్చిమం"], "front": ["ముందు"],
|
|
230
|
+
"back": ["వెనుక"], "inside": ["లోపల"], "outside": ["బయట"],
|
|
231
|
+
},
|
|
232
|
+
"time": {
|
|
233
|
+
"today": ["ఈరోజు"], "tomorrow": ["రేపు"], "yesterday": ["నిన్న"],
|
|
234
|
+
"now": ["ఇప్పుడు"], "later": ["తర్వాత"], "soon": ["త్వరలో"],
|
|
235
|
+
"morning": ["ఉదయం"], "evening": ["సాయంత్రం"], "night": ["రాత్రి"],
|
|
236
|
+
"day": ["ఈదుర"], "week": ["వారం"], "month": ["నెల"], "year": ["సంవత్సరం"],
|
|
237
|
+
},
|
|
238
|
+
"food": {
|
|
239
|
+
"rice": ["అన్నం"], "bread": ["రొట్టె"], "milk": ["పాలు"],
|
|
240
|
+
"water": ["నీరు"], "curry": ["కూర"], "vegetable": ["కూరగాయ"],
|
|
241
|
+
"fruit": ["పండు"], "sweet": ["ఇనిపిండి"], "salt": ["ఉప్పు"],
|
|
242
|
+
},
|
|
243
|
+
"emotions": {
|
|
244
|
+
"happy": ["సంతోషం"], "sad": ["దుఃఖం"], "angry": ["కోపం"],
|
|
245
|
+
"love": ["ప్రేమ"], "hate": ["ద్వేషం"], "fear": ["భయం"],
|
|
246
|
+
"hope": ["భావన"], "worry": ["చింత"], "surprise": ["ఆశ్చర్యం"],
|
|
247
|
+
},
|
|
248
|
+
"technology": {
|
|
249
|
+
"computer": ["కంప్యూటర్"], "phone": ["ఫోను"], "internet": ["ఇంటర్నెట్"],
|
|
250
|
+
"email": ["ఈమెయిల్"], "website": ["వెబ్సైటు"], "video": ["వీడియో"],
|
|
251
|
+
"audio": ["ఆడియో"], "data": ["డేటా"], "software": ["సాఫ్ట్వేర్"],
|
|
252
|
+
},
|
|
253
|
+
"common_phrases": {
|
|
254
|
+
"good morning": ["శుభోదయం"], "good night": ["శుభరాత్రి"],
|
|
255
|
+
"thank you": ["ధన్యవాదాలు"], "excuse me": ["క్షమించండి"],
|
|
256
|
+
"how are you": ["మీరు ఎలా ఉన్నారు"], "i am fine": ["నేను బాగున్నాను"],
|
|
257
|
+
"what is your name": ["మీ పేరు ఏమిటి"], "my name is": ["నా పేరు"],
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
def get_semantic_dictionary_by_category(category: str) -> Dict[str, List[str]]:
|
|
262
|
+
"""Get semantic dictionary for a specific category."""
|
|
263
|
+
enhanced = get_enhanced_semantic_dictionary()
|
|
264
|
+
return enhanced.get(category, {})
|
|
265
|
+
|
|
266
|
+
def search_semantic_dictionary(query: str,
|
|
267
|
+
category: Optional[str] = None) -> List[Tuple[str, str]]:
|
|
268
|
+
"""
|
|
269
|
+
Search for words in semantic dictionary.
|
|
270
|
+
|
|
271
|
+
Args:
|
|
272
|
+
query: Search query (partial match)
|
|
273
|
+
category: Specific category to search
|
|
274
|
+
|
|
275
|
+
Returns:
|
|
276
|
+
List of (english, telugu) matches
|
|
277
|
+
"""
|
|
278
|
+
results = []
|
|
279
|
+
enhanced = get_enhanced_semantic_dictionary()
|
|
280
|
+
|
|
281
|
+
if category:
|
|
282
|
+
categories = [category]
|
|
283
|
+
else:
|
|
284
|
+
categories = enhanced.keys()
|
|
285
|
+
|
|
286
|
+
for cat in categories:
|
|
287
|
+
for eng, tel_list in enhanced[cat].items():
|
|
288
|
+
if query.lower() in eng.lower():
|
|
289
|
+
for tel in tel_list:
|
|
290
|
+
results.append((eng, tel))
|
|
291
|
+
|
|
292
|
+
return results
|
|
293
|
+
|
|
294
|
+
# ============================================================================
|
|
295
|
+
# PART 7: COMPREHENSIVE TEST SUITE
|
|
296
|
+
# ============================================================================
|
|
297
|
+
|
|
298
|
+
def run_comprehensive_tests():
|
|
299
|
+
"""Run all tests and report results."""
|
|
300
|
+
|
|
301
|
+
tests_passed = 0
|
|
302
|
+
tests_failed = 0
|
|
303
|
+
failures = []
|
|
304
|
+
|
|
305
|
+
print("=" * 80)
|
|
306
|
+
print("COMPREHENSIVE TEST SUITE")
|
|
307
|
+
print("=" * 80)
|
|
308
|
+
|
|
309
|
+
# Test 1: Basic transliteration
|
|
310
|
+
print("\n[TEST 1] Basic Transliteration")
|
|
311
|
+
test_cases = [
|
|
312
|
+
("rama", "రామ", "modern"),
|
|
313
|
+
("krishna", "కృష్ణ", "modern"),
|
|
314
|
+
("lakshmi", "లక్ష్మి", "modern"),
|
|
315
|
+
("shiva", "శివ", "modern"),
|
|
316
|
+
("ganesha", "గణేశ", "modern"),
|
|
317
|
+
("anjaneya", "అంజనేయ", "modern"),
|
|
318
|
+
]
|
|
319
|
+
|
|
320
|
+
for eng, expected, style in test_cases:
|
|
321
|
+
result = eng_to_telugu_with_style(eng, style)
|
|
322
|
+
if result == expected:
|
|
323
|
+
print(f" ✓ {eng:12} → {result}")
|
|
324
|
+
tests_passed += 1
|
|
325
|
+
else:
|
|
326
|
+
print(f" ✗ {eng:12} → {result} (expected {expected})")
|
|
327
|
+
tests_failed += 1
|
|
328
|
+
failures.append(f"Basic: {eng} → {result} != {expected}")
|
|
329
|
+
|
|
330
|
+
# Test 2: Classical style
|
|
331
|
+
print("\n[TEST 2] Classical Style")
|
|
332
|
+
classical_cases = [
|
|
333
|
+
("rama", "రామ", "classical"),
|
|
334
|
+
("karma", "కర్మ", "classical"),
|
|
335
|
+
("yantra", "యంత్ర", "classical"),
|
|
336
|
+
]
|
|
337
|
+
|
|
338
|
+
for eng, expected, style in classical_cases:
|
|
339
|
+
result = eng_to_telugu_with_style(eng, style)
|
|
340
|
+
if result == expected:
|
|
341
|
+
print(f" ✓ {eng:12} → {result}")
|
|
342
|
+
tests_passed += 1
|
|
343
|
+
else:
|
|
344
|
+
print(f" ✗ {eng:12} → {result} (expected {expected})")
|
|
345
|
+
tests_failed += 1
|
|
346
|
+
failures.append(f"Classical: {eng} → {result} != {expected}")
|
|
347
|
+
|
|
348
|
+
# Test 3: Telugu to English
|
|
349
|
+
print("\n[TEST 3] Telugu to English")
|
|
350
|
+
telugu_cases = [
|
|
351
|
+
("కృష్ణ", "krishna"),
|
|
352
|
+
("ఎవరు", "evaru"),
|
|
353
|
+
("లక్ష్మి", "lakshmi"),
|
|
354
|
+
("రామ", "rama"),
|
|
355
|
+
("శివ", "shiva"),
|
|
356
|
+
]
|
|
357
|
+
|
|
358
|
+
for tel, expected in telugu_cases:
|
|
359
|
+
result = telugu_to_eng(tel)
|
|
360
|
+
if result == expected:
|
|
361
|
+
print(f" ✓ {tel:8} → {result}")
|
|
362
|
+
tests_passed += 1
|
|
363
|
+
else:
|
|
364
|
+
print(f" ✗ {tel:8} → {result} (expected {expected})")
|
|
365
|
+
tests_failed += 1
|
|
366
|
+
failures.append(f"Tel→Eng: {tel} → {result} != {expected}")
|
|
367
|
+
|
|
368
|
+
# Test 4: Semantic matching
|
|
369
|
+
print("\n[TEST 4] Semantic Matching")
|
|
370
|
+
semantic_cases = [
|
|
371
|
+
("who", ["ఎవరు", "ఎవరో"]),
|
|
372
|
+
("mother", ["అమ్మ", "తల్లి"]),
|
|
373
|
+
("krishna", ["కృష్ణ", "కృష్ణుడు"]),
|
|
374
|
+
]
|
|
375
|
+
|
|
376
|
+
for eng, expected_list in semantic_cases:
|
|
377
|
+
result = semantic_match(eng)
|
|
378
|
+
if any(exp in result['matches'] for exp in expected_list):
|
|
379
|
+
print(f" ✓ {eng:10} → {result['matches']}")
|
|
380
|
+
tests_passed += 1
|
|
381
|
+
else:
|
|
382
|
+
print(f" ✗ {eng:10} → {result['matches']} (expected one of {expected_list})")
|
|
383
|
+
tests_failed += 1
|
|
384
|
+
failures.append(f"Semantic: {eng} → {result['matches']} not in {expected_list}")
|
|
385
|
+
|
|
386
|
+
# Test 5: Sentence processing
|
|
387
|
+
print("\n[TEST 5] Sentence Processing")
|
|
388
|
+
from .transliterate import eng_to_telugu_sentence
|
|
389
|
+
sentence_cases = [
|
|
390
|
+
("hello world", "హలో వర్ల్ద"),
|
|
391
|
+
("who is rama", "ఎవరు ఇస్ రామ"),
|
|
392
|
+
("thank you", "ధన్యవాదాలు"),
|
|
393
|
+
]
|
|
394
|
+
|
|
395
|
+
for eng, expected in sentence_cases:
|
|
396
|
+
result = eng_to_telugu_sentence(eng)
|
|
397
|
+
if result == expected:
|
|
398
|
+
print(f" ✓ '{eng}' → '{result}'")
|
|
399
|
+
tests_passed += 1
|
|
400
|
+
else:
|
|
401
|
+
print(f" ✗ '{eng}' → '{result}' (expected '{expected}')")
|
|
402
|
+
tests_failed += 1
|
|
403
|
+
failures.append(f"Sentence: '{eng}' → '{result}' != '{expected}'")
|
|
404
|
+
|
|
405
|
+
# Test 6: Edge cases
|
|
406
|
+
print("\n[TEST 6] Edge Cases")
|
|
407
|
+
edge_cases = [
|
|
408
|
+
("", ""), # Empty string
|
|
409
|
+
(" ", ""), # Whitespace
|
|
410
|
+
("123", "123"), # Numbers
|
|
411
|
+
("hello!", "హలో!"), # Punctuation
|
|
412
|
+
("mixed123", "మిక్సెడ్123"), # Alphanumeric
|
|
413
|
+
]
|
|
414
|
+
|
|
415
|
+
for eng, expected in edge_cases:
|
|
416
|
+
result = eng_to_telugu_sentence(eng)
|
|
417
|
+
if result == expected:
|
|
418
|
+
print(f" ✓ '{eng}' → '{result}'")
|
|
419
|
+
tests_passed += 1
|
|
420
|
+
else:
|
|
421
|
+
print(f" ✗ '{eng}' → '{result}' (expected '{expected}')")
|
|
422
|
+
tests_failed += 1
|
|
423
|
+
failures.append(f"Edge: '{eng}' → '{result}' != '{expected}'")
|
|
424
|
+
|
|
425
|
+
# Summary
|
|
426
|
+
print("\n" + "=" * 80)
|
|
427
|
+
print("TEST SUMMARY")
|
|
428
|
+
print("=" * 80)
|
|
429
|
+
print(f"Total Tests: {tests_passed + tests_failed}")
|
|
430
|
+
print(f"Passed: {tests_passed}")
|
|
431
|
+
print(f"Failed: {tests_failed}")
|
|
432
|
+
print(f"Success Rate: {tests_passed/(tests_passed + tests_failed)*100:.1f}%")
|
|
433
|
+
|
|
434
|
+
if failures:
|
|
435
|
+
print("\nFailed Tests:")
|
|
436
|
+
for failure in failures:
|
|
437
|
+
print(f" - {failure}")
|
|
438
|
+
|
|
439
|
+
print("=" * 80)
|
|
440
|
+
|
|
441
|
+
return tests_failed == 0
|
|
442
|
+
|
|
443
|
+
# ============================================================================
|
|
444
|
+
# PART 8: COMMAND-LINE INTERFACE
|
|
445
|
+
# ============================================================================
|
|
446
|
+
|
|
447
|
+
def main_cli(argv=None):
|
|
448
|
+
"""Command-line interface for Telugu transliteration."""
|
|
449
|
+
import argparse
|
|
450
|
+
|
|
451
|
+
parser = argparse.ArgumentParser(
|
|
452
|
+
description="Telugu Library v2.1 - Transliteration Engine",
|
|
453
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
454
|
+
epilog="""
|
|
455
|
+
Examples:
|
|
456
|
+
python -m telugu_lib.advanced --text "krishna"
|
|
457
|
+
python -m telugu_lib.advanced --text "కృష్ణ" --reverse
|
|
458
|
+
python -m telugu_lib.advanced --file input.txt --output output.txt
|
|
459
|
+
python -m telugu_lib.advanced --style classical --sentence "rama is great"
|
|
460
|
+
python -m telugu_lib.advanced --test
|
|
461
|
+
python -m telugu_lib.advanced --search "who"
|
|
462
|
+
"""
|
|
463
|
+
)
|
|
464
|
+
|
|
465
|
+
parser.add_argument("--text", "-t", help="Text to transliterate")
|
|
466
|
+
parser.add_argument("--file", "-f", help="Input file path")
|
|
467
|
+
parser.add_argument("--output", "-o", help="Output file path")
|
|
468
|
+
parser.add_argument("--reverse", "-r", action="store_true", help="Telugu to English")
|
|
469
|
+
parser.add_argument("--style", "-s", choices=["modern", "classical", "hybrid"],
|
|
470
|
+
default="modern", help="Alphabet style")
|
|
471
|
+
parser.add_argument("--sentence", help="Process as sentence")
|
|
472
|
+
parser.add_argument("--test", action="store_true", help="Run comprehensive tests")
|
|
473
|
+
parser.add_argument("--search", help="Search semantic dictionary")
|
|
474
|
+
parser.add_argument("--category", help="Filter by category (with --search)")
|
|
475
|
+
parser.add_argument("--compare", action="store_true", help="Show alphabet comparison")
|
|
476
|
+
parser.add_argument("--batch", nargs="+", help="Batch transliterate multiple texts")
|
|
477
|
+
|
|
478
|
+
args = parser.parse_args(argv)
|
|
479
|
+
|
|
480
|
+
# Handle different modes
|
|
481
|
+
if args.test:
|
|
482
|
+
success = run_comprehensive_tests()
|
|
483
|
+
exit(0 if success else 1)
|
|
484
|
+
|
|
485
|
+
elif args.compare:
|
|
486
|
+
compare_old_new_alphabets()
|
|
487
|
+
exit(0)
|
|
488
|
+
|
|
489
|
+
elif args.search:
|
|
490
|
+
results = search_semantic_dictionary(args.search, args.category)
|
|
491
|
+
if results:
|
|
492
|
+
print(f"\nSearch results for '{args.search}':")
|
|
493
|
+
for eng, tel in results:
|
|
494
|
+
print(f" {eng:20} → {tel}")
|
|
495
|
+
else:
|
|
496
|
+
print(f"No results found for '{args.search}'")
|
|
497
|
+
exit(0)
|
|
498
|
+
|
|
499
|
+
elif args.batch:
|
|
500
|
+
results = batch_transliterate(args.batch, args.style)
|
|
501
|
+
print("\nBatch Transliteration Results:")
|
|
502
|
+
for orig, trans in zip(args.batch, results):
|
|
503
|
+
print(f" {orig:20} → {trans}")
|
|
504
|
+
exit(0)
|
|
505
|
+
|
|
506
|
+
elif args.file:
|
|
507
|
+
result = process_file(args.file, args.output, args.style)
|
|
508
|
+
if not args.output:
|
|
509
|
+
print(result)
|
|
510
|
+
exit(0)
|
|
511
|
+
|
|
512
|
+
elif args.sentence:
|
|
513
|
+
from .transliterate import eng_to_telugu_sentence
|
|
514
|
+
result = eng_to_telugu_sentence(args.sentence, args.style)
|
|
515
|
+
print(result)
|
|
516
|
+
exit(0)
|
|
517
|
+
|
|
518
|
+
elif args.text:
|
|
519
|
+
if args.reverse:
|
|
520
|
+
result = telugu_to_eng(args.text)
|
|
521
|
+
else:
|
|
522
|
+
result = transliterate(args.text, args.style)
|
|
523
|
+
print(result)
|
|
524
|
+
exit(0)
|
|
525
|
+
|
|
526
|
+
else:
|
|
527
|
+
parser.print_help()
|
|
528
|
+
exit(1)
|
|
529
|
+
|
|
530
|
+
# ============================================================================
|
|
531
|
+
# PART 9: WEB API INTERFACE (Flask)
|
|
532
|
+
# ============================================================================
|
|
533
|
+
|
|
534
|
+
def create_web_api():
|
|
535
|
+
"""
|
|
536
|
+
Create a Flask web API for transliteration.
|
|
537
|
+
Usage: python -m telugu_lib.advanced --serve
|
|
538
|
+
"""
|
|
539
|
+
try:
|
|
540
|
+
from flask import Flask, request, jsonify
|
|
541
|
+
except ImportError:
|
|
542
|
+
print("Flask is not installed. Install with: pip install flask")
|
|
543
|
+
return None
|
|
544
|
+
|
|
545
|
+
app = Flask(__name__)
|
|
546
|
+
|
|
547
|
+
@app.route('/transliterate', methods=['POST'])
|
|
548
|
+
def api_transliterate():
|
|
549
|
+
"""API endpoint for transliteration."""
|
|
550
|
+
data = request.get_json()
|
|
551
|
+
|
|
552
|
+
if not data or 'text' not in data:
|
|
553
|
+
return jsonify({'error': 'Missing required field: text'}), 400
|
|
554
|
+
|
|
555
|
+
text = data['text']
|
|
556
|
+
style = data.get('style', 'modern')
|
|
557
|
+
direction = data.get('direction', 'auto')
|
|
558
|
+
|
|
559
|
+
try:
|
|
560
|
+
if direction == 'telugu_to_eng':
|
|
561
|
+
result = telugu_to_eng(text)
|
|
562
|
+
elif direction == 'eng_to_telugu':
|
|
563
|
+
result = eng_to_telugu_with_style(text, style)
|
|
564
|
+
else: # auto-detect
|
|
565
|
+
result = transliterate(text, style)
|
|
566
|
+
|
|
567
|
+
return jsonify({
|
|
568
|
+
'success': True,
|
|
569
|
+
'text': text,
|
|
570
|
+
'result': result,
|
|
571
|
+
'style': style
|
|
572
|
+
})
|
|
573
|
+
except Exception as e:
|
|
574
|
+
return jsonify({'error': str(e)}), 500
|
|
575
|
+
|
|
576
|
+
@app.route('/semantic', methods=['POST'])
|
|
577
|
+
def api_semantic():
|
|
578
|
+
"""API endpoint for semantic matching."""
|
|
579
|
+
data = request.get_json()
|
|
580
|
+
|
|
581
|
+
if not data or 'text' not in data:
|
|
582
|
+
return jsonify({'error': 'Missing required field: text'}), 400
|
|
583
|
+
|
|
584
|
+
text = data['text']
|
|
585
|
+
|
|
586
|
+
try:
|
|
587
|
+
result = semantic_match(text)
|
|
588
|
+
return jsonify({
|
|
589
|
+
'success': True,
|
|
590
|
+
'result': result
|
|
591
|
+
})
|
|
592
|
+
except Exception as e:
|
|
593
|
+
return jsonify({'error': str(e)}), 500
|
|
594
|
+
|
|
595
|
+
@app.route('/search', methods=['GET'])
|
|
596
|
+
def api_search():
|
|
597
|
+
"""API endpoint for dictionary search."""
|
|
598
|
+
query = request.args.get('q', '')
|
|
599
|
+
category = request.args.get('category')
|
|
600
|
+
|
|
601
|
+
try:
|
|
602
|
+
results = search_semantic_dictionary(query, category)
|
|
603
|
+
return jsonify({
|
|
604
|
+
'success': True,
|
|
605
|
+
'query': query,
|
|
606
|
+
'results': [{'english': eng, 'telugu': tel} for eng, tel in results]
|
|
607
|
+
})
|
|
608
|
+
except Exception as e:
|
|
609
|
+
return jsonify({'error': str(e)}), 500
|
|
610
|
+
|
|
611
|
+
@app.route('/health', methods=['GET'])
|
|
612
|
+
def api_health():
|
|
613
|
+
"""Health check endpoint."""
|
|
614
|
+
return jsonify({
|
|
615
|
+
'success': True,
|
|
616
|
+
'status': 'healthy',
|
|
617
|
+
'version': '3.5.2'
|
|
618
|
+
})
|
|
619
|
+
|
|
620
|
+
return app
|
|
621
|
+
|
|
622
|
+
def serve_web_api(host='localhost', port=5000, debug=False):
|
|
623
|
+
"""Start the web API server."""
|
|
624
|
+
app = create_web_api()
|
|
625
|
+
if app is None:
|
|
626
|
+
print("Error: Cannot start web API. Please install Flask: pip install flask")
|
|
627
|
+
return False
|
|
628
|
+
|
|
629
|
+
try:
|
|
630
|
+
print(f"Starting Telugu API server on http://{host}:{port}")
|
|
631
|
+
print("Endpoints:")
|
|
632
|
+
print(" POST /transliterate - Transliterate text")
|
|
633
|
+
print(" POST /semantic - Get semantic matches")
|
|
634
|
+
print(" GET /search?q=<query> - Search dictionary")
|
|
635
|
+
print(" GET /health - Health check")
|
|
636
|
+
app.run(host=host, port=port, debug=debug)
|
|
637
|
+
return True
|
|
638
|
+
except Exception as e:
|
|
639
|
+
print(f"Error starting web server: {e}")
|
|
640
|
+
return False
|
|
641
|
+
|
|
642
|
+
# ============================================================================
|
|
643
|
+
# PART 10: PERFORMANCE MONITORING
|
|
644
|
+
# ============================================================================
|
|
645
|
+
|
|
646
|
+
class PerformanceMonitor:
|
|
647
|
+
"""Monitor and report transliteration performance."""
|
|
648
|
+
|
|
649
|
+
def __init__(self):
|
|
650
|
+
self.stats = {
|
|
651
|
+
'transliterations': 0,
|
|
652
|
+
'cache_hits': 0,
|
|
653
|
+
'semantic_lookups': 0,
|
|
654
|
+
'avg_time': 0.0
|
|
655
|
+
}
|
|
656
|
+
self._start_time = None
|
|
657
|
+
|
|
658
|
+
def start(self):
|
|
659
|
+
"""Start timing."""
|
|
660
|
+
import time
|
|
661
|
+
self._start_time = time.time()
|
|
662
|
+
|
|
663
|
+
def end(self, operation: str = 'transliteration'):
|
|
664
|
+
"""End timing and update stats."""
|
|
665
|
+
import time
|
|
666
|
+
if self._start_time:
|
|
667
|
+
duration = time.time() - self._start_time
|
|
668
|
+
self.stats['avg_time'] = (self.stats['avg_time'] * self.stats['transliterations'] + duration) / (self.stats['transliterations'] + 1)
|
|
669
|
+
self._start_time = None
|
|
670
|
+
|
|
671
|
+
def record_cache_hit(self):
|
|
672
|
+
"""Record a cache hit."""
|
|
673
|
+
self.stats['cache_hits'] += 1
|
|
674
|
+
|
|
675
|
+
def record_semantic_lookup(self):
|
|
676
|
+
"""Record a semantic lookup."""
|
|
677
|
+
self.stats['semantic_lookups'] += 1
|
|
678
|
+
|
|
679
|
+
def get_report(self) -> dict:
|
|
680
|
+
"""Get performance report."""
|
|
681
|
+
total_ops = self.stats['transliterations'] + self.stats['cache_hits']
|
|
682
|
+
hit_rate = (self.stats['cache_hits'] / total_ops * 100) if total_ops > 0 else 0
|
|
683
|
+
|
|
684
|
+
return {
|
|
685
|
+
**self.stats,
|
|
686
|
+
'cache_hit_rate': hit_rate,
|
|
687
|
+
'total_operations': total_ops
|
|
688
|
+
}
|
|
689
|
+
|
|
690
|
+
# Global performance monitor
|
|
691
|
+
_perf_monitor = PerformanceMonitor()
|
|
692
|
+
|
|
693
|
+
def get_performance_report():
|
|
694
|
+
"""Get current performance statistics."""
|
|
695
|
+
return _perf_monitor.get_report()
|
|
696
|
+
|
|
697
|
+
def reset_performance_stats():
|
|
698
|
+
"""Reset performance statistics."""
|
|
699
|
+
global _perf_monitor
|
|
700
|
+
_perf_monitor = PerformanceMonitor()
|
|
701
|
+
|
|
702
|
+
# ============================================================================
|
|
703
|
+
# MAIN ENTRY POINT
|
|
704
|
+
# ============================================================================
|
|
705
|
+
|
|
706
|
+
if __name__ == "__main__":
|
|
707
|
+
import sys
|
|
708
|
+
|
|
709
|
+
# Check for web serve mode
|
|
710
|
+
if '--serve' in sys.argv:
|
|
711
|
+
idx = sys.argv.index('--serve')
|
|
712
|
+
host = sys.argv[idx + 1] if len(sys.argv) > idx + 1 else 'localhost'
|
|
713
|
+
port = int(sys.argv[idx + 2]) if len(sys.argv) > idx + 2 else 5000
|
|
714
|
+
serve_web_api(host, port)
|
|
715
|
+
else:
|
|
716
|
+
# Run CLI with all arguments except the script name
|
|
717
|
+
main_cli()
|