telugu-language-tools 5.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of telugu-language-tools might be problematic. Click here for more details.
- telugu_engine/__init__.py +190 -0
- telugu_engine/cli.py +111 -0
- telugu_engine/enhanced_tense.py +854 -0
- telugu_engine/grammar.py +474 -0
- telugu_engine/phonetic_matrix.py +82 -0
- telugu_engine/tense_engine.py +391 -0
- telugu_engine/transliterator.py +692 -0
- telugu_engine/v3_validator.py +413 -0
- telugu_language_tools-5.0.4.dist-info/METADATA +398 -0
- telugu_language_tools-5.0.4.dist-info/RECORD +13 -0
- telugu_language_tools-5.0.4.dist-info/WHEEL +5 -0
- telugu_language_tools-5.0.4.dist-info/licenses/LICENSE +21 -0
- telugu_language_tools-5.0.4.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,413 @@
|
|
|
1
|
+
"""
|
|
2
|
+
v3.0 Compliance Validator
|
|
3
|
+
==========================
|
|
4
|
+
|
|
5
|
+
Validates that Telugu text follows v3.0 modern standards:
|
|
6
|
+
- Script compliance (no archaic letters)
|
|
7
|
+
- Modern pronouns
|
|
8
|
+
- Modern verb patterns
|
|
9
|
+
- 4-case system
|
|
10
|
+
- Vowel harmony
|
|
11
|
+
|
|
12
|
+
Usage:
|
|
13
|
+
from telugu_engine.v3_validator import validate_v3_compliance
|
|
14
|
+
result = validate_v3_compliance("నేను వచ్చాను")
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from typing import Dict, List, Tuple
|
|
18
|
+
import re
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# ============================================================================
|
|
22
|
+
# SECTION 1: SCRIPT VALIDATION
|
|
23
|
+
# ============================================================================
|
|
24
|
+
|
|
25
|
+
# Archaic letters (v3.0 PROHIBITS these)
|
|
26
|
+
ARCHAIC_LETTERS = [
|
|
27
|
+
'ఱ', # Banḍi Ra (alveolar trill - archaic)
|
|
28
|
+
'ఌ', # Vocalic l (confined to Sanskrit)
|
|
29
|
+
'ౡ', # Long vocalic l (obsolete)
|
|
30
|
+
'ౘ', # Marginal consonant (replaced)
|
|
31
|
+
'ౙ', # Marginal consonant (replaced)
|
|
32
|
+
'ఀ', # Archaic candrabindu
|
|
33
|
+
'ౝ', # Archaic nakara pollu
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
# Modern consonants (v3.0 ALLOWS these)
|
|
37
|
+
MODERN_CONSONANTS = [
|
|
38
|
+
'క', 'ఖ', 'గ', 'ఘ', 'ఙ',
|
|
39
|
+
'చ', 'ఛ', 'జ', 'ఝ', 'ఞ',
|
|
40
|
+
'ట', 'ఠ', 'డ', 'ఢ', 'ణ',
|
|
41
|
+
'త', 'థ', 'ద', 'ధ', 'న',
|
|
42
|
+
'ప', 'ఫ', 'బ', 'భ', 'మ',
|
|
43
|
+
'య', 'ర', 'ల', 'వ', 'శ', 'ష', 'స', 'హ'
|
|
44
|
+
]
|
|
45
|
+
|
|
46
|
+
# Modern vowels (v3.0 ALLOWS these)
|
|
47
|
+
MODERN_VOWELS = [
|
|
48
|
+
'అ', 'ఆ', 'ఇ', 'ఈ', 'ఉ', 'ఊ',
|
|
49
|
+
'ఋ', 'ౠ', 'ఎ', 'ఏ', 'ఒ', 'ఓ',
|
|
50
|
+
'ఐ', 'ఔ'
|
|
51
|
+
]
|
|
52
|
+
|
|
53
|
+
# Modern pronouns (v3.0 standard)
|
|
54
|
+
MODERN_PRONOUNS = [
|
|
55
|
+
'నేను', # I (modern)
|
|
56
|
+
'నీవు', # You (informal)
|
|
57
|
+
'మీరు', # You (formal/plural)
|
|
58
|
+
'వాళ్ళు', # They (modern, human)
|
|
59
|
+
'మేము', # We (modern)
|
|
60
|
+
'మనము', # We (inclusive)
|
|
61
|
+
'వాడు', # He
|
|
62
|
+
'అది', # It
|
|
63
|
+
]
|
|
64
|
+
|
|
65
|
+
# Archaic pronouns (v3.0 PROHIBITS these)
|
|
66
|
+
ARCHAIC_PRONOUNS = [
|
|
67
|
+
'ఏను', # Old 1st person
|
|
68
|
+
'ఈవు', # Old 2nd person
|
|
69
|
+
'వాండ్రు', # Old 3rd plural human
|
|
70
|
+
'ఏము', # Old 1st plural
|
|
71
|
+
]
|
|
72
|
+
|
|
73
|
+
# Modern verb patterns (v3.0 standard)
|
|
74
|
+
MODERN_VERB_PATTERNS = [
|
|
75
|
+
'సినాను', # I did (modern)
|
|
76
|
+
'సినారు', # They did (modern)
|
|
77
|
+
'చేసినాను', # I did (modern)
|
|
78
|
+
'తిన్నాను', # I ate (modern)
|
|
79
|
+
'వచ్చాను', # I came (modern)
|
|
80
|
+
]
|
|
81
|
+
|
|
82
|
+
# Archaic verb patterns (v3.0 PROHIBITS these)
|
|
83
|
+
ARCHAIC_VERB_PATTERNS = [
|
|
84
|
+
'చేసితిని', # Old past pattern
|
|
85
|
+
'చేసితిరి', # Old past plural
|
|
86
|
+
'తినితిని', # Old past
|
|
87
|
+
'వచ్చితిని', # Old past
|
|
88
|
+
]
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
# ============================================================================
|
|
92
|
+
# SECTION 2: VALIDATION FUNCTIONS
|
|
93
|
+
# ============================================================================
|
|
94
|
+
|
|
95
|
+
def validate_script(text: str) -> Tuple[bool, List[str]]:
|
|
96
|
+
"""
|
|
97
|
+
Validate script compliance (no archaic letters).
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
text: Telugu text to validate
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
Tuple of (is_valid, list_of_errors)
|
|
104
|
+
"""
|
|
105
|
+
errors = []
|
|
106
|
+
|
|
107
|
+
# Check for archaic letters
|
|
108
|
+
for letter in ARCHAIC_LETTERS:
|
|
109
|
+
if letter in text:
|
|
110
|
+
errors.append(f"Archaic letter found: {letter}")
|
|
111
|
+
|
|
112
|
+
# Check for modern characters (best effort)
|
|
113
|
+
# This is a basic check - could be enhanced
|
|
114
|
+
telugu_chars = set(text) - set('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 .,!?;:()[]{}"\'-')
|
|
115
|
+
if telugu_chars:
|
|
116
|
+
# Check if any are not in modern sets
|
|
117
|
+
for char in telugu_chars:
|
|
118
|
+
if char not in MODERN_CONSONANTS + MODERN_VOWELS and char not in ['ం', 'ః', '్', 'ి', 'ు', 'ె', 'ొ', 'ా', 'ీ', 'ూ', 'ే', 'ో', 'ై', 'ౌ']:
|
|
119
|
+
errors.append(f"Unknown character: {char}")
|
|
120
|
+
|
|
121
|
+
return len(errors) == 0, errors
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def validate_pronouns(text: str) -> Tuple[bool, List[str]]:
|
|
125
|
+
"""
|
|
126
|
+
Validate modern pronouns (v3.0 standard).
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
text: Telugu text to validate
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
Tuple of (is_valid, list_of_errors)
|
|
133
|
+
"""
|
|
134
|
+
errors = []
|
|
135
|
+
|
|
136
|
+
# Check for archaic pronouns
|
|
137
|
+
for pronoun in ARCHAIC_PRONOUNS:
|
|
138
|
+
if pronoun in text:
|
|
139
|
+
errors.append(f"Archaic pronoun found: {pronoun}")
|
|
140
|
+
|
|
141
|
+
return len(errors) == 0, errors
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def validate_verbs(text: str) -> Tuple[bool, List[str]]:
|
|
145
|
+
"""
|
|
146
|
+
Validate modern verb patterns.
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
text: Telugu text to validate
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
Tuple of (is_valid, list_of_errors)
|
|
153
|
+
"""
|
|
154
|
+
errors = []
|
|
155
|
+
|
|
156
|
+
# Check for archaic verb patterns
|
|
157
|
+
for pattern in ARCHAIC_VERB_PATTERNS:
|
|
158
|
+
if pattern in text:
|
|
159
|
+
errors.append(f"Archaic verb pattern found: {pattern}")
|
|
160
|
+
|
|
161
|
+
# Check if past tense uses modern participle pattern
|
|
162
|
+
# Look for -సిన- (modern participle marker)
|
|
163
|
+
if any(word in text for word in ['చేసి', 'తిని', 'వచ్చి', 'రాసి']):
|
|
164
|
+
# Has participle, which is good
|
|
165
|
+
pass
|
|
166
|
+
|
|
167
|
+
return len(errors) == 0, errors
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def validate_case_markers(text: str) -> Tuple[bool, List[str]]:
|
|
171
|
+
"""
|
|
172
|
+
Validate 4-case system usage.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
text: Telugu text to validate
|
|
176
|
+
|
|
177
|
+
Returns:
|
|
178
|
+
Tuple of (is_valid, list_of_errors)
|
|
179
|
+
"""
|
|
180
|
+
errors = []
|
|
181
|
+
|
|
182
|
+
# Check for correct case markers
|
|
183
|
+
modern_case_markers = ['డు', 'ను', 'కు', 'లో']
|
|
184
|
+
|
|
185
|
+
# This is a basic check - real implementation would be more sophisticated
|
|
186
|
+
# For now, just return True (v3.0 allows flexibility in case marking)
|
|
187
|
+
|
|
188
|
+
return True, errors
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def validate_vowel_harmony(text: str) -> Tuple[bool, List[str]]:
|
|
192
|
+
"""
|
|
193
|
+
Validate vowel harmony in text.
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
text: Telugu text to validate
|
|
197
|
+
|
|
198
|
+
Returns:
|
|
199
|
+
Tuple of (is_valid, list_of_errors)
|
|
200
|
+
"""
|
|
201
|
+
errors = []
|
|
202
|
+
|
|
203
|
+
# Check for obvious front/back mixing
|
|
204
|
+
# This is a simplified check
|
|
205
|
+
front_vowels = ['ఇ', 'ఈ', 'ఎ', 'ఏ', 'ఐ']
|
|
206
|
+
back_vowels = ['అ', 'ఆ', 'ఉ', 'ఊ', 'ఒ', 'ఓ', 'ఔ']
|
|
207
|
+
|
|
208
|
+
# Find words with mixed vowel classes
|
|
209
|
+
words = text.split()
|
|
210
|
+
for word in words:
|
|
211
|
+
word_front = any(v in word for v in front_vowels)
|
|
212
|
+
word_back = any(v in word for v in back_vowels)
|
|
213
|
+
|
|
214
|
+
if word_front and word_back and len(word) > 2:
|
|
215
|
+
# Mixed vowels in a single word - could be a harmony violation
|
|
216
|
+
# But this is a warning, not an error (some exceptions exist)
|
|
217
|
+
pass
|
|
218
|
+
|
|
219
|
+
return len(errors) == 0, errors
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
# ============================================================================
|
|
223
|
+
# SECTION 3: COMPREHENSIVE VALIDATION
|
|
224
|
+
# ============================================================================
|
|
225
|
+
|
|
226
|
+
def validate_v3_compliance(text: str) -> Dict[str, any]:
|
|
227
|
+
"""
|
|
228
|
+
Comprehensive v3.0 compliance check.
|
|
229
|
+
|
|
230
|
+
Args:
|
|
231
|
+
text: Telugu text to validate
|
|
232
|
+
|
|
233
|
+
Returns:
|
|
234
|
+
Dictionary with validation results
|
|
235
|
+
"""
|
|
236
|
+
results = {
|
|
237
|
+
'is_compliant': True,
|
|
238
|
+
'score': 100.0, # 0-100 percentage
|
|
239
|
+
'errors': [],
|
|
240
|
+
'warnings': [],
|
|
241
|
+
'checks': {}
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
# Run all checks
|
|
245
|
+
checks = [
|
|
246
|
+
('script', validate_script),
|
|
247
|
+
('pronouns', validate_pronouns),
|
|
248
|
+
('verbs', validate_verbs),
|
|
249
|
+
('case_markers', validate_case_markers),
|
|
250
|
+
('vowel_harmony', validate_vowel_harmony)
|
|
251
|
+
]
|
|
252
|
+
|
|
253
|
+
for check_name, check_func in checks:
|
|
254
|
+
is_valid, errors = check_func(text)
|
|
255
|
+
results['checks'][check_name] = {
|
|
256
|
+
'valid': is_valid,
|
|
257
|
+
'errors': errors
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
if not is_valid:
|
|
261
|
+
results['is_compliant'] = False
|
|
262
|
+
results['errors'].extend(errors)
|
|
263
|
+
# Deduct score based on errors
|
|
264
|
+
results['score'] -= len(errors) * 5
|
|
265
|
+
|
|
266
|
+
# Calculate score (minimum 0, maximum 100)
|
|
267
|
+
results['score'] = max(0, min(100, results['score']))
|
|
268
|
+
|
|
269
|
+
return results
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def get_compliance_report(text: str) -> str:
|
|
273
|
+
"""
|
|
274
|
+
Generate a human-readable compliance report.
|
|
275
|
+
|
|
276
|
+
Args:
|
|
277
|
+
text: Telugu text
|
|
278
|
+
|
|
279
|
+
Returns:
|
|
280
|
+
Formatted report string
|
|
281
|
+
"""
|
|
282
|
+
results = validate_v3_compliance(text)
|
|
283
|
+
|
|
284
|
+
lines = []
|
|
285
|
+
lines.append("="*70)
|
|
286
|
+
lines.append(" v3.0 COMPLIANCE REPORT")
|
|
287
|
+
lines.append("="*70)
|
|
288
|
+
lines.append("")
|
|
289
|
+
|
|
290
|
+
lines.append(f"Text: {text}")
|
|
291
|
+
lines.append(f"Overall: {'✓ COMPLIANT' if results['is_compliant'] else '✗ NON-COMPLIANT'}")
|
|
292
|
+
lines.append(f"Score: {results['score']:.1f}/100")
|
|
293
|
+
lines.append("")
|
|
294
|
+
|
|
295
|
+
lines.append("Checks:")
|
|
296
|
+
for check_name, check_result in results['checks'].items():
|
|
297
|
+
status = "✓" if check_result['valid'] else "✗"
|
|
298
|
+
lines.append(f" {status} {check_name.replace('_', ' ').title()}")
|
|
299
|
+
|
|
300
|
+
if results['errors']:
|
|
301
|
+
lines.append("")
|
|
302
|
+
lines.append("Errors:")
|
|
303
|
+
for error in results['errors']:
|
|
304
|
+
lines.append(f" - {error}")
|
|
305
|
+
|
|
306
|
+
if results['warnings']:
|
|
307
|
+
lines.append("")
|
|
308
|
+
lines.append("Warnings:")
|
|
309
|
+
for warning in results['warnings']:
|
|
310
|
+
lines.append(f" - {warning}")
|
|
311
|
+
|
|
312
|
+
lines.append("")
|
|
313
|
+
lines.append("="*70)
|
|
314
|
+
|
|
315
|
+
return "\n".join(lines)
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
# ============================================================================
|
|
319
|
+
# SECTION 4: TESTING UTILITIES
|
|
320
|
+
# ============================================================================
|
|
321
|
+
|
|
322
|
+
def test_v3_compliance():
|
|
323
|
+
"""
|
|
324
|
+
Test v3.0 compliance on sample texts.
|
|
325
|
+
|
|
326
|
+
Returns:
|
|
327
|
+
Test results
|
|
328
|
+
"""
|
|
329
|
+
test_cases = [
|
|
330
|
+
{
|
|
331
|
+
'text': 'నేను వచ్చాను',
|
|
332
|
+
'expected': True,
|
|
333
|
+
'description': 'Modern pronoun and verb'
|
|
334
|
+
},
|
|
335
|
+
{
|
|
336
|
+
'text': 'వాళ్ళు తిన్నారు',
|
|
337
|
+
'expected': True,
|
|
338
|
+
'description': 'Modern 3rd plural'
|
|
339
|
+
},
|
|
340
|
+
{
|
|
341
|
+
'text': 'ఏను చేసితిని',
|
|
342
|
+
'expected': False,
|
|
343
|
+
'description': 'Archaic pronoun and verb'
|
|
344
|
+
},
|
|
345
|
+
{
|
|
346
|
+
'text': 'రాము పుస్తకం చదువుతాడు',
|
|
347
|
+
'expected': True,
|
|
348
|
+
'description': 'Simple sentence'
|
|
349
|
+
}
|
|
350
|
+
]
|
|
351
|
+
|
|
352
|
+
print("\n" + "="*70)
|
|
353
|
+
print(" v3.0 COMPLIANCE TESTS")
|
|
354
|
+
print("="*70 + "\n")
|
|
355
|
+
|
|
356
|
+
passed = 0
|
|
357
|
+
failed = 0
|
|
358
|
+
|
|
359
|
+
for i, test in enumerate(test_cases, 1):
|
|
360
|
+
text = test['text']
|
|
361
|
+
expected = test['expected']
|
|
362
|
+
desc = test['description']
|
|
363
|
+
|
|
364
|
+
results = validate_v3_compliance(text)
|
|
365
|
+
is_compliant = results['is_compliant']
|
|
366
|
+
|
|
367
|
+
if is_compliant == expected:
|
|
368
|
+
status = "✓ PASS"
|
|
369
|
+
passed += 1
|
|
370
|
+
else:
|
|
371
|
+
status = "✗ FAIL"
|
|
372
|
+
failed += 1
|
|
373
|
+
|
|
374
|
+
print(f"{status} | Test {i}: {desc}")
|
|
375
|
+
print(f" Text: {text}")
|
|
376
|
+
print(f" Expected: {'Compliant' if expected else 'Non-compliant'}, "
|
|
377
|
+
f"Got: {'Compliant' if is_compliant else 'Non-compliant'}")
|
|
378
|
+
print(f" Score: {results['score']:.1f}/100")
|
|
379
|
+
|
|
380
|
+
if results['errors']:
|
|
381
|
+
print(f" Errors: {', '.join(results['errors'])}")
|
|
382
|
+
|
|
383
|
+
print()
|
|
384
|
+
|
|
385
|
+
print("="*70)
|
|
386
|
+
print(f"Results: {passed} passed, {failed} failed out of {len(test_cases)} tests")
|
|
387
|
+
print("="*70 + "\n")
|
|
388
|
+
|
|
389
|
+
return failed == 0
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
# ============================================================================
|
|
393
|
+
# SECTION 5: PUBLIC API
|
|
394
|
+
# ============================================================================
|
|
395
|
+
|
|
396
|
+
__all__ = [
|
|
397
|
+
'validate_script',
|
|
398
|
+
'validate_pronouns',
|
|
399
|
+
'validate_verbs',
|
|
400
|
+
'validate_case_markers',
|
|
401
|
+
'validate_vowel_harmony',
|
|
402
|
+
'validate_v3_compliance',
|
|
403
|
+
'get_compliance_report',
|
|
404
|
+
'test_v3_compliance'
|
|
405
|
+
]
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
# ============================================================================
|
|
409
|
+
# SECTION 6: MAIN
|
|
410
|
+
# ============================================================================
|
|
411
|
+
|
|
412
|
+
if __name__ == "__main__":
|
|
413
|
+
test_v3_compliance()
|