telugu-language-tools 5.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of telugu-language-tools might be problematic. Click here for more details.

@@ -0,0 +1,413 @@
1
+ """
2
+ v3.0 Compliance Validator
3
+ ==========================
4
+
5
+ Validates that Telugu text follows v3.0 modern standards:
6
+ - Script compliance (no archaic letters)
7
+ - Modern pronouns
8
+ - Modern verb patterns
9
+ - 4-case system
10
+ - Vowel harmony
11
+
12
+ Usage:
13
+ from telugu_engine.v3_validator import validate_v3_compliance
14
+ result = validate_v3_compliance("నేను వచ్చాను")
15
+ """
16
+
17
+ from typing import Dict, List, Tuple
18
+ import re
19
+
20
+
21
+ # ============================================================================
22
+ # SECTION 1: SCRIPT VALIDATION
23
+ # ============================================================================
24
+
25
+ # Archaic letters (v3.0 PROHIBITS these)
26
+ ARCHAIC_LETTERS = [
27
+ 'ఱ', # Banḍi Ra (alveolar trill - archaic)
28
+ 'ఌ', # Vocalic l (confined to Sanskrit)
29
+ 'ౡ', # Long vocalic l (obsolete)
30
+ 'ౘ', # Marginal consonant (replaced)
31
+ 'ౙ', # Marginal consonant (replaced)
32
+ 'ఀ', # Archaic candrabindu
33
+ 'ౝ', # Archaic nakara pollu
34
+ ]
35
+
36
+ # Modern consonants (v3.0 ALLOWS these)
37
+ MODERN_CONSONANTS = [
38
+ 'క', 'ఖ', 'గ', 'ఘ', 'ఙ',
39
+ 'చ', 'ఛ', 'జ', 'ఝ', 'ఞ',
40
+ 'ట', 'ఠ', 'డ', 'ఢ', 'ణ',
41
+ 'త', 'థ', 'ద', 'ధ', 'న',
42
+ 'ప', 'ఫ', 'బ', 'భ', 'మ',
43
+ 'య', 'ర', 'ల', 'వ', 'శ', 'ష', 'స', 'హ'
44
+ ]
45
+
46
+ # Modern vowels (v3.0 ALLOWS these)
47
+ MODERN_VOWELS = [
48
+ 'అ', 'ఆ', 'ఇ', 'ఈ', 'ఉ', 'ఊ',
49
+ 'ఋ', 'ౠ', 'ఎ', 'ఏ', 'ఒ', 'ఓ',
50
+ 'ఐ', 'ఔ'
51
+ ]
52
+
53
+ # Modern pronouns (v3.0 standard)
54
+ MODERN_PRONOUNS = [
55
+ 'నేను', # I (modern)
56
+ 'నీవు', # You (informal)
57
+ 'మీరు', # You (formal/plural)
58
+ 'వాళ్ళు', # They (modern, human)
59
+ 'మేము', # We (modern)
60
+ 'మనము', # We (inclusive)
61
+ 'వాడు', # He
62
+ 'అది', # It
63
+ ]
64
+
65
+ # Archaic pronouns (v3.0 PROHIBITS these)
66
+ ARCHAIC_PRONOUNS = [
67
+ 'ఏను', # Old 1st person
68
+ 'ఈవు', # Old 2nd person
69
+ 'వాండ్రు', # Old 3rd plural human
70
+ 'ఏము', # Old 1st plural
71
+ ]
72
+
73
+ # Modern verb patterns (v3.0 standard)
74
+ MODERN_VERB_PATTERNS = [
75
+ 'సినాను', # I did (modern)
76
+ 'సినారు', # They did (modern)
77
+ 'చేసినాను', # I did (modern)
78
+ 'తిన్నాను', # I ate (modern)
79
+ 'వచ్చాను', # I came (modern)
80
+ ]
81
+
82
+ # Archaic verb patterns (v3.0 PROHIBITS these)
83
+ ARCHAIC_VERB_PATTERNS = [
84
+ 'చేసితిని', # Old past pattern
85
+ 'చేసితిరి', # Old past plural
86
+ 'తినితిని', # Old past
87
+ 'వచ్చితిని', # Old past
88
+ ]
89
+
90
+
91
+ # ============================================================================
92
+ # SECTION 2: VALIDATION FUNCTIONS
93
+ # ============================================================================
94
+
95
+ def validate_script(text: str) -> Tuple[bool, List[str]]:
96
+ """
97
+ Validate script compliance (no archaic letters).
98
+
99
+ Args:
100
+ text: Telugu text to validate
101
+
102
+ Returns:
103
+ Tuple of (is_valid, list_of_errors)
104
+ """
105
+ errors = []
106
+
107
+ # Check for archaic letters
108
+ for letter in ARCHAIC_LETTERS:
109
+ if letter in text:
110
+ errors.append(f"Archaic letter found: {letter}")
111
+
112
+ # Check for modern characters (best effort)
113
+ # This is a basic check - could be enhanced
114
+ telugu_chars = set(text) - set('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 .,!?;:()[]{}"\'-')
115
+ if telugu_chars:
116
+ # Check if any are not in modern sets
117
+ for char in telugu_chars:
118
+ if char not in MODERN_CONSONANTS + MODERN_VOWELS and char not in ['ం', 'ః', '్', 'ి', 'ు', 'ె', 'ొ', 'ా', 'ీ', 'ూ', 'ే', 'ో', 'ై', 'ౌ']:
119
+ errors.append(f"Unknown character: {char}")
120
+
121
+ return len(errors) == 0, errors
122
+
123
+
124
+ def validate_pronouns(text: str) -> Tuple[bool, List[str]]:
125
+ """
126
+ Validate modern pronouns (v3.0 standard).
127
+
128
+ Args:
129
+ text: Telugu text to validate
130
+
131
+ Returns:
132
+ Tuple of (is_valid, list_of_errors)
133
+ """
134
+ errors = []
135
+
136
+ # Check for archaic pronouns
137
+ for pronoun in ARCHAIC_PRONOUNS:
138
+ if pronoun in text:
139
+ errors.append(f"Archaic pronoun found: {pronoun}")
140
+
141
+ return len(errors) == 0, errors
142
+
143
+
144
+ def validate_verbs(text: str) -> Tuple[bool, List[str]]:
145
+ """
146
+ Validate modern verb patterns.
147
+
148
+ Args:
149
+ text: Telugu text to validate
150
+
151
+ Returns:
152
+ Tuple of (is_valid, list_of_errors)
153
+ """
154
+ errors = []
155
+
156
+ # Check for archaic verb patterns
157
+ for pattern in ARCHAIC_VERB_PATTERNS:
158
+ if pattern in text:
159
+ errors.append(f"Archaic verb pattern found: {pattern}")
160
+
161
+ # Check if past tense uses modern participle pattern
162
+ # Look for -సిన- (modern participle marker)
163
+ if any(word in text for word in ['చేసి', 'తిని', 'వచ్చి', 'రాసి']):
164
+ # Has participle, which is good
165
+ pass
166
+
167
+ return len(errors) == 0, errors
168
+
169
+
170
+ def validate_case_markers(text: str) -> Tuple[bool, List[str]]:
171
+ """
172
+ Validate 4-case system usage.
173
+
174
+ Args:
175
+ text: Telugu text to validate
176
+
177
+ Returns:
178
+ Tuple of (is_valid, list_of_errors)
179
+ """
180
+ errors = []
181
+
182
+ # Check for correct case markers
183
+ modern_case_markers = ['డు', 'ను', 'కు', 'లో']
184
+
185
+ # This is a basic check - real implementation would be more sophisticated
186
+ # For now, just return True (v3.0 allows flexibility in case marking)
187
+
188
+ return True, errors
189
+
190
+
191
+ def validate_vowel_harmony(text: str) -> Tuple[bool, List[str]]:
192
+ """
193
+ Validate vowel harmony in text.
194
+
195
+ Args:
196
+ text: Telugu text to validate
197
+
198
+ Returns:
199
+ Tuple of (is_valid, list_of_errors)
200
+ """
201
+ errors = []
202
+
203
+ # Check for obvious front/back mixing
204
+ # This is a simplified check
205
+ front_vowels = ['ఇ', 'ఈ', 'ఎ', 'ఏ', 'ఐ']
206
+ back_vowels = ['అ', 'ఆ', 'ఉ', 'ఊ', 'ఒ', 'ఓ', 'ఔ']
207
+
208
+ # Find words with mixed vowel classes
209
+ words = text.split()
210
+ for word in words:
211
+ word_front = any(v in word for v in front_vowels)
212
+ word_back = any(v in word for v in back_vowels)
213
+
214
+ if word_front and word_back and len(word) > 2:
215
+ # Mixed vowels in a single word - could be a harmony violation
216
+ # But this is a warning, not an error (some exceptions exist)
217
+ pass
218
+
219
+ return len(errors) == 0, errors
220
+
221
+
222
+ # ============================================================================
223
+ # SECTION 3: COMPREHENSIVE VALIDATION
224
+ # ============================================================================
225
+
226
+ def validate_v3_compliance(text: str) -> Dict[str, any]:
227
+ """
228
+ Comprehensive v3.0 compliance check.
229
+
230
+ Args:
231
+ text: Telugu text to validate
232
+
233
+ Returns:
234
+ Dictionary with validation results
235
+ """
236
+ results = {
237
+ 'is_compliant': True,
238
+ 'score': 100.0, # 0-100 percentage
239
+ 'errors': [],
240
+ 'warnings': [],
241
+ 'checks': {}
242
+ }
243
+
244
+ # Run all checks
245
+ checks = [
246
+ ('script', validate_script),
247
+ ('pronouns', validate_pronouns),
248
+ ('verbs', validate_verbs),
249
+ ('case_markers', validate_case_markers),
250
+ ('vowel_harmony', validate_vowel_harmony)
251
+ ]
252
+
253
+ for check_name, check_func in checks:
254
+ is_valid, errors = check_func(text)
255
+ results['checks'][check_name] = {
256
+ 'valid': is_valid,
257
+ 'errors': errors
258
+ }
259
+
260
+ if not is_valid:
261
+ results['is_compliant'] = False
262
+ results['errors'].extend(errors)
263
+ # Deduct score based on errors
264
+ results['score'] -= len(errors) * 5
265
+
266
+ # Calculate score (minimum 0, maximum 100)
267
+ results['score'] = max(0, min(100, results['score']))
268
+
269
+ return results
270
+
271
+
272
+ def get_compliance_report(text: str) -> str:
273
+ """
274
+ Generate a human-readable compliance report.
275
+
276
+ Args:
277
+ text: Telugu text
278
+
279
+ Returns:
280
+ Formatted report string
281
+ """
282
+ results = validate_v3_compliance(text)
283
+
284
+ lines = []
285
+ lines.append("="*70)
286
+ lines.append(" v3.0 COMPLIANCE REPORT")
287
+ lines.append("="*70)
288
+ lines.append("")
289
+
290
+ lines.append(f"Text: {text}")
291
+ lines.append(f"Overall: {'✓ COMPLIANT' if results['is_compliant'] else '✗ NON-COMPLIANT'}")
292
+ lines.append(f"Score: {results['score']:.1f}/100")
293
+ lines.append("")
294
+
295
+ lines.append("Checks:")
296
+ for check_name, check_result in results['checks'].items():
297
+ status = "✓" if check_result['valid'] else "✗"
298
+ lines.append(f" {status} {check_name.replace('_', ' ').title()}")
299
+
300
+ if results['errors']:
301
+ lines.append("")
302
+ lines.append("Errors:")
303
+ for error in results['errors']:
304
+ lines.append(f" - {error}")
305
+
306
+ if results['warnings']:
307
+ lines.append("")
308
+ lines.append("Warnings:")
309
+ for warning in results['warnings']:
310
+ lines.append(f" - {warning}")
311
+
312
+ lines.append("")
313
+ lines.append("="*70)
314
+
315
+ return "\n".join(lines)
316
+
317
+
318
+ # ============================================================================
319
+ # SECTION 4: TESTING UTILITIES
320
+ # ============================================================================
321
+
322
+ def test_v3_compliance():
323
+ """
324
+ Test v3.0 compliance on sample texts.
325
+
326
+ Returns:
327
+ Test results
328
+ """
329
+ test_cases = [
330
+ {
331
+ 'text': 'నేను వచ్చాను',
332
+ 'expected': True,
333
+ 'description': 'Modern pronoun and verb'
334
+ },
335
+ {
336
+ 'text': 'వాళ్ళు తిన్నారు',
337
+ 'expected': True,
338
+ 'description': 'Modern 3rd plural'
339
+ },
340
+ {
341
+ 'text': 'ఏను చేసితిని',
342
+ 'expected': False,
343
+ 'description': 'Archaic pronoun and verb'
344
+ },
345
+ {
346
+ 'text': 'రాము పుస్తకం చదువుతాడు',
347
+ 'expected': True,
348
+ 'description': 'Simple sentence'
349
+ }
350
+ ]
351
+
352
+ print("\n" + "="*70)
353
+ print(" v3.0 COMPLIANCE TESTS")
354
+ print("="*70 + "\n")
355
+
356
+ passed = 0
357
+ failed = 0
358
+
359
+ for i, test in enumerate(test_cases, 1):
360
+ text = test['text']
361
+ expected = test['expected']
362
+ desc = test['description']
363
+
364
+ results = validate_v3_compliance(text)
365
+ is_compliant = results['is_compliant']
366
+
367
+ if is_compliant == expected:
368
+ status = "✓ PASS"
369
+ passed += 1
370
+ else:
371
+ status = "✗ FAIL"
372
+ failed += 1
373
+
374
+ print(f"{status} | Test {i}: {desc}")
375
+ print(f" Text: {text}")
376
+ print(f" Expected: {'Compliant' if expected else 'Non-compliant'}, "
377
+ f"Got: {'Compliant' if is_compliant else 'Non-compliant'}")
378
+ print(f" Score: {results['score']:.1f}/100")
379
+
380
+ if results['errors']:
381
+ print(f" Errors: {', '.join(results['errors'])}")
382
+
383
+ print()
384
+
385
+ print("="*70)
386
+ print(f"Results: {passed} passed, {failed} failed out of {len(test_cases)} tests")
387
+ print("="*70 + "\n")
388
+
389
+ return failed == 0
390
+
391
+
392
+ # ============================================================================
393
+ # SECTION 5: PUBLIC API
394
+ # ============================================================================
395
+
396
+ __all__ = [
397
+ 'validate_script',
398
+ 'validate_pronouns',
399
+ 'validate_verbs',
400
+ 'validate_case_markers',
401
+ 'validate_vowel_harmony',
402
+ 'validate_v3_compliance',
403
+ 'get_compliance_report',
404
+ 'test_v3_compliance'
405
+ ]
406
+
407
+
408
+ # ============================================================================
409
+ # SECTION 6: MAIN
410
+ # ============================================================================
411
+
412
+ if __name__ == "__main__":
413
+ test_v3_compliance()