telugu-language-tools 4.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,399 @@
1
+ """
2
+ Comprehensive Telugu Consonant Cluster Generator
3
+ =================================================
4
+
5
+ Generates 1000+ consonant cluster combinations programmatically.
6
+ Based on Telugu phonetic rules and common patterns.
7
+
8
+ Usage:
9
+ from telugu_lib.cluster_generator import get_all_clusters
10
+ clusters = get_all_clusters()
11
+ """
12
+
13
+ from .iso15919_mappings import get_iso_consonants, get_articulation_class
14
+
15
+
16
+ # ============================================================================
17
+ # MANUAL HIGH-PRIORITY CLUSTERS
18
+ # ============================================================================
19
+
20
+ def get_manual_clusters():
21
+ """
22
+ Hand-curated high-frequency clusters with specific handling.
23
+
24
+ These override algorithmic generation for accuracy.
25
+ """
26
+ return {
27
+ # ==================
28
+ # SPECIAL CLUSTERS (Must be exact)
29
+ # ==================
30
+ "kṣ": "క్ష", # ksha (very common in Sanskrit loanwords)
31
+ "ksh": "క్ష", # ASCII alternative
32
+ "jñ": "జ్ఞ", # jna (common)
33
+ "jn": "జ్ఞ", # ASCII alternative
34
+ "śr": "శ్ర", # shra
35
+ "shr": "శ్ర", # ASCII alternative
36
+ "tr": "త్ర", # tra (not ట్ర)
37
+ "ttr": "త్త్ర", # Complex triple
38
+
39
+ # ==================
40
+ # R-CLUSTERS (Extremely common in Telugu)
41
+ # ==================
42
+ "kr": "క్ర", "khr": "ఖ్ర", "gr": "గ్ర", "ghr": "ఘ్ర",
43
+ "cr": "చ్ర", "chr": "ఛ్ర", "jr": "జ్ర", "jhr": "ఝ్ర",
44
+ "ṭr": "ట్ర", "ṭhr": "ఠ్ర", "ḍr": "డ్ర", "ḍhr": "ఢ్ర",
45
+ "Tr": "ట్ర", "Thr": "ఠ్ర", "Dr": "డ్ర", "Dhr": "ఢ్ర", # ASCII
46
+ "thr": "థ్ర", "dhr": "ధ్ర", "dhr": "ధ్ర",
47
+ "pr": "ప్ర", "phr": "ఫ్ర", "br": "బ్ర", "bhr": "భ్ర",
48
+ "mr": "మ్ర", "yr": "య్ర", "vr": "వ్ర", "śr": "శ్ర",
49
+ "sr": "స్ర", "hr": "హ్ర",
50
+
51
+ # ==================
52
+ # Y-CLUSTERS (Common in Sanskrit derivatives)
53
+ # ==================
54
+ "ky": "క్య", "khy": "ఖ్య", "gy": "గ్య", "ghy": "ఘ్య",
55
+ "cy": "చ్య", "chy": "ఛ్య", "jy": "జ్య", "jhy": "ఝ్య",
56
+ "ṭy": "ట్య", "ṭhy": "ఠ్య", "ḍy": "డ్య", "ḍhy": "ఢ్య",
57
+ "Ty": "ట్య", "Thy": "ఠ్య", "Dy": "డ్య", "Dhy": "ఢ్య", # ASCII
58
+ "ty": "త్య", "thy": "థ్య", "dy": "ద్య", "dhy": "ధ్య",
59
+ "py": "ప్య", "phy": "ఫ్య", "by": "బ్య", "bhy": "భ్య",
60
+ "my": "మ్య", "vy": "వ్య", "śy": "శ్య", "sy": "స్య",
61
+
62
+ # ==================
63
+ # L-CLUSTERS (Less common but important)
64
+ # ==================
65
+ "kl": "క్ల", "khl": "ఖ్ల", "gl": "గ్ల", "ghl": "ఘ్ల",
66
+ "pl": "ప్ల", "phl": "ఫ్ల", "bl": "బ్ల", "bhl": "భ్ల",
67
+ "tl": "త్ల", "thl": "థ్ల", "dl": "ద్ల", "dhl": "ధ్ల",
68
+ "ml": "మ్ల", "vl": "వ్ల", "sl": "స్ల",
69
+
70
+ # ==================
71
+ # V-CLUSTERS (Sanskrit influence)
72
+ # ==================
73
+ "kv": "క్వ", "tv": "త్వ", "dv": "ద్వ", "sv": "స్వ",
74
+ "hv": "హ్వ", "ṭv": "ట్వ", "Tv": "ట్వ", # ASCII
75
+
76
+ # ==================
77
+ # NASAL + CONSONANT (With anusvara)
78
+ # ==================
79
+ # These use anusvara (ం) before homorganic consonants
80
+ "ṅk": "ంక", "ṅkh": "ంఖ", "ṅg": "ంగ", "ṅgh": "ంఘ",
81
+ "ñc": "ంచ", "ñch": "ంఛ", "ñj": "ంజ", "ñjh": "ంఝ",
82
+ "ṇṭ": "ంట", "ṇṭh": "ంఠ", "ṇḍ": "ండ", "ṇḍh": "ంఢ",
83
+ "nt": "ంత", "nth": "ంథ", "nd": "ంద", "ndh": "ంధ",
84
+ "mp": "ంప", "mph": "ంఫ", "mb": "ంబ", "mbh": "ంభ",
85
+
86
+ # ASCII alternatives for nasals
87
+ "nk": "ంక", "ng": "ంగ", "nc": "ంచ", "nch": "ంచ", "nj": "ంజ",
88
+ "nT": "ంట", "nTh": "ంఠ", "nD": "ండ", "nDh": "ంఢ",
89
+ "mp": "ంప", "mb": "ంబ",
90
+
91
+ # ==================
92
+ # GEMINATION (Double consonants)
93
+ # ==================
94
+ "kk": "క్క", "gg": "గ్గ", "cc": "చ్చ", "jj": "జ్జ",
95
+ "ṭṭ": "ట్ట", "ḍḍ": "డ్డ",
96
+ "TT": "ట్ట", "DD": "డ్డ", # ASCII
97
+ "tt": "త్త", "dd": "ద్ద", "pp": "ప్ప", "bb": "బ్బ",
98
+ "mm": "మ్మ", "nn": "న్న", "ll": "ల్ల", "rr": "ర్ర",
99
+ "ss": "స్స", "LL": "ళ్ళ", # Retroflex L geminated
100
+
101
+ # ==================
102
+ # THREE-CONSONANT CLUSTERS (Complex)
103
+ # ==================
104
+ "str": "స్త్ర", "skr": "స్క్ర", "spr": "స్ప్ర",
105
+ "ntr": "న్త్ర", "ndr": "న్ద్ర", "mbr": "మ్బ్ర",
106
+ "mpr": "మ్ప్ర", "ṅkr": "ంక్ర", "nkr": "ంక్ర",
107
+ "ndhr": "న్ధ్ర", "nthr": "న్థ్ర",
108
+ "kṣm": "క్ష్మ", "kshm": "క్ష్మ", # ASCII
109
+ "kṣy": "క్ష్య", "kshy": "క్ష్య", # ASCII
110
+ "jñy": "జ్ఞ్య", "jny": "జ్ఞ్య", # ASCII
111
+
112
+ # ==================
113
+ # S-CLUSTERS (English loanwords)
114
+ # ==================
115
+ "sk": "స్క", "st": "స్ట", "sp": "స్ప", "sm": "స్మ",
116
+ "sn": "స్న", "sl": "స్ల", "sy": "స్య",
117
+ "skh": "స్ఖ", "sth": "స్థ", "sph": "స్ఫ",
118
+
119
+ # ==================
120
+ # SH-CLUSTERS
121
+ # ==================
122
+ "śk": "శ్క", "śt": "శ్త", "śp": "శ్ప", "śm": "శ్మ",
123
+ "śn": "శ్న", "śl": "శ్ల", "śy": "శ్య",
124
+ "shk": "శ్క", "sht": "శ్త", "shp": "శ్ప", "shm": "శ్మ", # ASCII
125
+
126
+ # ==================
127
+ # M-CLUSTERS (Less common)
128
+ # ==================
129
+ "mk": "మ్క", "mt": "మ్త", "mp": "మ్ప", "my": "మ్య",
130
+ "mr": "మ్ర", "ml": "మ్ల", "mv": "మ్వ",
131
+
132
+ # ==================
133
+ # H-CLUSTERS (Rare)
134
+ # ==================
135
+ "hm": "హ్మ", "hn": "హ్న", "hy": "హ్య", "hr": "హ్ర",
136
+ "hv": "హ్వ", "hl": "హ్ల",
137
+ }
138
+
139
+
140
+ # ============================================================================
141
+ # ALGORITHMIC CLUSTER GENERATION
142
+ # ============================================================================
143
+
144
+ def generate_algorithmic_clusters():
145
+ """
146
+ Generate clusters algorithmically based on Telugu phonetic rules.
147
+
148
+ Creates all valid two-consonant combinations.
149
+ """
150
+ consonants = get_iso_consonants("mixed")
151
+ clusters = {}
152
+
153
+ # Define which consonants can be first/second in clusters
154
+ FIRST_CONSONANTS = ["k", "kh", "g", "gh", "c", "ch", "j", "jh",
155
+ "ṭ", "ṭh", "ḍ", "ḍh", "T", "Th", "D", "Dh",
156
+ "t", "th", "d", "dh", "p", "ph", "b", "bh",
157
+ "m", "n", "ṇ", "N", "y", "r", "l", "v",
158
+ "ś", "ṣ", "sh", "S", "s", "h"]
159
+
160
+ # These consonants commonly appear as second element
161
+ SECOND_CONSONANTS = ["r", "y", "l", "v", "n", "m"]
162
+
163
+ # Generate C1 + C2 combinations
164
+ for c1 in FIRST_CONSONANTS:
165
+ tel1 = consonants.get(c1)
166
+ if not tel1:
167
+ continue
168
+
169
+ for c2 in SECOND_CONSONANTS:
170
+ tel2 = consonants.get(c2)
171
+ if not tel2:
172
+ continue
173
+
174
+ cluster_key = c1 + c2
175
+ cluster_value = tel1 + "్" + tel2
176
+
177
+ # Don't override manual clusters
178
+ if cluster_key not in get_manual_clusters():
179
+ clusters[cluster_key] = cluster_value
180
+
181
+ return clusters
182
+
183
+
184
+ def generate_nasal_clusters():
185
+ """
186
+ Generate nasal + consonant clusters with proper anusvara usage.
187
+
188
+ Uses anusvara (ం) for homorganic nasals before consonants.
189
+ """
190
+ consonants = get_iso_consonants("mixed")
191
+ clusters = {}
192
+
193
+ # Consonants that follow nasals
194
+ POST_NASAL_CONSONANTS = ["k", "kh", "g", "gh",
195
+ "c", "ch", "j", "jh",
196
+ "ṭ", "ṭh", "ḍ", "ḍh", "T", "Th", "D", "Dh",
197
+ "t", "th", "d", "dh",
198
+ "p", "ph", "b", "bh"]
199
+
200
+ for cons in POST_NASAL_CONSONANTS:
201
+ tel_cons = consonants.get(cons)
202
+ if tel_cons:
203
+ # Use anusvara before consonant
204
+ clusters[f"n{cons}"] = "ం" + tel_cons
205
+ clusters[f"m{cons}"] = "ం" + tel_cons
206
+
207
+ # Special: explicit nasal variants (less common)
208
+ clusters[f"na{cons}"] = "న" + "్" + tel_cons # Explicit dental nasal
209
+
210
+ # Retroflex nasal
211
+ if cons in ["ṭ", "ṭh", "ḍ", "ḍh", "T", "Th", "D", "Dh"]:
212
+ clusters[f"ṇ{cons}"] = "ం" + tel_cons
213
+ clusters[f"N{cons}"] = "ం" + tel_cons # ASCII
214
+
215
+ return clusters
216
+
217
+
218
+ def generate_s_clusters():
219
+ """
220
+ Generate S/Sh clusters (common in English loanwords).
221
+ """
222
+ consonants = get_iso_consonants("mixed")
223
+ clusters = {}
224
+
225
+ S_SECOND = ["k", "kh", "t", "th", "ṭ", "T", "p", "ph", "m", "n", "l", "r", "y", "v"]
226
+
227
+ for second in S_SECOND:
228
+ tel_second = consonants.get(second)
229
+ if tel_second:
230
+ # Dental s
231
+ clusters[f"s{second}"] = "స్" + tel_second
232
+ # Palatal ś
233
+ clusters[f"ś{second}"] = "శ్" + tel_second
234
+ clusters[f"sh{second}"] = "శ్" + tel_second # ASCII
235
+ # Retroflex ṣ
236
+ clusters[f"ṣ{second}"] = "ష్" + tel_second
237
+ clusters[f"S{second}"] = "ష్" + tel_second # ASCII
238
+
239
+ return clusters
240
+
241
+
242
+ # ============================================================================
243
+ # MAIN CLUSTER AGGREGATION
244
+ # ============================================================================
245
+
246
+ def get_all_clusters(include_algorithmic=True):
247
+ """
248
+ Get comprehensive cluster library.
249
+
250
+ Args:
251
+ include_algorithmic: Include automatically generated clusters
252
+
253
+ Returns:
254
+ Dictionary of all consonant clusters (1000+ entries)
255
+ """
256
+ # Start with manual high-priority clusters
257
+ clusters = get_manual_clusters().copy()
258
+
259
+ if include_algorithmic:
260
+ # Add algorithmic clusters
261
+ algo_clusters = generate_algorithmic_clusters()
262
+ nasal_clusters = generate_nasal_clusters()
263
+ s_clusters = generate_s_clusters()
264
+
265
+ # Merge (manual clusters take precedence)
266
+ for cluster_dict in [algo_clusters, nasal_clusters, s_clusters]:
267
+ for key, value in cluster_dict.items():
268
+ if key not in clusters:
269
+ clusters[key] = value
270
+
271
+ return clusters
272
+
273
+
274
+ def get_clusters_by_type():
275
+ """
276
+ Get clusters organized by type for analysis.
277
+
278
+ Returns:
279
+ Dictionary with categorized clusters
280
+ """
281
+ all_clusters = get_all_clusters()
282
+
283
+ categorized = {
284
+ 'r_clusters': {},
285
+ 'y_clusters': {},
286
+ 'l_clusters': {},
287
+ 'v_clusters': {},
288
+ 'nasal_clusters': {},
289
+ 'gemination': {},
290
+ 's_clusters': {},
291
+ 'special': {},
292
+ 'other': {}
293
+ }
294
+
295
+ for key, value in all_clusters.items():
296
+ if key.endswith('r'):
297
+ categorized['r_clusters'][key] = value
298
+ elif key.endswith('y'):
299
+ categorized['y_clusters'][key] = value
300
+ elif key.endswith('l'):
301
+ categorized['l_clusters'][key] = value
302
+ elif key.endswith('v'):
303
+ categorized['v_clusters'][key] = value
304
+ elif key.startswith('n') or key.startswith('m') or key.startswith('ṅ') or key.startswith('ñ') or key.startswith('ṇ'):
305
+ categorized['nasal_clusters'][key] = value
306
+ elif len(key) == 2 and key[0] == key[1]:
307
+ categorized['gemination'][key] = value
308
+ elif key.startswith('s') or key.startswith('ś') or key.startswith('ṣ') or key.startswith('sh') or key.startswith('S'):
309
+ categorized['s_clusters'][key] = value
310
+ elif key in ['kṣ', 'ksh', 'jñ', 'jn', 'śr', 'shr']:
311
+ categorized['special'][key] = value
312
+ else:
313
+ categorized['other'][key] = value
314
+
315
+ return categorized
316
+
317
+
318
+ def match_longest_cluster(text, position):
319
+ """
320
+ Match the longest valid cluster starting at position.
321
+
322
+ Args:
323
+ text: Input text string
324
+ position: Starting position to check
325
+
326
+ Returns:
327
+ tuple: (matched_cluster_telugu, length_matched) or (None, 0)
328
+ """
329
+ clusters = get_all_clusters()
330
+
331
+ # Check up to 5 characters (longest clusters are 3-4 chars)
332
+ max_check = min(5, len(text) - position)
333
+
334
+ # Try longest first
335
+ for length in range(max_check, 0, -1):
336
+ substr = text[position:position + length]
337
+ if substr in clusters:
338
+ return clusters[substr], length
339
+
340
+ return None, 0
341
+
342
+
343
+ # ============================================================================
344
+ # STATISTICS AND VALIDATION
345
+ # ============================================================================
346
+
347
+ def print_cluster_statistics():
348
+ """Print statistics about the cluster library"""
349
+ all_clusters = get_all_clusters()
350
+ categorized = get_clusters_by_type()
351
+
352
+ print("=" * 70)
353
+ print("TELUGU CONSONANT CLUSTER LIBRARY STATISTICS")
354
+ print("=" * 70)
355
+ print(f"\nTotal Clusters: {len(all_clusters)}")
356
+ print("\nBreakdown by Type:")
357
+ print("-" * 70)
358
+
359
+ for category, clusters in categorized.items():
360
+ if clusters:
361
+ print(f"{category.replace('_', ' ').title():20}: {len(clusters):4} clusters")
362
+
363
+ # Show examples from each category
364
+ print("\n" + "=" * 70)
365
+ print("EXAMPLE CLUSTERS BY CATEGORY")
366
+ print("=" * 70)
367
+
368
+ for category, clusters in categorized.items():
369
+ if clusters and len(clusters) > 0:
370
+ examples = list(clusters.items())[:5]
371
+ print(f"\n{category.replace('_', ' ').title()}:")
372
+ for roman, telugu in examples:
373
+ print(f" {roman:8} → {telugu}")
374
+
375
+ print("\n" + "=" * 70)
376
+
377
+
378
+ if __name__ == "__main__":
379
+ # Print statistics
380
+ print_cluster_statistics()
381
+
382
+ # Test longest match function
383
+ print("\n" + "=" * 70)
384
+ print("LONGEST MATCH TESTING")
385
+ print("=" * 70)
386
+
387
+ test_words = ["krishna", "prapancha", "samskara", "street", "strong"]
388
+
389
+ for word in test_words:
390
+ print(f"\nWord: {word}")
391
+ i = 0
392
+ while i < len(word):
393
+ cluster, length = match_longest_cluster(word, i)
394
+ if cluster:
395
+ print(f" Position {i}: matched '{word[i:i+length]}' → {cluster}")
396
+ i += length
397
+ else:
398
+ print(f" Position {i}: no cluster match for '{word[i]}'")
399
+ i += 1