telugu-language-tools 4.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- telugu_language_tools-4.0.2.dist-info/METADATA +956 -0
- telugu_language_tools-4.0.2.dist-info/RECORD +14 -0
- telugu_language_tools-4.0.2.dist-info/WHEEL +5 -0
- telugu_language_tools-4.0.2.dist-info/licenses/LICENSE +21 -0
- telugu_language_tools-4.0.2.dist-info/top_level.txt +1 -0
- telugu_lib/__init__.py +197 -0
- telugu_lib/advanced.py +717 -0
- telugu_lib/cluster_generator.py +399 -0
- telugu_lib/context_rules.py +568 -0
- telugu_lib/enhanced_dictionary.py +516 -0
- telugu_lib/iso15919_mappings.py +430 -0
- telugu_lib/sentence_tools.py +214 -0
- telugu_lib/text_tools.py +108 -0
- telugu_lib/transliterate.py +972 -0
|
@@ -0,0 +1,430 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ISO 15919 Standard Compliant Telugu Mappings
|
|
3
|
+
=============================================
|
|
4
|
+
|
|
5
|
+
International standard for romanization of Indic scripts.
|
|
6
|
+
Supports both diacritic notation and ASCII alternatives.
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
from telugu_lib.iso15919_mappings import get_iso_consonants, get_iso_vowels
|
|
10
|
+
|
|
11
|
+
Reference: ISO 15919:2001 - Transliteration of Devanagari and related Indic scripts
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
# ============================================================================
|
|
15
|
+
# ISO 15919 CONSONANT MAPPINGS
|
|
16
|
+
# ============================================================================
|
|
17
|
+
|
|
18
|
+
def get_iso_consonants(mode="mixed"):
|
|
19
|
+
"""
|
|
20
|
+
Get ISO 15919 compliant consonant mappings.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
mode: "diacritic" (only diacritics), "ascii" (only capitals),
|
|
24
|
+
"mixed" (both, default)
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
Dictionary of romanization → Telugu mappings
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
# Base mappings with diacritics (ISO 15919 standard)
|
|
31
|
+
diacritic_consonants = {
|
|
32
|
+
# Velars (కవర్గ)
|
|
33
|
+
"k": "క", # ka
|
|
34
|
+
"kh": "ఖ", # kha
|
|
35
|
+
"g": "గ", # ga
|
|
36
|
+
"gh": "ఘ", # gha
|
|
37
|
+
"ṅ": "ఙ", # ṅa (velar nasal, rare)
|
|
38
|
+
|
|
39
|
+
# Palatals (చవర్గ)
|
|
40
|
+
"c": "చ", # ca (ISO uses 'c', not 'ch')
|
|
41
|
+
"ch": "ఛ", # cha (aspirated)
|
|
42
|
+
"j": "జ", # ja
|
|
43
|
+
"jh": "ఝ", # jha
|
|
44
|
+
"ñ": "ఞ", # ña (palatal nasal, rare)
|
|
45
|
+
|
|
46
|
+
# Retroflexes (టవర్గ) - with underdots
|
|
47
|
+
"ṭ": "ట", # ṭa
|
|
48
|
+
"ṭh": "ఠ", # ṭha
|
|
49
|
+
"ḍ": "డ", # ḍa
|
|
50
|
+
"ḍh": "ఢ", # ḍha
|
|
51
|
+
"ṇ": "ణ", # ṇa (retroflex nasal)
|
|
52
|
+
|
|
53
|
+
# Dentals (తవర్గ)
|
|
54
|
+
"t": "త", # ta
|
|
55
|
+
"th": "థ", # tha
|
|
56
|
+
"d": "ద", # da
|
|
57
|
+
"dh": "ధ", # dha
|
|
58
|
+
"n": "న", # na (dental nasal)
|
|
59
|
+
|
|
60
|
+
# Labials (పవర్గ)
|
|
61
|
+
"p": "ప", # pa
|
|
62
|
+
"ph": "ఫ", # pha
|
|
63
|
+
"b": "బ", # ba
|
|
64
|
+
"bh": "భ", # bha
|
|
65
|
+
"m": "మ", # ma
|
|
66
|
+
|
|
67
|
+
# Sonorants (అంతస్థలు)
|
|
68
|
+
"y": "య", # ya
|
|
69
|
+
"r": "ర", # ra
|
|
70
|
+
"l": "ల", # la
|
|
71
|
+
"v": "వ", # va
|
|
72
|
+
"w": "వ", # wa (alternative for v)
|
|
73
|
+
|
|
74
|
+
# Sibilants (ఊష్మలు)
|
|
75
|
+
"ś": "శ", # śa (palatal sibilant)
|
|
76
|
+
"ṣ": "ష", # ṣa (retroflex sibilant)
|
|
77
|
+
"s": "స", # sa (dental sibilant)
|
|
78
|
+
|
|
79
|
+
# Glottal
|
|
80
|
+
"h": "హ", # ha
|
|
81
|
+
|
|
82
|
+
# Additional consonants
|
|
83
|
+
"ḷ": "ళ", # ḷa (retroflex lateral)
|
|
84
|
+
"ḻ": "ఴ", # ḻa (Tamil retroflex, obsolete in Telugu)
|
|
85
|
+
"ṟ": "ఱ", # ṟa (alveolar trill, archaic)
|
|
86
|
+
|
|
87
|
+
# Simplified alternatives (common usage)
|
|
88
|
+
"sha": "శ", # Alternative for ś
|
|
89
|
+
"Sha": "ష", # Alternative for ṣ (capital S)
|
|
90
|
+
"za": "జ", # z often mapped to ja
|
|
91
|
+
"f": "ఫ", # f → pha
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
# ASCII alternatives (using capitals for retroflexes)
|
|
95
|
+
ascii_consonants = {
|
|
96
|
+
# Retroflexes (capital = retroflex)
|
|
97
|
+
"T": "ట", # ASCII for ṭ
|
|
98
|
+
"Th": "ఠ", # ASCII for ṭh
|
|
99
|
+
"D": "డ", # ASCII for ḍ
|
|
100
|
+
"Dh": "ఢ", # ASCII for ḍh
|
|
101
|
+
"N": "ణ", # ASCII for ṇ (retroflex nasal)
|
|
102
|
+
"L": "ళ", # ASCII for ḷ (retroflex lateral)
|
|
103
|
+
"R": "ఱ", # ASCII for ṟ (rare)
|
|
104
|
+
|
|
105
|
+
# Sibilants
|
|
106
|
+
"S": "ష", # ASCII for ṣ (retroflex sibilant)
|
|
107
|
+
"sh": "శ", # Palatal sibilant (lowercase)
|
|
108
|
+
|
|
109
|
+
# Palatals
|
|
110
|
+
"ch": "చ", # Common ch → ca
|
|
111
|
+
"chh": "ఛ", # Aspirated
|
|
112
|
+
|
|
113
|
+
# Nasals
|
|
114
|
+
"ng": "ఙ", # ASCII for ṅ (velar nasal)
|
|
115
|
+
"ny": "ఞ", # ASCII for ñ (palatal nasal)
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
# Combined mapping based on mode
|
|
119
|
+
if mode == "diacritic":
|
|
120
|
+
return diacritic_consonants
|
|
121
|
+
elif mode == "ascii":
|
|
122
|
+
return {**diacritic_consonants, **ascii_consonants}
|
|
123
|
+
else: # mixed (default)
|
|
124
|
+
return {**diacritic_consonants, **ascii_consonants}
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
# ============================================================================
|
|
128
|
+
# ISO 15919 VOWEL MAPPINGS
|
|
129
|
+
# ============================================================================
|
|
130
|
+
|
|
131
|
+
def get_iso_vowels(mode="mixed"):
|
|
132
|
+
"""
|
|
133
|
+
Get ISO 15919 compliant vowel mappings.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
mode: "diacritic" (only diacritics), "ascii" (only capitals),
|
|
137
|
+
"mixed" (both, default)
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
Dictionary of romanization → Telugu vowel mappings
|
|
141
|
+
"""
|
|
142
|
+
|
|
143
|
+
# Base vowels with diacritics (ISO 15919 standard)
|
|
144
|
+
diacritic_vowels = {
|
|
145
|
+
# Short vowels
|
|
146
|
+
"a": "అ", # a (short)
|
|
147
|
+
"i": "ఇ", # i (short)
|
|
148
|
+
"u": "ఉ", # u (short)
|
|
149
|
+
"ṛ": "ఋ", # ṛ (vocalic r, short)
|
|
150
|
+
"ḷ": "ఌ", # ḷ (vocalic l, short, very rare)
|
|
151
|
+
|
|
152
|
+
# Long vowels (with macrons)
|
|
153
|
+
"ā": "ఆ", # ā (long)
|
|
154
|
+
"ī": "ఈ", # ī (long)
|
|
155
|
+
"ū": "ఊ", # ū (long)
|
|
156
|
+
"ṝ": "ౠ", # ṝ (vocalic r, long)
|
|
157
|
+
"ḹ": "ౡ", # ḹ (vocalic l, long, very rare)
|
|
158
|
+
|
|
159
|
+
# E vowels
|
|
160
|
+
"e": "ఎ", # e (short)
|
|
161
|
+
"ē": "ఏ", # ē (long)
|
|
162
|
+
|
|
163
|
+
# O vowels
|
|
164
|
+
"o": "ఒ", # o (short)
|
|
165
|
+
"ō": "ఓ", # ō (long)
|
|
166
|
+
|
|
167
|
+
# Diphthongs
|
|
168
|
+
"ai": "ఐ", # ai
|
|
169
|
+
"au": "ఔ", # au
|
|
170
|
+
|
|
171
|
+
# Special markers
|
|
172
|
+
"ṁ": "ం", # ṁ (anusvara)
|
|
173
|
+
"ḥ": "ః", # ḥ (visarga)
|
|
174
|
+
"m̐": "ఁ", # candrabindu (rare)
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
# ASCII alternatives
|
|
178
|
+
ascii_vowels = {
|
|
179
|
+
# Long vowels (capital = long, or double letter)
|
|
180
|
+
"A": "ఆ", # ASCII for ā
|
|
181
|
+
"aa": "ఆ", # Alternative for ā
|
|
182
|
+
"I": "ఈ", # ASCII for ī
|
|
183
|
+
"ii": "ఈ", # Alternative for ī
|
|
184
|
+
"U": "ఊ", # ASCII for ū
|
|
185
|
+
"uu": "ఊ", # Alternative for ū
|
|
186
|
+
"E": "ఏ", # ASCII for ē (long e)
|
|
187
|
+
"ee": "ఏ", # Alternative for ē
|
|
188
|
+
"O": "ఓ", # ASCII for ō (long o)
|
|
189
|
+
"oo": "ఓ", # Alternative for ō
|
|
190
|
+
|
|
191
|
+
# Vocalic consonants
|
|
192
|
+
"R": "ఋ", # ASCII for ṛ
|
|
193
|
+
"ri": "ఋ", # Common alternative
|
|
194
|
+
"RR": "ౠ", # ASCII for ṝ (long)
|
|
195
|
+
"rii": "ౠ", # Common alternative
|
|
196
|
+
|
|
197
|
+
# Vocalic l (very rare)
|
|
198
|
+
"lR": "ఌ", # ASCII for ḷ
|
|
199
|
+
"li": "ఌ", # Common alternative
|
|
200
|
+
|
|
201
|
+
# Special markers
|
|
202
|
+
"M": "ం", # ASCII for ṁ (anusvara)
|
|
203
|
+
"am": "ం", # Common representation
|
|
204
|
+
"H": "ః", # ASCII for ḥ (visarga)
|
|
205
|
+
"ah": "ః", # Common representation
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
if mode == "diacritic":
|
|
209
|
+
return diacritic_vowels
|
|
210
|
+
elif mode == "ascii":
|
|
211
|
+
return {**diacritic_vowels, **ascii_vowels}
|
|
212
|
+
else: # mixed
|
|
213
|
+
return {**diacritic_vowels, **ascii_vowels}
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
# ============================================================================
|
|
217
|
+
# ISO 15919 MATRA (VOWEL SIGN) MAPPINGS
|
|
218
|
+
# ============================================================================
|
|
219
|
+
|
|
220
|
+
def get_iso_matras(mode="mixed"):
|
|
221
|
+
"""
|
|
222
|
+
Get ISO 15919 compliant matra (vowel sign) mappings.
|
|
223
|
+
|
|
224
|
+
Matras are vowel signs that attach to consonants.
|
|
225
|
+
|
|
226
|
+
Returns:
|
|
227
|
+
Dictionary of romanization → Telugu matra mappings
|
|
228
|
+
"""
|
|
229
|
+
|
|
230
|
+
diacritic_matras = {
|
|
231
|
+
# No marking for inherent 'a'
|
|
232
|
+
"a": "", # Inherent vowel (no mark)
|
|
233
|
+
|
|
234
|
+
# Short vowel signs
|
|
235
|
+
"i": "ి", # i-matra
|
|
236
|
+
"u": "ు", # u-matra
|
|
237
|
+
"ṛ": "ృ", # ṛ-matra (vocalic r)
|
|
238
|
+
"ḷ": "ౢ", # ḷ-matra (vocalic l, rare)
|
|
239
|
+
|
|
240
|
+
# Long vowel signs
|
|
241
|
+
"ā": "ా", # ā-matra
|
|
242
|
+
"ī": "ీ", # ī-matra
|
|
243
|
+
"ū": "ూ", # ū-matra
|
|
244
|
+
"ṝ": "ౄ", # ṝ-matra (long vocalic r)
|
|
245
|
+
"ḹ": "ౣ", # ḹ-matra (long vocalic l, rare)
|
|
246
|
+
|
|
247
|
+
# E vowel signs
|
|
248
|
+
"e": "ె", # e-matra (short)
|
|
249
|
+
"ē": "ే", # ē-matra (long)
|
|
250
|
+
|
|
251
|
+
# O vowel signs
|
|
252
|
+
"o": "ొ", # o-matra (short)
|
|
253
|
+
"ō": "ో", # ō-matra (long)
|
|
254
|
+
|
|
255
|
+
# Diphthong signs
|
|
256
|
+
"ai": "ై", # ai-matra
|
|
257
|
+
"au": "ౌ", # au-matra
|
|
258
|
+
|
|
259
|
+
# Special markers (same as standalone)
|
|
260
|
+
"ṁ": "ం", # anusvara
|
|
261
|
+
"ḥ": "ః", # visarga
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
ascii_matras = {
|
|
265
|
+
# ASCII alternatives for long vowels
|
|
266
|
+
"A": "ా", # ASCII for ā
|
|
267
|
+
"aa": "ా", # Common alternative
|
|
268
|
+
"I": "ీ", # ASCII for ī
|
|
269
|
+
"ii": "ీ", # Common alternative
|
|
270
|
+
"U": "ూ", # ASCII for ū
|
|
271
|
+
"uu": "ూ", # Common alternative
|
|
272
|
+
"E": "ే", # ASCII for ē
|
|
273
|
+
"ee": "ే", # Common alternative
|
|
274
|
+
"O": "ో", # ASCII for ō
|
|
275
|
+
"oo": "ో", # Common alternative
|
|
276
|
+
|
|
277
|
+
# Vocalic consonants
|
|
278
|
+
"R": "ృ", # ASCII for ṛ
|
|
279
|
+
"ri": "ృ", # Common alternative
|
|
280
|
+
"RR": "ౄ", # ASCII for ṝ
|
|
281
|
+
"rii": "ౄ", # Common alternative
|
|
282
|
+
|
|
283
|
+
# Special markers
|
|
284
|
+
"M": "ం", # ASCII for ṁ
|
|
285
|
+
"am": "ం", # Common representation
|
|
286
|
+
"H": "ః", # ASCII for ḥ
|
|
287
|
+
"ah": "ః", # Common representation
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
if mode == "diacritic":
|
|
291
|
+
return diacritic_matras
|
|
292
|
+
elif mode == "ascii":
|
|
293
|
+
return {**diacritic_matras, **ascii_matras}
|
|
294
|
+
else: # mixed
|
|
295
|
+
return {**diacritic_matras, **ascii_matras}
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
# ============================================================================
|
|
299
|
+
# HELPER FUNCTIONS
|
|
300
|
+
# ============================================================================
|
|
301
|
+
|
|
302
|
+
def normalize_input(text):
|
|
303
|
+
"""
|
|
304
|
+
Normalize input text to handle common variations.
|
|
305
|
+
|
|
306
|
+
Converts common romanization variants to ISO 15919 standard.
|
|
307
|
+
"""
|
|
308
|
+
replacements = {
|
|
309
|
+
# Common variations → ISO standard
|
|
310
|
+
"Ch": "ch", # Capital Ch → ch
|
|
311
|
+
"zh": "j", # zh → j
|
|
312
|
+
"Z": "j", # Z → j
|
|
313
|
+
"ph": "ph", # Already correct
|
|
314
|
+
"f": "ph", # f → ph (no native f in Telugu)
|
|
315
|
+
"q": "k", # q → k (no native q)
|
|
316
|
+
"x": "ks", # x → ks cluster
|
|
317
|
+
|
|
318
|
+
# Ensure ASCII capitals for retroflexes are preserved
|
|
319
|
+
# (handled separately in get_iso_consonants)
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
result = text
|
|
323
|
+
for old, new in replacements.items():
|
|
324
|
+
result = result.replace(old, new)
|
|
325
|
+
|
|
326
|
+
return result
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
def get_articulation_class(consonant):
|
|
330
|
+
"""
|
|
331
|
+
Get the articulation class of a consonant for nasal assimilation.
|
|
332
|
+
|
|
333
|
+
Returns:
|
|
334
|
+
String: "velar", "palatal", "retroflex", "dental", "labial", or None
|
|
335
|
+
"""
|
|
336
|
+
VELAR = ["k", "kh", "g", "gh", "ṅ", "ng"]
|
|
337
|
+
PALATAL = ["c", "ch", "chh", "j", "jh", "ñ", "ny", "ś", "sh"]
|
|
338
|
+
RETROFLEX = ["ṭ", "ṭh", "ḍ", "ḍh", "ṇ", "ṣ", "T", "Th", "D", "Dh", "N", "S", "ḷ", "L"]
|
|
339
|
+
DENTAL = ["t", "th", "d", "dh", "n", "s"]
|
|
340
|
+
LABIAL = ["p", "ph", "b", "bh", "m", "v", "w"]
|
|
341
|
+
|
|
342
|
+
if consonant in VELAR:
|
|
343
|
+
return "velar"
|
|
344
|
+
elif consonant in PALATAL:
|
|
345
|
+
return "palatal"
|
|
346
|
+
elif consonant in RETROFLEX:
|
|
347
|
+
return "retroflex"
|
|
348
|
+
elif consonant in DENTAL:
|
|
349
|
+
return "dental"
|
|
350
|
+
elif consonant in LABIAL:
|
|
351
|
+
return "labial"
|
|
352
|
+
else:
|
|
353
|
+
return None
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
def is_retroflex(char):
|
|
357
|
+
"""Check if character is a retroflex consonant"""
|
|
358
|
+
retroflexes = ["ṭ", "ṭh", "ḍ", "ḍh", "ṇ", "ṣ", "ḷ", "ṟ",
|
|
359
|
+
"T", "Th", "D", "Dh", "N", "S", "L", "R",
|
|
360
|
+
"ట", "ఠ", "డ", "ఢ", "ణ", "ష", "ళ", "ఱ"]
|
|
361
|
+
return char in retroflexes
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
def is_dental(char):
|
|
365
|
+
"""Check if character is a dental consonant"""
|
|
366
|
+
dentals = ["t", "th", "d", "dh", "n", "s",
|
|
367
|
+
"త", "థ", "ద", "ధ", "న", "స"]
|
|
368
|
+
return char in dentals
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
# ============================================================================
|
|
372
|
+
# VALIDATION AND TESTING
|
|
373
|
+
# ============================================================================
|
|
374
|
+
|
|
375
|
+
def validate_iso_mappings():
|
|
376
|
+
"""Validate that all ISO 15919 standard characters are mapped"""
|
|
377
|
+
consonants = get_iso_consonants("mixed")
|
|
378
|
+
vowels = get_iso_vowels("mixed")
|
|
379
|
+
matras = get_iso_matras("mixed")
|
|
380
|
+
|
|
381
|
+
print("ISO 15919 Mappings Validation")
|
|
382
|
+
print("=" * 50)
|
|
383
|
+
print(f"Consonants: {len(consonants)} mappings")
|
|
384
|
+
print(f"Vowels: {len(vowels)} mappings")
|
|
385
|
+
print(f"Matras: {len(matras)} mappings")
|
|
386
|
+
print(f"Total: {len(consonants) + len(vowels) + len(matras)} mappings")
|
|
387
|
+
|
|
388
|
+
# Check for duplicates
|
|
389
|
+
all_roman = list(consonants.keys()) + list(vowels.keys())
|
|
390
|
+
duplicates = [x for x in all_roman if all_roman.count(x) > 1]
|
|
391
|
+
if duplicates:
|
|
392
|
+
print(f"\n⚠️ Warning: Duplicate roman keys: {set(duplicates)}")
|
|
393
|
+
else:
|
|
394
|
+
print("\n✅ No duplicate keys")
|
|
395
|
+
|
|
396
|
+
# Check Telugu coverage
|
|
397
|
+
telugu_chars = set(consonants.values()) | set(vowels.values())
|
|
398
|
+
print(f"\n✅ Covers {len(telugu_chars)} unique Telugu characters")
|
|
399
|
+
|
|
400
|
+
return True
|
|
401
|
+
|
|
402
|
+
|
|
403
|
+
if __name__ == "__main__":
|
|
404
|
+
# Run validation
|
|
405
|
+
validate_iso_mappings()
|
|
406
|
+
|
|
407
|
+
# Example usage
|
|
408
|
+
print("\n" + "=" * 50)
|
|
409
|
+
print("Example Usage:")
|
|
410
|
+
print("=" * 50)
|
|
411
|
+
|
|
412
|
+
consonants = get_iso_consonants("mixed")
|
|
413
|
+
vowels = get_iso_vowels("mixed")
|
|
414
|
+
|
|
415
|
+
examples = [
|
|
416
|
+
("k", "Velar"),
|
|
417
|
+
("ṭ", "Retroflex (diacritic)"),
|
|
418
|
+
("T", "Retroflex (ASCII)"),
|
|
419
|
+
("ṅ", "Velar nasal (diacritic)"),
|
|
420
|
+
("ng", "Velar nasal (ASCII)"),
|
|
421
|
+
("ā", "Long vowel (diacritic)"),
|
|
422
|
+
("A", "Long vowel (ASCII)"),
|
|
423
|
+
("aa", "Long vowel (double)"),
|
|
424
|
+
]
|
|
425
|
+
|
|
426
|
+
for roman, description in examples:
|
|
427
|
+
telugu_cons = consonants.get(roman)
|
|
428
|
+
telugu_vow = vowels.get(roman)
|
|
429
|
+
telugu = telugu_cons or telugu_vow or "N/A"
|
|
430
|
+
print(f"{roman:4} → {telugu:2} ({description})")
|
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Sentence similarity and correction tools for Telugu text.
|
|
3
|
+
|
|
4
|
+
This module provides functionality to find similar Telugu sentences
|
|
5
|
+
and correct grammar/spelling using SentenceTransformers.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
from sentence_transformers import SentenceTransformer, util
|
|
10
|
+
import torch
|
|
11
|
+
SENTENCE_TRANSFORMERS_AVAILABLE = True
|
|
12
|
+
except ImportError:
|
|
13
|
+
SENTENCE_TRANSFORMERS_AVAILABLE = False
|
|
14
|
+
|
|
15
|
+
# Preload the model (lightweight multilingual model)
|
|
16
|
+
_MODEL_NAME = "paraphrase-multilingual-MiniLM-L12-v2"
|
|
17
|
+
_model = None
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _get_model():
|
|
21
|
+
"""
|
|
22
|
+
Lazy load the sentence transformer model.
|
|
23
|
+
Returns the cached model or loads it if not already loaded.
|
|
24
|
+
"""
|
|
25
|
+
global _model
|
|
26
|
+
if _model is None:
|
|
27
|
+
if not SENTENCE_TRANSFORMERS_AVAILABLE:
|
|
28
|
+
raise ImportError(
|
|
29
|
+
"sentence-transformers is not installed. "
|
|
30
|
+
"Please install it with: pip install sentence-transformers"
|
|
31
|
+
)
|
|
32
|
+
_model = SentenceTransformer(_MODEL_NAME)
|
|
33
|
+
return _model
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def find_similar_sentence(query, reference_list, top_k=1, min_score=0.5):
|
|
37
|
+
"""
|
|
38
|
+
Find the most similar sentence(s) from a list of references.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
query (str): The query sentence in Telugu
|
|
42
|
+
reference_list (list): List of Telugu reference sentences
|
|
43
|
+
top_k (int): Number of top similar sentences to return (default: 1)
|
|
44
|
+
min_score (float): Minimum similarity score threshold (default: 0.5)
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
tuple: (best_sentence, similarity_score) if top_k=1
|
|
48
|
+
list: List of tuples [(sentence, score), ...] if top_k > 1
|
|
49
|
+
|
|
50
|
+
Example:
|
|
51
|
+
>>> refs = ["వర్షం పడుతోంది", "ఇప్పుడు వాన వస్తోంది", "నేను తినడానికి వెళ్తున్నాను"]
|
|
52
|
+
>>> sentence, score = find_similar_sentence("వర్షం కురుస్తోంది", refs)
|
|
53
|
+
>>> print(sentence, score)
|
|
54
|
+
"""
|
|
55
|
+
if not SENTENCE_TRANSFORMERS_AVAILABLE:
|
|
56
|
+
raise ImportError(
|
|
57
|
+
"sentence-transformers is required for this feature. "
|
|
58
|
+
"Install it with: pip install sentence-transformers"
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
if not reference_list:
|
|
62
|
+
return (None, 0.0) if top_k == 1 else []
|
|
63
|
+
|
|
64
|
+
model = _get_model()
|
|
65
|
+
|
|
66
|
+
# Encode the query and reference sentences
|
|
67
|
+
query_emb = model.encode(query, convert_to_tensor=True)
|
|
68
|
+
ref_emb = model.encode(reference_list, convert_to_tensor=True)
|
|
69
|
+
|
|
70
|
+
# Compute cosine similarity
|
|
71
|
+
scores = util.cos_sim(query_emb, ref_emb)[0]
|
|
72
|
+
|
|
73
|
+
# Find top-k most similar sentences
|
|
74
|
+
top_results = []
|
|
75
|
+
for score, sentence in sorted(zip(scores, reference_list), reverse=True)[:top_k]:
|
|
76
|
+
if float(score) >= min_score:
|
|
77
|
+
top_results.append((sentence, float(score)))
|
|
78
|
+
|
|
79
|
+
if not top_results:
|
|
80
|
+
# Return the best match even if below threshold
|
|
81
|
+
best_idx = torch.argmax(scores).item()
|
|
82
|
+
best_score = float(scores[best_idx])
|
|
83
|
+
if top_k == 1:
|
|
84
|
+
return (reference_list[best_idx], best_score)
|
|
85
|
+
else:
|
|
86
|
+
return [(reference_list[best_idx], best_score)]
|
|
87
|
+
|
|
88
|
+
if top_k == 1:
|
|
89
|
+
return (top_results[0][0], top_results[0][1])
|
|
90
|
+
return top_results
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def correct_sentence(query, references, min_score=0.5):
|
|
94
|
+
"""
|
|
95
|
+
Correct a Telugu sentence by finding the best matching reference.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
query (str): The potentially incorrect Telugu sentence
|
|
99
|
+
references (list): List of correct Telugu sentences to match against
|
|
100
|
+
min_score (float): Minimum similarity score threshold
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
tuple: (corrected_sentence, similarity_score)
|
|
104
|
+
|
|
105
|
+
Example:
|
|
106
|
+
>>> refs = ["నేను ఇంటికి వెళ్తున్నాను", "వర్షం పడుతోంది", "ఇది మంచి పుస్తకం"]
|
|
107
|
+
>>> corrected, score = correct_sentence("వర్షం పడుతునది", refs)
|
|
108
|
+
>>> print(corrected, score)
|
|
109
|
+
"""
|
|
110
|
+
return find_similar_sentence(query, references, top_k=1, min_score=min_score)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def rank_sentences(query, reference_list, min_score=0.3):
|
|
114
|
+
"""
|
|
115
|
+
Rank all reference sentences by similarity to the query.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
query (str): The query sentence in Telugu
|
|
119
|
+
reference_list (list): List of Telugu reference sentences
|
|
120
|
+
min_score (float): Minimum similarity score to include in results
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
list: Sorted list of tuples [(sentence, score), ...] in descending order
|
|
124
|
+
|
|
125
|
+
Example:
|
|
126
|
+
>>> refs = ["వర్షం పడుతోంది", "ఇప్పుడు వాన వస్తోంది", "నేను తినడానికి వెళ్తున్నాను"]
|
|
127
|
+
>>> ranked = rank_sentences("వర్షం కురుస్తోంది", refs)
|
|
128
|
+
>>> for sentence, score in ranked:
|
|
129
|
+
... print(f"{sentence}: {score:.3f}")
|
|
130
|
+
"""
|
|
131
|
+
if not SENTENCE_TRANSFORMERS_AVAILABLE:
|
|
132
|
+
raise ImportError(
|
|
133
|
+
"sentence-transformers is required for this feature. "
|
|
134
|
+
"Install it with: pip install sentence-transformers"
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
if not reference_list:
|
|
138
|
+
return []
|
|
139
|
+
|
|
140
|
+
model = _get_model()
|
|
141
|
+
|
|
142
|
+
# Encode all sentences
|
|
143
|
+
query_emb = model.encode(query, convert_to_tensor=True)
|
|
144
|
+
ref_emb = model.encode(reference_list, convert_to_tensor=True)
|
|
145
|
+
|
|
146
|
+
# Compute similarities
|
|
147
|
+
scores = util.cos_sim(query_emb, ref_emb)[0]
|
|
148
|
+
|
|
149
|
+
# Create and sort results
|
|
150
|
+
results = [(ref, float(score))
|
|
151
|
+
for ref, score in zip(reference_list, scores)
|
|
152
|
+
if float(score) >= min_score]
|
|
153
|
+
|
|
154
|
+
# Sort by score in descending order
|
|
155
|
+
results.sort(key=lambda x: x[1], reverse=True)
|
|
156
|
+
|
|
157
|
+
return results
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def batch_similarity(queries, reference_list, batch_size=32):
|
|
161
|
+
"""
|
|
162
|
+
Compute similarity for multiple queries against the reference list.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
queries (list): List of query sentences
|
|
166
|
+
reference_list (list): List of reference sentences
|
|
167
|
+
batch_size (int): Batch size for encoding (default: 32)
|
|
168
|
+
|
|
169
|
+
Returns:
|
|
170
|
+
list: List of tuples (query, best_match, best_score) for each query
|
|
171
|
+
|
|
172
|
+
Example:
|
|
173
|
+
>>> queries = ["వర్షం కురుస్తోంది", "నేను వస్తున్నాను"]
|
|
174
|
+
>>> refs = ["వర్షం పడుతోంది", "నేను ఇంటికి వెళ్తున్నాను"]
|
|
175
|
+
>>> results = batch_similarity(queries, refs)
|
|
176
|
+
>>> for query, match, score in results:
|
|
177
|
+
... print(f"{query} -> {match} ({score:.3f})")
|
|
178
|
+
"""
|
|
179
|
+
if not SENTENCE_TRANSFORMERS_AVAILABLE:
|
|
180
|
+
raise ImportError(
|
|
181
|
+
"sentence-transformers is required for this feature. "
|
|
182
|
+
"Install it with: pip install sentence-transformers"
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
if not queries or not reference_list:
|
|
186
|
+
return []
|
|
187
|
+
|
|
188
|
+
model = _get_model()
|
|
189
|
+
|
|
190
|
+
# Encode all queries
|
|
191
|
+
query_embeddings = model.encode(queries, convert_to_tensor=True, batch_size=batch_size)
|
|
192
|
+
ref_embeddings = model.encode(reference_list, convert_to_tensor=True, batch_size=batch_size)
|
|
193
|
+
|
|
194
|
+
# Compute similarities
|
|
195
|
+
similarity_matrix = util.cos_sim(query_embeddings, ref_embeddings)
|
|
196
|
+
|
|
197
|
+
results = []
|
|
198
|
+
for i, query in enumerate(queries):
|
|
199
|
+
scores = similarity_matrix[i]
|
|
200
|
+
best_idx = torch.argmax(scores).item()
|
|
201
|
+
best_score = float(scores[best_idx])
|
|
202
|
+
results.append((query, reference_list[best_idx], best_score))
|
|
203
|
+
|
|
204
|
+
return results
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def is_sentence_transformers_available():
|
|
208
|
+
"""
|
|
209
|
+
Check if sentence-transformers library is available.
|
|
210
|
+
|
|
211
|
+
Returns:
|
|
212
|
+
bool: True if sentence-transformers is installed, False otherwise
|
|
213
|
+
"""
|
|
214
|
+
return SENTENCE_TRANSFORMERS_AVAILABLE
|