telugu-language-tools 4.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,430 @@
1
+ """
2
+ ISO 15919 Standard Compliant Telugu Mappings
3
+ =============================================
4
+
5
+ International standard for romanization of Indic scripts.
6
+ Supports both diacritic notation and ASCII alternatives.
7
+
8
+ Usage:
9
+ from telugu_lib.iso15919_mappings import get_iso_consonants, get_iso_vowels
10
+
11
+ Reference: ISO 15919:2001 - Transliteration of Devanagari and related Indic scripts
12
+ """
13
+
14
+ # ============================================================================
15
+ # ISO 15919 CONSONANT MAPPINGS
16
+ # ============================================================================
17
+
18
+ def get_iso_consonants(mode="mixed"):
19
+ """
20
+ Get ISO 15919 compliant consonant mappings.
21
+
22
+ Args:
23
+ mode: "diacritic" (only diacritics), "ascii" (only capitals),
24
+ "mixed" (both, default)
25
+
26
+ Returns:
27
+ Dictionary of romanization → Telugu mappings
28
+ """
29
+
30
+ # Base mappings with diacritics (ISO 15919 standard)
31
+ diacritic_consonants = {
32
+ # Velars (కవర్గ)
33
+ "k": "క", # ka
34
+ "kh": "ఖ", # kha
35
+ "g": "గ", # ga
36
+ "gh": "ఘ", # gha
37
+ "ṅ": "ఙ", # ṅa (velar nasal, rare)
38
+
39
+ # Palatals (చవర్గ)
40
+ "c": "చ", # ca (ISO uses 'c', not 'ch')
41
+ "ch": "ఛ", # cha (aspirated)
42
+ "j": "జ", # ja
43
+ "jh": "ఝ", # jha
44
+ "ñ": "ఞ", # ña (palatal nasal, rare)
45
+
46
+ # Retroflexes (టవర్గ) - with underdots
47
+ "ṭ": "ట", # ṭa
48
+ "ṭh": "ఠ", # ṭha
49
+ "ḍ": "డ", # ḍa
50
+ "ḍh": "ఢ", # ḍha
51
+ "ṇ": "ణ", # ṇa (retroflex nasal)
52
+
53
+ # Dentals (తవర్గ)
54
+ "t": "త", # ta
55
+ "th": "థ", # tha
56
+ "d": "ద", # da
57
+ "dh": "ధ", # dha
58
+ "n": "న", # na (dental nasal)
59
+
60
+ # Labials (పవర్గ)
61
+ "p": "ప", # pa
62
+ "ph": "ఫ", # pha
63
+ "b": "బ", # ba
64
+ "bh": "భ", # bha
65
+ "m": "మ", # ma
66
+
67
+ # Sonorants (అంతస్థలు)
68
+ "y": "య", # ya
69
+ "r": "ర", # ra
70
+ "l": "ల", # la
71
+ "v": "వ", # va
72
+ "w": "వ", # wa (alternative for v)
73
+
74
+ # Sibilants (ఊష్మలు)
75
+ "ś": "శ", # śa (palatal sibilant)
76
+ "ṣ": "ష", # ṣa (retroflex sibilant)
77
+ "s": "స", # sa (dental sibilant)
78
+
79
+ # Glottal
80
+ "h": "హ", # ha
81
+
82
+ # Additional consonants
83
+ "ḷ": "ళ", # ḷa (retroflex lateral)
84
+ "ḻ": "ఴ", # ḻa (Tamil retroflex, obsolete in Telugu)
85
+ "ṟ": "ఱ", # ṟa (alveolar trill, archaic)
86
+
87
+ # Simplified alternatives (common usage)
88
+ "sha": "శ", # Alternative for ś
89
+ "Sha": "ష", # Alternative for ṣ (capital S)
90
+ "za": "జ", # z often mapped to ja
91
+ "f": "ఫ", # f → pha
92
+ }
93
+
94
+ # ASCII alternatives (using capitals for retroflexes)
95
+ ascii_consonants = {
96
+ # Retroflexes (capital = retroflex)
97
+ "T": "ట", # ASCII for ṭ
98
+ "Th": "ఠ", # ASCII for ṭh
99
+ "D": "డ", # ASCII for ḍ
100
+ "Dh": "ఢ", # ASCII for ḍh
101
+ "N": "ణ", # ASCII for ṇ (retroflex nasal)
102
+ "L": "ళ", # ASCII for ḷ (retroflex lateral)
103
+ "R": "ఱ", # ASCII for ṟ (rare)
104
+
105
+ # Sibilants
106
+ "S": "ష", # ASCII for ṣ (retroflex sibilant)
107
+ "sh": "శ", # Palatal sibilant (lowercase)
108
+
109
+ # Palatals
110
+ "ch": "చ", # Common ch → ca
111
+ "chh": "ఛ", # Aspirated
112
+
113
+ # Nasals
114
+ "ng": "ఙ", # ASCII for ṅ (velar nasal)
115
+ "ny": "ఞ", # ASCII for ñ (palatal nasal)
116
+ }
117
+
118
+ # Combined mapping based on mode
119
+ if mode == "diacritic":
120
+ return diacritic_consonants
121
+ elif mode == "ascii":
122
+ return {**diacritic_consonants, **ascii_consonants}
123
+ else: # mixed (default)
124
+ return {**diacritic_consonants, **ascii_consonants}
125
+
126
+
127
+ # ============================================================================
128
+ # ISO 15919 VOWEL MAPPINGS
129
+ # ============================================================================
130
+
131
+ def get_iso_vowels(mode="mixed"):
132
+ """
133
+ Get ISO 15919 compliant vowel mappings.
134
+
135
+ Args:
136
+ mode: "diacritic" (only diacritics), "ascii" (only capitals),
137
+ "mixed" (both, default)
138
+
139
+ Returns:
140
+ Dictionary of romanization → Telugu vowel mappings
141
+ """
142
+
143
+ # Base vowels with diacritics (ISO 15919 standard)
144
+ diacritic_vowels = {
145
+ # Short vowels
146
+ "a": "అ", # a (short)
147
+ "i": "ఇ", # i (short)
148
+ "u": "ఉ", # u (short)
149
+ "ṛ": "ఋ", # ṛ (vocalic r, short)
150
+ "ḷ": "ఌ", # ḷ (vocalic l, short, very rare)
151
+
152
+ # Long vowels (with macrons)
153
+ "ā": "ఆ", # ā (long)
154
+ "ī": "ఈ", # ī (long)
155
+ "ū": "ఊ", # ū (long)
156
+ "ṝ": "ౠ", # ṝ (vocalic r, long)
157
+ "ḹ": "ౡ", # ḹ (vocalic l, long, very rare)
158
+
159
+ # E vowels
160
+ "e": "ఎ", # e (short)
161
+ "ē": "ఏ", # ē (long)
162
+
163
+ # O vowels
164
+ "o": "ఒ", # o (short)
165
+ "ō": "ఓ", # ō (long)
166
+
167
+ # Diphthongs
168
+ "ai": "ఐ", # ai
169
+ "au": "ఔ", # au
170
+
171
+ # Special markers
172
+ "ṁ": "ం", # ṁ (anusvara)
173
+ "ḥ": "ః", # ḥ (visarga)
174
+ "m̐": "ఁ", # candrabindu (rare)
175
+ }
176
+
177
+ # ASCII alternatives
178
+ ascii_vowels = {
179
+ # Long vowels (capital = long, or double letter)
180
+ "A": "ఆ", # ASCII for ā
181
+ "aa": "ఆ", # Alternative for ā
182
+ "I": "ఈ", # ASCII for ī
183
+ "ii": "ఈ", # Alternative for ī
184
+ "U": "ఊ", # ASCII for ū
185
+ "uu": "ఊ", # Alternative for ū
186
+ "E": "ఏ", # ASCII for ē (long e)
187
+ "ee": "ఏ", # Alternative for ē
188
+ "O": "ఓ", # ASCII for ō (long o)
189
+ "oo": "ఓ", # Alternative for ō
190
+
191
+ # Vocalic consonants
192
+ "R": "ఋ", # ASCII for ṛ
193
+ "ri": "ఋ", # Common alternative
194
+ "RR": "ౠ", # ASCII for ṝ (long)
195
+ "rii": "ౠ", # Common alternative
196
+
197
+ # Vocalic l (very rare)
198
+ "lR": "ఌ", # ASCII for ḷ
199
+ "li": "ఌ", # Common alternative
200
+
201
+ # Special markers
202
+ "M": "ం", # ASCII for ṁ (anusvara)
203
+ "am": "ం", # Common representation
204
+ "H": "ః", # ASCII for ḥ (visarga)
205
+ "ah": "ః", # Common representation
206
+ }
207
+
208
+ if mode == "diacritic":
209
+ return diacritic_vowels
210
+ elif mode == "ascii":
211
+ return {**diacritic_vowels, **ascii_vowels}
212
+ else: # mixed
213
+ return {**diacritic_vowels, **ascii_vowels}
214
+
215
+
216
+ # ============================================================================
217
+ # ISO 15919 MATRA (VOWEL SIGN) MAPPINGS
218
+ # ============================================================================
219
+
220
+ def get_iso_matras(mode="mixed"):
221
+ """
222
+ Get ISO 15919 compliant matra (vowel sign) mappings.
223
+
224
+ Matras are vowel signs that attach to consonants.
225
+
226
+ Returns:
227
+ Dictionary of romanization → Telugu matra mappings
228
+ """
229
+
230
+ diacritic_matras = {
231
+ # No marking for inherent 'a'
232
+ "a": "", # Inherent vowel (no mark)
233
+
234
+ # Short vowel signs
235
+ "i": "ి", # i-matra
236
+ "u": "ు", # u-matra
237
+ "ṛ": "ృ", # ṛ-matra (vocalic r)
238
+ "ḷ": "ౢ", # ḷ-matra (vocalic l, rare)
239
+
240
+ # Long vowel signs
241
+ "ā": "ా", # ā-matra
242
+ "ī": "ీ", # ī-matra
243
+ "ū": "ూ", # ū-matra
244
+ "ṝ": "ౄ", # ṝ-matra (long vocalic r)
245
+ "ḹ": "ౣ", # ḹ-matra (long vocalic l, rare)
246
+
247
+ # E vowel signs
248
+ "e": "ె", # e-matra (short)
249
+ "ē": "ే", # ē-matra (long)
250
+
251
+ # O vowel signs
252
+ "o": "ొ", # o-matra (short)
253
+ "ō": "ో", # ō-matra (long)
254
+
255
+ # Diphthong signs
256
+ "ai": "ై", # ai-matra
257
+ "au": "ౌ", # au-matra
258
+
259
+ # Special markers (same as standalone)
260
+ "ṁ": "ం", # anusvara
261
+ "ḥ": "ః", # visarga
262
+ }
263
+
264
+ ascii_matras = {
265
+ # ASCII alternatives for long vowels
266
+ "A": "ా", # ASCII for ā
267
+ "aa": "ా", # Common alternative
268
+ "I": "ీ", # ASCII for ī
269
+ "ii": "ీ", # Common alternative
270
+ "U": "ూ", # ASCII for ū
271
+ "uu": "ూ", # Common alternative
272
+ "E": "ే", # ASCII for ē
273
+ "ee": "ే", # Common alternative
274
+ "O": "ో", # ASCII for ō
275
+ "oo": "ో", # Common alternative
276
+
277
+ # Vocalic consonants
278
+ "R": "ృ", # ASCII for ṛ
279
+ "ri": "ృ", # Common alternative
280
+ "RR": "ౄ", # ASCII for ṝ
281
+ "rii": "ౄ", # Common alternative
282
+
283
+ # Special markers
284
+ "M": "ం", # ASCII for ṁ
285
+ "am": "ం", # Common representation
286
+ "H": "ః", # ASCII for ḥ
287
+ "ah": "ః", # Common representation
288
+ }
289
+
290
+ if mode == "diacritic":
291
+ return diacritic_matras
292
+ elif mode == "ascii":
293
+ return {**diacritic_matras, **ascii_matras}
294
+ else: # mixed
295
+ return {**diacritic_matras, **ascii_matras}
296
+
297
+
298
+ # ============================================================================
299
+ # HELPER FUNCTIONS
300
+ # ============================================================================
301
+
302
+ def normalize_input(text):
303
+ """
304
+ Normalize input text to handle common variations.
305
+
306
+ Converts common romanization variants to ISO 15919 standard.
307
+ """
308
+ replacements = {
309
+ # Common variations → ISO standard
310
+ "Ch": "ch", # Capital Ch → ch
311
+ "zh": "j", # zh → j
312
+ "Z": "j", # Z → j
313
+ "ph": "ph", # Already correct
314
+ "f": "ph", # f → ph (no native f in Telugu)
315
+ "q": "k", # q → k (no native q)
316
+ "x": "ks", # x → ks cluster
317
+
318
+ # Ensure ASCII capitals for retroflexes are preserved
319
+ # (handled separately in get_iso_consonants)
320
+ }
321
+
322
+ result = text
323
+ for old, new in replacements.items():
324
+ result = result.replace(old, new)
325
+
326
+ return result
327
+
328
+
329
+ def get_articulation_class(consonant):
330
+ """
331
+ Get the articulation class of a consonant for nasal assimilation.
332
+
333
+ Returns:
334
+ String: "velar", "palatal", "retroflex", "dental", "labial", or None
335
+ """
336
+ VELAR = ["k", "kh", "g", "gh", "ṅ", "ng"]
337
+ PALATAL = ["c", "ch", "chh", "j", "jh", "ñ", "ny", "ś", "sh"]
338
+ RETROFLEX = ["ṭ", "ṭh", "ḍ", "ḍh", "ṇ", "ṣ", "T", "Th", "D", "Dh", "N", "S", "ḷ", "L"]
339
+ DENTAL = ["t", "th", "d", "dh", "n", "s"]
340
+ LABIAL = ["p", "ph", "b", "bh", "m", "v", "w"]
341
+
342
+ if consonant in VELAR:
343
+ return "velar"
344
+ elif consonant in PALATAL:
345
+ return "palatal"
346
+ elif consonant in RETROFLEX:
347
+ return "retroflex"
348
+ elif consonant in DENTAL:
349
+ return "dental"
350
+ elif consonant in LABIAL:
351
+ return "labial"
352
+ else:
353
+ return None
354
+
355
+
356
+ def is_retroflex(char):
357
+ """Check if character is a retroflex consonant"""
358
+ retroflexes = ["ṭ", "ṭh", "ḍ", "ḍh", "ṇ", "ṣ", "ḷ", "ṟ",
359
+ "T", "Th", "D", "Dh", "N", "S", "L", "R",
360
+ "ట", "ఠ", "డ", "ఢ", "ణ", "ష", "ళ", "ఱ"]
361
+ return char in retroflexes
362
+
363
+
364
+ def is_dental(char):
365
+ """Check if character is a dental consonant"""
366
+ dentals = ["t", "th", "d", "dh", "n", "s",
367
+ "త", "థ", "ద", "ధ", "న", "స"]
368
+ return char in dentals
369
+
370
+
371
+ # ============================================================================
372
+ # VALIDATION AND TESTING
373
+ # ============================================================================
374
+
375
+ def validate_iso_mappings():
376
+ """Validate that all ISO 15919 standard characters are mapped"""
377
+ consonants = get_iso_consonants("mixed")
378
+ vowels = get_iso_vowels("mixed")
379
+ matras = get_iso_matras("mixed")
380
+
381
+ print("ISO 15919 Mappings Validation")
382
+ print("=" * 50)
383
+ print(f"Consonants: {len(consonants)} mappings")
384
+ print(f"Vowels: {len(vowels)} mappings")
385
+ print(f"Matras: {len(matras)} mappings")
386
+ print(f"Total: {len(consonants) + len(vowels) + len(matras)} mappings")
387
+
388
+ # Check for duplicates
389
+ all_roman = list(consonants.keys()) + list(vowels.keys())
390
+ duplicates = [x for x in all_roman if all_roman.count(x) > 1]
391
+ if duplicates:
392
+ print(f"\n⚠️ Warning: Duplicate roman keys: {set(duplicates)}")
393
+ else:
394
+ print("\n✅ No duplicate keys")
395
+
396
+ # Check Telugu coverage
397
+ telugu_chars = set(consonants.values()) | set(vowels.values())
398
+ print(f"\n✅ Covers {len(telugu_chars)} unique Telugu characters")
399
+
400
+ return True
401
+
402
+
403
+ if __name__ == "__main__":
404
+ # Run validation
405
+ validate_iso_mappings()
406
+
407
+ # Example usage
408
+ print("\n" + "=" * 50)
409
+ print("Example Usage:")
410
+ print("=" * 50)
411
+
412
+ consonants = get_iso_consonants("mixed")
413
+ vowels = get_iso_vowels("mixed")
414
+
415
+ examples = [
416
+ ("k", "Velar"),
417
+ ("ṭ", "Retroflex (diacritic)"),
418
+ ("T", "Retroflex (ASCII)"),
419
+ ("ṅ", "Velar nasal (diacritic)"),
420
+ ("ng", "Velar nasal (ASCII)"),
421
+ ("ā", "Long vowel (diacritic)"),
422
+ ("A", "Long vowel (ASCII)"),
423
+ ("aa", "Long vowel (double)"),
424
+ ]
425
+
426
+ for roman, description in examples:
427
+ telugu_cons = consonants.get(roman)
428
+ telugu_vow = vowels.get(roman)
429
+ telugu = telugu_cons or telugu_vow or "N/A"
430
+ print(f"{roman:4} → {telugu:2} ({description})")
@@ -0,0 +1,214 @@
1
+ """
2
+ Sentence similarity and correction tools for Telugu text.
3
+
4
+ This module provides functionality to find similar Telugu sentences
5
+ and correct grammar/spelling using SentenceTransformers.
6
+ """
7
+
8
+ try:
9
+ from sentence_transformers import SentenceTransformer, util
10
+ import torch
11
+ SENTENCE_TRANSFORMERS_AVAILABLE = True
12
+ except ImportError:
13
+ SENTENCE_TRANSFORMERS_AVAILABLE = False
14
+
15
+ # Preload the model (lightweight multilingual model)
16
+ _MODEL_NAME = "paraphrase-multilingual-MiniLM-L12-v2"
17
+ _model = None
18
+
19
+
20
+ def _get_model():
21
+ """
22
+ Lazy load the sentence transformer model.
23
+ Returns the cached model or loads it if not already loaded.
24
+ """
25
+ global _model
26
+ if _model is None:
27
+ if not SENTENCE_TRANSFORMERS_AVAILABLE:
28
+ raise ImportError(
29
+ "sentence-transformers is not installed. "
30
+ "Please install it with: pip install sentence-transformers"
31
+ )
32
+ _model = SentenceTransformer(_MODEL_NAME)
33
+ return _model
34
+
35
+
36
+ def find_similar_sentence(query, reference_list, top_k=1, min_score=0.5):
37
+ """
38
+ Find the most similar sentence(s) from a list of references.
39
+
40
+ Args:
41
+ query (str): The query sentence in Telugu
42
+ reference_list (list): List of Telugu reference sentences
43
+ top_k (int): Number of top similar sentences to return (default: 1)
44
+ min_score (float): Minimum similarity score threshold (default: 0.5)
45
+
46
+ Returns:
47
+ tuple: (best_sentence, similarity_score) if top_k=1
48
+ list: List of tuples [(sentence, score), ...] if top_k > 1
49
+
50
+ Example:
51
+ >>> refs = ["వర్షం పడుతోంది", "ఇప్పుడు వాన వస్తోంది", "నేను తినడానికి వెళ్తున్నాను"]
52
+ >>> sentence, score = find_similar_sentence("వర్షం కురుస్తోంది", refs)
53
+ >>> print(sentence, score)
54
+ """
55
+ if not SENTENCE_TRANSFORMERS_AVAILABLE:
56
+ raise ImportError(
57
+ "sentence-transformers is required for this feature. "
58
+ "Install it with: pip install sentence-transformers"
59
+ )
60
+
61
+ if not reference_list:
62
+ return (None, 0.0) if top_k == 1 else []
63
+
64
+ model = _get_model()
65
+
66
+ # Encode the query and reference sentences
67
+ query_emb = model.encode(query, convert_to_tensor=True)
68
+ ref_emb = model.encode(reference_list, convert_to_tensor=True)
69
+
70
+ # Compute cosine similarity
71
+ scores = util.cos_sim(query_emb, ref_emb)[0]
72
+
73
+ # Find top-k most similar sentences
74
+ top_results = []
75
+ for score, sentence in sorted(zip(scores, reference_list), reverse=True)[:top_k]:
76
+ if float(score) >= min_score:
77
+ top_results.append((sentence, float(score)))
78
+
79
+ if not top_results:
80
+ # Return the best match even if below threshold
81
+ best_idx = torch.argmax(scores).item()
82
+ best_score = float(scores[best_idx])
83
+ if top_k == 1:
84
+ return (reference_list[best_idx], best_score)
85
+ else:
86
+ return [(reference_list[best_idx], best_score)]
87
+
88
+ if top_k == 1:
89
+ return (top_results[0][0], top_results[0][1])
90
+ return top_results
91
+
92
+
93
+ def correct_sentence(query, references, min_score=0.5):
94
+ """
95
+ Correct a Telugu sentence by finding the best matching reference.
96
+
97
+ Args:
98
+ query (str): The potentially incorrect Telugu sentence
99
+ references (list): List of correct Telugu sentences to match against
100
+ min_score (float): Minimum similarity score threshold
101
+
102
+ Returns:
103
+ tuple: (corrected_sentence, similarity_score)
104
+
105
+ Example:
106
+ >>> refs = ["నేను ఇంటికి వెళ్తున్నాను", "వర్షం పడుతోంది", "ఇది మంచి పుస్తకం"]
107
+ >>> corrected, score = correct_sentence("వర్షం పడుతునది", refs)
108
+ >>> print(corrected, score)
109
+ """
110
+ return find_similar_sentence(query, references, top_k=1, min_score=min_score)
111
+
112
+
113
+ def rank_sentences(query, reference_list, min_score=0.3):
114
+ """
115
+ Rank all reference sentences by similarity to the query.
116
+
117
+ Args:
118
+ query (str): The query sentence in Telugu
119
+ reference_list (list): List of Telugu reference sentences
120
+ min_score (float): Minimum similarity score to include in results
121
+
122
+ Returns:
123
+ list: Sorted list of tuples [(sentence, score), ...] in descending order
124
+
125
+ Example:
126
+ >>> refs = ["వర్షం పడుతోంది", "ఇప్పుడు వాన వస్తోంది", "నేను తినడానికి వెళ్తున్నాను"]
127
+ >>> ranked = rank_sentences("వర్షం కురుస్తోంది", refs)
128
+ >>> for sentence, score in ranked:
129
+ ... print(f"{sentence}: {score:.3f}")
130
+ """
131
+ if not SENTENCE_TRANSFORMERS_AVAILABLE:
132
+ raise ImportError(
133
+ "sentence-transformers is required for this feature. "
134
+ "Install it with: pip install sentence-transformers"
135
+ )
136
+
137
+ if not reference_list:
138
+ return []
139
+
140
+ model = _get_model()
141
+
142
+ # Encode all sentences
143
+ query_emb = model.encode(query, convert_to_tensor=True)
144
+ ref_emb = model.encode(reference_list, convert_to_tensor=True)
145
+
146
+ # Compute similarities
147
+ scores = util.cos_sim(query_emb, ref_emb)[0]
148
+
149
+ # Create and sort results
150
+ results = [(ref, float(score))
151
+ for ref, score in zip(reference_list, scores)
152
+ if float(score) >= min_score]
153
+
154
+ # Sort by score in descending order
155
+ results.sort(key=lambda x: x[1], reverse=True)
156
+
157
+ return results
158
+
159
+
160
+ def batch_similarity(queries, reference_list, batch_size=32):
161
+ """
162
+ Compute similarity for multiple queries against the reference list.
163
+
164
+ Args:
165
+ queries (list): List of query sentences
166
+ reference_list (list): List of reference sentences
167
+ batch_size (int): Batch size for encoding (default: 32)
168
+
169
+ Returns:
170
+ list: List of tuples (query, best_match, best_score) for each query
171
+
172
+ Example:
173
+ >>> queries = ["వర్షం కురుస్తోంది", "నేను వస్తున్నాను"]
174
+ >>> refs = ["వర్షం పడుతోంది", "నేను ఇంటికి వెళ్తున్నాను"]
175
+ >>> results = batch_similarity(queries, refs)
176
+ >>> for query, match, score in results:
177
+ ... print(f"{query} -> {match} ({score:.3f})")
178
+ """
179
+ if not SENTENCE_TRANSFORMERS_AVAILABLE:
180
+ raise ImportError(
181
+ "sentence-transformers is required for this feature. "
182
+ "Install it with: pip install sentence-transformers"
183
+ )
184
+
185
+ if not queries or not reference_list:
186
+ return []
187
+
188
+ model = _get_model()
189
+
190
+ # Encode all queries
191
+ query_embeddings = model.encode(queries, convert_to_tensor=True, batch_size=batch_size)
192
+ ref_embeddings = model.encode(reference_list, convert_to_tensor=True, batch_size=batch_size)
193
+
194
+ # Compute similarities
195
+ similarity_matrix = util.cos_sim(query_embeddings, ref_embeddings)
196
+
197
+ results = []
198
+ for i, query in enumerate(queries):
199
+ scores = similarity_matrix[i]
200
+ best_idx = torch.argmax(scores).item()
201
+ best_score = float(scores[best_idx])
202
+ results.append((query, reference_list[best_idx], best_score))
203
+
204
+ return results
205
+
206
+
207
+ def is_sentence_transformers_available():
208
+ """
209
+ Check if sentence-transformers library is available.
210
+
211
+ Returns:
212
+ bool: True if sentence-transformers is installed, False otherwise
213
+ """
214
+ return SENTENCE_TRANSFORMERS_AVAILABLE