isa-model 0.4.0__py3-none-any.whl → 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (189) hide show
  1. isa_model/client.py +466 -43
  2. isa_model/core/cache/redis_cache.py +12 -3
  3. isa_model/core/config/config_manager.py +230 -3
  4. isa_model/core/config.py +90 -0
  5. isa_model/core/database/direct_db_client.py +114 -0
  6. isa_model/core/database/migration_manager.py +563 -0
  7. isa_model/core/database/migrations.py +21 -1
  8. isa_model/core/database/supabase_client.py +154 -19
  9. isa_model/core/dependencies.py +316 -0
  10. isa_model/core/discovery/__init__.py +19 -0
  11. isa_model/core/discovery/consul_discovery.py +190 -0
  12. isa_model/core/logging/__init__.py +54 -0
  13. isa_model/core/logging/influx_logger.py +523 -0
  14. isa_model/core/logging/loki_logger.py +160 -0
  15. isa_model/core/models/__init__.py +27 -18
  16. isa_model/core/models/config_models.py +625 -0
  17. isa_model/core/models/deployment_billing_tracker.py +430 -0
  18. isa_model/core/models/model_manager.py +35 -80
  19. isa_model/core/models/model_metadata.py +690 -0
  20. isa_model/core/models/model_repo.py +174 -18
  21. isa_model/core/models/system_models.py +857 -0
  22. isa_model/core/repositories/__init__.py +9 -0
  23. isa_model/core/repositories/config_repository.py +912 -0
  24. isa_model/core/services/intelligent_model_selector.py +399 -21
  25. isa_model/core/types.py +1 -0
  26. isa_model/deployment/__init__.py +5 -48
  27. isa_model/deployment/core/__init__.py +2 -31
  28. isa_model/deployment/core/deployment_manager.py +1278 -370
  29. isa_model/deployment/modal/__init__.py +8 -0
  30. isa_model/deployment/modal/config.py +136 -0
  31. isa_model/deployment/{services/auto_hf_modal_deployer.py → modal/deployer.py} +1 -1
  32. isa_model/deployment/modal/services/__init__.py +3 -0
  33. isa_model/deployment/modal/services/audio/__init__.py +1 -0
  34. isa_model/deployment/modal/services/embedding/__init__.py +1 -0
  35. isa_model/deployment/modal/services/llm/__init__.py +1 -0
  36. isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
  37. isa_model/deployment/modal/services/video/__init__.py +1 -0
  38. isa_model/deployment/modal/services/vision/__init__.py +1 -0
  39. isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
  40. isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
  41. isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
  42. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
  43. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
  44. isa_model/deployment/storage/__init__.py +5 -0
  45. isa_model/deployment/storage/deployment_repository.py +824 -0
  46. isa_model/deployment/triton/__init__.py +10 -0
  47. isa_model/deployment/triton/config.py +196 -0
  48. isa_model/deployment/triton/configs/__init__.py +1 -0
  49. isa_model/deployment/triton/provider.py +512 -0
  50. isa_model/deployment/triton/scripts/__init__.py +1 -0
  51. isa_model/deployment/triton/templates/__init__.py +1 -0
  52. isa_model/inference/__init__.py +47 -1
  53. isa_model/inference/ai_factory.py +137 -10
  54. isa_model/inference/legacy_services/__init__.py +21 -0
  55. isa_model/inference/legacy_services/model_evaluation.py +637 -0
  56. isa_model/inference/legacy_services/model_service.py +573 -0
  57. isa_model/inference/legacy_services/model_serving.py +717 -0
  58. isa_model/inference/legacy_services/model_training.py +561 -0
  59. isa_model/inference/models/__init__.py +21 -0
  60. isa_model/inference/models/inference_config.py +551 -0
  61. isa_model/inference/models/inference_record.py +675 -0
  62. isa_model/inference/models/performance_models.py +714 -0
  63. isa_model/inference/repositories/__init__.py +9 -0
  64. isa_model/inference/repositories/inference_repository.py +828 -0
  65. isa_model/inference/services/audio/base_stt_service.py +184 -11
  66. isa_model/inference/services/audio/openai_stt_service.py +22 -6
  67. isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
  68. isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
  69. isa_model/inference/services/llm/__init__.py +10 -2
  70. isa_model/inference/services/llm/base_llm_service.py +335 -24
  71. isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
  72. isa_model/inference/services/llm/helpers/llm_adapter.py +9 -4
  73. isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
  74. isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
  75. isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
  76. isa_model/inference/services/llm/ollama_llm_service.py +9 -2
  77. isa_model/inference/services/llm/openai_llm_service.py +33 -16
  78. isa_model/inference/services/llm/yyds_llm_service.py +8 -2
  79. isa_model/inference/services/vision/__init__.py +22 -1
  80. isa_model/inference/services/vision/helpers/image_utils.py +8 -5
  81. isa_model/inference/services/vision/isa_vision_service.py +65 -4
  82. isa_model/inference/services/vision/openai_vision_service.py +19 -10
  83. isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
  84. isa_model/serving/api/cache_manager.py +245 -0
  85. isa_model/serving/api/dependencies/__init__.py +1 -0
  86. isa_model/serving/api/dependencies/auth.py +194 -0
  87. isa_model/serving/api/dependencies/database.py +139 -0
  88. isa_model/serving/api/error_handlers.py +284 -0
  89. isa_model/serving/api/fastapi_server.py +172 -22
  90. isa_model/serving/api/middleware/auth.py +8 -2
  91. isa_model/serving/api/middleware/security.py +23 -33
  92. isa_model/serving/api/middleware/tenant_context.py +414 -0
  93. isa_model/serving/api/routes/analytics.py +4 -1
  94. isa_model/serving/api/routes/config.py +645 -0
  95. isa_model/serving/api/routes/deployment_billing.py +315 -0
  96. isa_model/serving/api/routes/deployments.py +138 -2
  97. isa_model/serving/api/routes/gpu_gateway.py +440 -0
  98. isa_model/serving/api/routes/health.py +32 -12
  99. isa_model/serving/api/routes/inference_monitoring.py +486 -0
  100. isa_model/serving/api/routes/local_deployments.py +448 -0
  101. isa_model/serving/api/routes/tenants.py +575 -0
  102. isa_model/serving/api/routes/unified.py +680 -18
  103. isa_model/serving/api/routes/webhooks.py +479 -0
  104. isa_model/serving/api/startup.py +68 -54
  105. isa_model/utils/gpu_utils.py +311 -0
  106. {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/METADATA +71 -24
  107. isa_model-0.4.4.dist-info/RECORD +180 -0
  108. isa_model/core/security/secrets.py +0 -358
  109. isa_model/core/storage/hf_storage.py +0 -419
  110. isa_model/core/storage/minio_storage.py +0 -0
  111. isa_model/deployment/cloud/__init__.py +0 -9
  112. isa_model/deployment/cloud/modal/__init__.py +0 -10
  113. isa_model/deployment/core/deployment_config.py +0 -356
  114. isa_model/deployment/core/isa_deployment_service.py +0 -401
  115. isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
  116. isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
  117. isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
  118. isa_model/deployment/runtime/deployed_service.py +0 -338
  119. isa_model/deployment/services/__init__.py +0 -9
  120. isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
  121. isa_model/deployment/services/model_service.py +0 -332
  122. isa_model/deployment/services/service_monitor.py +0 -356
  123. isa_model/deployment/services/service_registry.py +0 -527
  124. isa_model/eval/__init__.py +0 -92
  125. isa_model/eval/benchmarks/__init__.py +0 -27
  126. isa_model/eval/benchmarks/multimodal_datasets.py +0 -460
  127. isa_model/eval/benchmarks.py +0 -701
  128. isa_model/eval/config/__init__.py +0 -10
  129. isa_model/eval/config/evaluation_config.py +0 -108
  130. isa_model/eval/evaluators/__init__.py +0 -24
  131. isa_model/eval/evaluators/audio_evaluator.py +0 -727
  132. isa_model/eval/evaluators/base_evaluator.py +0 -503
  133. isa_model/eval/evaluators/embedding_evaluator.py +0 -742
  134. isa_model/eval/evaluators/llm_evaluator.py +0 -472
  135. isa_model/eval/evaluators/vision_evaluator.py +0 -564
  136. isa_model/eval/example_evaluation.py +0 -395
  137. isa_model/eval/factory.py +0 -798
  138. isa_model/eval/infrastructure/__init__.py +0 -24
  139. isa_model/eval/infrastructure/experiment_tracker.py +0 -466
  140. isa_model/eval/isa_benchmarks.py +0 -700
  141. isa_model/eval/isa_integration.py +0 -582
  142. isa_model/eval/metrics.py +0 -951
  143. isa_model/eval/tests/unit/test_basic.py +0 -396
  144. isa_model/serving/api/routes/evaluations.py +0 -579
  145. isa_model/training/__init__.py +0 -168
  146. isa_model/training/annotation/annotation_schema.py +0 -47
  147. isa_model/training/annotation/processors/annotation_processor.py +0 -126
  148. isa_model/training/annotation/storage/dataset_manager.py +0 -131
  149. isa_model/training/annotation/storage/dataset_schema.py +0 -44
  150. isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
  151. isa_model/training/annotation/tests/test_minio copy.py +0 -113
  152. isa_model/training/annotation/tests/test_minio_upload.py +0 -43
  153. isa_model/training/annotation/views/annotation_controller.py +0 -158
  154. isa_model/training/cloud/__init__.py +0 -22
  155. isa_model/training/cloud/job_orchestrator.py +0 -402
  156. isa_model/training/cloud/runpod_trainer.py +0 -454
  157. isa_model/training/cloud/storage_manager.py +0 -482
  158. isa_model/training/core/__init__.py +0 -26
  159. isa_model/training/core/config.py +0 -181
  160. isa_model/training/core/dataset.py +0 -222
  161. isa_model/training/core/trainer.py +0 -720
  162. isa_model/training/core/utils.py +0 -213
  163. isa_model/training/examples/intelligent_training_example.py +0 -281
  164. isa_model/training/factory.py +0 -424
  165. isa_model/training/intelligent/__init__.py +0 -25
  166. isa_model/training/intelligent/decision_engine.py +0 -643
  167. isa_model/training/intelligent/intelligent_factory.py +0 -888
  168. isa_model/training/intelligent/knowledge_base.py +0 -751
  169. isa_model/training/intelligent/resource_optimizer.py +0 -839
  170. isa_model/training/intelligent/task_classifier.py +0 -576
  171. isa_model/training/storage/__init__.py +0 -24
  172. isa_model/training/storage/core_integration.py +0 -439
  173. isa_model/training/storage/training_repository.py +0 -552
  174. isa_model/training/storage/training_storage.py +0 -628
  175. isa_model-0.4.0.dist-info/RECORD +0 -182
  176. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_chatTTS_service.py +0 -0
  177. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_fish_service.py +0 -0
  178. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_openvoice_service.py +0 -0
  179. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_service_v2.py +0 -0
  180. /isa_model/deployment/{cloud/modal → modal/services/embedding}/isa_embed_rerank_service.py +0 -0
  181. /isa_model/deployment/{cloud/modal → modal/services/video}/isa_video_hunyuan_service.py +0 -0
  182. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ocr_service.py +0 -0
  183. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_qwen25_service.py +0 -0
  184. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_table_service.py +0 -0
  185. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service.py +0 -0
  186. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service_optimized.py +0 -0
  187. /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
  188. {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/WHEEL +0 -0
  189. {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/top_level.txt +0 -0
@@ -1,742 +0,0 @@
1
- """
2
- Embedding Evaluator for ISA Model evaluation framework.
3
-
4
- Provides comprehensive evaluation capabilities for embedding and retrieval tasks including:
5
- - Semantic similarity evaluation
6
- - Information retrieval evaluation (Precision@K, Recall@K, NDCG)
7
- - Reranking effectiveness evaluation
8
- - Cross-lingual embedding evaluation
9
- - Document ranking evaluation
10
- - Clustering evaluation
11
-
12
- Supports ISA custom embedding services and standard embedding models.
13
- """
14
-
15
- import asyncio
16
- import logging
17
- import numpy as np
18
- from typing import Dict, List, Any, Optional, Union, Tuple
19
- from sklearn.metrics.pairwise import cosine_similarity
20
- from sklearn.metrics import precision_score, recall_score, f1_score
21
- import json
22
-
23
- from .base_evaluator import BaseEvaluator, EvaluationResult
24
-
25
- logger = logging.getLogger(__name__)
26
-
27
-
28
- class EmbeddingEvaluator(BaseEvaluator):
29
- """
30
- Comprehensive embedding model evaluator.
31
-
32
- Supports evaluation of:
33
- - Semantic similarity tasks (STS, semantic textual similarity)
34
- - Information retrieval (IR) tasks with Precision@K, Recall@K, NDCG
35
- - Reranking effectiveness (MAP, MRR, NDCG improvements)
36
- - Cross-lingual embedding alignment
37
- - Document clustering quality
38
- - Zero-shot classification accuracy
39
- """
40
-
41
- def __init__(self,
42
- config: Optional[Dict[str, Any]] = None,
43
- experiment_tracker: Optional[Any] = None):
44
- """
45
- Initialize the embedding evaluator.
46
-
47
- Args:
48
- config: Evaluation configuration
49
- experiment_tracker: Optional experiment tracking instance
50
- """
51
- super().__init__(
52
- evaluator_name="embedding_evaluator",
53
- config=config,
54
- experiment_tracker=experiment_tracker
55
- )
56
-
57
- # Embedding-specific configuration
58
- self.embedding_dim = self.config.get("embedding_dim", None) # Auto-detect if None
59
- self.normalize_embeddings = self.config.get("normalize_embeddings", True)
60
- self.similarity_metric = self.config.get("similarity_metric", "cosine") # cosine, dot, euclidean
61
-
62
- # Evaluation task types
63
- self.task_type = self.config.get("task_type", "similarity") # similarity, retrieval, reranking, clustering
64
-
65
- # Retrieval evaluation settings
66
- self.k_values = self.config.get("k_values", [1, 5, 10, 20]) # For Precision@K, Recall@K
67
- self.relevance_threshold = self.config.get("relevance_threshold", 0.5)
68
-
69
- # Multilingual settings
70
- self.enable_multilingual = self.config.get("enable_multilingual", True)
71
- self.languages = self.config.get("languages", ["en", "zh", "es", "fr", "de"])
72
-
73
- logger.info(f"Initialized EmbeddingEvaluator for task: {self.task_type}")
74
-
75
- async def evaluate_sample(self,
76
- sample: Dict[str, Any],
77
- model_interface: Any) -> Dict[str, Any]:
78
- """
79
- Evaluate a single embedding sample.
80
-
81
- Args:
82
- sample: Embedding sample containing text and expected output
83
- model_interface: Embedding model interface
84
-
85
- Returns:
86
- Evaluation result for the sample
87
- """
88
- try:
89
- # Extract sample data
90
- text_input = sample.get("text", "")
91
- query = sample.get("query", "")
92
- documents = sample.get("documents", [])
93
- expected_output = sample.get("expected_output")
94
- task_type = sample.get("task_type", self.task_type)
95
-
96
- # Get embeddings based on task type
97
- if task_type == "similarity":
98
- result = await self._evaluate_similarity_sample(
99
- model_interface, text_input, expected_output, sample
100
- )
101
- elif task_type == "retrieval":
102
- result = await self._evaluate_retrieval_sample(
103
- model_interface, query, documents, expected_output, sample
104
- )
105
- elif task_type == "reranking":
106
- result = await self._evaluate_reranking_sample(
107
- model_interface, query, documents, expected_output, sample
108
- )
109
- elif task_type == "clustering":
110
- result = await self._evaluate_clustering_sample(
111
- model_interface, text_input, expected_output, sample
112
- )
113
- else:
114
- # Generic embedding evaluation
115
- result = await self._evaluate_generic_sample(
116
- model_interface, text_input, expected_output, sample
117
- )
118
-
119
- result["task_type"] = task_type
120
- return result
121
-
122
- except Exception as e:
123
- logger.error(f"Error evaluating embedding sample: {e}")
124
- raise
125
-
126
- async def _evaluate_similarity_sample(self,
127
- model_interface: Any,
128
- text_input: str,
129
- expected_output: Any,
130
- sample: Dict[str, Any]) -> Dict[str, Any]:
131
- """Evaluate semantic similarity task."""
132
- try:
133
- # Extract text pairs
134
- text1 = sample.get("text1", text_input)
135
- text2 = sample.get("text2", "")
136
- expected_similarity = float(expected_output) if expected_output is not None else 0.0
137
-
138
- # Get embeddings
139
- emb1 = await self._get_embedding(model_interface, text1)
140
- emb2 = await self._get_embedding(model_interface, text2)
141
-
142
- # Compute similarity
143
- predicted_similarity = self._compute_similarity(emb1, emb2)
144
-
145
- # Compute metrics
146
- sample_metrics = {
147
- "predicted_similarity": predicted_similarity,
148
- "expected_similarity": expected_similarity,
149
- "similarity_error": abs(predicted_similarity - expected_similarity),
150
- "similarity_correlation": self._compute_correlation([predicted_similarity], [expected_similarity])
151
- }
152
-
153
- return {
154
- "prediction": predicted_similarity,
155
- "expected_output": expected_similarity,
156
- "sample_metrics": sample_metrics,
157
- "embeddings": {"text1": emb1.tolist(), "text2": emb2.tolist()}
158
- }
159
-
160
- except Exception as e:
161
- logger.error(f"Error evaluating similarity sample: {e}")
162
- raise
163
-
164
- async def _evaluate_retrieval_sample(self,
165
- model_interface: Any,
166
- query: str,
167
- documents: List[str],
168
- expected_output: Any,
169
- sample: Dict[str, Any]) -> Dict[str, Any]:
170
- """Evaluate information retrieval task."""
171
- try:
172
- # Get query embedding
173
- query_embedding = await self._get_embedding(model_interface, query)
174
-
175
- # Get document embeddings
176
- doc_embeddings = []
177
- for doc in documents:
178
- doc_emb = await self._get_embedding(model_interface, doc)
179
- doc_embeddings.append(doc_emb)
180
-
181
- if not doc_embeddings:
182
- raise ValueError("No documents provided for retrieval evaluation")
183
-
184
- doc_embeddings = np.array(doc_embeddings)
185
-
186
- # Compute similarities
187
- similarities = self._compute_similarity_matrix(query_embedding, doc_embeddings)
188
-
189
- # Rank documents
190
- ranked_indices = np.argsort(similarities)[::-1] # Descending order
191
-
192
- # Extract relevance labels
193
- relevance_labels = expected_output if isinstance(expected_output, list) else []
194
-
195
- # Compute retrieval metrics
196
- sample_metrics = self._compute_retrieval_metrics(ranked_indices, relevance_labels)
197
-
198
- return {
199
- "prediction": ranked_indices.tolist(),
200
- "expected_output": relevance_labels,
201
- "sample_metrics": sample_metrics,
202
- "similarities": similarities.tolist(),
203
- "query_embedding": query_embedding.tolist()
204
- }
205
-
206
- except Exception as e:
207
- logger.error(f"Error evaluating retrieval sample: {e}")
208
- raise
209
-
210
- async def _evaluate_reranking_sample(self,
211
- model_interface: Any,
212
- query: str,
213
- documents: List[str],
214
- expected_output: Any,
215
- sample: Dict[str, Any]) -> Dict[str, Any]:
216
- """Evaluate reranking task."""
217
- try:
218
- # Get initial rankings (if provided)
219
- initial_ranking = sample.get("initial_ranking", list(range(len(documents))))
220
-
221
- # Rerank using embedding model
222
- if hasattr(model_interface, 'rerank'):
223
- # ISA reranking service
224
- reranked_results = await model_interface.rerank(query, documents)
225
- if isinstance(reranked_results, list):
226
- reranked_indices = [r.get("index", i) for i, r in enumerate(reranked_results)]
227
- else:
228
- reranked_indices = list(range(len(documents)))
229
- else:
230
- # Use embedding similarity for reranking
231
- query_embedding = await self._get_embedding(model_interface, query)
232
- doc_embeddings = []
233
- for doc in documents:
234
- doc_emb = await self._get_embedding(model_interface, doc)
235
- doc_embeddings.append(doc_emb)
236
-
237
- doc_embeddings = np.array(doc_embeddings)
238
- similarities = self._compute_similarity_matrix(query_embedding, doc_embeddings)
239
- reranked_indices = np.argsort(similarities)[::-1].tolist()
240
-
241
- # Extract relevance labels
242
- relevance_labels = expected_output if isinstance(expected_output, list) else []
243
-
244
- # Compute reranking metrics
245
- initial_metrics = self._compute_retrieval_metrics(initial_ranking, relevance_labels)
246
- reranked_metrics = self._compute_retrieval_metrics(reranked_indices, relevance_labels)
247
-
248
- # Compute improvement
249
- improvement_metrics = {}
250
- for metric_name in ["precision_at_1", "precision_at_5", "ndcg_at_10"]:
251
- initial_score = initial_metrics.get(metric_name, 0.0)
252
- reranked_score = reranked_metrics.get(metric_name, 0.0)
253
- improvement_metrics[f"{metric_name}_improvement"] = reranked_score - initial_score
254
-
255
- sample_metrics = {
256
- **reranked_metrics,
257
- **improvement_metrics,
258
- "reranking_effectiveness": np.mean(list(improvement_metrics.values()))
259
- }
260
-
261
- return {
262
- "prediction": reranked_indices,
263
- "expected_output": relevance_labels,
264
- "sample_metrics": sample_metrics,
265
- "initial_ranking": initial_ranking,
266
- "reranked_ranking": reranked_indices
267
- }
268
-
269
- except Exception as e:
270
- logger.error(f"Error evaluating reranking sample: {e}")
271
- raise
272
-
273
- async def _evaluate_clustering_sample(self,
274
- model_interface: Any,
275
- text_input: Union[str, List[str]],
276
- expected_output: Any,
277
- sample: Dict[str, Any]) -> Dict[str, Any]:
278
- """Evaluate clustering task."""
279
- try:
280
- # Extract texts for clustering
281
- texts = text_input if isinstance(text_input, list) else sample.get("texts", [text_input])
282
- expected_clusters = expected_output if isinstance(expected_output, list) else []
283
-
284
- # Get embeddings
285
- embeddings = []
286
- for text in texts:
287
- emb = await self._get_embedding(model_interface, text)
288
- embeddings.append(emb)
289
-
290
- embeddings = np.array(embeddings)
291
-
292
- # Perform clustering (simple k-means)
293
- from sklearn.cluster import KMeans
294
-
295
- n_clusters = len(set(expected_clusters)) if expected_clusters else 2
296
- kmeans = KMeans(n_clusters=n_clusters, random_state=42)
297
- predicted_clusters = kmeans.fit_predict(embeddings)
298
-
299
- # Compute clustering metrics
300
- sample_metrics = self._compute_clustering_metrics(predicted_clusters, expected_clusters)
301
-
302
- return {
303
- "prediction": predicted_clusters.tolist(),
304
- "expected_output": expected_clusters,
305
- "sample_metrics": sample_metrics,
306
- "embeddings": embeddings.tolist()
307
- }
308
-
309
- except Exception as e:
310
- logger.error(f"Error evaluating clustering sample: {e}")
311
- raise
312
-
313
- async def _evaluate_generic_sample(self,
314
- model_interface: Any,
315
- text_input: str,
316
- expected_output: Any,
317
- sample: Dict[str, Any]) -> Dict[str, Any]:
318
- """Evaluate generic embedding task."""
319
- try:
320
- # Get embedding
321
- embedding = await self._get_embedding(model_interface, text_input)
322
-
323
- # Basic embedding quality metrics
324
- sample_metrics = {
325
- "embedding_norm": float(np.linalg.norm(embedding)),
326
- "embedding_mean": float(np.mean(embedding)),
327
- "embedding_std": float(np.std(embedding)),
328
- "embedding_dimension": len(embedding)
329
- }
330
-
331
- return {
332
- "prediction": embedding.tolist(),
333
- "expected_output": expected_output,
334
- "sample_metrics": sample_metrics,
335
- "embedding": embedding.tolist()
336
- }
337
-
338
- except Exception as e:
339
- logger.error(f"Error evaluating generic sample: {e}")
340
- raise
341
-
342
- async def _get_embedding(self, model_interface: Any, text: str) -> np.ndarray:
343
- """Get embedding from model interface."""
344
- try:
345
- if hasattr(model_interface, 'embed'):
346
- # ISA embedding service
347
- result = await model_interface.embed(text)
348
- if isinstance(result, dict):
349
- embedding = result.get("embedding", result.get("vector", []))
350
- else:
351
- embedding = result
352
- elif hasattr(model_interface, 'encode'):
353
- # Standard embedding interface
354
- embedding = await model_interface.encode(text)
355
- else:
356
- # Generic interface
357
- embedding = await model_interface.predict(text)
358
-
359
- # Convert to numpy array
360
- embedding = np.array(embedding, dtype=np.float32)
361
-
362
- # Normalize if configured
363
- if self.normalize_embeddings:
364
- norm = np.linalg.norm(embedding)
365
- if norm > 0:
366
- embedding = embedding / norm
367
-
368
- return embedding
369
-
370
- except Exception as e:
371
- logger.error(f"Error getting embedding: {e}")
372
- raise
373
-
374
- def _compute_similarity(self, emb1: np.ndarray, emb2: np.ndarray) -> float:
375
- """Compute similarity between two embeddings."""
376
- try:
377
- if self.similarity_metric == "cosine":
378
- # Cosine similarity
379
- return float(np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2)))
380
- elif self.similarity_metric == "dot":
381
- # Dot product
382
- return float(np.dot(emb1, emb2))
383
- elif self.similarity_metric == "euclidean":
384
- # Negative euclidean distance (higher = more similar)
385
- return float(-np.linalg.norm(emb1 - emb2))
386
- else:
387
- # Default to cosine
388
- return float(np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2)))
389
-
390
- except Exception as e:
391
- logger.error(f"Error computing similarity: {e}")
392
- return 0.0
393
-
394
- def _compute_similarity_matrix(self, query_emb: np.ndarray, doc_embs: np.ndarray) -> np.ndarray:
395
- """Compute similarity matrix between query and documents."""
396
- try:
397
- if self.similarity_metric == "cosine":
398
- # Reshape for sklearn cosine_similarity
399
- query_emb = query_emb.reshape(1, -1)
400
- similarities = cosine_similarity(query_emb, doc_embs)[0]
401
- elif self.similarity_metric == "dot":
402
- similarities = np.dot(doc_embs, query_emb)
403
- elif self.similarity_metric == "euclidean":
404
- similarities = -np.linalg.norm(doc_embs - query_emb, axis=1)
405
- else:
406
- # Default to cosine
407
- query_emb = query_emb.reshape(1, -1)
408
- similarities = cosine_similarity(query_emb, doc_embs)[0]
409
-
410
- return similarities
411
-
412
- except Exception as e:
413
- logger.error(f"Error computing similarity matrix: {e}")
414
- return np.zeros(len(doc_embs))
415
-
416
- def _compute_retrieval_metrics(self,
417
- ranked_indices: List[int],
418
- relevance_labels: List[int]) -> Dict[str, float]:
419
- """Compute information retrieval metrics."""
420
- try:
421
- if not relevance_labels:
422
- return {"retrieval_error": 1.0}
423
-
424
- metrics = {}
425
- n_docs = len(ranked_indices)
426
-
427
- # Ensure relevance labels match document count
428
- relevance_labels = relevance_labels[:n_docs] + [0] * max(0, n_docs - len(relevance_labels))
429
-
430
- # Compute metrics for different K values
431
- for k in self.k_values:
432
- if k > n_docs:
433
- continue
434
-
435
- # Get top-k predictions
436
- top_k_indices = ranked_indices[:k]
437
- top_k_relevance = [relevance_labels[i] for i in top_k_indices]
438
-
439
- # Precision@K
440
- precision_k = sum(top_k_relevance) / k if k > 0 else 0.0
441
- metrics[f"precision_at_{k}"] = precision_k
442
-
443
- # Recall@K
444
- total_relevant = sum(relevance_labels)
445
- recall_k = sum(top_k_relevance) / total_relevant if total_relevant > 0 else 0.0
446
- metrics[f"recall_at_{k}"] = recall_k
447
-
448
- # F1@K
449
- if precision_k + recall_k > 0:
450
- f1_k = 2 * precision_k * recall_k / (precision_k + recall_k)
451
- else:
452
- f1_k = 0.0
453
- metrics[f"f1_at_{k}"] = f1_k
454
-
455
- # NDCG@K for different K values
456
- for k in self.k_values:
457
- if k > n_docs:
458
- continue
459
- ndcg_k = self._compute_ndcg(ranked_indices, relevance_labels, k)
460
- metrics[f"ndcg_at_{k}"] = ndcg_k
461
-
462
- # Mean Average Precision (MAP)
463
- metrics["map"] = self._compute_map(ranked_indices, relevance_labels)
464
-
465
- # Mean Reciprocal Rank (MRR)
466
- metrics["mrr"] = self._compute_mrr(ranked_indices, relevance_labels)
467
-
468
- return metrics
469
-
470
- except Exception as e:
471
- logger.error(f"Error computing retrieval metrics: {e}")
472
- return {"retrieval_error": 1.0}
473
-
474
- def _compute_ndcg(self, ranked_indices: List[int], relevance_labels: List[int], k: int) -> float:
475
- """Compute Normalized Discounted Cumulative Gain@K."""
476
- try:
477
- # DCG@K
478
- dcg = 0.0
479
- for i, doc_idx in enumerate(ranked_indices[:k]):
480
- if doc_idx < len(relevance_labels):
481
- relevance = relevance_labels[doc_idx]
482
- dcg += relevance / np.log2(i + 2) # i+2 because log2(1) = 0
483
-
484
- # IDCG@K (Ideal DCG)
485
- sorted_relevance = sorted(relevance_labels, reverse=True)
486
- idcg = 0.0
487
- for i, relevance in enumerate(sorted_relevance[:k]):
488
- idcg += relevance / np.log2(i + 2)
489
-
490
- # NDCG@K
491
- ndcg = dcg / idcg if idcg > 0 else 0.0
492
- return ndcg
493
-
494
- except Exception as e:
495
- logger.error(f"Error computing NDCG: {e}")
496
- return 0.0
497
-
498
- def _compute_map(self, ranked_indices: List[int], relevance_labels: List[int]) -> float:
499
- """Compute Mean Average Precision."""
500
- try:
501
- if not any(relevance_labels):
502
- return 0.0
503
-
504
- precision_sum = 0.0
505
- relevant_count = 0
506
-
507
- for i, doc_idx in enumerate(ranked_indices):
508
- if doc_idx < len(relevance_labels) and relevance_labels[doc_idx] > 0:
509
- relevant_count += 1
510
- precision_at_i = relevant_count / (i + 1)
511
- precision_sum += precision_at_i
512
-
513
- total_relevant = sum(1 for r in relevance_labels if r > 0)
514
- map_score = precision_sum / total_relevant if total_relevant > 0 else 0.0
515
-
516
- return map_score
517
-
518
- except Exception as e:
519
- logger.error(f"Error computing MAP: {e}")
520
- return 0.0
521
-
522
- def _compute_mrr(self, ranked_indices: List[int], relevance_labels: List[int]) -> float:
523
- """Compute Mean Reciprocal Rank."""
524
- try:
525
- for i, doc_idx in enumerate(ranked_indices):
526
- if doc_idx < len(relevance_labels) and relevance_labels[doc_idx] > 0:
527
- return 1.0 / (i + 1)
528
- return 0.0
529
-
530
- except Exception as e:
531
- logger.error(f"Error computing MRR: {e}")
532
- return 0.0
533
-
534
- def _compute_clustering_metrics(self,
535
- predicted_clusters: List[int],
536
- expected_clusters: List[int]) -> Dict[str, float]:
537
- """Compute clustering evaluation metrics."""
538
- try:
539
- if not expected_clusters:
540
- return {"clustering_error": 1.0}
541
-
542
- # Ensure equal lengths
543
- min_len = min(len(predicted_clusters), len(expected_clusters))
544
- predicted_clusters = predicted_clusters[:min_len]
545
- expected_clusters = expected_clusters[:min_len]
546
-
547
- # Adjusted Rand Index (ARI)
548
- from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
549
-
550
- ari = adjusted_rand_score(expected_clusters, predicted_clusters)
551
- nmi = normalized_mutual_info_score(expected_clusters, predicted_clusters)
552
-
553
- # Silhouette score would require embeddings
554
- return {
555
- "adjusted_rand_index": ari,
556
- "normalized_mutual_info": nmi,
557
- "clustering_accuracy": (ari + nmi) / 2
558
- }
559
-
560
- except Exception as e:
561
- logger.error(f"Error computing clustering metrics: {e}")
562
- return {"clustering_error": 1.0}
563
-
564
- def _compute_correlation(self, predictions: List[float], references: List[float]) -> float:
565
- """Compute correlation between predictions and references."""
566
- try:
567
- if len(predictions) < 2 or len(references) < 2:
568
- return 0.0
569
-
570
- from scipy.stats import pearsonr
571
- correlation, _ = pearsonr(predictions, references)
572
- return float(correlation) if not np.isnan(correlation) else 0.0
573
-
574
- except Exception as e:
575
- logger.error(f"Error computing correlation: {e}")
576
- return 0.0
577
-
578
- def compute_metrics(self,
579
- predictions: List[Any],
580
- references: List[Any],
581
- **kwargs) -> Dict[str, float]:
582
- """
583
- Compute aggregate embedding evaluation metrics.
584
-
585
- Args:
586
- predictions: List of model predictions
587
- references: List of reference outputs
588
- **kwargs: Additional parameters
589
-
590
- Returns:
591
- Dictionary of computed metrics
592
- """
593
- try:
594
- if not predictions or not references:
595
- logger.warning("Empty predictions or references provided")
596
- return {}
597
-
598
- # Ensure equal lengths
599
- min_len = min(len(predictions), len(references))
600
- predictions = predictions[:min_len]
601
- references = references[:min_len]
602
-
603
- task_type = self.task_type
604
-
605
- if task_type == "similarity":
606
- return self._compute_aggregate_similarity_metrics(predictions, references)
607
- elif task_type == "retrieval":
608
- return self._compute_aggregate_retrieval_metrics(predictions, references)
609
- elif task_type == "reranking":
610
- return self._compute_aggregate_reranking_metrics(predictions, references)
611
- elif task_type == "clustering":
612
- return self._compute_aggregate_clustering_metrics(predictions, references)
613
- else:
614
- # Generic metrics
615
- return {
616
- "total_samples": len(predictions),
617
- "task_type": task_type,
618
- "evaluation_success_rate": 1.0
619
- }
620
-
621
- except Exception as e:
622
- logger.error(f"Error computing aggregate metrics: {e}")
623
- return {"error_rate": 1.0}
624
-
625
- def _compute_aggregate_similarity_metrics(self,
626
- predictions: List[float],
627
- references: List[float]) -> Dict[str, float]:
628
- """Compute aggregate similarity metrics."""
629
- try:
630
- # Convert to float if needed
631
- pred_vals = [float(p) for p in predictions if p is not None]
632
- ref_vals = [float(r) for r in references if r is not None]
633
-
634
- if not pred_vals or not ref_vals:
635
- return {"similarity_error": 1.0}
636
-
637
- # Correlation
638
- correlation = self._compute_correlation(pred_vals, ref_vals)
639
-
640
- # Mean absolute error
641
- errors = [abs(p - r) for p, r in zip(pred_vals, ref_vals)]
642
- mae = np.mean(errors) if errors else 1.0
643
-
644
- # Mean squared error
645
- mse = np.mean([(p - r)**2 for p, r in zip(pred_vals, ref_vals)]) if pred_vals else 1.0
646
-
647
- return {
648
- "similarity_correlation": correlation,
649
- "similarity_mae": mae,
650
- "similarity_mse": mse,
651
- "similarity_rmse": np.sqrt(mse),
652
- "total_samples": len(pred_vals)
653
- }
654
-
655
- except Exception as e:
656
- logger.error(f"Error computing aggregate similarity metrics: {e}")
657
- return {"similarity_error": 1.0}
658
-
659
- def _compute_aggregate_retrieval_metrics(self,
660
- predictions: List[List[int]],
661
- references: List[List[int]]) -> Dict[str, float]:
662
- """Compute aggregate retrieval metrics."""
663
- try:
664
- all_metrics = {}
665
- metric_names = [f"precision_at_{k}" for k in self.k_values] + \
666
- [f"recall_at_{k}" for k in self.k_values] + \
667
- [f"ndcg_at_{k}" for k in self.k_values] + \
668
- ["map", "mrr"]
669
-
670
- # Initialize metric accumulators
671
- for metric_name in metric_names:
672
- all_metrics[metric_name] = []
673
-
674
- # Compute metrics for each sample
675
- for pred, ref in zip(predictions, references):
676
- if isinstance(pred, list) and isinstance(ref, list):
677
- sample_metrics = self._compute_retrieval_metrics(pred, ref)
678
- for metric_name in metric_names:
679
- if metric_name in sample_metrics:
680
- all_metrics[metric_name].append(sample_metrics[metric_name])
681
-
682
- # Compute averages
683
- avg_metrics = {}
684
- for metric_name, values in all_metrics.items():
685
- if values:
686
- avg_metrics[f"avg_{metric_name}"] = np.mean(values)
687
-
688
- avg_metrics["total_samples"] = len(predictions)
689
- return avg_metrics
690
-
691
- except Exception as e:
692
- logger.error(f"Error computing aggregate retrieval metrics: {e}")
693
- return {"retrieval_error": 1.0}
694
-
695
- def _compute_aggregate_reranking_metrics(self,
696
- predictions: List[List[int]],
697
- references: List[List[int]]) -> Dict[str, float]:
698
- """Compute aggregate reranking metrics."""
699
- # Similar to retrieval but focus on improvement metrics
700
- return self._compute_aggregate_retrieval_metrics(predictions, references)
701
-
702
- def _compute_aggregate_clustering_metrics(self,
703
- predictions: List[List[int]],
704
- references: List[List[int]]) -> Dict[str, float]:
705
- """Compute aggregate clustering metrics."""
706
- try:
707
- ari_scores = []
708
- nmi_scores = []
709
-
710
- for pred, ref in zip(predictions, references):
711
- if isinstance(pred, list) and isinstance(ref, list):
712
- sample_metrics = self._compute_clustering_metrics(pred, ref)
713
- ari_scores.append(sample_metrics.get("adjusted_rand_index", 0.0))
714
- nmi_scores.append(sample_metrics.get("normalized_mutual_info", 0.0))
715
-
716
- return {
717
- "avg_adjusted_rand_index": np.mean(ari_scores) if ari_scores else 0.0,
718
- "avg_normalized_mutual_info": np.mean(nmi_scores) if nmi_scores else 0.0,
719
- "avg_clustering_accuracy": np.mean(ari_scores + nmi_scores) / 2 if ari_scores or nmi_scores else 0.0,
720
- "total_samples": len(predictions)
721
- }
722
-
723
- except Exception as e:
724
- logger.error(f"Error computing aggregate clustering metrics: {e}")
725
- return {"clustering_error": 1.0}
726
-
727
- def get_supported_metrics(self) -> List[str]:
728
- """Get list of metrics supported by this evaluator."""
729
- base_metrics = ["total_samples", "evaluation_success_rate"]
730
-
731
- task_specific_metrics = {
732
- "similarity": ["similarity_correlation", "similarity_mae", "similarity_mse", "similarity_rmse"],
733
- "retrieval": [f"precision_at_{k}" for k in self.k_values] +
734
- [f"recall_at_{k}" for k in self.k_values] +
735
- [f"ndcg_at_{k}" for k in self.k_values] +
736
- ["map", "mrr"],
737
- "reranking": [f"precision_at_{k}_improvement" for k in self.k_values] +
738
- ["reranking_effectiveness"],
739
- "clustering": ["adjusted_rand_index", "normalized_mutual_info", "clustering_accuracy"]
740
- }
741
-
742
- return base_metrics + task_specific_metrics.get(self.task_type, [])