isa-model 0.4.0__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (199) hide show
  1. isa_model/client.py +466 -43
  2. isa_model/core/cache/redis_cache.py +12 -3
  3. isa_model/core/config/config_manager.py +230 -3
  4. isa_model/core/config.py +90 -0
  5. isa_model/core/database/direct_db_client.py +114 -0
  6. isa_model/core/database/migration_manager.py +563 -0
  7. isa_model/core/database/migrations.py +21 -1
  8. isa_model/core/database/supabase_client.py +154 -19
  9. isa_model/core/dependencies.py +316 -0
  10. isa_model/core/discovery/__init__.py +19 -0
  11. isa_model/core/discovery/consul_discovery.py +190 -0
  12. isa_model/core/logging/__init__.py +54 -0
  13. isa_model/core/logging/influx_logger.py +523 -0
  14. isa_model/core/logging/loki_logger.py +160 -0
  15. isa_model/core/models/__init__.py +27 -18
  16. isa_model/core/models/config_models.py +625 -0
  17. isa_model/core/models/deployment_billing_tracker.py +430 -0
  18. isa_model/core/models/model_manager.py +40 -17
  19. isa_model/core/models/model_metadata.py +690 -0
  20. isa_model/core/models/model_repo.py +174 -18
  21. isa_model/core/models/system_models.py +857 -0
  22. isa_model/core/repositories/__init__.py +9 -0
  23. isa_model/core/repositories/config_repository.py +912 -0
  24. isa_model/core/services/intelligent_model_selector.py +399 -21
  25. isa_model/core/storage/hf_storage.py +1 -1
  26. isa_model/core/types.py +1 -0
  27. isa_model/deployment/__init__.py +5 -48
  28. isa_model/deployment/core/__init__.py +2 -31
  29. isa_model/deployment/core/deployment_manager.py +1278 -370
  30. isa_model/deployment/local/__init__.py +31 -0
  31. isa_model/deployment/local/config.py +248 -0
  32. isa_model/deployment/local/gpu_gateway.py +607 -0
  33. isa_model/deployment/local/health_checker.py +428 -0
  34. isa_model/deployment/local/provider.py +586 -0
  35. isa_model/deployment/local/tensorrt_service.py +621 -0
  36. isa_model/deployment/local/transformers_service.py +644 -0
  37. isa_model/deployment/local/vllm_service.py +527 -0
  38. isa_model/deployment/modal/__init__.py +8 -0
  39. isa_model/deployment/modal/config.py +136 -0
  40. isa_model/deployment/{services/auto_hf_modal_deployer.py → modal/deployer.py} +1 -1
  41. isa_model/deployment/modal/services/__init__.py +3 -0
  42. isa_model/deployment/modal/services/audio/__init__.py +1 -0
  43. isa_model/deployment/modal/services/embedding/__init__.py +1 -0
  44. isa_model/deployment/modal/services/llm/__init__.py +1 -0
  45. isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
  46. isa_model/deployment/modal/services/video/__init__.py +1 -0
  47. isa_model/deployment/modal/services/vision/__init__.py +1 -0
  48. isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
  49. isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
  50. isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
  51. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
  52. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
  53. isa_model/deployment/storage/__init__.py +5 -0
  54. isa_model/deployment/storage/deployment_repository.py +824 -0
  55. isa_model/deployment/triton/__init__.py +10 -0
  56. isa_model/deployment/triton/config.py +196 -0
  57. isa_model/deployment/triton/configs/__init__.py +1 -0
  58. isa_model/deployment/triton/provider.py +512 -0
  59. isa_model/deployment/triton/scripts/__init__.py +1 -0
  60. isa_model/deployment/triton/templates/__init__.py +1 -0
  61. isa_model/inference/__init__.py +47 -1
  62. isa_model/inference/ai_factory.py +137 -10
  63. isa_model/inference/legacy_services/__init__.py +21 -0
  64. isa_model/inference/legacy_services/model_evaluation.py +637 -0
  65. isa_model/inference/legacy_services/model_service.py +573 -0
  66. isa_model/inference/legacy_services/model_serving.py +717 -0
  67. isa_model/inference/legacy_services/model_training.py +561 -0
  68. isa_model/inference/models/__init__.py +21 -0
  69. isa_model/inference/models/inference_config.py +551 -0
  70. isa_model/inference/models/inference_record.py +675 -0
  71. isa_model/inference/models/performance_models.py +714 -0
  72. isa_model/inference/repositories/__init__.py +9 -0
  73. isa_model/inference/repositories/inference_repository.py +828 -0
  74. isa_model/inference/services/audio/base_stt_service.py +184 -11
  75. isa_model/inference/services/audio/openai_stt_service.py +22 -6
  76. isa_model/inference/services/custom_model_manager.py +277 -0
  77. isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
  78. isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
  79. isa_model/inference/services/llm/__init__.py +10 -2
  80. isa_model/inference/services/llm/base_llm_service.py +335 -24
  81. isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
  82. isa_model/inference/services/llm/helpers/llm_adapter.py +9 -4
  83. isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
  84. isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
  85. isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
  86. isa_model/inference/services/llm/local_llm_service.py +747 -0
  87. isa_model/inference/services/llm/ollama_llm_service.py +9 -2
  88. isa_model/inference/services/llm/openai_llm_service.py +33 -16
  89. isa_model/inference/services/llm/yyds_llm_service.py +8 -2
  90. isa_model/inference/services/vision/__init__.py +22 -1
  91. isa_model/inference/services/vision/blip_vision_service.py +359 -0
  92. isa_model/inference/services/vision/helpers/image_utils.py +8 -5
  93. isa_model/inference/services/vision/isa_vision_service.py +65 -4
  94. isa_model/inference/services/vision/openai_vision_service.py +19 -10
  95. isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
  96. isa_model/serving/api/cache_manager.py +245 -0
  97. isa_model/serving/api/dependencies/__init__.py +1 -0
  98. isa_model/serving/api/dependencies/auth.py +194 -0
  99. isa_model/serving/api/dependencies/database.py +139 -0
  100. isa_model/serving/api/error_handlers.py +284 -0
  101. isa_model/serving/api/fastapi_server.py +172 -22
  102. isa_model/serving/api/middleware/auth.py +8 -2
  103. isa_model/serving/api/middleware/security.py +23 -33
  104. isa_model/serving/api/middleware/tenant_context.py +414 -0
  105. isa_model/serving/api/routes/analytics.py +4 -1
  106. isa_model/serving/api/routes/config.py +645 -0
  107. isa_model/serving/api/routes/deployment_billing.py +315 -0
  108. isa_model/serving/api/routes/deployments.py +138 -2
  109. isa_model/serving/api/routes/gpu_gateway.py +440 -0
  110. isa_model/serving/api/routes/health.py +32 -12
  111. isa_model/serving/api/routes/inference_monitoring.py +486 -0
  112. isa_model/serving/api/routes/local_deployments.py +448 -0
  113. isa_model/serving/api/routes/tenants.py +575 -0
  114. isa_model/serving/api/routes/unified.py +680 -18
  115. isa_model/serving/api/routes/webhooks.py +479 -0
  116. isa_model/serving/api/startup.py +68 -54
  117. isa_model/utils/gpu_utils.py +311 -0
  118. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/METADATA +66 -24
  119. isa_model-0.4.3.dist-info/RECORD +193 -0
  120. isa_model/core/storage/minio_storage.py +0 -0
  121. isa_model/deployment/cloud/__init__.py +0 -9
  122. isa_model/deployment/cloud/modal/__init__.py +0 -10
  123. isa_model/deployment/core/deployment_config.py +0 -356
  124. isa_model/deployment/core/isa_deployment_service.py +0 -401
  125. isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
  126. isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
  127. isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
  128. isa_model/deployment/runtime/deployed_service.py +0 -338
  129. isa_model/deployment/services/__init__.py +0 -9
  130. isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
  131. isa_model/deployment/services/model_service.py +0 -332
  132. isa_model/deployment/services/service_monitor.py +0 -356
  133. isa_model/deployment/services/service_registry.py +0 -527
  134. isa_model/eval/__init__.py +0 -92
  135. isa_model/eval/benchmarks/__init__.py +0 -27
  136. isa_model/eval/benchmarks/multimodal_datasets.py +0 -460
  137. isa_model/eval/benchmarks.py +0 -701
  138. isa_model/eval/config/__init__.py +0 -10
  139. isa_model/eval/config/evaluation_config.py +0 -108
  140. isa_model/eval/evaluators/__init__.py +0 -24
  141. isa_model/eval/evaluators/audio_evaluator.py +0 -727
  142. isa_model/eval/evaluators/base_evaluator.py +0 -503
  143. isa_model/eval/evaluators/embedding_evaluator.py +0 -742
  144. isa_model/eval/evaluators/llm_evaluator.py +0 -472
  145. isa_model/eval/evaluators/vision_evaluator.py +0 -564
  146. isa_model/eval/example_evaluation.py +0 -395
  147. isa_model/eval/factory.py +0 -798
  148. isa_model/eval/infrastructure/__init__.py +0 -24
  149. isa_model/eval/infrastructure/experiment_tracker.py +0 -466
  150. isa_model/eval/isa_benchmarks.py +0 -700
  151. isa_model/eval/isa_integration.py +0 -582
  152. isa_model/eval/metrics.py +0 -951
  153. isa_model/eval/tests/unit/test_basic.py +0 -396
  154. isa_model/serving/api/routes/evaluations.py +0 -579
  155. isa_model/training/__init__.py +0 -168
  156. isa_model/training/annotation/annotation_schema.py +0 -47
  157. isa_model/training/annotation/processors/annotation_processor.py +0 -126
  158. isa_model/training/annotation/storage/dataset_manager.py +0 -131
  159. isa_model/training/annotation/storage/dataset_schema.py +0 -44
  160. isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
  161. isa_model/training/annotation/tests/test_minio copy.py +0 -113
  162. isa_model/training/annotation/tests/test_minio_upload.py +0 -43
  163. isa_model/training/annotation/views/annotation_controller.py +0 -158
  164. isa_model/training/cloud/__init__.py +0 -22
  165. isa_model/training/cloud/job_orchestrator.py +0 -402
  166. isa_model/training/cloud/runpod_trainer.py +0 -454
  167. isa_model/training/cloud/storage_manager.py +0 -482
  168. isa_model/training/core/__init__.py +0 -26
  169. isa_model/training/core/config.py +0 -181
  170. isa_model/training/core/dataset.py +0 -222
  171. isa_model/training/core/trainer.py +0 -720
  172. isa_model/training/core/utils.py +0 -213
  173. isa_model/training/examples/intelligent_training_example.py +0 -281
  174. isa_model/training/factory.py +0 -424
  175. isa_model/training/intelligent/__init__.py +0 -25
  176. isa_model/training/intelligent/decision_engine.py +0 -643
  177. isa_model/training/intelligent/intelligent_factory.py +0 -888
  178. isa_model/training/intelligent/knowledge_base.py +0 -751
  179. isa_model/training/intelligent/resource_optimizer.py +0 -839
  180. isa_model/training/intelligent/task_classifier.py +0 -576
  181. isa_model/training/storage/__init__.py +0 -24
  182. isa_model/training/storage/core_integration.py +0 -439
  183. isa_model/training/storage/training_repository.py +0 -552
  184. isa_model/training/storage/training_storage.py +0 -628
  185. isa_model-0.4.0.dist-info/RECORD +0 -182
  186. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_chatTTS_service.py +0 -0
  187. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_fish_service.py +0 -0
  188. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_openvoice_service.py +0 -0
  189. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_service_v2.py +0 -0
  190. /isa_model/deployment/{cloud/modal → modal/services/embedding}/isa_embed_rerank_service.py +0 -0
  191. /isa_model/deployment/{cloud/modal → modal/services/video}/isa_video_hunyuan_service.py +0 -0
  192. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ocr_service.py +0 -0
  193. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_qwen25_service.py +0 -0
  194. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_table_service.py +0 -0
  195. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service.py +0 -0
  196. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service_optimized.py +0 -0
  197. /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
  198. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/WHEEL +0 -0
  199. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/top_level.txt +0 -0
@@ -1,742 +0,0 @@
1
- """
2
- Embedding Evaluator for ISA Model evaluation framework.
3
-
4
- Provides comprehensive evaluation capabilities for embedding and retrieval tasks including:
5
- - Semantic similarity evaluation
6
- - Information retrieval evaluation (Precision@K, Recall@K, NDCG)
7
- - Reranking effectiveness evaluation
8
- - Cross-lingual embedding evaluation
9
- - Document ranking evaluation
10
- - Clustering evaluation
11
-
12
- Supports ISA custom embedding services and standard embedding models.
13
- """
14
-
15
- import asyncio
16
- import logging
17
- import numpy as np
18
- from typing import Dict, List, Any, Optional, Union, Tuple
19
- from sklearn.metrics.pairwise import cosine_similarity
20
- from sklearn.metrics import precision_score, recall_score, f1_score
21
- import json
22
-
23
- from .base_evaluator import BaseEvaluator, EvaluationResult
24
-
25
- logger = logging.getLogger(__name__)
26
-
27
-
28
- class EmbeddingEvaluator(BaseEvaluator):
29
- """
30
- Comprehensive embedding model evaluator.
31
-
32
- Supports evaluation of:
33
- - Semantic similarity tasks (STS, semantic textual similarity)
34
- - Information retrieval (IR) tasks with Precision@K, Recall@K, NDCG
35
- - Reranking effectiveness (MAP, MRR, NDCG improvements)
36
- - Cross-lingual embedding alignment
37
- - Document clustering quality
38
- - Zero-shot classification accuracy
39
- """
40
-
41
- def __init__(self,
42
- config: Optional[Dict[str, Any]] = None,
43
- experiment_tracker: Optional[Any] = None):
44
- """
45
- Initialize the embedding evaluator.
46
-
47
- Args:
48
- config: Evaluation configuration
49
- experiment_tracker: Optional experiment tracking instance
50
- """
51
- super().__init__(
52
- evaluator_name="embedding_evaluator",
53
- config=config,
54
- experiment_tracker=experiment_tracker
55
- )
56
-
57
- # Embedding-specific configuration
58
- self.embedding_dim = self.config.get("embedding_dim", None) # Auto-detect if None
59
- self.normalize_embeddings = self.config.get("normalize_embeddings", True)
60
- self.similarity_metric = self.config.get("similarity_metric", "cosine") # cosine, dot, euclidean
61
-
62
- # Evaluation task types
63
- self.task_type = self.config.get("task_type", "similarity") # similarity, retrieval, reranking, clustering
64
-
65
- # Retrieval evaluation settings
66
- self.k_values = self.config.get("k_values", [1, 5, 10, 20]) # For Precision@K, Recall@K
67
- self.relevance_threshold = self.config.get("relevance_threshold", 0.5)
68
-
69
- # Multilingual settings
70
- self.enable_multilingual = self.config.get("enable_multilingual", True)
71
- self.languages = self.config.get("languages", ["en", "zh", "es", "fr", "de"])
72
-
73
- logger.info(f"Initialized EmbeddingEvaluator for task: {self.task_type}")
74
-
75
- async def evaluate_sample(self,
76
- sample: Dict[str, Any],
77
- model_interface: Any) -> Dict[str, Any]:
78
- """
79
- Evaluate a single embedding sample.
80
-
81
- Args:
82
- sample: Embedding sample containing text and expected output
83
- model_interface: Embedding model interface
84
-
85
- Returns:
86
- Evaluation result for the sample
87
- """
88
- try:
89
- # Extract sample data
90
- text_input = sample.get("text", "")
91
- query = sample.get("query", "")
92
- documents = sample.get("documents", [])
93
- expected_output = sample.get("expected_output")
94
- task_type = sample.get("task_type", self.task_type)
95
-
96
- # Get embeddings based on task type
97
- if task_type == "similarity":
98
- result = await self._evaluate_similarity_sample(
99
- model_interface, text_input, expected_output, sample
100
- )
101
- elif task_type == "retrieval":
102
- result = await self._evaluate_retrieval_sample(
103
- model_interface, query, documents, expected_output, sample
104
- )
105
- elif task_type == "reranking":
106
- result = await self._evaluate_reranking_sample(
107
- model_interface, query, documents, expected_output, sample
108
- )
109
- elif task_type == "clustering":
110
- result = await self._evaluate_clustering_sample(
111
- model_interface, text_input, expected_output, sample
112
- )
113
- else:
114
- # Generic embedding evaluation
115
- result = await self._evaluate_generic_sample(
116
- model_interface, text_input, expected_output, sample
117
- )
118
-
119
- result["task_type"] = task_type
120
- return result
121
-
122
- except Exception as e:
123
- logger.error(f"Error evaluating embedding sample: {e}")
124
- raise
125
-
126
- async def _evaluate_similarity_sample(self,
127
- model_interface: Any,
128
- text_input: str,
129
- expected_output: Any,
130
- sample: Dict[str, Any]) -> Dict[str, Any]:
131
- """Evaluate semantic similarity task."""
132
- try:
133
- # Extract text pairs
134
- text1 = sample.get("text1", text_input)
135
- text2 = sample.get("text2", "")
136
- expected_similarity = float(expected_output) if expected_output is not None else 0.0
137
-
138
- # Get embeddings
139
- emb1 = await self._get_embedding(model_interface, text1)
140
- emb2 = await self._get_embedding(model_interface, text2)
141
-
142
- # Compute similarity
143
- predicted_similarity = self._compute_similarity(emb1, emb2)
144
-
145
- # Compute metrics
146
- sample_metrics = {
147
- "predicted_similarity": predicted_similarity,
148
- "expected_similarity": expected_similarity,
149
- "similarity_error": abs(predicted_similarity - expected_similarity),
150
- "similarity_correlation": self._compute_correlation([predicted_similarity], [expected_similarity])
151
- }
152
-
153
- return {
154
- "prediction": predicted_similarity,
155
- "expected_output": expected_similarity,
156
- "sample_metrics": sample_metrics,
157
- "embeddings": {"text1": emb1.tolist(), "text2": emb2.tolist()}
158
- }
159
-
160
- except Exception as e:
161
- logger.error(f"Error evaluating similarity sample: {e}")
162
- raise
163
-
164
- async def _evaluate_retrieval_sample(self,
165
- model_interface: Any,
166
- query: str,
167
- documents: List[str],
168
- expected_output: Any,
169
- sample: Dict[str, Any]) -> Dict[str, Any]:
170
- """Evaluate information retrieval task."""
171
- try:
172
- # Get query embedding
173
- query_embedding = await self._get_embedding(model_interface, query)
174
-
175
- # Get document embeddings
176
- doc_embeddings = []
177
- for doc in documents:
178
- doc_emb = await self._get_embedding(model_interface, doc)
179
- doc_embeddings.append(doc_emb)
180
-
181
- if not doc_embeddings:
182
- raise ValueError("No documents provided for retrieval evaluation")
183
-
184
- doc_embeddings = np.array(doc_embeddings)
185
-
186
- # Compute similarities
187
- similarities = self._compute_similarity_matrix(query_embedding, doc_embeddings)
188
-
189
- # Rank documents
190
- ranked_indices = np.argsort(similarities)[::-1] # Descending order
191
-
192
- # Extract relevance labels
193
- relevance_labels = expected_output if isinstance(expected_output, list) else []
194
-
195
- # Compute retrieval metrics
196
- sample_metrics = self._compute_retrieval_metrics(ranked_indices, relevance_labels)
197
-
198
- return {
199
- "prediction": ranked_indices.tolist(),
200
- "expected_output": relevance_labels,
201
- "sample_metrics": sample_metrics,
202
- "similarities": similarities.tolist(),
203
- "query_embedding": query_embedding.tolist()
204
- }
205
-
206
- except Exception as e:
207
- logger.error(f"Error evaluating retrieval sample: {e}")
208
- raise
209
-
210
- async def _evaluate_reranking_sample(self,
211
- model_interface: Any,
212
- query: str,
213
- documents: List[str],
214
- expected_output: Any,
215
- sample: Dict[str, Any]) -> Dict[str, Any]:
216
- """Evaluate reranking task."""
217
- try:
218
- # Get initial rankings (if provided)
219
- initial_ranking = sample.get("initial_ranking", list(range(len(documents))))
220
-
221
- # Rerank using embedding model
222
- if hasattr(model_interface, 'rerank'):
223
- # ISA reranking service
224
- reranked_results = await model_interface.rerank(query, documents)
225
- if isinstance(reranked_results, list):
226
- reranked_indices = [r.get("index", i) for i, r in enumerate(reranked_results)]
227
- else:
228
- reranked_indices = list(range(len(documents)))
229
- else:
230
- # Use embedding similarity for reranking
231
- query_embedding = await self._get_embedding(model_interface, query)
232
- doc_embeddings = []
233
- for doc in documents:
234
- doc_emb = await self._get_embedding(model_interface, doc)
235
- doc_embeddings.append(doc_emb)
236
-
237
- doc_embeddings = np.array(doc_embeddings)
238
- similarities = self._compute_similarity_matrix(query_embedding, doc_embeddings)
239
- reranked_indices = np.argsort(similarities)[::-1].tolist()
240
-
241
- # Extract relevance labels
242
- relevance_labels = expected_output if isinstance(expected_output, list) else []
243
-
244
- # Compute reranking metrics
245
- initial_metrics = self._compute_retrieval_metrics(initial_ranking, relevance_labels)
246
- reranked_metrics = self._compute_retrieval_metrics(reranked_indices, relevance_labels)
247
-
248
- # Compute improvement
249
- improvement_metrics = {}
250
- for metric_name in ["precision_at_1", "precision_at_5", "ndcg_at_10"]:
251
- initial_score = initial_metrics.get(metric_name, 0.0)
252
- reranked_score = reranked_metrics.get(metric_name, 0.0)
253
- improvement_metrics[f"{metric_name}_improvement"] = reranked_score - initial_score
254
-
255
- sample_metrics = {
256
- **reranked_metrics,
257
- **improvement_metrics,
258
- "reranking_effectiveness": np.mean(list(improvement_metrics.values()))
259
- }
260
-
261
- return {
262
- "prediction": reranked_indices,
263
- "expected_output": relevance_labels,
264
- "sample_metrics": sample_metrics,
265
- "initial_ranking": initial_ranking,
266
- "reranked_ranking": reranked_indices
267
- }
268
-
269
- except Exception as e:
270
- logger.error(f"Error evaluating reranking sample: {e}")
271
- raise
272
-
273
- async def _evaluate_clustering_sample(self,
274
- model_interface: Any,
275
- text_input: Union[str, List[str]],
276
- expected_output: Any,
277
- sample: Dict[str, Any]) -> Dict[str, Any]:
278
- """Evaluate clustering task."""
279
- try:
280
- # Extract texts for clustering
281
- texts = text_input if isinstance(text_input, list) else sample.get("texts", [text_input])
282
- expected_clusters = expected_output if isinstance(expected_output, list) else []
283
-
284
- # Get embeddings
285
- embeddings = []
286
- for text in texts:
287
- emb = await self._get_embedding(model_interface, text)
288
- embeddings.append(emb)
289
-
290
- embeddings = np.array(embeddings)
291
-
292
- # Perform clustering (simple k-means)
293
- from sklearn.cluster import KMeans
294
-
295
- n_clusters = len(set(expected_clusters)) if expected_clusters else 2
296
- kmeans = KMeans(n_clusters=n_clusters, random_state=42)
297
- predicted_clusters = kmeans.fit_predict(embeddings)
298
-
299
- # Compute clustering metrics
300
- sample_metrics = self._compute_clustering_metrics(predicted_clusters, expected_clusters)
301
-
302
- return {
303
- "prediction": predicted_clusters.tolist(),
304
- "expected_output": expected_clusters,
305
- "sample_metrics": sample_metrics,
306
- "embeddings": embeddings.tolist()
307
- }
308
-
309
- except Exception as e:
310
- logger.error(f"Error evaluating clustering sample: {e}")
311
- raise
312
-
313
- async def _evaluate_generic_sample(self,
314
- model_interface: Any,
315
- text_input: str,
316
- expected_output: Any,
317
- sample: Dict[str, Any]) -> Dict[str, Any]:
318
- """Evaluate generic embedding task."""
319
- try:
320
- # Get embedding
321
- embedding = await self._get_embedding(model_interface, text_input)
322
-
323
- # Basic embedding quality metrics
324
- sample_metrics = {
325
- "embedding_norm": float(np.linalg.norm(embedding)),
326
- "embedding_mean": float(np.mean(embedding)),
327
- "embedding_std": float(np.std(embedding)),
328
- "embedding_dimension": len(embedding)
329
- }
330
-
331
- return {
332
- "prediction": embedding.tolist(),
333
- "expected_output": expected_output,
334
- "sample_metrics": sample_metrics,
335
- "embedding": embedding.tolist()
336
- }
337
-
338
- except Exception as e:
339
- logger.error(f"Error evaluating generic sample: {e}")
340
- raise
341
-
342
- async def _get_embedding(self, model_interface: Any, text: str) -> np.ndarray:
343
- """Get embedding from model interface."""
344
- try:
345
- if hasattr(model_interface, 'embed'):
346
- # ISA embedding service
347
- result = await model_interface.embed(text)
348
- if isinstance(result, dict):
349
- embedding = result.get("embedding", result.get("vector", []))
350
- else:
351
- embedding = result
352
- elif hasattr(model_interface, 'encode'):
353
- # Standard embedding interface
354
- embedding = await model_interface.encode(text)
355
- else:
356
- # Generic interface
357
- embedding = await model_interface.predict(text)
358
-
359
- # Convert to numpy array
360
- embedding = np.array(embedding, dtype=np.float32)
361
-
362
- # Normalize if configured
363
- if self.normalize_embeddings:
364
- norm = np.linalg.norm(embedding)
365
- if norm > 0:
366
- embedding = embedding / norm
367
-
368
- return embedding
369
-
370
- except Exception as e:
371
- logger.error(f"Error getting embedding: {e}")
372
- raise
373
-
374
- def _compute_similarity(self, emb1: np.ndarray, emb2: np.ndarray) -> float:
375
- """Compute similarity between two embeddings."""
376
- try:
377
- if self.similarity_metric == "cosine":
378
- # Cosine similarity
379
- return float(np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2)))
380
- elif self.similarity_metric == "dot":
381
- # Dot product
382
- return float(np.dot(emb1, emb2))
383
- elif self.similarity_metric == "euclidean":
384
- # Negative euclidean distance (higher = more similar)
385
- return float(-np.linalg.norm(emb1 - emb2))
386
- else:
387
- # Default to cosine
388
- return float(np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2)))
389
-
390
- except Exception as e:
391
- logger.error(f"Error computing similarity: {e}")
392
- return 0.0
393
-
394
- def _compute_similarity_matrix(self, query_emb: np.ndarray, doc_embs: np.ndarray) -> np.ndarray:
395
- """Compute similarity matrix between query and documents."""
396
- try:
397
- if self.similarity_metric == "cosine":
398
- # Reshape for sklearn cosine_similarity
399
- query_emb = query_emb.reshape(1, -1)
400
- similarities = cosine_similarity(query_emb, doc_embs)[0]
401
- elif self.similarity_metric == "dot":
402
- similarities = np.dot(doc_embs, query_emb)
403
- elif self.similarity_metric == "euclidean":
404
- similarities = -np.linalg.norm(doc_embs - query_emb, axis=1)
405
- else:
406
- # Default to cosine
407
- query_emb = query_emb.reshape(1, -1)
408
- similarities = cosine_similarity(query_emb, doc_embs)[0]
409
-
410
- return similarities
411
-
412
- except Exception as e:
413
- logger.error(f"Error computing similarity matrix: {e}")
414
- return np.zeros(len(doc_embs))
415
-
416
- def _compute_retrieval_metrics(self,
417
- ranked_indices: List[int],
418
- relevance_labels: List[int]) -> Dict[str, float]:
419
- """Compute information retrieval metrics."""
420
- try:
421
- if not relevance_labels:
422
- return {"retrieval_error": 1.0}
423
-
424
- metrics = {}
425
- n_docs = len(ranked_indices)
426
-
427
- # Ensure relevance labels match document count
428
- relevance_labels = relevance_labels[:n_docs] + [0] * max(0, n_docs - len(relevance_labels))
429
-
430
- # Compute metrics for different K values
431
- for k in self.k_values:
432
- if k > n_docs:
433
- continue
434
-
435
- # Get top-k predictions
436
- top_k_indices = ranked_indices[:k]
437
- top_k_relevance = [relevance_labels[i] for i in top_k_indices]
438
-
439
- # Precision@K
440
- precision_k = sum(top_k_relevance) / k if k > 0 else 0.0
441
- metrics[f"precision_at_{k}"] = precision_k
442
-
443
- # Recall@K
444
- total_relevant = sum(relevance_labels)
445
- recall_k = sum(top_k_relevance) / total_relevant if total_relevant > 0 else 0.0
446
- metrics[f"recall_at_{k}"] = recall_k
447
-
448
- # F1@K
449
- if precision_k + recall_k > 0:
450
- f1_k = 2 * precision_k * recall_k / (precision_k + recall_k)
451
- else:
452
- f1_k = 0.0
453
- metrics[f"f1_at_{k}"] = f1_k
454
-
455
- # NDCG@K for different K values
456
- for k in self.k_values:
457
- if k > n_docs:
458
- continue
459
- ndcg_k = self._compute_ndcg(ranked_indices, relevance_labels, k)
460
- metrics[f"ndcg_at_{k}"] = ndcg_k
461
-
462
- # Mean Average Precision (MAP)
463
- metrics["map"] = self._compute_map(ranked_indices, relevance_labels)
464
-
465
- # Mean Reciprocal Rank (MRR)
466
- metrics["mrr"] = self._compute_mrr(ranked_indices, relevance_labels)
467
-
468
- return metrics
469
-
470
- except Exception as e:
471
- logger.error(f"Error computing retrieval metrics: {e}")
472
- return {"retrieval_error": 1.0}
473
-
474
- def _compute_ndcg(self, ranked_indices: List[int], relevance_labels: List[int], k: int) -> float:
475
- """Compute Normalized Discounted Cumulative Gain@K."""
476
- try:
477
- # DCG@K
478
- dcg = 0.0
479
- for i, doc_idx in enumerate(ranked_indices[:k]):
480
- if doc_idx < len(relevance_labels):
481
- relevance = relevance_labels[doc_idx]
482
- dcg += relevance / np.log2(i + 2) # i+2 because log2(1) = 0
483
-
484
- # IDCG@K (Ideal DCG)
485
- sorted_relevance = sorted(relevance_labels, reverse=True)
486
- idcg = 0.0
487
- for i, relevance in enumerate(sorted_relevance[:k]):
488
- idcg += relevance / np.log2(i + 2)
489
-
490
- # NDCG@K
491
- ndcg = dcg / idcg if idcg > 0 else 0.0
492
- return ndcg
493
-
494
- except Exception as e:
495
- logger.error(f"Error computing NDCG: {e}")
496
- return 0.0
497
-
498
- def _compute_map(self, ranked_indices: List[int], relevance_labels: List[int]) -> float:
499
- """Compute Mean Average Precision."""
500
- try:
501
- if not any(relevance_labels):
502
- return 0.0
503
-
504
- precision_sum = 0.0
505
- relevant_count = 0
506
-
507
- for i, doc_idx in enumerate(ranked_indices):
508
- if doc_idx < len(relevance_labels) and relevance_labels[doc_idx] > 0:
509
- relevant_count += 1
510
- precision_at_i = relevant_count / (i + 1)
511
- precision_sum += precision_at_i
512
-
513
- total_relevant = sum(1 for r in relevance_labels if r > 0)
514
- map_score = precision_sum / total_relevant if total_relevant > 0 else 0.0
515
-
516
- return map_score
517
-
518
- except Exception as e:
519
- logger.error(f"Error computing MAP: {e}")
520
- return 0.0
521
-
522
- def _compute_mrr(self, ranked_indices: List[int], relevance_labels: List[int]) -> float:
523
- """Compute Mean Reciprocal Rank."""
524
- try:
525
- for i, doc_idx in enumerate(ranked_indices):
526
- if doc_idx < len(relevance_labels) and relevance_labels[doc_idx] > 0:
527
- return 1.0 / (i + 1)
528
- return 0.0
529
-
530
- except Exception as e:
531
- logger.error(f"Error computing MRR: {e}")
532
- return 0.0
533
-
534
- def _compute_clustering_metrics(self,
535
- predicted_clusters: List[int],
536
- expected_clusters: List[int]) -> Dict[str, float]:
537
- """Compute clustering evaluation metrics."""
538
- try:
539
- if not expected_clusters:
540
- return {"clustering_error": 1.0}
541
-
542
- # Ensure equal lengths
543
- min_len = min(len(predicted_clusters), len(expected_clusters))
544
- predicted_clusters = predicted_clusters[:min_len]
545
- expected_clusters = expected_clusters[:min_len]
546
-
547
- # Adjusted Rand Index (ARI)
548
- from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
549
-
550
- ari = adjusted_rand_score(expected_clusters, predicted_clusters)
551
- nmi = normalized_mutual_info_score(expected_clusters, predicted_clusters)
552
-
553
- # Silhouette score would require embeddings
554
- return {
555
- "adjusted_rand_index": ari,
556
- "normalized_mutual_info": nmi,
557
- "clustering_accuracy": (ari + nmi) / 2
558
- }
559
-
560
- except Exception as e:
561
- logger.error(f"Error computing clustering metrics: {e}")
562
- return {"clustering_error": 1.0}
563
-
564
- def _compute_correlation(self, predictions: List[float], references: List[float]) -> float:
565
- """Compute correlation between predictions and references."""
566
- try:
567
- if len(predictions) < 2 or len(references) < 2:
568
- return 0.0
569
-
570
- from scipy.stats import pearsonr
571
- correlation, _ = pearsonr(predictions, references)
572
- return float(correlation) if not np.isnan(correlation) else 0.0
573
-
574
- except Exception as e:
575
- logger.error(f"Error computing correlation: {e}")
576
- return 0.0
577
-
578
- def compute_metrics(self,
579
- predictions: List[Any],
580
- references: List[Any],
581
- **kwargs) -> Dict[str, float]:
582
- """
583
- Compute aggregate embedding evaluation metrics.
584
-
585
- Args:
586
- predictions: List of model predictions
587
- references: List of reference outputs
588
- **kwargs: Additional parameters
589
-
590
- Returns:
591
- Dictionary of computed metrics
592
- """
593
- try:
594
- if not predictions or not references:
595
- logger.warning("Empty predictions or references provided")
596
- return {}
597
-
598
- # Ensure equal lengths
599
- min_len = min(len(predictions), len(references))
600
- predictions = predictions[:min_len]
601
- references = references[:min_len]
602
-
603
- task_type = self.task_type
604
-
605
- if task_type == "similarity":
606
- return self._compute_aggregate_similarity_metrics(predictions, references)
607
- elif task_type == "retrieval":
608
- return self._compute_aggregate_retrieval_metrics(predictions, references)
609
- elif task_type == "reranking":
610
- return self._compute_aggregate_reranking_metrics(predictions, references)
611
- elif task_type == "clustering":
612
- return self._compute_aggregate_clustering_metrics(predictions, references)
613
- else:
614
- # Generic metrics
615
- return {
616
- "total_samples": len(predictions),
617
- "task_type": task_type,
618
- "evaluation_success_rate": 1.0
619
- }
620
-
621
- except Exception as e:
622
- logger.error(f"Error computing aggregate metrics: {e}")
623
- return {"error_rate": 1.0}
624
-
625
- def _compute_aggregate_similarity_metrics(self,
626
- predictions: List[float],
627
- references: List[float]) -> Dict[str, float]:
628
- """Compute aggregate similarity metrics."""
629
- try:
630
- # Convert to float if needed
631
- pred_vals = [float(p) for p in predictions if p is not None]
632
- ref_vals = [float(r) for r in references if r is not None]
633
-
634
- if not pred_vals or not ref_vals:
635
- return {"similarity_error": 1.0}
636
-
637
- # Correlation
638
- correlation = self._compute_correlation(pred_vals, ref_vals)
639
-
640
- # Mean absolute error
641
- errors = [abs(p - r) for p, r in zip(pred_vals, ref_vals)]
642
- mae = np.mean(errors) if errors else 1.0
643
-
644
- # Mean squared error
645
- mse = np.mean([(p - r)**2 for p, r in zip(pred_vals, ref_vals)]) if pred_vals else 1.0
646
-
647
- return {
648
- "similarity_correlation": correlation,
649
- "similarity_mae": mae,
650
- "similarity_mse": mse,
651
- "similarity_rmse": np.sqrt(mse),
652
- "total_samples": len(pred_vals)
653
- }
654
-
655
- except Exception as e:
656
- logger.error(f"Error computing aggregate similarity metrics: {e}")
657
- return {"similarity_error": 1.0}
658
-
659
- def _compute_aggregate_retrieval_metrics(self,
660
- predictions: List[List[int]],
661
- references: List[List[int]]) -> Dict[str, float]:
662
- """Compute aggregate retrieval metrics."""
663
- try:
664
- all_metrics = {}
665
- metric_names = [f"precision_at_{k}" for k in self.k_values] + \
666
- [f"recall_at_{k}" for k in self.k_values] + \
667
- [f"ndcg_at_{k}" for k in self.k_values] + \
668
- ["map", "mrr"]
669
-
670
- # Initialize metric accumulators
671
- for metric_name in metric_names:
672
- all_metrics[metric_name] = []
673
-
674
- # Compute metrics for each sample
675
- for pred, ref in zip(predictions, references):
676
- if isinstance(pred, list) and isinstance(ref, list):
677
- sample_metrics = self._compute_retrieval_metrics(pred, ref)
678
- for metric_name in metric_names:
679
- if metric_name in sample_metrics:
680
- all_metrics[metric_name].append(sample_metrics[metric_name])
681
-
682
- # Compute averages
683
- avg_metrics = {}
684
- for metric_name, values in all_metrics.items():
685
- if values:
686
- avg_metrics[f"avg_{metric_name}"] = np.mean(values)
687
-
688
- avg_metrics["total_samples"] = len(predictions)
689
- return avg_metrics
690
-
691
- except Exception as e:
692
- logger.error(f"Error computing aggregate retrieval metrics: {e}")
693
- return {"retrieval_error": 1.0}
694
-
695
- def _compute_aggregate_reranking_metrics(self,
696
- predictions: List[List[int]],
697
- references: List[List[int]]) -> Dict[str, float]:
698
- """Compute aggregate reranking metrics."""
699
- # Similar to retrieval but focus on improvement metrics
700
- return self._compute_aggregate_retrieval_metrics(predictions, references)
701
-
702
- def _compute_aggregate_clustering_metrics(self,
703
- predictions: List[List[int]],
704
- references: List[List[int]]) -> Dict[str, float]:
705
- """Compute aggregate clustering metrics."""
706
- try:
707
- ari_scores = []
708
- nmi_scores = []
709
-
710
- for pred, ref in zip(predictions, references):
711
- if isinstance(pred, list) and isinstance(ref, list):
712
- sample_metrics = self._compute_clustering_metrics(pred, ref)
713
- ari_scores.append(sample_metrics.get("adjusted_rand_index", 0.0))
714
- nmi_scores.append(sample_metrics.get("normalized_mutual_info", 0.0))
715
-
716
- return {
717
- "avg_adjusted_rand_index": np.mean(ari_scores) if ari_scores else 0.0,
718
- "avg_normalized_mutual_info": np.mean(nmi_scores) if nmi_scores else 0.0,
719
- "avg_clustering_accuracy": np.mean(ari_scores + nmi_scores) / 2 if ari_scores or nmi_scores else 0.0,
720
- "total_samples": len(predictions)
721
- }
722
-
723
- except Exception as e:
724
- logger.error(f"Error computing aggregate clustering metrics: {e}")
725
- return {"clustering_error": 1.0}
726
-
727
- def get_supported_metrics(self) -> List[str]:
728
- """Get list of metrics supported by this evaluator."""
729
- base_metrics = ["total_samples", "evaluation_success_rate"]
730
-
731
- task_specific_metrics = {
732
- "similarity": ["similarity_correlation", "similarity_mae", "similarity_mse", "similarity_rmse"],
733
- "retrieval": [f"precision_at_{k}" for k in self.k_values] +
734
- [f"recall_at_{k}" for k in self.k_values] +
735
- [f"ndcg_at_{k}" for k in self.k_values] +
736
- ["map", "mrr"],
737
- "reranking": [f"precision_at_{k}_improvement" for k in self.k_values] +
738
- ["reranking_effectiveness"],
739
- "clustering": ["adjusted_rand_index", "normalized_mutual_info", "clustering_accuracy"]
740
- }
741
-
742
- return base_metrics + task_specific_metrics.get(self.task_type, [])