isa-model 0.4.0__py3-none-any.whl → 0.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isa_model/client.py +466 -43
- isa_model/core/cache/redis_cache.py +12 -3
- isa_model/core/config/config_manager.py +230 -3
- isa_model/core/config.py +90 -0
- isa_model/core/database/direct_db_client.py +114 -0
- isa_model/core/database/migration_manager.py +563 -0
- isa_model/core/database/migrations.py +21 -1
- isa_model/core/database/supabase_client.py +154 -19
- isa_model/core/dependencies.py +316 -0
- isa_model/core/discovery/__init__.py +19 -0
- isa_model/core/discovery/consul_discovery.py +190 -0
- isa_model/core/logging/__init__.py +54 -0
- isa_model/core/logging/influx_logger.py +523 -0
- isa_model/core/logging/loki_logger.py +160 -0
- isa_model/core/models/__init__.py +27 -18
- isa_model/core/models/config_models.py +625 -0
- isa_model/core/models/deployment_billing_tracker.py +430 -0
- isa_model/core/models/model_manager.py +35 -80
- isa_model/core/models/model_metadata.py +690 -0
- isa_model/core/models/model_repo.py +174 -18
- isa_model/core/models/system_models.py +857 -0
- isa_model/core/repositories/__init__.py +9 -0
- isa_model/core/repositories/config_repository.py +912 -0
- isa_model/core/services/intelligent_model_selector.py +399 -21
- isa_model/core/types.py +1 -0
- isa_model/deployment/__init__.py +5 -48
- isa_model/deployment/core/__init__.py +2 -31
- isa_model/deployment/core/deployment_manager.py +1278 -370
- isa_model/deployment/modal/__init__.py +8 -0
- isa_model/deployment/modal/config.py +136 -0
- isa_model/deployment/{services/auto_hf_modal_deployer.py → modal/deployer.py} +1 -1
- isa_model/deployment/modal/services/__init__.py +3 -0
- isa_model/deployment/modal/services/audio/__init__.py +1 -0
- isa_model/deployment/modal/services/embedding/__init__.py +1 -0
- isa_model/deployment/modal/services/llm/__init__.py +1 -0
- isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
- isa_model/deployment/modal/services/video/__init__.py +1 -0
- isa_model/deployment/modal/services/vision/__init__.py +1 -0
- isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
- isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
- isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
- isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
- isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
- isa_model/deployment/storage/__init__.py +5 -0
- isa_model/deployment/storage/deployment_repository.py +824 -0
- isa_model/deployment/triton/__init__.py +10 -0
- isa_model/deployment/triton/config.py +196 -0
- isa_model/deployment/triton/configs/__init__.py +1 -0
- isa_model/deployment/triton/provider.py +512 -0
- isa_model/deployment/triton/scripts/__init__.py +1 -0
- isa_model/deployment/triton/templates/__init__.py +1 -0
- isa_model/inference/__init__.py +47 -1
- isa_model/inference/ai_factory.py +137 -10
- isa_model/inference/legacy_services/__init__.py +21 -0
- isa_model/inference/legacy_services/model_evaluation.py +637 -0
- isa_model/inference/legacy_services/model_service.py +573 -0
- isa_model/inference/legacy_services/model_serving.py +717 -0
- isa_model/inference/legacy_services/model_training.py +561 -0
- isa_model/inference/models/__init__.py +21 -0
- isa_model/inference/models/inference_config.py +551 -0
- isa_model/inference/models/inference_record.py +675 -0
- isa_model/inference/models/performance_models.py +714 -0
- isa_model/inference/repositories/__init__.py +9 -0
- isa_model/inference/repositories/inference_repository.py +828 -0
- isa_model/inference/services/audio/base_stt_service.py +184 -11
- isa_model/inference/services/audio/openai_stt_service.py +22 -6
- isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
- isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
- isa_model/inference/services/llm/__init__.py +10 -2
- isa_model/inference/services/llm/base_llm_service.py +335 -24
- isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
- isa_model/inference/services/llm/helpers/llm_adapter.py +9 -4
- isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
- isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
- isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
- isa_model/inference/services/llm/ollama_llm_service.py +9 -2
- isa_model/inference/services/llm/openai_llm_service.py +33 -16
- isa_model/inference/services/llm/yyds_llm_service.py +8 -2
- isa_model/inference/services/vision/__init__.py +22 -1
- isa_model/inference/services/vision/helpers/image_utils.py +8 -5
- isa_model/inference/services/vision/isa_vision_service.py +65 -4
- isa_model/inference/services/vision/openai_vision_service.py +19 -10
- isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
- isa_model/serving/api/cache_manager.py +245 -0
- isa_model/serving/api/dependencies/__init__.py +1 -0
- isa_model/serving/api/dependencies/auth.py +194 -0
- isa_model/serving/api/dependencies/database.py +139 -0
- isa_model/serving/api/error_handlers.py +284 -0
- isa_model/serving/api/fastapi_server.py +172 -22
- isa_model/serving/api/middleware/auth.py +8 -2
- isa_model/serving/api/middleware/security.py +23 -33
- isa_model/serving/api/middleware/tenant_context.py +414 -0
- isa_model/serving/api/routes/analytics.py +4 -1
- isa_model/serving/api/routes/config.py +645 -0
- isa_model/serving/api/routes/deployment_billing.py +315 -0
- isa_model/serving/api/routes/deployments.py +138 -2
- isa_model/serving/api/routes/gpu_gateway.py +440 -0
- isa_model/serving/api/routes/health.py +32 -12
- isa_model/serving/api/routes/inference_monitoring.py +486 -0
- isa_model/serving/api/routes/local_deployments.py +448 -0
- isa_model/serving/api/routes/tenants.py +575 -0
- isa_model/serving/api/routes/unified.py +680 -18
- isa_model/serving/api/routes/webhooks.py +479 -0
- isa_model/serving/api/startup.py +68 -54
- isa_model/utils/gpu_utils.py +311 -0
- {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/METADATA +71 -24
- isa_model-0.4.4.dist-info/RECORD +180 -0
- isa_model/core/security/secrets.py +0 -358
- isa_model/core/storage/hf_storage.py +0 -419
- isa_model/core/storage/minio_storage.py +0 -0
- isa_model/deployment/cloud/__init__.py +0 -9
- isa_model/deployment/cloud/modal/__init__.py +0 -10
- isa_model/deployment/core/deployment_config.py +0 -356
- isa_model/deployment/core/isa_deployment_service.py +0 -401
- isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
- isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
- isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
- isa_model/deployment/runtime/deployed_service.py +0 -338
- isa_model/deployment/services/__init__.py +0 -9
- isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
- isa_model/deployment/services/model_service.py +0 -332
- isa_model/deployment/services/service_monitor.py +0 -356
- isa_model/deployment/services/service_registry.py +0 -527
- isa_model/eval/__init__.py +0 -92
- isa_model/eval/benchmarks/__init__.py +0 -27
- isa_model/eval/benchmarks/multimodal_datasets.py +0 -460
- isa_model/eval/benchmarks.py +0 -701
- isa_model/eval/config/__init__.py +0 -10
- isa_model/eval/config/evaluation_config.py +0 -108
- isa_model/eval/evaluators/__init__.py +0 -24
- isa_model/eval/evaluators/audio_evaluator.py +0 -727
- isa_model/eval/evaluators/base_evaluator.py +0 -503
- isa_model/eval/evaluators/embedding_evaluator.py +0 -742
- isa_model/eval/evaluators/llm_evaluator.py +0 -472
- isa_model/eval/evaluators/vision_evaluator.py +0 -564
- isa_model/eval/example_evaluation.py +0 -395
- isa_model/eval/factory.py +0 -798
- isa_model/eval/infrastructure/__init__.py +0 -24
- isa_model/eval/infrastructure/experiment_tracker.py +0 -466
- isa_model/eval/isa_benchmarks.py +0 -700
- isa_model/eval/isa_integration.py +0 -582
- isa_model/eval/metrics.py +0 -951
- isa_model/eval/tests/unit/test_basic.py +0 -396
- isa_model/serving/api/routes/evaluations.py +0 -579
- isa_model/training/__init__.py +0 -168
- isa_model/training/annotation/annotation_schema.py +0 -47
- isa_model/training/annotation/processors/annotation_processor.py +0 -126
- isa_model/training/annotation/storage/dataset_manager.py +0 -131
- isa_model/training/annotation/storage/dataset_schema.py +0 -44
- isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
- isa_model/training/annotation/tests/test_minio copy.py +0 -113
- isa_model/training/annotation/tests/test_minio_upload.py +0 -43
- isa_model/training/annotation/views/annotation_controller.py +0 -158
- isa_model/training/cloud/__init__.py +0 -22
- isa_model/training/cloud/job_orchestrator.py +0 -402
- isa_model/training/cloud/runpod_trainer.py +0 -454
- isa_model/training/cloud/storage_manager.py +0 -482
- isa_model/training/core/__init__.py +0 -26
- isa_model/training/core/config.py +0 -181
- isa_model/training/core/dataset.py +0 -222
- isa_model/training/core/trainer.py +0 -720
- isa_model/training/core/utils.py +0 -213
- isa_model/training/examples/intelligent_training_example.py +0 -281
- isa_model/training/factory.py +0 -424
- isa_model/training/intelligent/__init__.py +0 -25
- isa_model/training/intelligent/decision_engine.py +0 -643
- isa_model/training/intelligent/intelligent_factory.py +0 -888
- isa_model/training/intelligent/knowledge_base.py +0 -751
- isa_model/training/intelligent/resource_optimizer.py +0 -839
- isa_model/training/intelligent/task_classifier.py +0 -576
- isa_model/training/storage/__init__.py +0 -24
- isa_model/training/storage/core_integration.py +0 -439
- isa_model/training/storage/training_repository.py +0 -552
- isa_model/training/storage/training_storage.py +0 -628
- isa_model-0.4.0.dist-info/RECORD +0 -182
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_chatTTS_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_fish_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_openvoice_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_service_v2.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/embedding}/isa_embed_rerank_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/video}/isa_video_hunyuan_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ocr_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_qwen25_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_table_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service_optimized.py +0 -0
- /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
- {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/WHEEL +0 -0
- {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/top_level.txt +0 -0
@@ -1,742 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Embedding Evaluator for ISA Model evaluation framework.
|
3
|
-
|
4
|
-
Provides comprehensive evaluation capabilities for embedding and retrieval tasks including:
|
5
|
-
- Semantic similarity evaluation
|
6
|
-
- Information retrieval evaluation (Precision@K, Recall@K, NDCG)
|
7
|
-
- Reranking effectiveness evaluation
|
8
|
-
- Cross-lingual embedding evaluation
|
9
|
-
- Document ranking evaluation
|
10
|
-
- Clustering evaluation
|
11
|
-
|
12
|
-
Supports ISA custom embedding services and standard embedding models.
|
13
|
-
"""
|
14
|
-
|
15
|
-
import asyncio
|
16
|
-
import logging
|
17
|
-
import numpy as np
|
18
|
-
from typing import Dict, List, Any, Optional, Union, Tuple
|
19
|
-
from sklearn.metrics.pairwise import cosine_similarity
|
20
|
-
from sklearn.metrics import precision_score, recall_score, f1_score
|
21
|
-
import json
|
22
|
-
|
23
|
-
from .base_evaluator import BaseEvaluator, EvaluationResult
|
24
|
-
|
25
|
-
logger = logging.getLogger(__name__)
|
26
|
-
|
27
|
-
|
28
|
-
class EmbeddingEvaluator(BaseEvaluator):
|
29
|
-
"""
|
30
|
-
Comprehensive embedding model evaluator.
|
31
|
-
|
32
|
-
Supports evaluation of:
|
33
|
-
- Semantic similarity tasks (STS, semantic textual similarity)
|
34
|
-
- Information retrieval (IR) tasks with Precision@K, Recall@K, NDCG
|
35
|
-
- Reranking effectiveness (MAP, MRR, NDCG improvements)
|
36
|
-
- Cross-lingual embedding alignment
|
37
|
-
- Document clustering quality
|
38
|
-
- Zero-shot classification accuracy
|
39
|
-
"""
|
40
|
-
|
41
|
-
def __init__(self,
|
42
|
-
config: Optional[Dict[str, Any]] = None,
|
43
|
-
experiment_tracker: Optional[Any] = None):
|
44
|
-
"""
|
45
|
-
Initialize the embedding evaluator.
|
46
|
-
|
47
|
-
Args:
|
48
|
-
config: Evaluation configuration
|
49
|
-
experiment_tracker: Optional experiment tracking instance
|
50
|
-
"""
|
51
|
-
super().__init__(
|
52
|
-
evaluator_name="embedding_evaluator",
|
53
|
-
config=config,
|
54
|
-
experiment_tracker=experiment_tracker
|
55
|
-
)
|
56
|
-
|
57
|
-
# Embedding-specific configuration
|
58
|
-
self.embedding_dim = self.config.get("embedding_dim", None) # Auto-detect if None
|
59
|
-
self.normalize_embeddings = self.config.get("normalize_embeddings", True)
|
60
|
-
self.similarity_metric = self.config.get("similarity_metric", "cosine") # cosine, dot, euclidean
|
61
|
-
|
62
|
-
# Evaluation task types
|
63
|
-
self.task_type = self.config.get("task_type", "similarity") # similarity, retrieval, reranking, clustering
|
64
|
-
|
65
|
-
# Retrieval evaluation settings
|
66
|
-
self.k_values = self.config.get("k_values", [1, 5, 10, 20]) # For Precision@K, Recall@K
|
67
|
-
self.relevance_threshold = self.config.get("relevance_threshold", 0.5)
|
68
|
-
|
69
|
-
# Multilingual settings
|
70
|
-
self.enable_multilingual = self.config.get("enable_multilingual", True)
|
71
|
-
self.languages = self.config.get("languages", ["en", "zh", "es", "fr", "de"])
|
72
|
-
|
73
|
-
logger.info(f"Initialized EmbeddingEvaluator for task: {self.task_type}")
|
74
|
-
|
75
|
-
async def evaluate_sample(self,
|
76
|
-
sample: Dict[str, Any],
|
77
|
-
model_interface: Any) -> Dict[str, Any]:
|
78
|
-
"""
|
79
|
-
Evaluate a single embedding sample.
|
80
|
-
|
81
|
-
Args:
|
82
|
-
sample: Embedding sample containing text and expected output
|
83
|
-
model_interface: Embedding model interface
|
84
|
-
|
85
|
-
Returns:
|
86
|
-
Evaluation result for the sample
|
87
|
-
"""
|
88
|
-
try:
|
89
|
-
# Extract sample data
|
90
|
-
text_input = sample.get("text", "")
|
91
|
-
query = sample.get("query", "")
|
92
|
-
documents = sample.get("documents", [])
|
93
|
-
expected_output = sample.get("expected_output")
|
94
|
-
task_type = sample.get("task_type", self.task_type)
|
95
|
-
|
96
|
-
# Get embeddings based on task type
|
97
|
-
if task_type == "similarity":
|
98
|
-
result = await self._evaluate_similarity_sample(
|
99
|
-
model_interface, text_input, expected_output, sample
|
100
|
-
)
|
101
|
-
elif task_type == "retrieval":
|
102
|
-
result = await self._evaluate_retrieval_sample(
|
103
|
-
model_interface, query, documents, expected_output, sample
|
104
|
-
)
|
105
|
-
elif task_type == "reranking":
|
106
|
-
result = await self._evaluate_reranking_sample(
|
107
|
-
model_interface, query, documents, expected_output, sample
|
108
|
-
)
|
109
|
-
elif task_type == "clustering":
|
110
|
-
result = await self._evaluate_clustering_sample(
|
111
|
-
model_interface, text_input, expected_output, sample
|
112
|
-
)
|
113
|
-
else:
|
114
|
-
# Generic embedding evaluation
|
115
|
-
result = await self._evaluate_generic_sample(
|
116
|
-
model_interface, text_input, expected_output, sample
|
117
|
-
)
|
118
|
-
|
119
|
-
result["task_type"] = task_type
|
120
|
-
return result
|
121
|
-
|
122
|
-
except Exception as e:
|
123
|
-
logger.error(f"Error evaluating embedding sample: {e}")
|
124
|
-
raise
|
125
|
-
|
126
|
-
async def _evaluate_similarity_sample(self,
|
127
|
-
model_interface: Any,
|
128
|
-
text_input: str,
|
129
|
-
expected_output: Any,
|
130
|
-
sample: Dict[str, Any]) -> Dict[str, Any]:
|
131
|
-
"""Evaluate semantic similarity task."""
|
132
|
-
try:
|
133
|
-
# Extract text pairs
|
134
|
-
text1 = sample.get("text1", text_input)
|
135
|
-
text2 = sample.get("text2", "")
|
136
|
-
expected_similarity = float(expected_output) if expected_output is not None else 0.0
|
137
|
-
|
138
|
-
# Get embeddings
|
139
|
-
emb1 = await self._get_embedding(model_interface, text1)
|
140
|
-
emb2 = await self._get_embedding(model_interface, text2)
|
141
|
-
|
142
|
-
# Compute similarity
|
143
|
-
predicted_similarity = self._compute_similarity(emb1, emb2)
|
144
|
-
|
145
|
-
# Compute metrics
|
146
|
-
sample_metrics = {
|
147
|
-
"predicted_similarity": predicted_similarity,
|
148
|
-
"expected_similarity": expected_similarity,
|
149
|
-
"similarity_error": abs(predicted_similarity - expected_similarity),
|
150
|
-
"similarity_correlation": self._compute_correlation([predicted_similarity], [expected_similarity])
|
151
|
-
}
|
152
|
-
|
153
|
-
return {
|
154
|
-
"prediction": predicted_similarity,
|
155
|
-
"expected_output": expected_similarity,
|
156
|
-
"sample_metrics": sample_metrics,
|
157
|
-
"embeddings": {"text1": emb1.tolist(), "text2": emb2.tolist()}
|
158
|
-
}
|
159
|
-
|
160
|
-
except Exception as e:
|
161
|
-
logger.error(f"Error evaluating similarity sample: {e}")
|
162
|
-
raise
|
163
|
-
|
164
|
-
async def _evaluate_retrieval_sample(self,
|
165
|
-
model_interface: Any,
|
166
|
-
query: str,
|
167
|
-
documents: List[str],
|
168
|
-
expected_output: Any,
|
169
|
-
sample: Dict[str, Any]) -> Dict[str, Any]:
|
170
|
-
"""Evaluate information retrieval task."""
|
171
|
-
try:
|
172
|
-
# Get query embedding
|
173
|
-
query_embedding = await self._get_embedding(model_interface, query)
|
174
|
-
|
175
|
-
# Get document embeddings
|
176
|
-
doc_embeddings = []
|
177
|
-
for doc in documents:
|
178
|
-
doc_emb = await self._get_embedding(model_interface, doc)
|
179
|
-
doc_embeddings.append(doc_emb)
|
180
|
-
|
181
|
-
if not doc_embeddings:
|
182
|
-
raise ValueError("No documents provided for retrieval evaluation")
|
183
|
-
|
184
|
-
doc_embeddings = np.array(doc_embeddings)
|
185
|
-
|
186
|
-
# Compute similarities
|
187
|
-
similarities = self._compute_similarity_matrix(query_embedding, doc_embeddings)
|
188
|
-
|
189
|
-
# Rank documents
|
190
|
-
ranked_indices = np.argsort(similarities)[::-1] # Descending order
|
191
|
-
|
192
|
-
# Extract relevance labels
|
193
|
-
relevance_labels = expected_output if isinstance(expected_output, list) else []
|
194
|
-
|
195
|
-
# Compute retrieval metrics
|
196
|
-
sample_metrics = self._compute_retrieval_metrics(ranked_indices, relevance_labels)
|
197
|
-
|
198
|
-
return {
|
199
|
-
"prediction": ranked_indices.tolist(),
|
200
|
-
"expected_output": relevance_labels,
|
201
|
-
"sample_metrics": sample_metrics,
|
202
|
-
"similarities": similarities.tolist(),
|
203
|
-
"query_embedding": query_embedding.tolist()
|
204
|
-
}
|
205
|
-
|
206
|
-
except Exception as e:
|
207
|
-
logger.error(f"Error evaluating retrieval sample: {e}")
|
208
|
-
raise
|
209
|
-
|
210
|
-
async def _evaluate_reranking_sample(self,
|
211
|
-
model_interface: Any,
|
212
|
-
query: str,
|
213
|
-
documents: List[str],
|
214
|
-
expected_output: Any,
|
215
|
-
sample: Dict[str, Any]) -> Dict[str, Any]:
|
216
|
-
"""Evaluate reranking task."""
|
217
|
-
try:
|
218
|
-
# Get initial rankings (if provided)
|
219
|
-
initial_ranking = sample.get("initial_ranking", list(range(len(documents))))
|
220
|
-
|
221
|
-
# Rerank using embedding model
|
222
|
-
if hasattr(model_interface, 'rerank'):
|
223
|
-
# ISA reranking service
|
224
|
-
reranked_results = await model_interface.rerank(query, documents)
|
225
|
-
if isinstance(reranked_results, list):
|
226
|
-
reranked_indices = [r.get("index", i) for i, r in enumerate(reranked_results)]
|
227
|
-
else:
|
228
|
-
reranked_indices = list(range(len(documents)))
|
229
|
-
else:
|
230
|
-
# Use embedding similarity for reranking
|
231
|
-
query_embedding = await self._get_embedding(model_interface, query)
|
232
|
-
doc_embeddings = []
|
233
|
-
for doc in documents:
|
234
|
-
doc_emb = await self._get_embedding(model_interface, doc)
|
235
|
-
doc_embeddings.append(doc_emb)
|
236
|
-
|
237
|
-
doc_embeddings = np.array(doc_embeddings)
|
238
|
-
similarities = self._compute_similarity_matrix(query_embedding, doc_embeddings)
|
239
|
-
reranked_indices = np.argsort(similarities)[::-1].tolist()
|
240
|
-
|
241
|
-
# Extract relevance labels
|
242
|
-
relevance_labels = expected_output if isinstance(expected_output, list) else []
|
243
|
-
|
244
|
-
# Compute reranking metrics
|
245
|
-
initial_metrics = self._compute_retrieval_metrics(initial_ranking, relevance_labels)
|
246
|
-
reranked_metrics = self._compute_retrieval_metrics(reranked_indices, relevance_labels)
|
247
|
-
|
248
|
-
# Compute improvement
|
249
|
-
improvement_metrics = {}
|
250
|
-
for metric_name in ["precision_at_1", "precision_at_5", "ndcg_at_10"]:
|
251
|
-
initial_score = initial_metrics.get(metric_name, 0.0)
|
252
|
-
reranked_score = reranked_metrics.get(metric_name, 0.0)
|
253
|
-
improvement_metrics[f"{metric_name}_improvement"] = reranked_score - initial_score
|
254
|
-
|
255
|
-
sample_metrics = {
|
256
|
-
**reranked_metrics,
|
257
|
-
**improvement_metrics,
|
258
|
-
"reranking_effectiveness": np.mean(list(improvement_metrics.values()))
|
259
|
-
}
|
260
|
-
|
261
|
-
return {
|
262
|
-
"prediction": reranked_indices,
|
263
|
-
"expected_output": relevance_labels,
|
264
|
-
"sample_metrics": sample_metrics,
|
265
|
-
"initial_ranking": initial_ranking,
|
266
|
-
"reranked_ranking": reranked_indices
|
267
|
-
}
|
268
|
-
|
269
|
-
except Exception as e:
|
270
|
-
logger.error(f"Error evaluating reranking sample: {e}")
|
271
|
-
raise
|
272
|
-
|
273
|
-
async def _evaluate_clustering_sample(self,
|
274
|
-
model_interface: Any,
|
275
|
-
text_input: Union[str, List[str]],
|
276
|
-
expected_output: Any,
|
277
|
-
sample: Dict[str, Any]) -> Dict[str, Any]:
|
278
|
-
"""Evaluate clustering task."""
|
279
|
-
try:
|
280
|
-
# Extract texts for clustering
|
281
|
-
texts = text_input if isinstance(text_input, list) else sample.get("texts", [text_input])
|
282
|
-
expected_clusters = expected_output if isinstance(expected_output, list) else []
|
283
|
-
|
284
|
-
# Get embeddings
|
285
|
-
embeddings = []
|
286
|
-
for text in texts:
|
287
|
-
emb = await self._get_embedding(model_interface, text)
|
288
|
-
embeddings.append(emb)
|
289
|
-
|
290
|
-
embeddings = np.array(embeddings)
|
291
|
-
|
292
|
-
# Perform clustering (simple k-means)
|
293
|
-
from sklearn.cluster import KMeans
|
294
|
-
|
295
|
-
n_clusters = len(set(expected_clusters)) if expected_clusters else 2
|
296
|
-
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
|
297
|
-
predicted_clusters = kmeans.fit_predict(embeddings)
|
298
|
-
|
299
|
-
# Compute clustering metrics
|
300
|
-
sample_metrics = self._compute_clustering_metrics(predicted_clusters, expected_clusters)
|
301
|
-
|
302
|
-
return {
|
303
|
-
"prediction": predicted_clusters.tolist(),
|
304
|
-
"expected_output": expected_clusters,
|
305
|
-
"sample_metrics": sample_metrics,
|
306
|
-
"embeddings": embeddings.tolist()
|
307
|
-
}
|
308
|
-
|
309
|
-
except Exception as e:
|
310
|
-
logger.error(f"Error evaluating clustering sample: {e}")
|
311
|
-
raise
|
312
|
-
|
313
|
-
async def _evaluate_generic_sample(self,
|
314
|
-
model_interface: Any,
|
315
|
-
text_input: str,
|
316
|
-
expected_output: Any,
|
317
|
-
sample: Dict[str, Any]) -> Dict[str, Any]:
|
318
|
-
"""Evaluate generic embedding task."""
|
319
|
-
try:
|
320
|
-
# Get embedding
|
321
|
-
embedding = await self._get_embedding(model_interface, text_input)
|
322
|
-
|
323
|
-
# Basic embedding quality metrics
|
324
|
-
sample_metrics = {
|
325
|
-
"embedding_norm": float(np.linalg.norm(embedding)),
|
326
|
-
"embedding_mean": float(np.mean(embedding)),
|
327
|
-
"embedding_std": float(np.std(embedding)),
|
328
|
-
"embedding_dimension": len(embedding)
|
329
|
-
}
|
330
|
-
|
331
|
-
return {
|
332
|
-
"prediction": embedding.tolist(),
|
333
|
-
"expected_output": expected_output,
|
334
|
-
"sample_metrics": sample_metrics,
|
335
|
-
"embedding": embedding.tolist()
|
336
|
-
}
|
337
|
-
|
338
|
-
except Exception as e:
|
339
|
-
logger.error(f"Error evaluating generic sample: {e}")
|
340
|
-
raise
|
341
|
-
|
342
|
-
async def _get_embedding(self, model_interface: Any, text: str) -> np.ndarray:
|
343
|
-
"""Get embedding from model interface."""
|
344
|
-
try:
|
345
|
-
if hasattr(model_interface, 'embed'):
|
346
|
-
# ISA embedding service
|
347
|
-
result = await model_interface.embed(text)
|
348
|
-
if isinstance(result, dict):
|
349
|
-
embedding = result.get("embedding", result.get("vector", []))
|
350
|
-
else:
|
351
|
-
embedding = result
|
352
|
-
elif hasattr(model_interface, 'encode'):
|
353
|
-
# Standard embedding interface
|
354
|
-
embedding = await model_interface.encode(text)
|
355
|
-
else:
|
356
|
-
# Generic interface
|
357
|
-
embedding = await model_interface.predict(text)
|
358
|
-
|
359
|
-
# Convert to numpy array
|
360
|
-
embedding = np.array(embedding, dtype=np.float32)
|
361
|
-
|
362
|
-
# Normalize if configured
|
363
|
-
if self.normalize_embeddings:
|
364
|
-
norm = np.linalg.norm(embedding)
|
365
|
-
if norm > 0:
|
366
|
-
embedding = embedding / norm
|
367
|
-
|
368
|
-
return embedding
|
369
|
-
|
370
|
-
except Exception as e:
|
371
|
-
logger.error(f"Error getting embedding: {e}")
|
372
|
-
raise
|
373
|
-
|
374
|
-
def _compute_similarity(self, emb1: np.ndarray, emb2: np.ndarray) -> float:
|
375
|
-
"""Compute similarity between two embeddings."""
|
376
|
-
try:
|
377
|
-
if self.similarity_metric == "cosine":
|
378
|
-
# Cosine similarity
|
379
|
-
return float(np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2)))
|
380
|
-
elif self.similarity_metric == "dot":
|
381
|
-
# Dot product
|
382
|
-
return float(np.dot(emb1, emb2))
|
383
|
-
elif self.similarity_metric == "euclidean":
|
384
|
-
# Negative euclidean distance (higher = more similar)
|
385
|
-
return float(-np.linalg.norm(emb1 - emb2))
|
386
|
-
else:
|
387
|
-
# Default to cosine
|
388
|
-
return float(np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2)))
|
389
|
-
|
390
|
-
except Exception as e:
|
391
|
-
logger.error(f"Error computing similarity: {e}")
|
392
|
-
return 0.0
|
393
|
-
|
394
|
-
def _compute_similarity_matrix(self, query_emb: np.ndarray, doc_embs: np.ndarray) -> np.ndarray:
|
395
|
-
"""Compute similarity matrix between query and documents."""
|
396
|
-
try:
|
397
|
-
if self.similarity_metric == "cosine":
|
398
|
-
# Reshape for sklearn cosine_similarity
|
399
|
-
query_emb = query_emb.reshape(1, -1)
|
400
|
-
similarities = cosine_similarity(query_emb, doc_embs)[0]
|
401
|
-
elif self.similarity_metric == "dot":
|
402
|
-
similarities = np.dot(doc_embs, query_emb)
|
403
|
-
elif self.similarity_metric == "euclidean":
|
404
|
-
similarities = -np.linalg.norm(doc_embs - query_emb, axis=1)
|
405
|
-
else:
|
406
|
-
# Default to cosine
|
407
|
-
query_emb = query_emb.reshape(1, -1)
|
408
|
-
similarities = cosine_similarity(query_emb, doc_embs)[0]
|
409
|
-
|
410
|
-
return similarities
|
411
|
-
|
412
|
-
except Exception as e:
|
413
|
-
logger.error(f"Error computing similarity matrix: {e}")
|
414
|
-
return np.zeros(len(doc_embs))
|
415
|
-
|
416
|
-
def _compute_retrieval_metrics(self,
|
417
|
-
ranked_indices: List[int],
|
418
|
-
relevance_labels: List[int]) -> Dict[str, float]:
|
419
|
-
"""Compute information retrieval metrics."""
|
420
|
-
try:
|
421
|
-
if not relevance_labels:
|
422
|
-
return {"retrieval_error": 1.0}
|
423
|
-
|
424
|
-
metrics = {}
|
425
|
-
n_docs = len(ranked_indices)
|
426
|
-
|
427
|
-
# Ensure relevance labels match document count
|
428
|
-
relevance_labels = relevance_labels[:n_docs] + [0] * max(0, n_docs - len(relevance_labels))
|
429
|
-
|
430
|
-
# Compute metrics for different K values
|
431
|
-
for k in self.k_values:
|
432
|
-
if k > n_docs:
|
433
|
-
continue
|
434
|
-
|
435
|
-
# Get top-k predictions
|
436
|
-
top_k_indices = ranked_indices[:k]
|
437
|
-
top_k_relevance = [relevance_labels[i] for i in top_k_indices]
|
438
|
-
|
439
|
-
# Precision@K
|
440
|
-
precision_k = sum(top_k_relevance) / k if k > 0 else 0.0
|
441
|
-
metrics[f"precision_at_{k}"] = precision_k
|
442
|
-
|
443
|
-
# Recall@K
|
444
|
-
total_relevant = sum(relevance_labels)
|
445
|
-
recall_k = sum(top_k_relevance) / total_relevant if total_relevant > 0 else 0.0
|
446
|
-
metrics[f"recall_at_{k}"] = recall_k
|
447
|
-
|
448
|
-
# F1@K
|
449
|
-
if precision_k + recall_k > 0:
|
450
|
-
f1_k = 2 * precision_k * recall_k / (precision_k + recall_k)
|
451
|
-
else:
|
452
|
-
f1_k = 0.0
|
453
|
-
metrics[f"f1_at_{k}"] = f1_k
|
454
|
-
|
455
|
-
# NDCG@K for different K values
|
456
|
-
for k in self.k_values:
|
457
|
-
if k > n_docs:
|
458
|
-
continue
|
459
|
-
ndcg_k = self._compute_ndcg(ranked_indices, relevance_labels, k)
|
460
|
-
metrics[f"ndcg_at_{k}"] = ndcg_k
|
461
|
-
|
462
|
-
# Mean Average Precision (MAP)
|
463
|
-
metrics["map"] = self._compute_map(ranked_indices, relevance_labels)
|
464
|
-
|
465
|
-
# Mean Reciprocal Rank (MRR)
|
466
|
-
metrics["mrr"] = self._compute_mrr(ranked_indices, relevance_labels)
|
467
|
-
|
468
|
-
return metrics
|
469
|
-
|
470
|
-
except Exception as e:
|
471
|
-
logger.error(f"Error computing retrieval metrics: {e}")
|
472
|
-
return {"retrieval_error": 1.0}
|
473
|
-
|
474
|
-
def _compute_ndcg(self, ranked_indices: List[int], relevance_labels: List[int], k: int) -> float:
|
475
|
-
"""Compute Normalized Discounted Cumulative Gain@K."""
|
476
|
-
try:
|
477
|
-
# DCG@K
|
478
|
-
dcg = 0.0
|
479
|
-
for i, doc_idx in enumerate(ranked_indices[:k]):
|
480
|
-
if doc_idx < len(relevance_labels):
|
481
|
-
relevance = relevance_labels[doc_idx]
|
482
|
-
dcg += relevance / np.log2(i + 2) # i+2 because log2(1) = 0
|
483
|
-
|
484
|
-
# IDCG@K (Ideal DCG)
|
485
|
-
sorted_relevance = sorted(relevance_labels, reverse=True)
|
486
|
-
idcg = 0.0
|
487
|
-
for i, relevance in enumerate(sorted_relevance[:k]):
|
488
|
-
idcg += relevance / np.log2(i + 2)
|
489
|
-
|
490
|
-
# NDCG@K
|
491
|
-
ndcg = dcg / idcg if idcg > 0 else 0.0
|
492
|
-
return ndcg
|
493
|
-
|
494
|
-
except Exception as e:
|
495
|
-
logger.error(f"Error computing NDCG: {e}")
|
496
|
-
return 0.0
|
497
|
-
|
498
|
-
def _compute_map(self, ranked_indices: List[int], relevance_labels: List[int]) -> float:
|
499
|
-
"""Compute Mean Average Precision."""
|
500
|
-
try:
|
501
|
-
if not any(relevance_labels):
|
502
|
-
return 0.0
|
503
|
-
|
504
|
-
precision_sum = 0.0
|
505
|
-
relevant_count = 0
|
506
|
-
|
507
|
-
for i, doc_idx in enumerate(ranked_indices):
|
508
|
-
if doc_idx < len(relevance_labels) and relevance_labels[doc_idx] > 0:
|
509
|
-
relevant_count += 1
|
510
|
-
precision_at_i = relevant_count / (i + 1)
|
511
|
-
precision_sum += precision_at_i
|
512
|
-
|
513
|
-
total_relevant = sum(1 for r in relevance_labels if r > 0)
|
514
|
-
map_score = precision_sum / total_relevant if total_relevant > 0 else 0.0
|
515
|
-
|
516
|
-
return map_score
|
517
|
-
|
518
|
-
except Exception as e:
|
519
|
-
logger.error(f"Error computing MAP: {e}")
|
520
|
-
return 0.0
|
521
|
-
|
522
|
-
def _compute_mrr(self, ranked_indices: List[int], relevance_labels: List[int]) -> float:
|
523
|
-
"""Compute Mean Reciprocal Rank."""
|
524
|
-
try:
|
525
|
-
for i, doc_idx in enumerate(ranked_indices):
|
526
|
-
if doc_idx < len(relevance_labels) and relevance_labels[doc_idx] > 0:
|
527
|
-
return 1.0 / (i + 1)
|
528
|
-
return 0.0
|
529
|
-
|
530
|
-
except Exception as e:
|
531
|
-
logger.error(f"Error computing MRR: {e}")
|
532
|
-
return 0.0
|
533
|
-
|
534
|
-
def _compute_clustering_metrics(self,
|
535
|
-
predicted_clusters: List[int],
|
536
|
-
expected_clusters: List[int]) -> Dict[str, float]:
|
537
|
-
"""Compute clustering evaluation metrics."""
|
538
|
-
try:
|
539
|
-
if not expected_clusters:
|
540
|
-
return {"clustering_error": 1.0}
|
541
|
-
|
542
|
-
# Ensure equal lengths
|
543
|
-
min_len = min(len(predicted_clusters), len(expected_clusters))
|
544
|
-
predicted_clusters = predicted_clusters[:min_len]
|
545
|
-
expected_clusters = expected_clusters[:min_len]
|
546
|
-
|
547
|
-
# Adjusted Rand Index (ARI)
|
548
|
-
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
|
549
|
-
|
550
|
-
ari = adjusted_rand_score(expected_clusters, predicted_clusters)
|
551
|
-
nmi = normalized_mutual_info_score(expected_clusters, predicted_clusters)
|
552
|
-
|
553
|
-
# Silhouette score would require embeddings
|
554
|
-
return {
|
555
|
-
"adjusted_rand_index": ari,
|
556
|
-
"normalized_mutual_info": nmi,
|
557
|
-
"clustering_accuracy": (ari + nmi) / 2
|
558
|
-
}
|
559
|
-
|
560
|
-
except Exception as e:
|
561
|
-
logger.error(f"Error computing clustering metrics: {e}")
|
562
|
-
return {"clustering_error": 1.0}
|
563
|
-
|
564
|
-
def _compute_correlation(self, predictions: List[float], references: List[float]) -> float:
|
565
|
-
"""Compute correlation between predictions and references."""
|
566
|
-
try:
|
567
|
-
if len(predictions) < 2 or len(references) < 2:
|
568
|
-
return 0.0
|
569
|
-
|
570
|
-
from scipy.stats import pearsonr
|
571
|
-
correlation, _ = pearsonr(predictions, references)
|
572
|
-
return float(correlation) if not np.isnan(correlation) else 0.0
|
573
|
-
|
574
|
-
except Exception as e:
|
575
|
-
logger.error(f"Error computing correlation: {e}")
|
576
|
-
return 0.0
|
577
|
-
|
578
|
-
def compute_metrics(self,
|
579
|
-
predictions: List[Any],
|
580
|
-
references: List[Any],
|
581
|
-
**kwargs) -> Dict[str, float]:
|
582
|
-
"""
|
583
|
-
Compute aggregate embedding evaluation metrics.
|
584
|
-
|
585
|
-
Args:
|
586
|
-
predictions: List of model predictions
|
587
|
-
references: List of reference outputs
|
588
|
-
**kwargs: Additional parameters
|
589
|
-
|
590
|
-
Returns:
|
591
|
-
Dictionary of computed metrics
|
592
|
-
"""
|
593
|
-
try:
|
594
|
-
if not predictions or not references:
|
595
|
-
logger.warning("Empty predictions or references provided")
|
596
|
-
return {}
|
597
|
-
|
598
|
-
# Ensure equal lengths
|
599
|
-
min_len = min(len(predictions), len(references))
|
600
|
-
predictions = predictions[:min_len]
|
601
|
-
references = references[:min_len]
|
602
|
-
|
603
|
-
task_type = self.task_type
|
604
|
-
|
605
|
-
if task_type == "similarity":
|
606
|
-
return self._compute_aggregate_similarity_metrics(predictions, references)
|
607
|
-
elif task_type == "retrieval":
|
608
|
-
return self._compute_aggregate_retrieval_metrics(predictions, references)
|
609
|
-
elif task_type == "reranking":
|
610
|
-
return self._compute_aggregate_reranking_metrics(predictions, references)
|
611
|
-
elif task_type == "clustering":
|
612
|
-
return self._compute_aggregate_clustering_metrics(predictions, references)
|
613
|
-
else:
|
614
|
-
# Generic metrics
|
615
|
-
return {
|
616
|
-
"total_samples": len(predictions),
|
617
|
-
"task_type": task_type,
|
618
|
-
"evaluation_success_rate": 1.0
|
619
|
-
}
|
620
|
-
|
621
|
-
except Exception as e:
|
622
|
-
logger.error(f"Error computing aggregate metrics: {e}")
|
623
|
-
return {"error_rate": 1.0}
|
624
|
-
|
625
|
-
def _compute_aggregate_similarity_metrics(self,
|
626
|
-
predictions: List[float],
|
627
|
-
references: List[float]) -> Dict[str, float]:
|
628
|
-
"""Compute aggregate similarity metrics."""
|
629
|
-
try:
|
630
|
-
# Convert to float if needed
|
631
|
-
pred_vals = [float(p) for p in predictions if p is not None]
|
632
|
-
ref_vals = [float(r) for r in references if r is not None]
|
633
|
-
|
634
|
-
if not pred_vals or not ref_vals:
|
635
|
-
return {"similarity_error": 1.0}
|
636
|
-
|
637
|
-
# Correlation
|
638
|
-
correlation = self._compute_correlation(pred_vals, ref_vals)
|
639
|
-
|
640
|
-
# Mean absolute error
|
641
|
-
errors = [abs(p - r) for p, r in zip(pred_vals, ref_vals)]
|
642
|
-
mae = np.mean(errors) if errors else 1.0
|
643
|
-
|
644
|
-
# Mean squared error
|
645
|
-
mse = np.mean([(p - r)**2 for p, r in zip(pred_vals, ref_vals)]) if pred_vals else 1.0
|
646
|
-
|
647
|
-
return {
|
648
|
-
"similarity_correlation": correlation,
|
649
|
-
"similarity_mae": mae,
|
650
|
-
"similarity_mse": mse,
|
651
|
-
"similarity_rmse": np.sqrt(mse),
|
652
|
-
"total_samples": len(pred_vals)
|
653
|
-
}
|
654
|
-
|
655
|
-
except Exception as e:
|
656
|
-
logger.error(f"Error computing aggregate similarity metrics: {e}")
|
657
|
-
return {"similarity_error": 1.0}
|
658
|
-
|
659
|
-
def _compute_aggregate_retrieval_metrics(self,
|
660
|
-
predictions: List[List[int]],
|
661
|
-
references: List[List[int]]) -> Dict[str, float]:
|
662
|
-
"""Compute aggregate retrieval metrics."""
|
663
|
-
try:
|
664
|
-
all_metrics = {}
|
665
|
-
metric_names = [f"precision_at_{k}" for k in self.k_values] + \
|
666
|
-
[f"recall_at_{k}" for k in self.k_values] + \
|
667
|
-
[f"ndcg_at_{k}" for k in self.k_values] + \
|
668
|
-
["map", "mrr"]
|
669
|
-
|
670
|
-
# Initialize metric accumulators
|
671
|
-
for metric_name in metric_names:
|
672
|
-
all_metrics[metric_name] = []
|
673
|
-
|
674
|
-
# Compute metrics for each sample
|
675
|
-
for pred, ref in zip(predictions, references):
|
676
|
-
if isinstance(pred, list) and isinstance(ref, list):
|
677
|
-
sample_metrics = self._compute_retrieval_metrics(pred, ref)
|
678
|
-
for metric_name in metric_names:
|
679
|
-
if metric_name in sample_metrics:
|
680
|
-
all_metrics[metric_name].append(sample_metrics[metric_name])
|
681
|
-
|
682
|
-
# Compute averages
|
683
|
-
avg_metrics = {}
|
684
|
-
for metric_name, values in all_metrics.items():
|
685
|
-
if values:
|
686
|
-
avg_metrics[f"avg_{metric_name}"] = np.mean(values)
|
687
|
-
|
688
|
-
avg_metrics["total_samples"] = len(predictions)
|
689
|
-
return avg_metrics
|
690
|
-
|
691
|
-
except Exception as e:
|
692
|
-
logger.error(f"Error computing aggregate retrieval metrics: {e}")
|
693
|
-
return {"retrieval_error": 1.0}
|
694
|
-
|
695
|
-
def _compute_aggregate_reranking_metrics(self,
|
696
|
-
predictions: List[List[int]],
|
697
|
-
references: List[List[int]]) -> Dict[str, float]:
|
698
|
-
"""Compute aggregate reranking metrics."""
|
699
|
-
# Similar to retrieval but focus on improvement metrics
|
700
|
-
return self._compute_aggregate_retrieval_metrics(predictions, references)
|
701
|
-
|
702
|
-
def _compute_aggregate_clustering_metrics(self,
|
703
|
-
predictions: List[List[int]],
|
704
|
-
references: List[List[int]]) -> Dict[str, float]:
|
705
|
-
"""Compute aggregate clustering metrics."""
|
706
|
-
try:
|
707
|
-
ari_scores = []
|
708
|
-
nmi_scores = []
|
709
|
-
|
710
|
-
for pred, ref in zip(predictions, references):
|
711
|
-
if isinstance(pred, list) and isinstance(ref, list):
|
712
|
-
sample_metrics = self._compute_clustering_metrics(pred, ref)
|
713
|
-
ari_scores.append(sample_metrics.get("adjusted_rand_index", 0.0))
|
714
|
-
nmi_scores.append(sample_metrics.get("normalized_mutual_info", 0.0))
|
715
|
-
|
716
|
-
return {
|
717
|
-
"avg_adjusted_rand_index": np.mean(ari_scores) if ari_scores else 0.0,
|
718
|
-
"avg_normalized_mutual_info": np.mean(nmi_scores) if nmi_scores else 0.0,
|
719
|
-
"avg_clustering_accuracy": np.mean(ari_scores + nmi_scores) / 2 if ari_scores or nmi_scores else 0.0,
|
720
|
-
"total_samples": len(predictions)
|
721
|
-
}
|
722
|
-
|
723
|
-
except Exception as e:
|
724
|
-
logger.error(f"Error computing aggregate clustering metrics: {e}")
|
725
|
-
return {"clustering_error": 1.0}
|
726
|
-
|
727
|
-
def get_supported_metrics(self) -> List[str]:
|
728
|
-
"""Get list of metrics supported by this evaluator."""
|
729
|
-
base_metrics = ["total_samples", "evaluation_success_rate"]
|
730
|
-
|
731
|
-
task_specific_metrics = {
|
732
|
-
"similarity": ["similarity_correlation", "similarity_mae", "similarity_mse", "similarity_rmse"],
|
733
|
-
"retrieval": [f"precision_at_{k}" for k in self.k_values] +
|
734
|
-
[f"recall_at_{k}" for k in self.k_values] +
|
735
|
-
[f"ndcg_at_{k}" for k in self.k_values] +
|
736
|
-
["map", "mrr"],
|
737
|
-
"reranking": [f"precision_at_{k}_improvement" for k in self.k_values] +
|
738
|
-
["reranking_effectiveness"],
|
739
|
-
"clustering": ["adjusted_rand_index", "normalized_mutual_info", "clustering_accuracy"]
|
740
|
-
}
|
741
|
-
|
742
|
-
return base_metrics + task_specific_metrics.get(self.task_type, [])
|