isa-model 0.4.0__py3-none-any.whl → 0.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isa_model/client.py +466 -43
- isa_model/core/cache/redis_cache.py +12 -3
- isa_model/core/config/config_manager.py +230 -3
- isa_model/core/config.py +90 -0
- isa_model/core/database/direct_db_client.py +114 -0
- isa_model/core/database/migration_manager.py +563 -0
- isa_model/core/database/migrations.py +21 -1
- isa_model/core/database/supabase_client.py +154 -19
- isa_model/core/dependencies.py +316 -0
- isa_model/core/discovery/__init__.py +19 -0
- isa_model/core/discovery/consul_discovery.py +190 -0
- isa_model/core/logging/__init__.py +54 -0
- isa_model/core/logging/influx_logger.py +523 -0
- isa_model/core/logging/loki_logger.py +160 -0
- isa_model/core/models/__init__.py +27 -18
- isa_model/core/models/config_models.py +625 -0
- isa_model/core/models/deployment_billing_tracker.py +430 -0
- isa_model/core/models/model_manager.py +35 -80
- isa_model/core/models/model_metadata.py +690 -0
- isa_model/core/models/model_repo.py +174 -18
- isa_model/core/models/system_models.py +857 -0
- isa_model/core/repositories/__init__.py +9 -0
- isa_model/core/repositories/config_repository.py +912 -0
- isa_model/core/services/intelligent_model_selector.py +399 -21
- isa_model/core/types.py +1 -0
- isa_model/deployment/__init__.py +5 -48
- isa_model/deployment/core/__init__.py +2 -31
- isa_model/deployment/core/deployment_manager.py +1278 -370
- isa_model/deployment/modal/__init__.py +8 -0
- isa_model/deployment/modal/config.py +136 -0
- isa_model/deployment/{services/auto_hf_modal_deployer.py → modal/deployer.py} +1 -1
- isa_model/deployment/modal/services/__init__.py +3 -0
- isa_model/deployment/modal/services/audio/__init__.py +1 -0
- isa_model/deployment/modal/services/embedding/__init__.py +1 -0
- isa_model/deployment/modal/services/llm/__init__.py +1 -0
- isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
- isa_model/deployment/modal/services/video/__init__.py +1 -0
- isa_model/deployment/modal/services/vision/__init__.py +1 -0
- isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
- isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
- isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
- isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
- isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
- isa_model/deployment/storage/__init__.py +5 -0
- isa_model/deployment/storage/deployment_repository.py +824 -0
- isa_model/deployment/triton/__init__.py +10 -0
- isa_model/deployment/triton/config.py +196 -0
- isa_model/deployment/triton/configs/__init__.py +1 -0
- isa_model/deployment/triton/provider.py +512 -0
- isa_model/deployment/triton/scripts/__init__.py +1 -0
- isa_model/deployment/triton/templates/__init__.py +1 -0
- isa_model/inference/__init__.py +47 -1
- isa_model/inference/ai_factory.py +137 -10
- isa_model/inference/legacy_services/__init__.py +21 -0
- isa_model/inference/legacy_services/model_evaluation.py +637 -0
- isa_model/inference/legacy_services/model_service.py +573 -0
- isa_model/inference/legacy_services/model_serving.py +717 -0
- isa_model/inference/legacy_services/model_training.py +561 -0
- isa_model/inference/models/__init__.py +21 -0
- isa_model/inference/models/inference_config.py +551 -0
- isa_model/inference/models/inference_record.py +675 -0
- isa_model/inference/models/performance_models.py +714 -0
- isa_model/inference/repositories/__init__.py +9 -0
- isa_model/inference/repositories/inference_repository.py +828 -0
- isa_model/inference/services/audio/base_stt_service.py +184 -11
- isa_model/inference/services/audio/openai_stt_service.py +22 -6
- isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
- isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
- isa_model/inference/services/llm/__init__.py +10 -2
- isa_model/inference/services/llm/base_llm_service.py +335 -24
- isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
- isa_model/inference/services/llm/helpers/llm_adapter.py +9 -4
- isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
- isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
- isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
- isa_model/inference/services/llm/ollama_llm_service.py +9 -2
- isa_model/inference/services/llm/openai_llm_service.py +33 -16
- isa_model/inference/services/llm/yyds_llm_service.py +8 -2
- isa_model/inference/services/vision/__init__.py +22 -1
- isa_model/inference/services/vision/helpers/image_utils.py +8 -5
- isa_model/inference/services/vision/isa_vision_service.py +65 -4
- isa_model/inference/services/vision/openai_vision_service.py +19 -10
- isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
- isa_model/serving/api/cache_manager.py +245 -0
- isa_model/serving/api/dependencies/__init__.py +1 -0
- isa_model/serving/api/dependencies/auth.py +194 -0
- isa_model/serving/api/dependencies/database.py +139 -0
- isa_model/serving/api/error_handlers.py +284 -0
- isa_model/serving/api/fastapi_server.py +172 -22
- isa_model/serving/api/middleware/auth.py +8 -2
- isa_model/serving/api/middleware/security.py +23 -33
- isa_model/serving/api/middleware/tenant_context.py +414 -0
- isa_model/serving/api/routes/analytics.py +4 -1
- isa_model/serving/api/routes/config.py +645 -0
- isa_model/serving/api/routes/deployment_billing.py +315 -0
- isa_model/serving/api/routes/deployments.py +138 -2
- isa_model/serving/api/routes/gpu_gateway.py +440 -0
- isa_model/serving/api/routes/health.py +32 -12
- isa_model/serving/api/routes/inference_monitoring.py +486 -0
- isa_model/serving/api/routes/local_deployments.py +448 -0
- isa_model/serving/api/routes/tenants.py +575 -0
- isa_model/serving/api/routes/unified.py +680 -18
- isa_model/serving/api/routes/webhooks.py +479 -0
- isa_model/serving/api/startup.py +68 -54
- isa_model/utils/gpu_utils.py +311 -0
- {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/METADATA +71 -24
- isa_model-0.4.4.dist-info/RECORD +180 -0
- isa_model/core/security/secrets.py +0 -358
- isa_model/core/storage/hf_storage.py +0 -419
- isa_model/core/storage/minio_storage.py +0 -0
- isa_model/deployment/cloud/__init__.py +0 -9
- isa_model/deployment/cloud/modal/__init__.py +0 -10
- isa_model/deployment/core/deployment_config.py +0 -356
- isa_model/deployment/core/isa_deployment_service.py +0 -401
- isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
- isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
- isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
- isa_model/deployment/runtime/deployed_service.py +0 -338
- isa_model/deployment/services/__init__.py +0 -9
- isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
- isa_model/deployment/services/model_service.py +0 -332
- isa_model/deployment/services/service_monitor.py +0 -356
- isa_model/deployment/services/service_registry.py +0 -527
- isa_model/eval/__init__.py +0 -92
- isa_model/eval/benchmarks/__init__.py +0 -27
- isa_model/eval/benchmarks/multimodal_datasets.py +0 -460
- isa_model/eval/benchmarks.py +0 -701
- isa_model/eval/config/__init__.py +0 -10
- isa_model/eval/config/evaluation_config.py +0 -108
- isa_model/eval/evaluators/__init__.py +0 -24
- isa_model/eval/evaluators/audio_evaluator.py +0 -727
- isa_model/eval/evaluators/base_evaluator.py +0 -503
- isa_model/eval/evaluators/embedding_evaluator.py +0 -742
- isa_model/eval/evaluators/llm_evaluator.py +0 -472
- isa_model/eval/evaluators/vision_evaluator.py +0 -564
- isa_model/eval/example_evaluation.py +0 -395
- isa_model/eval/factory.py +0 -798
- isa_model/eval/infrastructure/__init__.py +0 -24
- isa_model/eval/infrastructure/experiment_tracker.py +0 -466
- isa_model/eval/isa_benchmarks.py +0 -700
- isa_model/eval/isa_integration.py +0 -582
- isa_model/eval/metrics.py +0 -951
- isa_model/eval/tests/unit/test_basic.py +0 -396
- isa_model/serving/api/routes/evaluations.py +0 -579
- isa_model/training/__init__.py +0 -168
- isa_model/training/annotation/annotation_schema.py +0 -47
- isa_model/training/annotation/processors/annotation_processor.py +0 -126
- isa_model/training/annotation/storage/dataset_manager.py +0 -131
- isa_model/training/annotation/storage/dataset_schema.py +0 -44
- isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
- isa_model/training/annotation/tests/test_minio copy.py +0 -113
- isa_model/training/annotation/tests/test_minio_upload.py +0 -43
- isa_model/training/annotation/views/annotation_controller.py +0 -158
- isa_model/training/cloud/__init__.py +0 -22
- isa_model/training/cloud/job_orchestrator.py +0 -402
- isa_model/training/cloud/runpod_trainer.py +0 -454
- isa_model/training/cloud/storage_manager.py +0 -482
- isa_model/training/core/__init__.py +0 -26
- isa_model/training/core/config.py +0 -181
- isa_model/training/core/dataset.py +0 -222
- isa_model/training/core/trainer.py +0 -720
- isa_model/training/core/utils.py +0 -213
- isa_model/training/examples/intelligent_training_example.py +0 -281
- isa_model/training/factory.py +0 -424
- isa_model/training/intelligent/__init__.py +0 -25
- isa_model/training/intelligent/decision_engine.py +0 -643
- isa_model/training/intelligent/intelligent_factory.py +0 -888
- isa_model/training/intelligent/knowledge_base.py +0 -751
- isa_model/training/intelligent/resource_optimizer.py +0 -839
- isa_model/training/intelligent/task_classifier.py +0 -576
- isa_model/training/storage/__init__.py +0 -24
- isa_model/training/storage/core_integration.py +0 -439
- isa_model/training/storage/training_repository.py +0 -552
- isa_model/training/storage/training_storage.py +0 -628
- isa_model-0.4.0.dist-info/RECORD +0 -182
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_chatTTS_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_fish_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_openvoice_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_service_v2.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/embedding}/isa_embed_rerank_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/video}/isa_video_hunyuan_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ocr_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_qwen25_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_table_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service_optimized.py +0 -0
- /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
- {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/WHEEL +0 -0
- {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/top_level.txt +0 -0
isa_model/eval/metrics.py
DELETED
@@ -1,951 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Evaluation Metrics for ISA Model Framework
|
3
|
-
|
4
|
-
This module provides various metrics for evaluating AI models:
|
5
|
-
- LLM metrics: perplexity, BLEU, ROUGE, accuracy, etc.
|
6
|
-
- Image metrics: FID, IS, LPIPS, etc.
|
7
|
-
- Custom metrics and benchmark runners
|
8
|
-
"""
|
9
|
-
|
10
|
-
import os
|
11
|
-
import json
|
12
|
-
import logging
|
13
|
-
import numpy as np
|
14
|
-
from typing import Dict, List, Any, Optional, Union
|
15
|
-
from enum import Enum
|
16
|
-
from abc import ABC, abstractmethod
|
17
|
-
|
18
|
-
try:
|
19
|
-
from ..inference.ai_factory import AIFactory
|
20
|
-
AI_FACTORY_AVAILABLE = True
|
21
|
-
except ImportError:
|
22
|
-
AI_FACTORY_AVAILABLE = False
|
23
|
-
|
24
|
-
logger = logging.getLogger(__name__)
|
25
|
-
|
26
|
-
|
27
|
-
class MetricType(str, Enum):
|
28
|
-
"""Types of evaluation metrics."""
|
29
|
-
PERPLEXITY = "perplexity"
|
30
|
-
BLEU = "bleu"
|
31
|
-
ROUGE = "rouge"
|
32
|
-
ACCURACY = "accuracy"
|
33
|
-
F1_SCORE = "f1"
|
34
|
-
DIVERSITY = "diversity"
|
35
|
-
COHERENCE = "coherence"
|
36
|
-
FLUENCY = "fluency"
|
37
|
-
FID = "fid"
|
38
|
-
IS = "is"
|
39
|
-
LPIPS = "lpips"
|
40
|
-
|
41
|
-
|
42
|
-
class BaseMetric(ABC):
|
43
|
-
"""Base class for all metrics."""
|
44
|
-
|
45
|
-
@abstractmethod
|
46
|
-
def compute(self, predictions: List[str], references: List[str] = None, **kwargs) -> Dict[str, float]:
|
47
|
-
"""Compute the metric."""
|
48
|
-
pass
|
49
|
-
|
50
|
-
|
51
|
-
class LLMMetrics:
|
52
|
-
"""
|
53
|
-
Metrics calculator for Language Models.
|
54
|
-
|
55
|
-
Supports various metrics including:
|
56
|
-
- Perplexity
|
57
|
-
- BLEU score
|
58
|
-
- ROUGE score
|
59
|
-
- Accuracy
|
60
|
-
- F1 score
|
61
|
-
- Generation quality metrics
|
62
|
-
"""
|
63
|
-
|
64
|
-
def __init__(self):
|
65
|
-
self.available_metrics = [
|
66
|
-
MetricType.PERPLEXITY,
|
67
|
-
MetricType.BLEU,
|
68
|
-
MetricType.ROUGE,
|
69
|
-
MetricType.ACCURACY,
|
70
|
-
MetricType.F1_SCORE,
|
71
|
-
MetricType.DIVERSITY,
|
72
|
-
MetricType.COHERENCE,
|
73
|
-
MetricType.FLUENCY
|
74
|
-
]
|
75
|
-
|
76
|
-
# Initialize AI factory if available
|
77
|
-
if AI_FACTORY_AVAILABLE:
|
78
|
-
try:
|
79
|
-
self.ai_factory = AIFactory()
|
80
|
-
except Exception as e:
|
81
|
-
logger.warning(f"Failed to initialize AIFactory: {e}")
|
82
|
-
self.ai_factory = None
|
83
|
-
else:
|
84
|
-
self.ai_factory = None
|
85
|
-
|
86
|
-
async def evaluate(
|
87
|
-
self,
|
88
|
-
model_path: str,
|
89
|
-
dataset: List[Dict[str, Any]],
|
90
|
-
metrics: List[str],
|
91
|
-
batch_size: int = 8,
|
92
|
-
provider: str = "ollama",
|
93
|
-
**kwargs
|
94
|
-
) -> Dict[str, Any]:
|
95
|
-
"""
|
96
|
-
Evaluate LLM on dataset with specified metrics.
|
97
|
-
|
98
|
-
Args:
|
99
|
-
model_path: Path to the model
|
100
|
-
dataset: Evaluation dataset
|
101
|
-
metrics: List of metrics to compute
|
102
|
-
batch_size: Batch size for evaluation
|
103
|
-
provider: Model provider
|
104
|
-
**kwargs: Additional parameters
|
105
|
-
|
106
|
-
Returns:
|
107
|
-
Dictionary with metric results
|
108
|
-
"""
|
109
|
-
results = {
|
110
|
-
"model_path": model_path,
|
111
|
-
"num_samples": len(dataset),
|
112
|
-
"metrics": {}
|
113
|
-
}
|
114
|
-
|
115
|
-
# Generate predictions
|
116
|
-
predictions, references = await self._generate_predictions(
|
117
|
-
model_path, dataset, batch_size, provider, **kwargs
|
118
|
-
)
|
119
|
-
|
120
|
-
# Compute each metric
|
121
|
-
for metric in metrics:
|
122
|
-
try:
|
123
|
-
if metric == MetricType.PERPLEXITY:
|
124
|
-
score = self._compute_perplexity(predictions, references)
|
125
|
-
elif metric == MetricType.BLEU:
|
126
|
-
score = self._compute_bleu(predictions, references)
|
127
|
-
elif metric == MetricType.ROUGE:
|
128
|
-
score = self._compute_rouge(predictions, references)
|
129
|
-
elif metric == MetricType.ACCURACY:
|
130
|
-
score = self._compute_accuracy(predictions, references)
|
131
|
-
elif metric == MetricType.F1_SCORE:
|
132
|
-
score = self._compute_f1(predictions, references)
|
133
|
-
elif metric == MetricType.DIVERSITY:
|
134
|
-
score = self._compute_diversity(predictions)
|
135
|
-
elif metric == MetricType.COHERENCE:
|
136
|
-
score = self._compute_coherence(predictions)
|
137
|
-
elif metric == MetricType.FLUENCY:
|
138
|
-
score = self._compute_fluency(predictions)
|
139
|
-
else:
|
140
|
-
logger.warning(f"Unknown metric: {metric}")
|
141
|
-
continue
|
142
|
-
|
143
|
-
results["metrics"][metric] = score
|
144
|
-
logger.info(f"Computed {metric}: {score}")
|
145
|
-
|
146
|
-
except Exception as e:
|
147
|
-
logger.error(f"Failed to compute {metric}: {e}")
|
148
|
-
results["metrics"][metric] = {"error": str(e)}
|
149
|
-
|
150
|
-
return results
|
151
|
-
|
152
|
-
async def evaluate_generation(
|
153
|
-
self,
|
154
|
-
model_path: str,
|
155
|
-
prompts: List[str],
|
156
|
-
reference_texts: List[str] = None,
|
157
|
-
metrics: List[str] = None,
|
158
|
-
provider: str = "ollama",
|
159
|
-
**kwargs
|
160
|
-
) -> Dict[str, Any]:
|
161
|
-
"""
|
162
|
-
Evaluate text generation quality.
|
163
|
-
|
164
|
-
Args:
|
165
|
-
model_path: Path to the model
|
166
|
-
prompts: Input prompts
|
167
|
-
reference_texts: Reference texts (optional)
|
168
|
-
metrics: Metrics to compute
|
169
|
-
provider: Model provider
|
170
|
-
**kwargs: Additional parameters
|
171
|
-
|
172
|
-
Returns:
|
173
|
-
Generation evaluation results
|
174
|
-
"""
|
175
|
-
if metrics is None:
|
176
|
-
metrics = [MetricType.DIVERSITY, MetricType.COHERENCE, MetricType.FLUENCY]
|
177
|
-
|
178
|
-
# Generate texts
|
179
|
-
generated_texts = self._generate_texts(model_path, prompts, provider, **kwargs)
|
180
|
-
|
181
|
-
results = {
|
182
|
-
"model_path": model_path,
|
183
|
-
"num_prompts": len(prompts),
|
184
|
-
"metrics": {}
|
185
|
-
}
|
186
|
-
|
187
|
-
# Compute metrics
|
188
|
-
for metric in metrics:
|
189
|
-
try:
|
190
|
-
if metric == MetricType.DIVERSITY:
|
191
|
-
score = self._compute_diversity(generated_texts)
|
192
|
-
elif metric == MetricType.COHERENCE:
|
193
|
-
score = self._compute_coherence(generated_texts)
|
194
|
-
elif metric == MetricType.FLUENCY:
|
195
|
-
score = self._compute_fluency(generated_texts)
|
196
|
-
elif metric == MetricType.BLEU and reference_texts:
|
197
|
-
score = self._compute_bleu(generated_texts, reference_texts)
|
198
|
-
elif metric == MetricType.ROUGE and reference_texts:
|
199
|
-
score = self._compute_rouge(generated_texts, reference_texts)
|
200
|
-
else:
|
201
|
-
continue
|
202
|
-
|
203
|
-
results["metrics"][metric] = score
|
204
|
-
|
205
|
-
except Exception as e:
|
206
|
-
logger.error(f"Failed to compute {metric}: {e}")
|
207
|
-
results["metrics"][metric] = {"error": str(e)}
|
208
|
-
|
209
|
-
return results
|
210
|
-
|
211
|
-
async def _generate_predictions(
|
212
|
-
self,
|
213
|
-
model_path: str,
|
214
|
-
dataset: List[Dict[str, Any]],
|
215
|
-
batch_size: int,
|
216
|
-
provider: str,
|
217
|
-
**kwargs
|
218
|
-
) -> tuple:
|
219
|
-
"""Generate predictions from model using actual inference."""
|
220
|
-
predictions = []
|
221
|
-
references = []
|
222
|
-
|
223
|
-
if not self.ai_factory:
|
224
|
-
logger.warning("AIFactory not available, using placeholder predictions")
|
225
|
-
# Fallback to placeholder predictions
|
226
|
-
for item in dataset:
|
227
|
-
if isinstance(item, dict):
|
228
|
-
if "input" in item and "output" in item:
|
229
|
-
predictions.append(f"Generated response for: {item['input']}")
|
230
|
-
references.append(item["output"])
|
231
|
-
elif "prompt" in item and "response" in item:
|
232
|
-
predictions.append(f"Generated response for: {item['prompt']}")
|
233
|
-
references.append(item["response"])
|
234
|
-
return predictions, references
|
235
|
-
|
236
|
-
try:
|
237
|
-
# Get LLM service
|
238
|
-
llm_service = self.ai_factory.get_llm(model_name=model_path, provider=provider)
|
239
|
-
|
240
|
-
# Process dataset in batches
|
241
|
-
for i in range(0, len(dataset), batch_size):
|
242
|
-
batch = dataset[i:i + batch_size]
|
243
|
-
batch_predictions = []
|
244
|
-
batch_references = []
|
245
|
-
|
246
|
-
for item in batch:
|
247
|
-
if isinstance(item, dict):
|
248
|
-
prompt = None
|
249
|
-
reference = None
|
250
|
-
|
251
|
-
# Extract prompt and reference based on data format
|
252
|
-
if "input" in item and "output" in item:
|
253
|
-
prompt = item["input"]
|
254
|
-
reference = item["output"]
|
255
|
-
elif "prompt" in item and "response" in item:
|
256
|
-
prompt = item["prompt"]
|
257
|
-
reference = item["response"]
|
258
|
-
elif "question" in item and "answer" in item:
|
259
|
-
prompt = item["question"]
|
260
|
-
reference = item["answer"]
|
261
|
-
elif "text" in item and "label" in item:
|
262
|
-
prompt = item["text"]
|
263
|
-
reference = str(item["label"])
|
264
|
-
|
265
|
-
if prompt and reference:
|
266
|
-
try:
|
267
|
-
# Generate prediction using actual model
|
268
|
-
response = await llm_service.ainvoke(prompt)
|
269
|
-
|
270
|
-
# Extract text from response
|
271
|
-
if hasattr(response, 'text'):
|
272
|
-
prediction = response.text
|
273
|
-
elif isinstance(response, dict) and 'text' in response:
|
274
|
-
prediction = response['text']
|
275
|
-
elif isinstance(response, str):
|
276
|
-
prediction = response
|
277
|
-
else:
|
278
|
-
prediction = str(response)
|
279
|
-
|
280
|
-
batch_predictions.append(prediction.strip())
|
281
|
-
batch_references.append(reference)
|
282
|
-
|
283
|
-
except Exception as e:
|
284
|
-
logger.error(f"Failed to generate prediction for item: {e}")
|
285
|
-
# Use fallback prediction
|
286
|
-
batch_predictions.append(f"Error generating prediction: {str(e)}")
|
287
|
-
batch_references.append(reference)
|
288
|
-
|
289
|
-
predictions.extend(batch_predictions)
|
290
|
-
references.extend(batch_references)
|
291
|
-
|
292
|
-
logger.info(f"Processed batch {i//batch_size + 1}/{(len(dataset) + batch_size - 1)//batch_size}")
|
293
|
-
|
294
|
-
except Exception as e:
|
295
|
-
logger.error(f"Failed to use AIFactory for predictions: {e}")
|
296
|
-
# Fallback to placeholder predictions
|
297
|
-
for item in dataset:
|
298
|
-
if isinstance(item, dict):
|
299
|
-
if "input" in item and "output" in item:
|
300
|
-
predictions.append(f"Generated response for: {item['input']}")
|
301
|
-
references.append(item["output"])
|
302
|
-
elif "prompt" in item and "response" in item:
|
303
|
-
predictions.append(f"Generated response for: {item['prompt']}")
|
304
|
-
references.append(item["response"])
|
305
|
-
|
306
|
-
logger.info(f"Generated {len(predictions)} predictions")
|
307
|
-
return predictions, references
|
308
|
-
|
309
|
-
async def _generate_texts(
|
310
|
-
self,
|
311
|
-
model_path: str,
|
312
|
-
prompts: List[str],
|
313
|
-
provider: str,
|
314
|
-
**kwargs
|
315
|
-
) -> List[str]:
|
316
|
-
"""Generate texts from prompts using actual model inference."""
|
317
|
-
generated_texts = []
|
318
|
-
|
319
|
-
if not self.ai_factory:
|
320
|
-
logger.warning("AIFactory not available, using placeholder text generation")
|
321
|
-
# Fallback to placeholder generation
|
322
|
-
for prompt in prompts:
|
323
|
-
generated_texts.append(f"Generated response for: {prompt}")
|
324
|
-
return generated_texts
|
325
|
-
|
326
|
-
try:
|
327
|
-
# Get LLM service
|
328
|
-
llm_service = self.ai_factory.get_llm(model_name=model_path, provider=provider)
|
329
|
-
|
330
|
-
for prompt in prompts:
|
331
|
-
try:
|
332
|
-
# Generate text using actual model
|
333
|
-
response = await llm_service.ainvoke(prompt)
|
334
|
-
|
335
|
-
# Extract text from response
|
336
|
-
if hasattr(response, 'text'):
|
337
|
-
generated_text = response.text
|
338
|
-
elif isinstance(response, dict) and 'text' in response:
|
339
|
-
generated_text = response['text']
|
340
|
-
elif isinstance(response, str):
|
341
|
-
generated_text = response
|
342
|
-
else:
|
343
|
-
generated_text = str(response)
|
344
|
-
|
345
|
-
generated_texts.append(generated_text.strip())
|
346
|
-
|
347
|
-
except Exception as e:
|
348
|
-
logger.error(f"Failed to generate text for prompt: {e}")
|
349
|
-
# Use fallback generation
|
350
|
-
generated_texts.append(f"Error generating text: {str(e)}")
|
351
|
-
|
352
|
-
except Exception as e:
|
353
|
-
logger.error(f"Failed to use AIFactory for text generation: {e}")
|
354
|
-
# Fallback to placeholder generation
|
355
|
-
for prompt in prompts:
|
356
|
-
generated_texts.append(f"Generated response for: {prompt}")
|
357
|
-
|
358
|
-
return generated_texts
|
359
|
-
|
360
|
-
def _compute_perplexity(self, predictions: List[str], references: List[str]) -> Dict[str, float]:
|
361
|
-
"""Compute perplexity score (simplified implementation)."""
|
362
|
-
# This is a placeholder - actual perplexity requires model probabilities
|
363
|
-
return {
|
364
|
-
"perplexity": np.random.uniform(10, 100), # Placeholder
|
365
|
-
"log_perplexity": np.random.uniform(2, 5)
|
366
|
-
}
|
367
|
-
|
368
|
-
def _compute_bleu(self, predictions: List[str], references: List[str]) -> Dict[str, float]:
|
369
|
-
"""Compute BLEU score (simplified implementation)."""
|
370
|
-
try:
|
371
|
-
# Placeholder implementation - use actual BLEU calculation
|
372
|
-
# from nltk.translate.bleu_score import sentence_bleu
|
373
|
-
scores = []
|
374
|
-
for pred, ref in zip(predictions, references):
|
375
|
-
# Simplified BLEU calculation
|
376
|
-
pred_words = pred.lower().split()
|
377
|
-
ref_words = ref.lower().split()
|
378
|
-
|
379
|
-
# Simple overlap calculation (not actual BLEU)
|
380
|
-
overlap = len(set(pred_words) & set(ref_words))
|
381
|
-
total = len(set(pred_words) | set(ref_words))
|
382
|
-
|
383
|
-
if total > 0:
|
384
|
-
scores.append(overlap / total)
|
385
|
-
else:
|
386
|
-
scores.append(0.0)
|
387
|
-
|
388
|
-
return {
|
389
|
-
"bleu": np.mean(scores),
|
390
|
-
"bleu_std": np.std(scores)
|
391
|
-
}
|
392
|
-
except Exception as e:
|
393
|
-
logger.error(f"BLEU computation failed: {e}")
|
394
|
-
return {"bleu": 0.0, "error": str(e)}
|
395
|
-
|
396
|
-
def _compute_rouge(self, predictions: List[str], references: List[str]) -> Dict[str, float]:
|
397
|
-
"""Compute ROUGE score (simplified implementation)."""
|
398
|
-
try:
|
399
|
-
rouge_1_scores = []
|
400
|
-
rouge_l_scores = []
|
401
|
-
|
402
|
-
for pred, ref in zip(predictions, references):
|
403
|
-
pred_words = set(pred.lower().split())
|
404
|
-
ref_words = set(ref.lower().split())
|
405
|
-
|
406
|
-
# ROUGE-1 (unigram overlap)
|
407
|
-
if len(ref_words) > 0:
|
408
|
-
rouge_1 = len(pred_words & ref_words) / len(ref_words)
|
409
|
-
rouge_1_scores.append(rouge_1)
|
410
|
-
|
411
|
-
# Simplified ROUGE-L (longest common subsequence)
|
412
|
-
rouge_l = len(pred_words & ref_words) / max(len(pred_words), len(ref_words), 1)
|
413
|
-
rouge_l_scores.append(rouge_l)
|
414
|
-
|
415
|
-
return {
|
416
|
-
"rouge_1": np.mean(rouge_1_scores),
|
417
|
-
"rouge_l": np.mean(rouge_l_scores),
|
418
|
-
"rouge_1_std": np.std(rouge_1_scores),
|
419
|
-
"rouge_l_std": np.std(rouge_l_scores)
|
420
|
-
}
|
421
|
-
except Exception as e:
|
422
|
-
logger.error(f"ROUGE computation failed: {e}")
|
423
|
-
return {"rouge_1": 0.0, "rouge_l": 0.0, "error": str(e)}
|
424
|
-
|
425
|
-
def _compute_accuracy(self, predictions: List[str], references: List[str]) -> Dict[str, float]:
|
426
|
-
"""Compute accuracy score."""
|
427
|
-
try:
|
428
|
-
correct = 0
|
429
|
-
total = len(predictions)
|
430
|
-
|
431
|
-
for pred, ref in zip(predictions, references):
|
432
|
-
if pred.strip().lower() == ref.strip().lower():
|
433
|
-
correct += 1
|
434
|
-
|
435
|
-
accuracy = correct / total if total > 0 else 0.0
|
436
|
-
|
437
|
-
return {
|
438
|
-
"accuracy": accuracy,
|
439
|
-
"correct": correct,
|
440
|
-
"total": total
|
441
|
-
}
|
442
|
-
except Exception as e:
|
443
|
-
logger.error(f"Accuracy computation failed: {e}")
|
444
|
-
return {"accuracy": 0.0, "error": str(e)}
|
445
|
-
|
446
|
-
def _compute_f1(self, predictions: List[str], references: List[str]) -> Dict[str, float]:
|
447
|
-
"""Compute F1 score (simplified implementation)."""
|
448
|
-
try:
|
449
|
-
f1_scores = []
|
450
|
-
|
451
|
-
for pred, ref in zip(predictions, references):
|
452
|
-
pred_words = set(pred.lower().split())
|
453
|
-
ref_words = set(ref.lower().split())
|
454
|
-
|
455
|
-
if len(pred_words) == 0 and len(ref_words) == 0:
|
456
|
-
f1_scores.append(1.0)
|
457
|
-
elif len(pred_words) == 0 or len(ref_words) == 0:
|
458
|
-
f1_scores.append(0.0)
|
459
|
-
else:
|
460
|
-
intersection = len(pred_words & ref_words)
|
461
|
-
precision = intersection / len(pred_words)
|
462
|
-
recall = intersection / len(ref_words)
|
463
|
-
|
464
|
-
if precision + recall > 0:
|
465
|
-
f1 = 2 * (precision * recall) / (precision + recall)
|
466
|
-
f1_scores.append(f1)
|
467
|
-
else:
|
468
|
-
f1_scores.append(0.0)
|
469
|
-
|
470
|
-
return {
|
471
|
-
"f1": np.mean(f1_scores),
|
472
|
-
"f1_std": np.std(f1_scores)
|
473
|
-
}
|
474
|
-
except Exception as e:
|
475
|
-
logger.error(f"F1 computation failed: {e}")
|
476
|
-
return {"f1": 0.0, "error": str(e)}
|
477
|
-
|
478
|
-
def _compute_diversity(self, texts: List[str]) -> Dict[str, float]:
|
479
|
-
"""Compute diversity metrics."""
|
480
|
-
try:
|
481
|
-
# Distinct-1 and Distinct-2
|
482
|
-
all_unigrams = []
|
483
|
-
all_bigrams = []
|
484
|
-
|
485
|
-
for text in texts:
|
486
|
-
words = text.lower().split()
|
487
|
-
all_unigrams.extend(words)
|
488
|
-
|
489
|
-
# Create bigrams
|
490
|
-
for i in range(len(words) - 1):
|
491
|
-
all_bigrams.append((words[i], words[i + 1]))
|
492
|
-
|
493
|
-
distinct_1 = len(set(all_unigrams)) / len(all_unigrams) if all_unigrams else 0
|
494
|
-
distinct_2 = len(set(all_bigrams)) / len(all_bigrams) if all_bigrams else 0
|
495
|
-
|
496
|
-
return {
|
497
|
-
"distinct_1": distinct_1,
|
498
|
-
"distinct_2": distinct_2,
|
499
|
-
"vocab_size": len(set(all_unigrams))
|
500
|
-
}
|
501
|
-
except Exception as e:
|
502
|
-
logger.error(f"Diversity computation failed: {e}")
|
503
|
-
return {"distinct_1": 0.0, "distinct_2": 0.0, "error": str(e)}
|
504
|
-
|
505
|
-
def _compute_coherence(self, texts: List[str]) -> Dict[str, float]:
|
506
|
-
"""Compute coherence score (simplified implementation)."""
|
507
|
-
try:
|
508
|
-
# Simplified coherence based on sentence length consistency
|
509
|
-
coherence_scores = []
|
510
|
-
|
511
|
-
for text in texts:
|
512
|
-
sentences = text.split('.')
|
513
|
-
if len(sentences) > 1:
|
514
|
-
lengths = [len(s.split()) for s in sentences if s.strip()]
|
515
|
-
if lengths:
|
516
|
-
# Coherence as inverse of length variance
|
517
|
-
coherence = 1.0 / (1.0 + np.var(lengths))
|
518
|
-
coherence_scores.append(coherence)
|
519
|
-
else:
|
520
|
-
coherence_scores.append(0.5)
|
521
|
-
else:
|
522
|
-
coherence_scores.append(0.5)
|
523
|
-
|
524
|
-
return {
|
525
|
-
"coherence": np.mean(coherence_scores),
|
526
|
-
"coherence_std": np.std(coherence_scores)
|
527
|
-
}
|
528
|
-
except Exception as e:
|
529
|
-
logger.error(f"Coherence computation failed: {e}")
|
530
|
-
return {"coherence": 0.5, "error": str(e)}
|
531
|
-
|
532
|
-
def _compute_fluency(self, texts: List[str]) -> Dict[str, float]:
|
533
|
-
"""Compute fluency score (simplified implementation)."""
|
534
|
-
try:
|
535
|
-
fluency_scores = []
|
536
|
-
|
537
|
-
for text in texts:
|
538
|
-
# Simplified fluency based on word count and sentence structure
|
539
|
-
words = text.split()
|
540
|
-
sentences = text.split('.')
|
541
|
-
|
542
|
-
if len(words) > 0 and len(sentences) > 0:
|
543
|
-
avg_words_per_sentence = len(words) / len(sentences)
|
544
|
-
# Fluency based on reasonable sentence length (5-20 words)
|
545
|
-
if 5 <= avg_words_per_sentence <= 20:
|
546
|
-
fluency = 1.0
|
547
|
-
else:
|
548
|
-
fluency = max(0.0, 1.0 - abs(avg_words_per_sentence - 12.5) / 12.5)
|
549
|
-
|
550
|
-
fluency_scores.append(fluency)
|
551
|
-
else:
|
552
|
-
fluency_scores.append(0.0)
|
553
|
-
|
554
|
-
return {
|
555
|
-
"fluency": np.mean(fluency_scores),
|
556
|
-
"fluency_std": np.std(fluency_scores)
|
557
|
-
}
|
558
|
-
except Exception as e:
|
559
|
-
logger.error(f"Fluency computation failed: {e}")
|
560
|
-
return {"fluency": 0.0, "error": str(e)}
|
561
|
-
|
562
|
-
|
563
|
-
class ImageMetrics:
|
564
|
-
"""
|
565
|
-
Metrics calculator for Image Generation Models.
|
566
|
-
|
567
|
-
Supports metrics including:
|
568
|
-
- FID (Fréchet Inception Distance)
|
569
|
-
- IS (Inception Score)
|
570
|
-
- LPIPS (Learned Perceptual Image Patch Similarity)
|
571
|
-
"""
|
572
|
-
|
573
|
-
def __init__(self):
|
574
|
-
self.available_metrics = [
|
575
|
-
MetricType.FID,
|
576
|
-
MetricType.IS,
|
577
|
-
MetricType.LPIPS
|
578
|
-
]
|
579
|
-
|
580
|
-
def evaluate(
|
581
|
-
self,
|
582
|
-
model_path: str,
|
583
|
-
test_images_dir: str,
|
584
|
-
reference_images_dir: Optional[str] = None,
|
585
|
-
metrics: List[str] = None,
|
586
|
-
**kwargs
|
587
|
-
) -> Dict[str, Any]:
|
588
|
-
"""
|
589
|
-
Evaluate image generation model.
|
590
|
-
|
591
|
-
Args:
|
592
|
-
model_path: Path to the image model
|
593
|
-
test_images_dir: Directory with test images
|
594
|
-
reference_images_dir: Directory with reference images
|
595
|
-
metrics: Metrics to compute
|
596
|
-
**kwargs: Additional parameters
|
597
|
-
|
598
|
-
Returns:
|
599
|
-
Image evaluation results
|
600
|
-
"""
|
601
|
-
if metrics is None:
|
602
|
-
metrics = [MetricType.FID, MetricType.IS]
|
603
|
-
|
604
|
-
results = {
|
605
|
-
"model_path": model_path,
|
606
|
-
"test_images_dir": test_images_dir,
|
607
|
-
"reference_images_dir": reference_images_dir,
|
608
|
-
"metrics": {}
|
609
|
-
}
|
610
|
-
|
611
|
-
for metric in metrics:
|
612
|
-
try:
|
613
|
-
if metric == MetricType.FID:
|
614
|
-
score = self._compute_fid(test_images_dir, reference_images_dir)
|
615
|
-
elif metric == MetricType.IS:
|
616
|
-
score = self._compute_is(test_images_dir)
|
617
|
-
elif metric == MetricType.LPIPS:
|
618
|
-
score = self._compute_lpips(test_images_dir, reference_images_dir)
|
619
|
-
else:
|
620
|
-
logger.warning(f"Unknown image metric: {metric}")
|
621
|
-
continue
|
622
|
-
|
623
|
-
results["metrics"][metric] = score
|
624
|
-
logger.info(f"Computed {metric}: {score}")
|
625
|
-
|
626
|
-
except Exception as e:
|
627
|
-
logger.error(f"Failed to compute {metric}: {e}")
|
628
|
-
results["metrics"][metric] = {"error": str(e)}
|
629
|
-
|
630
|
-
return results
|
631
|
-
|
632
|
-
def _compute_fid(self, test_dir: str, reference_dir: Optional[str]) -> Dict[str, float]:
|
633
|
-
"""Compute FID score (placeholder implementation)."""
|
634
|
-
# This is a placeholder - actual FID requires complex neural network computations
|
635
|
-
logger.warning("FID computation not fully implemented - returning placeholder")
|
636
|
-
return {
|
637
|
-
"fid": np.random.uniform(20, 100), # Placeholder
|
638
|
-
"note": "Placeholder implementation"
|
639
|
-
}
|
640
|
-
|
641
|
-
def _compute_is(self, images_dir: str) -> Dict[str, float]:
|
642
|
-
"""Compute Inception Score (placeholder implementation)."""
|
643
|
-
# This is a placeholder - actual IS requires Inception network
|
644
|
-
logger.warning("IS computation not fully implemented - returning placeholder")
|
645
|
-
return {
|
646
|
-
"is_mean": np.random.uniform(2, 10), # Placeholder
|
647
|
-
"is_std": np.random.uniform(0.1, 1.0),
|
648
|
-
"note": "Placeholder implementation"
|
649
|
-
}
|
650
|
-
|
651
|
-
def _compute_lpips(self, test_dir: str, reference_dir: Optional[str]) -> Dict[str, float]:
|
652
|
-
"""Compute LPIPS score (placeholder implementation)."""
|
653
|
-
# This is a placeholder - actual LPIPS requires perceptual loss networks
|
654
|
-
logger.warning("LPIPS computation not fully implemented - returning placeholder")
|
655
|
-
return {
|
656
|
-
"lpips": np.random.uniform(0.1, 0.8), # Placeholder
|
657
|
-
"note": "Placeholder implementation"
|
658
|
-
}
|
659
|
-
|
660
|
-
|
661
|
-
class BenchmarkRunner:
|
662
|
-
"""
|
663
|
-
Runner for standard AI benchmarks.
|
664
|
-
|
665
|
-
Supports running various benchmarks and collecting results.
|
666
|
-
"""
|
667
|
-
|
668
|
-
def __init__(self):
|
669
|
-
self.supported_benchmarks = ["mmlu", "hellaswag", "arc", "gsm8k"]
|
670
|
-
|
671
|
-
# Initialize AI factory if available
|
672
|
-
if AI_FACTORY_AVAILABLE:
|
673
|
-
try:
|
674
|
-
self.ai_factory = AIFactory()
|
675
|
-
except Exception as e:
|
676
|
-
logger.warning(f"Failed to initialize AIFactory: {e}")
|
677
|
-
self.ai_factory = None
|
678
|
-
else:
|
679
|
-
self.ai_factory = None
|
680
|
-
|
681
|
-
def run(
|
682
|
-
self,
|
683
|
-
benchmark,
|
684
|
-
model_path: str,
|
685
|
-
num_shots: int = 0,
|
686
|
-
max_samples: Optional[int] = None,
|
687
|
-
provider: str = "ollama",
|
688
|
-
**kwargs
|
689
|
-
) -> Dict[str, Any]:
|
690
|
-
"""
|
691
|
-
Run a benchmark evaluation.
|
692
|
-
|
693
|
-
Args:
|
694
|
-
benchmark: Benchmark instance
|
695
|
-
model_path: Path to the model
|
696
|
-
num_shots: Number of few-shot examples
|
697
|
-
max_samples: Maximum samples to evaluate
|
698
|
-
provider: Model provider
|
699
|
-
**kwargs: Additional parameters
|
700
|
-
|
701
|
-
Returns:
|
702
|
-
Benchmark results
|
703
|
-
"""
|
704
|
-
logger.info(f"Running benchmark {benchmark.name} on {model_path}")
|
705
|
-
|
706
|
-
# Load benchmark data
|
707
|
-
test_data = benchmark.load_data(max_samples=max_samples)
|
708
|
-
|
709
|
-
# Run evaluation
|
710
|
-
results = {
|
711
|
-
"benchmark": benchmark.name,
|
712
|
-
"model_path": model_path,
|
713
|
-
"num_shots": num_shots,
|
714
|
-
"num_samples": len(test_data),
|
715
|
-
"results": {}
|
716
|
-
}
|
717
|
-
|
718
|
-
# Process each sample
|
719
|
-
correct = 0
|
720
|
-
total = 0
|
721
|
-
|
722
|
-
for sample in test_data:
|
723
|
-
try:
|
724
|
-
# Format prompt using benchmark's method
|
725
|
-
prompt = benchmark.format_prompt(sample)
|
726
|
-
|
727
|
-
# Generate prediction using actual model
|
728
|
-
prediction = self._generate_prediction(
|
729
|
-
model_path, {"prompt": prompt}, num_shots, provider, **kwargs
|
730
|
-
)
|
731
|
-
|
732
|
-
# Check if correct
|
733
|
-
is_correct = benchmark.evaluate_sample(sample, prediction)
|
734
|
-
if is_correct:
|
735
|
-
correct += 1
|
736
|
-
total += 1
|
737
|
-
|
738
|
-
except Exception as e:
|
739
|
-
logger.error(f"Failed to process sample: {e}")
|
740
|
-
continue
|
741
|
-
|
742
|
-
# Calculate final score
|
743
|
-
accuracy = correct / total if total > 0 else 0.0
|
744
|
-
|
745
|
-
results["results"] = {
|
746
|
-
"accuracy": accuracy,
|
747
|
-
"correct": correct,
|
748
|
-
"total": total
|
749
|
-
}
|
750
|
-
|
751
|
-
logger.info(f"Benchmark completed: {accuracy:.3f} accuracy ({correct}/{total})")
|
752
|
-
return results
|
753
|
-
|
754
|
-
def _generate_prediction(
|
755
|
-
self,
|
756
|
-
model_path: str,
|
757
|
-
sample: Dict[str, Any],
|
758
|
-
num_shots: int,
|
759
|
-
provider: str,
|
760
|
-
**kwargs
|
761
|
-
) -> str:
|
762
|
-
"""Generate prediction for a sample using actual model inference."""
|
763
|
-
if not self.ai_factory:
|
764
|
-
logger.warning("AIFactory not available, using placeholder prediction")
|
765
|
-
return "A" # Placeholder answer
|
766
|
-
|
767
|
-
try:
|
768
|
-
# Get LLM service
|
769
|
-
llm_service = self.ai_factory.get_llm(model_name=model_path, provider=provider)
|
770
|
-
|
771
|
-
# Format the prompt (this should be done by the benchmark)
|
772
|
-
if hasattr(sample, 'get'):
|
773
|
-
prompt = sample.get('prompt', str(sample))
|
774
|
-
else:
|
775
|
-
prompt = str(sample)
|
776
|
-
|
777
|
-
# Generate prediction using actual model
|
778
|
-
response = llm_service.generate(
|
779
|
-
prompt=prompt,
|
780
|
-
max_tokens=kwargs.get("max_tokens", 50),
|
781
|
-
temperature=kwargs.get("temperature", 0.0) # Low temperature for consistency
|
782
|
-
)
|
783
|
-
|
784
|
-
# Extract text from response
|
785
|
-
if hasattr(response, 'text'):
|
786
|
-
prediction = response.text
|
787
|
-
elif isinstance(response, dict) and 'text' in response:
|
788
|
-
prediction = response['text']
|
789
|
-
elif isinstance(response, str):
|
790
|
-
prediction = response
|
791
|
-
else:
|
792
|
-
prediction = str(response)
|
793
|
-
|
794
|
-
return prediction.strip()
|
795
|
-
|
796
|
-
except Exception as e:
|
797
|
-
logger.error(f"Failed to generate prediction: {e}")
|
798
|
-
return "A" # Fallback answer
|
799
|
-
|
800
|
-
|
801
|
-
# Utility functions for evaluators
|
802
|
-
def compute_text_metrics(predictions: Union[str, List[str]],
|
803
|
-
references: Union[str, List[str]],
|
804
|
-
aggregate: bool = False) -> Dict[str, float]:
|
805
|
-
"""
|
806
|
-
Compute standard text evaluation metrics.
|
807
|
-
|
808
|
-
Args:
|
809
|
-
predictions: Single prediction or list of predictions
|
810
|
-
references: Single reference or list of references
|
811
|
-
aggregate: Whether to compute aggregate metrics for lists
|
812
|
-
|
813
|
-
Returns:
|
814
|
-
Dictionary of computed metrics
|
815
|
-
"""
|
816
|
-
try:
|
817
|
-
# Handle single string inputs
|
818
|
-
if isinstance(predictions, str) and isinstance(references, str):
|
819
|
-
pred_list = [predictions]
|
820
|
-
ref_list = [references]
|
821
|
-
else:
|
822
|
-
pred_list = predictions if isinstance(predictions, list) else [str(predictions)]
|
823
|
-
ref_list = references if isinstance(references, list) else [str(references)]
|
824
|
-
|
825
|
-
# Ensure equal lengths
|
826
|
-
min_len = min(len(pred_list), len(ref_list))
|
827
|
-
pred_list = pred_list[:min_len]
|
828
|
-
ref_list = ref_list[:min_len]
|
829
|
-
|
830
|
-
metrics = {}
|
831
|
-
|
832
|
-
# Exact match
|
833
|
-
exact_matches = sum(1 for p, r in zip(pred_list, ref_list) if p.strip().lower() == r.strip().lower())
|
834
|
-
metrics["exact_match"] = exact_matches / len(pred_list) if pred_list else 0.0
|
835
|
-
|
836
|
-
# F1 Score (token-level)
|
837
|
-
f1_scores = []
|
838
|
-
for pred, ref in zip(pred_list, ref_list):
|
839
|
-
pred_tokens = set(pred.lower().split())
|
840
|
-
ref_tokens = set(ref.lower().split())
|
841
|
-
|
842
|
-
if not ref_tokens and not pred_tokens:
|
843
|
-
f1_scores.append(1.0)
|
844
|
-
elif not ref_tokens or not pred_tokens:
|
845
|
-
f1_scores.append(0.0)
|
846
|
-
else:
|
847
|
-
intersection = len(pred_tokens & ref_tokens)
|
848
|
-
precision = intersection / len(pred_tokens)
|
849
|
-
recall = intersection / len(ref_tokens)
|
850
|
-
|
851
|
-
if precision + recall > 0:
|
852
|
-
f1 = 2 * (precision * recall) / (precision + recall)
|
853
|
-
f1_scores.append(f1)
|
854
|
-
else:
|
855
|
-
f1_scores.append(0.0)
|
856
|
-
|
857
|
-
metrics["f1_score"] = np.mean(f1_scores) if f1_scores else 0.0
|
858
|
-
|
859
|
-
# BLEU Score (simplified)
|
860
|
-
bleu_scores = []
|
861
|
-
for pred, ref in zip(pred_list, ref_list):
|
862
|
-
pred_words = pred.lower().split()
|
863
|
-
ref_words = ref.lower().split()
|
864
|
-
|
865
|
-
# Simple n-gram overlap
|
866
|
-
overlap = len(set(pred_words) & set(ref_words))
|
867
|
-
total = len(set(pred_words) | set(ref_words))
|
868
|
-
|
869
|
-
bleu_scores.append(overlap / total if total > 0 else 0.0)
|
870
|
-
|
871
|
-
metrics["bleu_score"] = np.mean(bleu_scores) if bleu_scores else 0.0
|
872
|
-
|
873
|
-
# ROUGE-L (simplified)
|
874
|
-
rouge_scores = []
|
875
|
-
for pred, ref in zip(pred_list, ref_list):
|
876
|
-
pred_words = set(pred.lower().split())
|
877
|
-
ref_words = set(ref.lower().split())
|
878
|
-
|
879
|
-
if len(ref_words) > 0:
|
880
|
-
rouge_l = len(pred_words & ref_words) / len(ref_words)
|
881
|
-
rouge_scores.append(rouge_l)
|
882
|
-
else:
|
883
|
-
rouge_scores.append(0.0)
|
884
|
-
|
885
|
-
metrics["rouge_l"] = np.mean(rouge_scores) if rouge_scores else 0.0
|
886
|
-
|
887
|
-
# Response length metrics
|
888
|
-
pred_lengths = [len(p.split()) for p in pred_list]
|
889
|
-
ref_lengths = [len(r.split()) for r in ref_list]
|
890
|
-
|
891
|
-
metrics["avg_prediction_length"] = np.mean(pred_lengths) if pred_lengths else 0.0
|
892
|
-
metrics["avg_reference_length"] = np.mean(ref_lengths) if ref_lengths else 0.0
|
893
|
-
metrics["length_ratio"] = (np.mean(pred_lengths) / np.mean(ref_lengths)) if np.mean(ref_lengths) > 0 else 0.0
|
894
|
-
|
895
|
-
# Diversity metrics for predictions
|
896
|
-
if len(pred_list) > 1:
|
897
|
-
all_words = []
|
898
|
-
for pred in pred_list:
|
899
|
-
all_words.extend(pred.lower().split())
|
900
|
-
|
901
|
-
unique_words = len(set(all_words))
|
902
|
-
total_words = len(all_words)
|
903
|
-
|
904
|
-
metrics["vocabulary_diversity"] = unique_words / total_words if total_words > 0 else 0.0
|
905
|
-
|
906
|
-
return metrics
|
907
|
-
|
908
|
-
except Exception as e:
|
909
|
-
logger.error(f"Error computing text metrics: {e}")
|
910
|
-
return {"text_metrics_error": 1.0}
|
911
|
-
|
912
|
-
|
913
|
-
def compute_vision_metrics(predictions: List[Any],
|
914
|
-
references: List[Any],
|
915
|
-
task_type: str = "general") -> Dict[str, float]:
|
916
|
-
"""
|
917
|
-
Compute vision-specific evaluation metrics.
|
918
|
-
|
919
|
-
Args:
|
920
|
-
predictions: List of vision model predictions
|
921
|
-
references: List of reference outputs
|
922
|
-
task_type: Type of vision task (ocr, detection, etc.)
|
923
|
-
|
924
|
-
Returns:
|
925
|
-
Dictionary of computed metrics
|
926
|
-
"""
|
927
|
-
try:
|
928
|
-
metrics = {}
|
929
|
-
|
930
|
-
# Basic success rate
|
931
|
-
successful_predictions = sum(1 for p in predictions if p is not None)
|
932
|
-
metrics["prediction_success_rate"] = successful_predictions / len(predictions) if predictions else 0.0
|
933
|
-
|
934
|
-
# Task-specific metrics would be computed by individual evaluators
|
935
|
-
# This is a placeholder for common vision metrics
|
936
|
-
|
937
|
-
if task_type == "ocr":
|
938
|
-
# OCR-specific metrics would be computed in VisionEvaluator
|
939
|
-
pass
|
940
|
-
elif task_type == "detection":
|
941
|
-
# Object detection metrics (IoU, mAP, etc.)
|
942
|
-
pass
|
943
|
-
elif task_type == "classification":
|
944
|
-
# Image classification metrics
|
945
|
-
pass
|
946
|
-
|
947
|
-
return metrics
|
948
|
-
|
949
|
-
except Exception as e:
|
950
|
-
logger.error(f"Error computing vision metrics: {e}")
|
951
|
-
return {"vision_metrics_error": 1.0}
|