isa-model 0.4.0__py3-none-any.whl → 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (189) hide show
  1. isa_model/client.py +466 -43
  2. isa_model/core/cache/redis_cache.py +12 -3
  3. isa_model/core/config/config_manager.py +230 -3
  4. isa_model/core/config.py +90 -0
  5. isa_model/core/database/direct_db_client.py +114 -0
  6. isa_model/core/database/migration_manager.py +563 -0
  7. isa_model/core/database/migrations.py +21 -1
  8. isa_model/core/database/supabase_client.py +154 -19
  9. isa_model/core/dependencies.py +316 -0
  10. isa_model/core/discovery/__init__.py +19 -0
  11. isa_model/core/discovery/consul_discovery.py +190 -0
  12. isa_model/core/logging/__init__.py +54 -0
  13. isa_model/core/logging/influx_logger.py +523 -0
  14. isa_model/core/logging/loki_logger.py +160 -0
  15. isa_model/core/models/__init__.py +27 -18
  16. isa_model/core/models/config_models.py +625 -0
  17. isa_model/core/models/deployment_billing_tracker.py +430 -0
  18. isa_model/core/models/model_manager.py +35 -80
  19. isa_model/core/models/model_metadata.py +690 -0
  20. isa_model/core/models/model_repo.py +174 -18
  21. isa_model/core/models/system_models.py +857 -0
  22. isa_model/core/repositories/__init__.py +9 -0
  23. isa_model/core/repositories/config_repository.py +912 -0
  24. isa_model/core/services/intelligent_model_selector.py +399 -21
  25. isa_model/core/types.py +1 -0
  26. isa_model/deployment/__init__.py +5 -48
  27. isa_model/deployment/core/__init__.py +2 -31
  28. isa_model/deployment/core/deployment_manager.py +1278 -370
  29. isa_model/deployment/modal/__init__.py +8 -0
  30. isa_model/deployment/modal/config.py +136 -0
  31. isa_model/deployment/{services/auto_hf_modal_deployer.py → modal/deployer.py} +1 -1
  32. isa_model/deployment/modal/services/__init__.py +3 -0
  33. isa_model/deployment/modal/services/audio/__init__.py +1 -0
  34. isa_model/deployment/modal/services/embedding/__init__.py +1 -0
  35. isa_model/deployment/modal/services/llm/__init__.py +1 -0
  36. isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
  37. isa_model/deployment/modal/services/video/__init__.py +1 -0
  38. isa_model/deployment/modal/services/vision/__init__.py +1 -0
  39. isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
  40. isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
  41. isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
  42. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
  43. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
  44. isa_model/deployment/storage/__init__.py +5 -0
  45. isa_model/deployment/storage/deployment_repository.py +824 -0
  46. isa_model/deployment/triton/__init__.py +10 -0
  47. isa_model/deployment/triton/config.py +196 -0
  48. isa_model/deployment/triton/configs/__init__.py +1 -0
  49. isa_model/deployment/triton/provider.py +512 -0
  50. isa_model/deployment/triton/scripts/__init__.py +1 -0
  51. isa_model/deployment/triton/templates/__init__.py +1 -0
  52. isa_model/inference/__init__.py +47 -1
  53. isa_model/inference/ai_factory.py +137 -10
  54. isa_model/inference/legacy_services/__init__.py +21 -0
  55. isa_model/inference/legacy_services/model_evaluation.py +637 -0
  56. isa_model/inference/legacy_services/model_service.py +573 -0
  57. isa_model/inference/legacy_services/model_serving.py +717 -0
  58. isa_model/inference/legacy_services/model_training.py +561 -0
  59. isa_model/inference/models/__init__.py +21 -0
  60. isa_model/inference/models/inference_config.py +551 -0
  61. isa_model/inference/models/inference_record.py +675 -0
  62. isa_model/inference/models/performance_models.py +714 -0
  63. isa_model/inference/repositories/__init__.py +9 -0
  64. isa_model/inference/repositories/inference_repository.py +828 -0
  65. isa_model/inference/services/audio/base_stt_service.py +184 -11
  66. isa_model/inference/services/audio/openai_stt_service.py +22 -6
  67. isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
  68. isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
  69. isa_model/inference/services/llm/__init__.py +10 -2
  70. isa_model/inference/services/llm/base_llm_service.py +335 -24
  71. isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
  72. isa_model/inference/services/llm/helpers/llm_adapter.py +9 -4
  73. isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
  74. isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
  75. isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
  76. isa_model/inference/services/llm/ollama_llm_service.py +9 -2
  77. isa_model/inference/services/llm/openai_llm_service.py +33 -16
  78. isa_model/inference/services/llm/yyds_llm_service.py +8 -2
  79. isa_model/inference/services/vision/__init__.py +22 -1
  80. isa_model/inference/services/vision/helpers/image_utils.py +8 -5
  81. isa_model/inference/services/vision/isa_vision_service.py +65 -4
  82. isa_model/inference/services/vision/openai_vision_service.py +19 -10
  83. isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
  84. isa_model/serving/api/cache_manager.py +245 -0
  85. isa_model/serving/api/dependencies/__init__.py +1 -0
  86. isa_model/serving/api/dependencies/auth.py +194 -0
  87. isa_model/serving/api/dependencies/database.py +139 -0
  88. isa_model/serving/api/error_handlers.py +284 -0
  89. isa_model/serving/api/fastapi_server.py +172 -22
  90. isa_model/serving/api/middleware/auth.py +8 -2
  91. isa_model/serving/api/middleware/security.py +23 -33
  92. isa_model/serving/api/middleware/tenant_context.py +414 -0
  93. isa_model/serving/api/routes/analytics.py +4 -1
  94. isa_model/serving/api/routes/config.py +645 -0
  95. isa_model/serving/api/routes/deployment_billing.py +315 -0
  96. isa_model/serving/api/routes/deployments.py +138 -2
  97. isa_model/serving/api/routes/gpu_gateway.py +440 -0
  98. isa_model/serving/api/routes/health.py +32 -12
  99. isa_model/serving/api/routes/inference_monitoring.py +486 -0
  100. isa_model/serving/api/routes/local_deployments.py +448 -0
  101. isa_model/serving/api/routes/tenants.py +575 -0
  102. isa_model/serving/api/routes/unified.py +680 -18
  103. isa_model/serving/api/routes/webhooks.py +479 -0
  104. isa_model/serving/api/startup.py +68 -54
  105. isa_model/utils/gpu_utils.py +311 -0
  106. {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/METADATA +71 -24
  107. isa_model-0.4.4.dist-info/RECORD +180 -0
  108. isa_model/core/security/secrets.py +0 -358
  109. isa_model/core/storage/hf_storage.py +0 -419
  110. isa_model/core/storage/minio_storage.py +0 -0
  111. isa_model/deployment/cloud/__init__.py +0 -9
  112. isa_model/deployment/cloud/modal/__init__.py +0 -10
  113. isa_model/deployment/core/deployment_config.py +0 -356
  114. isa_model/deployment/core/isa_deployment_service.py +0 -401
  115. isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
  116. isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
  117. isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
  118. isa_model/deployment/runtime/deployed_service.py +0 -338
  119. isa_model/deployment/services/__init__.py +0 -9
  120. isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
  121. isa_model/deployment/services/model_service.py +0 -332
  122. isa_model/deployment/services/service_monitor.py +0 -356
  123. isa_model/deployment/services/service_registry.py +0 -527
  124. isa_model/eval/__init__.py +0 -92
  125. isa_model/eval/benchmarks/__init__.py +0 -27
  126. isa_model/eval/benchmarks/multimodal_datasets.py +0 -460
  127. isa_model/eval/benchmarks.py +0 -701
  128. isa_model/eval/config/__init__.py +0 -10
  129. isa_model/eval/config/evaluation_config.py +0 -108
  130. isa_model/eval/evaluators/__init__.py +0 -24
  131. isa_model/eval/evaluators/audio_evaluator.py +0 -727
  132. isa_model/eval/evaluators/base_evaluator.py +0 -503
  133. isa_model/eval/evaluators/embedding_evaluator.py +0 -742
  134. isa_model/eval/evaluators/llm_evaluator.py +0 -472
  135. isa_model/eval/evaluators/vision_evaluator.py +0 -564
  136. isa_model/eval/example_evaluation.py +0 -395
  137. isa_model/eval/factory.py +0 -798
  138. isa_model/eval/infrastructure/__init__.py +0 -24
  139. isa_model/eval/infrastructure/experiment_tracker.py +0 -466
  140. isa_model/eval/isa_benchmarks.py +0 -700
  141. isa_model/eval/isa_integration.py +0 -582
  142. isa_model/eval/metrics.py +0 -951
  143. isa_model/eval/tests/unit/test_basic.py +0 -396
  144. isa_model/serving/api/routes/evaluations.py +0 -579
  145. isa_model/training/__init__.py +0 -168
  146. isa_model/training/annotation/annotation_schema.py +0 -47
  147. isa_model/training/annotation/processors/annotation_processor.py +0 -126
  148. isa_model/training/annotation/storage/dataset_manager.py +0 -131
  149. isa_model/training/annotation/storage/dataset_schema.py +0 -44
  150. isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
  151. isa_model/training/annotation/tests/test_minio copy.py +0 -113
  152. isa_model/training/annotation/tests/test_minio_upload.py +0 -43
  153. isa_model/training/annotation/views/annotation_controller.py +0 -158
  154. isa_model/training/cloud/__init__.py +0 -22
  155. isa_model/training/cloud/job_orchestrator.py +0 -402
  156. isa_model/training/cloud/runpod_trainer.py +0 -454
  157. isa_model/training/cloud/storage_manager.py +0 -482
  158. isa_model/training/core/__init__.py +0 -26
  159. isa_model/training/core/config.py +0 -181
  160. isa_model/training/core/dataset.py +0 -222
  161. isa_model/training/core/trainer.py +0 -720
  162. isa_model/training/core/utils.py +0 -213
  163. isa_model/training/examples/intelligent_training_example.py +0 -281
  164. isa_model/training/factory.py +0 -424
  165. isa_model/training/intelligent/__init__.py +0 -25
  166. isa_model/training/intelligent/decision_engine.py +0 -643
  167. isa_model/training/intelligent/intelligent_factory.py +0 -888
  168. isa_model/training/intelligent/knowledge_base.py +0 -751
  169. isa_model/training/intelligent/resource_optimizer.py +0 -839
  170. isa_model/training/intelligent/task_classifier.py +0 -576
  171. isa_model/training/storage/__init__.py +0 -24
  172. isa_model/training/storage/core_integration.py +0 -439
  173. isa_model/training/storage/training_repository.py +0 -552
  174. isa_model/training/storage/training_storage.py +0 -628
  175. isa_model-0.4.0.dist-info/RECORD +0 -182
  176. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_chatTTS_service.py +0 -0
  177. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_fish_service.py +0 -0
  178. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_openvoice_service.py +0 -0
  179. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_service_v2.py +0 -0
  180. /isa_model/deployment/{cloud/modal → modal/services/embedding}/isa_embed_rerank_service.py +0 -0
  181. /isa_model/deployment/{cloud/modal → modal/services/video}/isa_video_hunyuan_service.py +0 -0
  182. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ocr_service.py +0 -0
  183. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_qwen25_service.py +0 -0
  184. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_table_service.py +0 -0
  185. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service.py +0 -0
  186. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service_optimized.py +0 -0
  187. /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
  188. {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/WHEEL +0 -0
  189. {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/top_level.txt +0 -0
@@ -1,503 +0,0 @@
1
- """
2
- Base evaluator class implementing industry best practices for AI model evaluation.
3
-
4
- Features:
5
- - Async/await support for concurrent evaluation
6
- - Comprehensive error handling and retry logic
7
- - Experiment tracking integration (W&B, MLflow)
8
- - Distributed evaluation support
9
- - Memory-efficient batch processing
10
- - Comprehensive logging and metrics
11
- """
12
-
13
- import asyncio
14
- import logging
15
- import time
16
- import traceback
17
- from abc import ABC, abstractmethod
18
- from dataclasses import dataclass, field
19
- from typing import Dict, List, Any, Optional, Union, Callable, AsyncGenerator
20
- from datetime import datetime
21
- from pathlib import Path
22
- import json
23
-
24
- try:
25
- import wandb
26
- WANDB_AVAILABLE = True
27
- except ImportError:
28
- WANDB_AVAILABLE = False
29
-
30
- try:
31
- import mlflow
32
- MLFLOW_AVAILABLE = True
33
- except ImportError:
34
- MLFLOW_AVAILABLE = False
35
-
36
- logger = logging.getLogger(__name__)
37
-
38
-
39
- @dataclass
40
- class EvaluationResult:
41
- """
42
- Standardized evaluation result container.
43
-
44
- Follows MLOps best practices for result tracking and reproducibility.
45
- """
46
-
47
- # Core results
48
- metrics: Dict[str, float] = field(default_factory=dict)
49
- predictions: List[Any] = field(default_factory=list)
50
- references: List[Any] = field(default_factory=list)
51
-
52
- # Metadata
53
- model_name: str = ""
54
- dataset_name: str = ""
55
- evaluation_type: str = ""
56
- timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
57
-
58
- # Performance metrics
59
- total_samples: int = 0
60
- successful_samples: int = 0
61
- failed_samples: int = 0
62
- evaluation_time_seconds: float = 0.0
63
- throughput_samples_per_second: float = 0.0
64
-
65
- # Cost and resource tracking
66
- total_tokens_used: int = 0
67
- estimated_cost_usd: float = 0.0
68
- memory_peak_mb: float = 0.0
69
-
70
- # Configuration
71
- config: Dict[str, Any] = field(default_factory=dict)
72
- environment_info: Dict[str, Any] = field(default_factory=dict)
73
-
74
- # Error tracking
75
- errors: List[Dict[str, Any]] = field(default_factory=list)
76
- warnings: List[str] = field(default_factory=list)
77
-
78
- # Detailed results
79
- sample_results: List[Dict[str, Any]] = field(default_factory=list)
80
-
81
- def to_dict(self) -> Dict[str, Any]:
82
- """Convert to dictionary for serialization."""
83
- return {
84
- "metrics": self.metrics,
85
- "predictions": self.predictions,
86
- "references": self.references,
87
- "model_name": self.model_name,
88
- "dataset_name": self.dataset_name,
89
- "evaluation_type": self.evaluation_type,
90
- "timestamp": self.timestamp,
91
- "total_samples": self.total_samples,
92
- "successful_samples": self.successful_samples,
93
- "failed_samples": self.failed_samples,
94
- "evaluation_time_seconds": self.evaluation_time_seconds,
95
- "throughput_samples_per_second": self.throughput_samples_per_second,
96
- "total_tokens_used": self.total_tokens_used,
97
- "estimated_cost_usd": self.estimated_cost_usd,
98
- "memory_peak_mb": self.memory_peak_mb,
99
- "config": self.config,
100
- "environment_info": self.environment_info,
101
- "errors": self.errors,
102
- "warnings": self.warnings,
103
- "sample_results": self.sample_results
104
- }
105
-
106
- def save_to_file(self, file_path: Union[str, Path]) -> None:
107
- """Save results to JSON file."""
108
- with open(file_path, 'w', encoding='utf-8') as f:
109
- json.dump(self.to_dict(), f, indent=2, ensure_ascii=False)
110
-
111
- @classmethod
112
- def load_from_file(cls, file_path: Union[str, Path]) -> 'EvaluationResult':
113
- """Load results from JSON file."""
114
- with open(file_path, 'r', encoding='utf-8') as f:
115
- data = json.load(f)
116
-
117
- result = cls()
118
- for key, value in data.items():
119
- if hasattr(result, key):
120
- setattr(result, key, value)
121
-
122
- return result
123
-
124
- def get_summary(self) -> Dict[str, Any]:
125
- """Get evaluation summary."""
126
- success_rate = self.successful_samples / self.total_samples if self.total_samples > 0 else 0.0
127
-
128
- return {
129
- "model_name": self.model_name,
130
- "dataset_name": self.dataset_name,
131
- "evaluation_type": self.evaluation_type,
132
- "timestamp": self.timestamp,
133
- "success_rate": success_rate,
134
- "total_samples": self.total_samples,
135
- "evaluation_time_seconds": self.evaluation_time_seconds,
136
- "throughput_samples_per_second": self.throughput_samples_per_second,
137
- "estimated_cost_usd": self.estimated_cost_usd,
138
- "key_metrics": self.metrics,
139
- "error_count": len(self.errors),
140
- "warning_count": len(self.warnings)
141
- }
142
-
143
-
144
- class BaseEvaluator(ABC):
145
- """
146
- Abstract base evaluator implementing industry best practices.
147
-
148
- Features:
149
- - Async evaluation with concurrency control
150
- - Comprehensive error handling and retry logic
151
- - Experiment tracking integration
152
- - Memory-efficient batch processing
153
- - Progress monitoring and cancellation support
154
- """
155
-
156
- def __init__(self,
157
- evaluator_name: str,
158
- config: Optional[Dict[str, Any]] = None,
159
- experiment_tracker: Optional[Any] = None):
160
- """
161
- Initialize the base evaluator.
162
-
163
- Args:
164
- evaluator_name: Name identifier for this evaluator
165
- config: Evaluation configuration
166
- experiment_tracker: Optional experiment tracking instance
167
- """
168
- self.evaluator_name = evaluator_name
169
- self.config = config or {}
170
- self.experiment_tracker = experiment_tracker
171
-
172
- # State management
173
- self._is_running = False
174
- self._should_stop = False
175
- self._current_result: Optional[EvaluationResult] = None
176
-
177
- # Performance monitoring
178
- self._start_time: Optional[float] = None
179
- self._peak_memory_mb: float = 0.0
180
-
181
- # Concurrency control
182
- self.max_concurrent_requests = self.config.get("max_concurrent_requests", 10)
183
- self.semaphore = asyncio.Semaphore(self.max_concurrent_requests)
184
-
185
- # Retry configuration
186
- self.max_retries = self.config.get("max_retries", 3)
187
- self.retry_delay = self.config.get("retry_delay_seconds", 1.0)
188
-
189
- logger.info(f"Initialized {evaluator_name} evaluator with config: {self.config}")
190
-
191
- @abstractmethod
192
- async def evaluate_sample(self,
193
- sample: Dict[str, Any],
194
- model_interface: Any) -> Dict[str, Any]:
195
- """
196
- Evaluate a single sample.
197
-
198
- Args:
199
- sample: Data sample to evaluate
200
- model_interface: Model interface for inference
201
-
202
- Returns:
203
- Evaluation result for the sample
204
- """
205
- pass
206
-
207
- @abstractmethod
208
- def compute_metrics(self,
209
- predictions: List[Any],
210
- references: List[Any],
211
- **kwargs) -> Dict[str, float]:
212
- """
213
- Compute evaluation metrics.
214
-
215
- Args:
216
- predictions: Model predictions
217
- references: Ground truth references
218
- **kwargs: Additional parameters
219
-
220
- Returns:
221
- Dictionary of computed metrics
222
- """
223
- pass
224
-
225
- async def evaluate(self,
226
- model_interface: Any,
227
- dataset: List[Dict[str, Any]],
228
- dataset_name: str = "unknown",
229
- model_name: str = "unknown",
230
- batch_size: Optional[int] = None,
231
- save_predictions: bool = True,
232
- progress_callback: Optional[Callable] = None) -> EvaluationResult:
233
- """
234
- Perform comprehensive evaluation with industry best practices.
235
-
236
- Args:
237
- model_interface: Model interface for inference
238
- dataset: Dataset to evaluate on
239
- dataset_name: Name of the dataset
240
- model_name: Name of the model
241
- batch_size: Batch size for processing
242
- save_predictions: Whether to save individual predictions
243
- progress_callback: Optional callback for progress updates
244
-
245
- Returns:
246
- Comprehensive evaluation results
247
- """
248
-
249
- # Initialize evaluation
250
- self._start_evaluation()
251
- result = EvaluationResult(
252
- model_name=model_name,
253
- dataset_name=dataset_name,
254
- evaluation_type=self.evaluator_name,
255
- config=self.config.copy(),
256
- environment_info=self._get_environment_info()
257
- )
258
-
259
- try:
260
- # Start experiment tracking
261
- await self._start_experiment_tracking(model_name, dataset_name)
262
-
263
- # Process dataset in batches
264
- batch_size = batch_size or self.config.get("batch_size", 32)
265
- total_batches = (len(dataset) + batch_size - 1) // batch_size
266
-
267
- all_predictions = []
268
- all_references = []
269
- all_sample_results = []
270
-
271
- for batch_idx in range(total_batches):
272
- if self._should_stop:
273
- logger.info("Evaluation stopped by user request")
274
- break
275
-
276
- # Get batch
277
- start_idx = batch_idx * batch_size
278
- end_idx = min(start_idx + batch_size, len(dataset))
279
- batch = dataset[start_idx:end_idx]
280
-
281
- # Process batch
282
- batch_results = await self._process_batch(batch, model_interface)
283
-
284
- # Collect results
285
- for sample_result in batch_results:
286
- if sample_result.get("success", False):
287
- all_predictions.append(sample_result.get("prediction"))
288
- all_references.append(sample_result.get("reference"))
289
- result.successful_samples += 1
290
- else:
291
- result.failed_samples += 1
292
- result.errors.append({
293
- "sample_id": sample_result.get("sample_id"),
294
- "error": sample_result.get("error"),
295
- "timestamp": datetime.now().isoformat()
296
- })
297
-
298
- if save_predictions:
299
- all_sample_results.append(sample_result)
300
-
301
- # Update progress
302
- progress = (batch_idx + 1) / total_batches
303
- if progress_callback:
304
- await progress_callback(progress, batch_idx + 1, total_batches)
305
-
306
- # Log progress
307
- if (batch_idx + 1) % 10 == 0 or batch_idx == total_batches - 1:
308
- logger.info(f"Processed {batch_idx + 1}/{total_batches} batches "
309
- f"({result.successful_samples} successful, {result.failed_samples} failed)")
310
-
311
- # Compute final metrics
312
- if all_predictions and all_references:
313
- result.metrics = self.compute_metrics(all_predictions, all_references)
314
- logger.info(f"Computed metrics: {result.metrics}")
315
- else:
316
- logger.warning("No valid predictions available for metric computation")
317
- result.warnings.append("No valid predictions available for metric computation")
318
-
319
- # Finalize results
320
- result.predictions = all_predictions
321
- result.references = all_references
322
- result.sample_results = all_sample_results
323
- result.total_samples = len(dataset)
324
-
325
- # Log experiment results
326
- await self._log_experiment_results(result)
327
-
328
- except Exception as e:
329
- logger.error(f"Evaluation failed: {e}")
330
- logger.error(traceback.format_exc())
331
- result.errors.append({
332
- "error": str(e),
333
- "error_type": type(e).__name__,
334
- "traceback": traceback.format_exc(),
335
- "timestamp": datetime.now().isoformat()
336
- })
337
-
338
- finally:
339
- # Finalize evaluation
340
- self._end_evaluation(result)
341
- await self._end_experiment_tracking()
342
- self._current_result = result
343
-
344
- return result
345
-
346
- async def _process_batch(self,
347
- batch: List[Dict[str, Any]],
348
- model_interface: Any) -> List[Dict[str, Any]]:
349
- """Process a batch of samples with concurrency control."""
350
- tasks = []
351
-
352
- for sample in batch:
353
- task = asyncio.create_task(
354
- self._process_sample_with_retry(sample, model_interface)
355
- )
356
- tasks.append(task)
357
-
358
- # Wait for all tasks in batch to complete
359
- results = await asyncio.gather(*tasks, return_exceptions=True)
360
-
361
- # Process results and handle exceptions
362
- processed_results = []
363
- for i, result in enumerate(results):
364
- if isinstance(result, Exception):
365
- processed_results.append({
366
- "sample_id": batch[i].get("id", f"sample_{i}"),
367
- "success": False,
368
- "error": str(result),
369
- "prediction": None,
370
- "reference": batch[i].get("reference")
371
- })
372
- else:
373
- processed_results.append(result)
374
-
375
- return processed_results
376
-
377
- async def _process_sample_with_retry(self,
378
- sample: Dict[str, Any],
379
- model_interface: Any) -> Dict[str, Any]:
380
- """Process a single sample with retry logic and concurrency control."""
381
- async with self.semaphore: # Limit concurrent requests
382
- for attempt in range(self.max_retries + 1):
383
- try:
384
- result = await self.evaluate_sample(sample, model_interface)
385
- result["success"] = True
386
- result["sample_id"] = sample.get("id", "unknown")
387
- result["reference"] = sample.get("reference")
388
- return result
389
-
390
- except Exception as e:
391
- if attempt == self.max_retries:
392
- # Final attempt failed
393
- logger.error(f"Sample evaluation failed after {self.max_retries + 1} attempts: {e}")
394
- return {
395
- "sample_id": sample.get("id", "unknown"),
396
- "success": False,
397
- "error": str(e),
398
- "prediction": None,
399
- "reference": sample.get("reference")
400
- }
401
- else:
402
- # Retry with exponential backoff
403
- delay = self.retry_delay * (2 ** attempt)
404
- logger.warning(f"Sample evaluation failed (attempt {attempt + 1}), retrying in {delay}s: {e}")
405
- await asyncio.sleep(delay)
406
-
407
- def _start_evaluation(self) -> None:
408
- """Mark the start of evaluation."""
409
- self._is_running = True
410
- self._should_stop = False
411
- self._start_time = time.time()
412
-
413
- # Monitor memory usage
414
- try:
415
- import psutil
416
- process = psutil.Process()
417
- self._peak_memory_mb = process.memory_info().rss / 1024 / 1024
418
- except ImportError:
419
- pass
420
-
421
- def _end_evaluation(self, result: EvaluationResult) -> None:
422
- """Finalize evaluation with performance metrics."""
423
- self._is_running = False
424
- end_time = time.time()
425
-
426
- if self._start_time:
427
- result.evaluation_time_seconds = end_time - self._start_time
428
- if result.total_samples > 0:
429
- result.throughput_samples_per_second = result.total_samples / result.evaluation_time_seconds
430
-
431
- result.memory_peak_mb = self._peak_memory_mb
432
-
433
- logger.info(f"Evaluation completed in {result.evaluation_time_seconds:.2f}s "
434
- f"({result.throughput_samples_per_second:.2f} samples/sec)")
435
-
436
- def _get_environment_info(self) -> Dict[str, Any]:
437
- """Get environment information for reproducibility."""
438
- import platform
439
- import sys
440
-
441
- env_info = {
442
- "python_version": sys.version,
443
- "platform": platform.platform(),
444
- "hostname": platform.node(),
445
- "timestamp": datetime.now().isoformat()
446
- }
447
-
448
- try:
449
- import torch
450
- env_info["torch_version"] = torch.__version__
451
- env_info["cuda_available"] = torch.cuda.is_available()
452
- if torch.cuda.is_available():
453
- env_info["cuda_device_count"] = torch.cuda.device_count()
454
- env_info["cuda_device_name"] = torch.cuda.get_device_name()
455
- except ImportError:
456
- pass
457
-
458
- return env_info
459
-
460
- async def _start_experiment_tracking(self, model_name: str, dataset_name: str) -> None:
461
- """Start experiment tracking if available."""
462
- if self.experiment_tracker:
463
- try:
464
- await self.experiment_tracker.start_run(
465
- name=f"{self.evaluator_name}_{model_name}_{dataset_name}",
466
- config=self.config
467
- )
468
- except Exception as e:
469
- logger.warning(f"Failed to start experiment tracking: {e}")
470
-
471
- async def _log_experiment_results(self, result: EvaluationResult) -> None:
472
- """Log results to experiment tracker."""
473
- if self.experiment_tracker:
474
- try:
475
- await self.experiment_tracker.log_metrics(result.metrics)
476
- await self.experiment_tracker.log_params(result.config)
477
- except Exception as e:
478
- logger.warning(f"Failed to log experiment results: {e}")
479
-
480
- async def _end_experiment_tracking(self) -> None:
481
- """End experiment tracking."""
482
- if self.experiment_tracker:
483
- try:
484
- await self.experiment_tracker.end_run()
485
- except Exception as e:
486
- logger.warning(f"Failed to end experiment tracking: {e}")
487
-
488
- def stop_evaluation(self) -> None:
489
- """Request evaluation to stop gracefully."""
490
- self._should_stop = True
491
- logger.info("Evaluation stop requested")
492
-
493
- def is_running(self) -> bool:
494
- """Check if evaluation is currently running."""
495
- return self._is_running
496
-
497
- def get_current_result(self) -> Optional[EvaluationResult]:
498
- """Get the current/latest evaluation result."""
499
- return self._current_result
500
-
501
- def get_supported_metrics(self) -> List[str]:
502
- """Get list of metrics supported by this evaluator."""
503
- return [] # To be overridden by subclasses