isa-model 0.4.0__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (199) hide show
  1. isa_model/client.py +466 -43
  2. isa_model/core/cache/redis_cache.py +12 -3
  3. isa_model/core/config/config_manager.py +230 -3
  4. isa_model/core/config.py +90 -0
  5. isa_model/core/database/direct_db_client.py +114 -0
  6. isa_model/core/database/migration_manager.py +563 -0
  7. isa_model/core/database/migrations.py +21 -1
  8. isa_model/core/database/supabase_client.py +154 -19
  9. isa_model/core/dependencies.py +316 -0
  10. isa_model/core/discovery/__init__.py +19 -0
  11. isa_model/core/discovery/consul_discovery.py +190 -0
  12. isa_model/core/logging/__init__.py +54 -0
  13. isa_model/core/logging/influx_logger.py +523 -0
  14. isa_model/core/logging/loki_logger.py +160 -0
  15. isa_model/core/models/__init__.py +27 -18
  16. isa_model/core/models/config_models.py +625 -0
  17. isa_model/core/models/deployment_billing_tracker.py +430 -0
  18. isa_model/core/models/model_manager.py +40 -17
  19. isa_model/core/models/model_metadata.py +690 -0
  20. isa_model/core/models/model_repo.py +174 -18
  21. isa_model/core/models/system_models.py +857 -0
  22. isa_model/core/repositories/__init__.py +9 -0
  23. isa_model/core/repositories/config_repository.py +912 -0
  24. isa_model/core/services/intelligent_model_selector.py +399 -21
  25. isa_model/core/storage/hf_storage.py +1 -1
  26. isa_model/core/types.py +1 -0
  27. isa_model/deployment/__init__.py +5 -48
  28. isa_model/deployment/core/__init__.py +2 -31
  29. isa_model/deployment/core/deployment_manager.py +1278 -370
  30. isa_model/deployment/local/__init__.py +31 -0
  31. isa_model/deployment/local/config.py +248 -0
  32. isa_model/deployment/local/gpu_gateway.py +607 -0
  33. isa_model/deployment/local/health_checker.py +428 -0
  34. isa_model/deployment/local/provider.py +586 -0
  35. isa_model/deployment/local/tensorrt_service.py +621 -0
  36. isa_model/deployment/local/transformers_service.py +644 -0
  37. isa_model/deployment/local/vllm_service.py +527 -0
  38. isa_model/deployment/modal/__init__.py +8 -0
  39. isa_model/deployment/modal/config.py +136 -0
  40. isa_model/deployment/{services/auto_hf_modal_deployer.py → modal/deployer.py} +1 -1
  41. isa_model/deployment/modal/services/__init__.py +3 -0
  42. isa_model/deployment/modal/services/audio/__init__.py +1 -0
  43. isa_model/deployment/modal/services/embedding/__init__.py +1 -0
  44. isa_model/deployment/modal/services/llm/__init__.py +1 -0
  45. isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
  46. isa_model/deployment/modal/services/video/__init__.py +1 -0
  47. isa_model/deployment/modal/services/vision/__init__.py +1 -0
  48. isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
  49. isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
  50. isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
  51. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
  52. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
  53. isa_model/deployment/storage/__init__.py +5 -0
  54. isa_model/deployment/storage/deployment_repository.py +824 -0
  55. isa_model/deployment/triton/__init__.py +10 -0
  56. isa_model/deployment/triton/config.py +196 -0
  57. isa_model/deployment/triton/configs/__init__.py +1 -0
  58. isa_model/deployment/triton/provider.py +512 -0
  59. isa_model/deployment/triton/scripts/__init__.py +1 -0
  60. isa_model/deployment/triton/templates/__init__.py +1 -0
  61. isa_model/inference/__init__.py +47 -1
  62. isa_model/inference/ai_factory.py +137 -10
  63. isa_model/inference/legacy_services/__init__.py +21 -0
  64. isa_model/inference/legacy_services/model_evaluation.py +637 -0
  65. isa_model/inference/legacy_services/model_service.py +573 -0
  66. isa_model/inference/legacy_services/model_serving.py +717 -0
  67. isa_model/inference/legacy_services/model_training.py +561 -0
  68. isa_model/inference/models/__init__.py +21 -0
  69. isa_model/inference/models/inference_config.py +551 -0
  70. isa_model/inference/models/inference_record.py +675 -0
  71. isa_model/inference/models/performance_models.py +714 -0
  72. isa_model/inference/repositories/__init__.py +9 -0
  73. isa_model/inference/repositories/inference_repository.py +828 -0
  74. isa_model/inference/services/audio/base_stt_service.py +184 -11
  75. isa_model/inference/services/audio/openai_stt_service.py +22 -6
  76. isa_model/inference/services/custom_model_manager.py +277 -0
  77. isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
  78. isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
  79. isa_model/inference/services/llm/__init__.py +10 -2
  80. isa_model/inference/services/llm/base_llm_service.py +335 -24
  81. isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
  82. isa_model/inference/services/llm/helpers/llm_adapter.py +9 -4
  83. isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
  84. isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
  85. isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
  86. isa_model/inference/services/llm/local_llm_service.py +747 -0
  87. isa_model/inference/services/llm/ollama_llm_service.py +9 -2
  88. isa_model/inference/services/llm/openai_llm_service.py +33 -16
  89. isa_model/inference/services/llm/yyds_llm_service.py +8 -2
  90. isa_model/inference/services/vision/__init__.py +22 -1
  91. isa_model/inference/services/vision/blip_vision_service.py +359 -0
  92. isa_model/inference/services/vision/helpers/image_utils.py +8 -5
  93. isa_model/inference/services/vision/isa_vision_service.py +65 -4
  94. isa_model/inference/services/vision/openai_vision_service.py +19 -10
  95. isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
  96. isa_model/serving/api/cache_manager.py +245 -0
  97. isa_model/serving/api/dependencies/__init__.py +1 -0
  98. isa_model/serving/api/dependencies/auth.py +194 -0
  99. isa_model/serving/api/dependencies/database.py +139 -0
  100. isa_model/serving/api/error_handlers.py +284 -0
  101. isa_model/serving/api/fastapi_server.py +172 -22
  102. isa_model/serving/api/middleware/auth.py +8 -2
  103. isa_model/serving/api/middleware/security.py +23 -33
  104. isa_model/serving/api/middleware/tenant_context.py +414 -0
  105. isa_model/serving/api/routes/analytics.py +4 -1
  106. isa_model/serving/api/routes/config.py +645 -0
  107. isa_model/serving/api/routes/deployment_billing.py +315 -0
  108. isa_model/serving/api/routes/deployments.py +138 -2
  109. isa_model/serving/api/routes/gpu_gateway.py +440 -0
  110. isa_model/serving/api/routes/health.py +32 -12
  111. isa_model/serving/api/routes/inference_monitoring.py +486 -0
  112. isa_model/serving/api/routes/local_deployments.py +448 -0
  113. isa_model/serving/api/routes/tenants.py +575 -0
  114. isa_model/serving/api/routes/unified.py +680 -18
  115. isa_model/serving/api/routes/webhooks.py +479 -0
  116. isa_model/serving/api/startup.py +68 -54
  117. isa_model/utils/gpu_utils.py +311 -0
  118. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/METADATA +66 -24
  119. isa_model-0.4.3.dist-info/RECORD +193 -0
  120. isa_model/core/storage/minio_storage.py +0 -0
  121. isa_model/deployment/cloud/__init__.py +0 -9
  122. isa_model/deployment/cloud/modal/__init__.py +0 -10
  123. isa_model/deployment/core/deployment_config.py +0 -356
  124. isa_model/deployment/core/isa_deployment_service.py +0 -401
  125. isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
  126. isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
  127. isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
  128. isa_model/deployment/runtime/deployed_service.py +0 -338
  129. isa_model/deployment/services/__init__.py +0 -9
  130. isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
  131. isa_model/deployment/services/model_service.py +0 -332
  132. isa_model/deployment/services/service_monitor.py +0 -356
  133. isa_model/deployment/services/service_registry.py +0 -527
  134. isa_model/eval/__init__.py +0 -92
  135. isa_model/eval/benchmarks/__init__.py +0 -27
  136. isa_model/eval/benchmarks/multimodal_datasets.py +0 -460
  137. isa_model/eval/benchmarks.py +0 -701
  138. isa_model/eval/config/__init__.py +0 -10
  139. isa_model/eval/config/evaluation_config.py +0 -108
  140. isa_model/eval/evaluators/__init__.py +0 -24
  141. isa_model/eval/evaluators/audio_evaluator.py +0 -727
  142. isa_model/eval/evaluators/base_evaluator.py +0 -503
  143. isa_model/eval/evaluators/embedding_evaluator.py +0 -742
  144. isa_model/eval/evaluators/llm_evaluator.py +0 -472
  145. isa_model/eval/evaluators/vision_evaluator.py +0 -564
  146. isa_model/eval/example_evaluation.py +0 -395
  147. isa_model/eval/factory.py +0 -798
  148. isa_model/eval/infrastructure/__init__.py +0 -24
  149. isa_model/eval/infrastructure/experiment_tracker.py +0 -466
  150. isa_model/eval/isa_benchmarks.py +0 -700
  151. isa_model/eval/isa_integration.py +0 -582
  152. isa_model/eval/metrics.py +0 -951
  153. isa_model/eval/tests/unit/test_basic.py +0 -396
  154. isa_model/serving/api/routes/evaluations.py +0 -579
  155. isa_model/training/__init__.py +0 -168
  156. isa_model/training/annotation/annotation_schema.py +0 -47
  157. isa_model/training/annotation/processors/annotation_processor.py +0 -126
  158. isa_model/training/annotation/storage/dataset_manager.py +0 -131
  159. isa_model/training/annotation/storage/dataset_schema.py +0 -44
  160. isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
  161. isa_model/training/annotation/tests/test_minio copy.py +0 -113
  162. isa_model/training/annotation/tests/test_minio_upload.py +0 -43
  163. isa_model/training/annotation/views/annotation_controller.py +0 -158
  164. isa_model/training/cloud/__init__.py +0 -22
  165. isa_model/training/cloud/job_orchestrator.py +0 -402
  166. isa_model/training/cloud/runpod_trainer.py +0 -454
  167. isa_model/training/cloud/storage_manager.py +0 -482
  168. isa_model/training/core/__init__.py +0 -26
  169. isa_model/training/core/config.py +0 -181
  170. isa_model/training/core/dataset.py +0 -222
  171. isa_model/training/core/trainer.py +0 -720
  172. isa_model/training/core/utils.py +0 -213
  173. isa_model/training/examples/intelligent_training_example.py +0 -281
  174. isa_model/training/factory.py +0 -424
  175. isa_model/training/intelligent/__init__.py +0 -25
  176. isa_model/training/intelligent/decision_engine.py +0 -643
  177. isa_model/training/intelligent/intelligent_factory.py +0 -888
  178. isa_model/training/intelligent/knowledge_base.py +0 -751
  179. isa_model/training/intelligent/resource_optimizer.py +0 -839
  180. isa_model/training/intelligent/task_classifier.py +0 -576
  181. isa_model/training/storage/__init__.py +0 -24
  182. isa_model/training/storage/core_integration.py +0 -439
  183. isa_model/training/storage/training_repository.py +0 -552
  184. isa_model/training/storage/training_storage.py +0 -628
  185. isa_model-0.4.0.dist-info/RECORD +0 -182
  186. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_chatTTS_service.py +0 -0
  187. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_fish_service.py +0 -0
  188. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_openvoice_service.py +0 -0
  189. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_service_v2.py +0 -0
  190. /isa_model/deployment/{cloud/modal → modal/services/embedding}/isa_embed_rerank_service.py +0 -0
  191. /isa_model/deployment/{cloud/modal → modal/services/video}/isa_video_hunyuan_service.py +0 -0
  192. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ocr_service.py +0 -0
  193. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_qwen25_service.py +0 -0
  194. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_table_service.py +0 -0
  195. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service.py +0 -0
  196. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service_optimized.py +0 -0
  197. /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
  198. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/WHEEL +0 -0
  199. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/top_level.txt +0 -0
@@ -1,503 +0,0 @@
1
- """
2
- Base evaluator class implementing industry best practices for AI model evaluation.
3
-
4
- Features:
5
- - Async/await support for concurrent evaluation
6
- - Comprehensive error handling and retry logic
7
- - Experiment tracking integration (W&B, MLflow)
8
- - Distributed evaluation support
9
- - Memory-efficient batch processing
10
- - Comprehensive logging and metrics
11
- """
12
-
13
- import asyncio
14
- import logging
15
- import time
16
- import traceback
17
- from abc import ABC, abstractmethod
18
- from dataclasses import dataclass, field
19
- from typing import Dict, List, Any, Optional, Union, Callable, AsyncGenerator
20
- from datetime import datetime
21
- from pathlib import Path
22
- import json
23
-
24
- try:
25
- import wandb
26
- WANDB_AVAILABLE = True
27
- except ImportError:
28
- WANDB_AVAILABLE = False
29
-
30
- try:
31
- import mlflow
32
- MLFLOW_AVAILABLE = True
33
- except ImportError:
34
- MLFLOW_AVAILABLE = False
35
-
36
- logger = logging.getLogger(__name__)
37
-
38
-
39
- @dataclass
40
- class EvaluationResult:
41
- """
42
- Standardized evaluation result container.
43
-
44
- Follows MLOps best practices for result tracking and reproducibility.
45
- """
46
-
47
- # Core results
48
- metrics: Dict[str, float] = field(default_factory=dict)
49
- predictions: List[Any] = field(default_factory=list)
50
- references: List[Any] = field(default_factory=list)
51
-
52
- # Metadata
53
- model_name: str = ""
54
- dataset_name: str = ""
55
- evaluation_type: str = ""
56
- timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
57
-
58
- # Performance metrics
59
- total_samples: int = 0
60
- successful_samples: int = 0
61
- failed_samples: int = 0
62
- evaluation_time_seconds: float = 0.0
63
- throughput_samples_per_second: float = 0.0
64
-
65
- # Cost and resource tracking
66
- total_tokens_used: int = 0
67
- estimated_cost_usd: float = 0.0
68
- memory_peak_mb: float = 0.0
69
-
70
- # Configuration
71
- config: Dict[str, Any] = field(default_factory=dict)
72
- environment_info: Dict[str, Any] = field(default_factory=dict)
73
-
74
- # Error tracking
75
- errors: List[Dict[str, Any]] = field(default_factory=list)
76
- warnings: List[str] = field(default_factory=list)
77
-
78
- # Detailed results
79
- sample_results: List[Dict[str, Any]] = field(default_factory=list)
80
-
81
- def to_dict(self) -> Dict[str, Any]:
82
- """Convert to dictionary for serialization."""
83
- return {
84
- "metrics": self.metrics,
85
- "predictions": self.predictions,
86
- "references": self.references,
87
- "model_name": self.model_name,
88
- "dataset_name": self.dataset_name,
89
- "evaluation_type": self.evaluation_type,
90
- "timestamp": self.timestamp,
91
- "total_samples": self.total_samples,
92
- "successful_samples": self.successful_samples,
93
- "failed_samples": self.failed_samples,
94
- "evaluation_time_seconds": self.evaluation_time_seconds,
95
- "throughput_samples_per_second": self.throughput_samples_per_second,
96
- "total_tokens_used": self.total_tokens_used,
97
- "estimated_cost_usd": self.estimated_cost_usd,
98
- "memory_peak_mb": self.memory_peak_mb,
99
- "config": self.config,
100
- "environment_info": self.environment_info,
101
- "errors": self.errors,
102
- "warnings": self.warnings,
103
- "sample_results": self.sample_results
104
- }
105
-
106
- def save_to_file(self, file_path: Union[str, Path]) -> None:
107
- """Save results to JSON file."""
108
- with open(file_path, 'w', encoding='utf-8') as f:
109
- json.dump(self.to_dict(), f, indent=2, ensure_ascii=False)
110
-
111
- @classmethod
112
- def load_from_file(cls, file_path: Union[str, Path]) -> 'EvaluationResult':
113
- """Load results from JSON file."""
114
- with open(file_path, 'r', encoding='utf-8') as f:
115
- data = json.load(f)
116
-
117
- result = cls()
118
- for key, value in data.items():
119
- if hasattr(result, key):
120
- setattr(result, key, value)
121
-
122
- return result
123
-
124
- def get_summary(self) -> Dict[str, Any]:
125
- """Get evaluation summary."""
126
- success_rate = self.successful_samples / self.total_samples if self.total_samples > 0 else 0.0
127
-
128
- return {
129
- "model_name": self.model_name,
130
- "dataset_name": self.dataset_name,
131
- "evaluation_type": self.evaluation_type,
132
- "timestamp": self.timestamp,
133
- "success_rate": success_rate,
134
- "total_samples": self.total_samples,
135
- "evaluation_time_seconds": self.evaluation_time_seconds,
136
- "throughput_samples_per_second": self.throughput_samples_per_second,
137
- "estimated_cost_usd": self.estimated_cost_usd,
138
- "key_metrics": self.metrics,
139
- "error_count": len(self.errors),
140
- "warning_count": len(self.warnings)
141
- }
142
-
143
-
144
- class BaseEvaluator(ABC):
145
- """
146
- Abstract base evaluator implementing industry best practices.
147
-
148
- Features:
149
- - Async evaluation with concurrency control
150
- - Comprehensive error handling and retry logic
151
- - Experiment tracking integration
152
- - Memory-efficient batch processing
153
- - Progress monitoring and cancellation support
154
- """
155
-
156
- def __init__(self,
157
- evaluator_name: str,
158
- config: Optional[Dict[str, Any]] = None,
159
- experiment_tracker: Optional[Any] = None):
160
- """
161
- Initialize the base evaluator.
162
-
163
- Args:
164
- evaluator_name: Name identifier for this evaluator
165
- config: Evaluation configuration
166
- experiment_tracker: Optional experiment tracking instance
167
- """
168
- self.evaluator_name = evaluator_name
169
- self.config = config or {}
170
- self.experiment_tracker = experiment_tracker
171
-
172
- # State management
173
- self._is_running = False
174
- self._should_stop = False
175
- self._current_result: Optional[EvaluationResult] = None
176
-
177
- # Performance monitoring
178
- self._start_time: Optional[float] = None
179
- self._peak_memory_mb: float = 0.0
180
-
181
- # Concurrency control
182
- self.max_concurrent_requests = self.config.get("max_concurrent_requests", 10)
183
- self.semaphore = asyncio.Semaphore(self.max_concurrent_requests)
184
-
185
- # Retry configuration
186
- self.max_retries = self.config.get("max_retries", 3)
187
- self.retry_delay = self.config.get("retry_delay_seconds", 1.0)
188
-
189
- logger.info(f"Initialized {evaluator_name} evaluator with config: {self.config}")
190
-
191
- @abstractmethod
192
- async def evaluate_sample(self,
193
- sample: Dict[str, Any],
194
- model_interface: Any) -> Dict[str, Any]:
195
- """
196
- Evaluate a single sample.
197
-
198
- Args:
199
- sample: Data sample to evaluate
200
- model_interface: Model interface for inference
201
-
202
- Returns:
203
- Evaluation result for the sample
204
- """
205
- pass
206
-
207
- @abstractmethod
208
- def compute_metrics(self,
209
- predictions: List[Any],
210
- references: List[Any],
211
- **kwargs) -> Dict[str, float]:
212
- """
213
- Compute evaluation metrics.
214
-
215
- Args:
216
- predictions: Model predictions
217
- references: Ground truth references
218
- **kwargs: Additional parameters
219
-
220
- Returns:
221
- Dictionary of computed metrics
222
- """
223
- pass
224
-
225
- async def evaluate(self,
226
- model_interface: Any,
227
- dataset: List[Dict[str, Any]],
228
- dataset_name: str = "unknown",
229
- model_name: str = "unknown",
230
- batch_size: Optional[int] = None,
231
- save_predictions: bool = True,
232
- progress_callback: Optional[Callable] = None) -> EvaluationResult:
233
- """
234
- Perform comprehensive evaluation with industry best practices.
235
-
236
- Args:
237
- model_interface: Model interface for inference
238
- dataset: Dataset to evaluate on
239
- dataset_name: Name of the dataset
240
- model_name: Name of the model
241
- batch_size: Batch size for processing
242
- save_predictions: Whether to save individual predictions
243
- progress_callback: Optional callback for progress updates
244
-
245
- Returns:
246
- Comprehensive evaluation results
247
- """
248
-
249
- # Initialize evaluation
250
- self._start_evaluation()
251
- result = EvaluationResult(
252
- model_name=model_name,
253
- dataset_name=dataset_name,
254
- evaluation_type=self.evaluator_name,
255
- config=self.config.copy(),
256
- environment_info=self._get_environment_info()
257
- )
258
-
259
- try:
260
- # Start experiment tracking
261
- await self._start_experiment_tracking(model_name, dataset_name)
262
-
263
- # Process dataset in batches
264
- batch_size = batch_size or self.config.get("batch_size", 32)
265
- total_batches = (len(dataset) + batch_size - 1) // batch_size
266
-
267
- all_predictions = []
268
- all_references = []
269
- all_sample_results = []
270
-
271
- for batch_idx in range(total_batches):
272
- if self._should_stop:
273
- logger.info("Evaluation stopped by user request")
274
- break
275
-
276
- # Get batch
277
- start_idx = batch_idx * batch_size
278
- end_idx = min(start_idx + batch_size, len(dataset))
279
- batch = dataset[start_idx:end_idx]
280
-
281
- # Process batch
282
- batch_results = await self._process_batch(batch, model_interface)
283
-
284
- # Collect results
285
- for sample_result in batch_results:
286
- if sample_result.get("success", False):
287
- all_predictions.append(sample_result.get("prediction"))
288
- all_references.append(sample_result.get("reference"))
289
- result.successful_samples += 1
290
- else:
291
- result.failed_samples += 1
292
- result.errors.append({
293
- "sample_id": sample_result.get("sample_id"),
294
- "error": sample_result.get("error"),
295
- "timestamp": datetime.now().isoformat()
296
- })
297
-
298
- if save_predictions:
299
- all_sample_results.append(sample_result)
300
-
301
- # Update progress
302
- progress = (batch_idx + 1) / total_batches
303
- if progress_callback:
304
- await progress_callback(progress, batch_idx + 1, total_batches)
305
-
306
- # Log progress
307
- if (batch_idx + 1) % 10 == 0 or batch_idx == total_batches - 1:
308
- logger.info(f"Processed {batch_idx + 1}/{total_batches} batches "
309
- f"({result.successful_samples} successful, {result.failed_samples} failed)")
310
-
311
- # Compute final metrics
312
- if all_predictions and all_references:
313
- result.metrics = self.compute_metrics(all_predictions, all_references)
314
- logger.info(f"Computed metrics: {result.metrics}")
315
- else:
316
- logger.warning("No valid predictions available for metric computation")
317
- result.warnings.append("No valid predictions available for metric computation")
318
-
319
- # Finalize results
320
- result.predictions = all_predictions
321
- result.references = all_references
322
- result.sample_results = all_sample_results
323
- result.total_samples = len(dataset)
324
-
325
- # Log experiment results
326
- await self._log_experiment_results(result)
327
-
328
- except Exception as e:
329
- logger.error(f"Evaluation failed: {e}")
330
- logger.error(traceback.format_exc())
331
- result.errors.append({
332
- "error": str(e),
333
- "error_type": type(e).__name__,
334
- "traceback": traceback.format_exc(),
335
- "timestamp": datetime.now().isoformat()
336
- })
337
-
338
- finally:
339
- # Finalize evaluation
340
- self._end_evaluation(result)
341
- await self._end_experiment_tracking()
342
- self._current_result = result
343
-
344
- return result
345
-
346
- async def _process_batch(self,
347
- batch: List[Dict[str, Any]],
348
- model_interface: Any) -> List[Dict[str, Any]]:
349
- """Process a batch of samples with concurrency control."""
350
- tasks = []
351
-
352
- for sample in batch:
353
- task = asyncio.create_task(
354
- self._process_sample_with_retry(sample, model_interface)
355
- )
356
- tasks.append(task)
357
-
358
- # Wait for all tasks in batch to complete
359
- results = await asyncio.gather(*tasks, return_exceptions=True)
360
-
361
- # Process results and handle exceptions
362
- processed_results = []
363
- for i, result in enumerate(results):
364
- if isinstance(result, Exception):
365
- processed_results.append({
366
- "sample_id": batch[i].get("id", f"sample_{i}"),
367
- "success": False,
368
- "error": str(result),
369
- "prediction": None,
370
- "reference": batch[i].get("reference")
371
- })
372
- else:
373
- processed_results.append(result)
374
-
375
- return processed_results
376
-
377
- async def _process_sample_with_retry(self,
378
- sample: Dict[str, Any],
379
- model_interface: Any) -> Dict[str, Any]:
380
- """Process a single sample with retry logic and concurrency control."""
381
- async with self.semaphore: # Limit concurrent requests
382
- for attempt in range(self.max_retries + 1):
383
- try:
384
- result = await self.evaluate_sample(sample, model_interface)
385
- result["success"] = True
386
- result["sample_id"] = sample.get("id", "unknown")
387
- result["reference"] = sample.get("reference")
388
- return result
389
-
390
- except Exception as e:
391
- if attempt == self.max_retries:
392
- # Final attempt failed
393
- logger.error(f"Sample evaluation failed after {self.max_retries + 1} attempts: {e}")
394
- return {
395
- "sample_id": sample.get("id", "unknown"),
396
- "success": False,
397
- "error": str(e),
398
- "prediction": None,
399
- "reference": sample.get("reference")
400
- }
401
- else:
402
- # Retry with exponential backoff
403
- delay = self.retry_delay * (2 ** attempt)
404
- logger.warning(f"Sample evaluation failed (attempt {attempt + 1}), retrying in {delay}s: {e}")
405
- await asyncio.sleep(delay)
406
-
407
- def _start_evaluation(self) -> None:
408
- """Mark the start of evaluation."""
409
- self._is_running = True
410
- self._should_stop = False
411
- self._start_time = time.time()
412
-
413
- # Monitor memory usage
414
- try:
415
- import psutil
416
- process = psutil.Process()
417
- self._peak_memory_mb = process.memory_info().rss / 1024 / 1024
418
- except ImportError:
419
- pass
420
-
421
- def _end_evaluation(self, result: EvaluationResult) -> None:
422
- """Finalize evaluation with performance metrics."""
423
- self._is_running = False
424
- end_time = time.time()
425
-
426
- if self._start_time:
427
- result.evaluation_time_seconds = end_time - self._start_time
428
- if result.total_samples > 0:
429
- result.throughput_samples_per_second = result.total_samples / result.evaluation_time_seconds
430
-
431
- result.memory_peak_mb = self._peak_memory_mb
432
-
433
- logger.info(f"Evaluation completed in {result.evaluation_time_seconds:.2f}s "
434
- f"({result.throughput_samples_per_second:.2f} samples/sec)")
435
-
436
- def _get_environment_info(self) -> Dict[str, Any]:
437
- """Get environment information for reproducibility."""
438
- import platform
439
- import sys
440
-
441
- env_info = {
442
- "python_version": sys.version,
443
- "platform": platform.platform(),
444
- "hostname": platform.node(),
445
- "timestamp": datetime.now().isoformat()
446
- }
447
-
448
- try:
449
- import torch
450
- env_info["torch_version"] = torch.__version__
451
- env_info["cuda_available"] = torch.cuda.is_available()
452
- if torch.cuda.is_available():
453
- env_info["cuda_device_count"] = torch.cuda.device_count()
454
- env_info["cuda_device_name"] = torch.cuda.get_device_name()
455
- except ImportError:
456
- pass
457
-
458
- return env_info
459
-
460
- async def _start_experiment_tracking(self, model_name: str, dataset_name: str) -> None:
461
- """Start experiment tracking if available."""
462
- if self.experiment_tracker:
463
- try:
464
- await self.experiment_tracker.start_run(
465
- name=f"{self.evaluator_name}_{model_name}_{dataset_name}",
466
- config=self.config
467
- )
468
- except Exception as e:
469
- logger.warning(f"Failed to start experiment tracking: {e}")
470
-
471
- async def _log_experiment_results(self, result: EvaluationResult) -> None:
472
- """Log results to experiment tracker."""
473
- if self.experiment_tracker:
474
- try:
475
- await self.experiment_tracker.log_metrics(result.metrics)
476
- await self.experiment_tracker.log_params(result.config)
477
- except Exception as e:
478
- logger.warning(f"Failed to log experiment results: {e}")
479
-
480
- async def _end_experiment_tracking(self) -> None:
481
- """End experiment tracking."""
482
- if self.experiment_tracker:
483
- try:
484
- await self.experiment_tracker.end_run()
485
- except Exception as e:
486
- logger.warning(f"Failed to end experiment tracking: {e}")
487
-
488
- def stop_evaluation(self) -> None:
489
- """Request evaluation to stop gracefully."""
490
- self._should_stop = True
491
- logger.info("Evaluation stop requested")
492
-
493
- def is_running(self) -> bool:
494
- """Check if evaluation is currently running."""
495
- return self._is_running
496
-
497
- def get_current_result(self) -> Optional[EvaluationResult]:
498
- """Get the current/latest evaluation result."""
499
- return self._current_result
500
-
501
- def get_supported_metrics(self) -> List[str]:
502
- """Get list of metrics supported by this evaluator."""
503
- return [] # To be overridden by subclasses