isa-model 0.4.0__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (199) hide show
  1. isa_model/client.py +466 -43
  2. isa_model/core/cache/redis_cache.py +12 -3
  3. isa_model/core/config/config_manager.py +230 -3
  4. isa_model/core/config.py +90 -0
  5. isa_model/core/database/direct_db_client.py +114 -0
  6. isa_model/core/database/migration_manager.py +563 -0
  7. isa_model/core/database/migrations.py +21 -1
  8. isa_model/core/database/supabase_client.py +154 -19
  9. isa_model/core/dependencies.py +316 -0
  10. isa_model/core/discovery/__init__.py +19 -0
  11. isa_model/core/discovery/consul_discovery.py +190 -0
  12. isa_model/core/logging/__init__.py +54 -0
  13. isa_model/core/logging/influx_logger.py +523 -0
  14. isa_model/core/logging/loki_logger.py +160 -0
  15. isa_model/core/models/__init__.py +27 -18
  16. isa_model/core/models/config_models.py +625 -0
  17. isa_model/core/models/deployment_billing_tracker.py +430 -0
  18. isa_model/core/models/model_manager.py +40 -17
  19. isa_model/core/models/model_metadata.py +690 -0
  20. isa_model/core/models/model_repo.py +174 -18
  21. isa_model/core/models/system_models.py +857 -0
  22. isa_model/core/repositories/__init__.py +9 -0
  23. isa_model/core/repositories/config_repository.py +912 -0
  24. isa_model/core/services/intelligent_model_selector.py +399 -21
  25. isa_model/core/storage/hf_storage.py +1 -1
  26. isa_model/core/types.py +1 -0
  27. isa_model/deployment/__init__.py +5 -48
  28. isa_model/deployment/core/__init__.py +2 -31
  29. isa_model/deployment/core/deployment_manager.py +1278 -370
  30. isa_model/deployment/local/__init__.py +31 -0
  31. isa_model/deployment/local/config.py +248 -0
  32. isa_model/deployment/local/gpu_gateway.py +607 -0
  33. isa_model/deployment/local/health_checker.py +428 -0
  34. isa_model/deployment/local/provider.py +586 -0
  35. isa_model/deployment/local/tensorrt_service.py +621 -0
  36. isa_model/deployment/local/transformers_service.py +644 -0
  37. isa_model/deployment/local/vllm_service.py +527 -0
  38. isa_model/deployment/modal/__init__.py +8 -0
  39. isa_model/deployment/modal/config.py +136 -0
  40. isa_model/deployment/{services/auto_hf_modal_deployer.py → modal/deployer.py} +1 -1
  41. isa_model/deployment/modal/services/__init__.py +3 -0
  42. isa_model/deployment/modal/services/audio/__init__.py +1 -0
  43. isa_model/deployment/modal/services/embedding/__init__.py +1 -0
  44. isa_model/deployment/modal/services/llm/__init__.py +1 -0
  45. isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
  46. isa_model/deployment/modal/services/video/__init__.py +1 -0
  47. isa_model/deployment/modal/services/vision/__init__.py +1 -0
  48. isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
  49. isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
  50. isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
  51. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
  52. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
  53. isa_model/deployment/storage/__init__.py +5 -0
  54. isa_model/deployment/storage/deployment_repository.py +824 -0
  55. isa_model/deployment/triton/__init__.py +10 -0
  56. isa_model/deployment/triton/config.py +196 -0
  57. isa_model/deployment/triton/configs/__init__.py +1 -0
  58. isa_model/deployment/triton/provider.py +512 -0
  59. isa_model/deployment/triton/scripts/__init__.py +1 -0
  60. isa_model/deployment/triton/templates/__init__.py +1 -0
  61. isa_model/inference/__init__.py +47 -1
  62. isa_model/inference/ai_factory.py +137 -10
  63. isa_model/inference/legacy_services/__init__.py +21 -0
  64. isa_model/inference/legacy_services/model_evaluation.py +637 -0
  65. isa_model/inference/legacy_services/model_service.py +573 -0
  66. isa_model/inference/legacy_services/model_serving.py +717 -0
  67. isa_model/inference/legacy_services/model_training.py +561 -0
  68. isa_model/inference/models/__init__.py +21 -0
  69. isa_model/inference/models/inference_config.py +551 -0
  70. isa_model/inference/models/inference_record.py +675 -0
  71. isa_model/inference/models/performance_models.py +714 -0
  72. isa_model/inference/repositories/__init__.py +9 -0
  73. isa_model/inference/repositories/inference_repository.py +828 -0
  74. isa_model/inference/services/audio/base_stt_service.py +184 -11
  75. isa_model/inference/services/audio/openai_stt_service.py +22 -6
  76. isa_model/inference/services/custom_model_manager.py +277 -0
  77. isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
  78. isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
  79. isa_model/inference/services/llm/__init__.py +10 -2
  80. isa_model/inference/services/llm/base_llm_service.py +335 -24
  81. isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
  82. isa_model/inference/services/llm/helpers/llm_adapter.py +9 -4
  83. isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
  84. isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
  85. isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
  86. isa_model/inference/services/llm/local_llm_service.py +747 -0
  87. isa_model/inference/services/llm/ollama_llm_service.py +9 -2
  88. isa_model/inference/services/llm/openai_llm_service.py +33 -16
  89. isa_model/inference/services/llm/yyds_llm_service.py +8 -2
  90. isa_model/inference/services/vision/__init__.py +22 -1
  91. isa_model/inference/services/vision/blip_vision_service.py +359 -0
  92. isa_model/inference/services/vision/helpers/image_utils.py +8 -5
  93. isa_model/inference/services/vision/isa_vision_service.py +65 -4
  94. isa_model/inference/services/vision/openai_vision_service.py +19 -10
  95. isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
  96. isa_model/serving/api/cache_manager.py +245 -0
  97. isa_model/serving/api/dependencies/__init__.py +1 -0
  98. isa_model/serving/api/dependencies/auth.py +194 -0
  99. isa_model/serving/api/dependencies/database.py +139 -0
  100. isa_model/serving/api/error_handlers.py +284 -0
  101. isa_model/serving/api/fastapi_server.py +172 -22
  102. isa_model/serving/api/middleware/auth.py +8 -2
  103. isa_model/serving/api/middleware/security.py +23 -33
  104. isa_model/serving/api/middleware/tenant_context.py +414 -0
  105. isa_model/serving/api/routes/analytics.py +4 -1
  106. isa_model/serving/api/routes/config.py +645 -0
  107. isa_model/serving/api/routes/deployment_billing.py +315 -0
  108. isa_model/serving/api/routes/deployments.py +138 -2
  109. isa_model/serving/api/routes/gpu_gateway.py +440 -0
  110. isa_model/serving/api/routes/health.py +32 -12
  111. isa_model/serving/api/routes/inference_monitoring.py +486 -0
  112. isa_model/serving/api/routes/local_deployments.py +448 -0
  113. isa_model/serving/api/routes/tenants.py +575 -0
  114. isa_model/serving/api/routes/unified.py +680 -18
  115. isa_model/serving/api/routes/webhooks.py +479 -0
  116. isa_model/serving/api/startup.py +68 -54
  117. isa_model/utils/gpu_utils.py +311 -0
  118. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/METADATA +66 -24
  119. isa_model-0.4.3.dist-info/RECORD +193 -0
  120. isa_model/core/storage/minio_storage.py +0 -0
  121. isa_model/deployment/cloud/__init__.py +0 -9
  122. isa_model/deployment/cloud/modal/__init__.py +0 -10
  123. isa_model/deployment/core/deployment_config.py +0 -356
  124. isa_model/deployment/core/isa_deployment_service.py +0 -401
  125. isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
  126. isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
  127. isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
  128. isa_model/deployment/runtime/deployed_service.py +0 -338
  129. isa_model/deployment/services/__init__.py +0 -9
  130. isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
  131. isa_model/deployment/services/model_service.py +0 -332
  132. isa_model/deployment/services/service_monitor.py +0 -356
  133. isa_model/deployment/services/service_registry.py +0 -527
  134. isa_model/eval/__init__.py +0 -92
  135. isa_model/eval/benchmarks/__init__.py +0 -27
  136. isa_model/eval/benchmarks/multimodal_datasets.py +0 -460
  137. isa_model/eval/benchmarks.py +0 -701
  138. isa_model/eval/config/__init__.py +0 -10
  139. isa_model/eval/config/evaluation_config.py +0 -108
  140. isa_model/eval/evaluators/__init__.py +0 -24
  141. isa_model/eval/evaluators/audio_evaluator.py +0 -727
  142. isa_model/eval/evaluators/base_evaluator.py +0 -503
  143. isa_model/eval/evaluators/embedding_evaluator.py +0 -742
  144. isa_model/eval/evaluators/llm_evaluator.py +0 -472
  145. isa_model/eval/evaluators/vision_evaluator.py +0 -564
  146. isa_model/eval/example_evaluation.py +0 -395
  147. isa_model/eval/factory.py +0 -798
  148. isa_model/eval/infrastructure/__init__.py +0 -24
  149. isa_model/eval/infrastructure/experiment_tracker.py +0 -466
  150. isa_model/eval/isa_benchmarks.py +0 -700
  151. isa_model/eval/isa_integration.py +0 -582
  152. isa_model/eval/metrics.py +0 -951
  153. isa_model/eval/tests/unit/test_basic.py +0 -396
  154. isa_model/serving/api/routes/evaluations.py +0 -579
  155. isa_model/training/__init__.py +0 -168
  156. isa_model/training/annotation/annotation_schema.py +0 -47
  157. isa_model/training/annotation/processors/annotation_processor.py +0 -126
  158. isa_model/training/annotation/storage/dataset_manager.py +0 -131
  159. isa_model/training/annotation/storage/dataset_schema.py +0 -44
  160. isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
  161. isa_model/training/annotation/tests/test_minio copy.py +0 -113
  162. isa_model/training/annotation/tests/test_minio_upload.py +0 -43
  163. isa_model/training/annotation/views/annotation_controller.py +0 -158
  164. isa_model/training/cloud/__init__.py +0 -22
  165. isa_model/training/cloud/job_orchestrator.py +0 -402
  166. isa_model/training/cloud/runpod_trainer.py +0 -454
  167. isa_model/training/cloud/storage_manager.py +0 -482
  168. isa_model/training/core/__init__.py +0 -26
  169. isa_model/training/core/config.py +0 -181
  170. isa_model/training/core/dataset.py +0 -222
  171. isa_model/training/core/trainer.py +0 -720
  172. isa_model/training/core/utils.py +0 -213
  173. isa_model/training/examples/intelligent_training_example.py +0 -281
  174. isa_model/training/factory.py +0 -424
  175. isa_model/training/intelligent/__init__.py +0 -25
  176. isa_model/training/intelligent/decision_engine.py +0 -643
  177. isa_model/training/intelligent/intelligent_factory.py +0 -888
  178. isa_model/training/intelligent/knowledge_base.py +0 -751
  179. isa_model/training/intelligent/resource_optimizer.py +0 -839
  180. isa_model/training/intelligent/task_classifier.py +0 -576
  181. isa_model/training/storage/__init__.py +0 -24
  182. isa_model/training/storage/core_integration.py +0 -439
  183. isa_model/training/storage/training_repository.py +0 -552
  184. isa_model/training/storage/training_storage.py +0 -628
  185. isa_model-0.4.0.dist-info/RECORD +0 -182
  186. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_chatTTS_service.py +0 -0
  187. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_fish_service.py +0 -0
  188. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_openvoice_service.py +0 -0
  189. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_service_v2.py +0 -0
  190. /isa_model/deployment/{cloud/modal → modal/services/embedding}/isa_embed_rerank_service.py +0 -0
  191. /isa_model/deployment/{cloud/modal → modal/services/video}/isa_video_hunyuan_service.py +0 -0
  192. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ocr_service.py +0 -0
  193. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_qwen25_service.py +0 -0
  194. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_table_service.py +0 -0
  195. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service.py +0 -0
  196. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service_optimized.py +0 -0
  197. /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
  198. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/WHEEL +0 -0
  199. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/top_level.txt +0 -0
isa_model/eval/factory.py DELETED
@@ -1,798 +0,0 @@
1
- """
2
- Enterprise-Grade Evaluation Factory for ISA Model Framework
3
-
4
- Implements industry best practices for AI model evaluation at scale:
5
- - Async evaluation with concurrency control
6
- - Comprehensive experiment tracking (W&B, MLflow)
7
- - Distributed evaluation support
8
- - Production-ready monitoring and alerting
9
- - Cost tracking and optimization
10
- - Reproducible evaluation pipelines
11
- """
12
-
13
- import asyncio
14
- import logging
15
- from typing import Optional, Dict, Any, List, Union, Callable
16
- from pathlib import Path
17
- import json
18
-
19
- from .evaluators import LLMEvaluator, VisionEvaluator, AudioEvaluator, EmbeddingEvaluator, EvaluationResult
20
- from .isa_integration import ISAModelInterface
21
- try:
22
- from .infrastructure import ExperimentTracker, create_experiment_tracker
23
- EXPERIMENT_TRACKING_AVAILABLE = True
24
- except ImportError:
25
- EXPERIMENT_TRACKING_AVAILABLE = False
26
- logger.warning("Experiment tracking not available")
27
-
28
- try:
29
- from .config import EvaluationConfig
30
- CONFIG_AVAILABLE = True
31
- except ImportError:
32
- CONFIG_AVAILABLE = False
33
- # Create a simple config class
34
- class EvaluationConfig:
35
- def __init__(self):
36
- self.batch_size = 16
37
- self.output_dir = "./evaluation_results"
38
- self.default_temperature = 0.7
39
- self.default_max_tokens = 512
40
- self.max_concurrent_evaluations = 3
41
-
42
- def to_dict(self):
43
- return {
44
- "batch_size": self.batch_size,
45
- "output_dir": self.output_dir,
46
- "default_temperature": self.default_temperature,
47
- "default_max_tokens": self.default_max_tokens,
48
- "max_concurrent_evaluations": self.max_concurrent_evaluations
49
- }
50
-
51
- @classmethod
52
- def from_dict(cls, config_dict):
53
- config = cls()
54
- for key, value in config_dict.items():
55
- if hasattr(config, key):
56
- setattr(config, key, value)
57
- return config
58
-
59
- logger = logging.getLogger(__name__)
60
-
61
-
62
- class EvaluationFactory:
63
- """
64
- Enterprise-grade evaluation factory implementing MLOps best practices.
65
-
66
- Features:
67
- - Multi-modal evaluation support (LLM, Vision, Multimodal)
68
- - Async evaluation with smart concurrency management
69
- - Comprehensive experiment tracking and visualization
70
- - Cost optimization and resource monitoring
71
- - Distributed evaluation across multiple GPUs/nodes
72
- - Production-ready error handling and retry logic
73
- - Automated result storage and comparison
74
-
75
- Example usage:
76
- ```python
77
- from isa_model.eval import EvaluationFactory
78
-
79
- # Initialize with experiment tracking
80
- factory = EvaluationFactory(
81
- experiment_tracking={
82
- "type": "wandb",
83
- "project": "model-evaluation",
84
- "entity": "my-team"
85
- }
86
- )
87
-
88
- # Evaluate LLM on dataset
89
- result = await factory.evaluate_llm(
90
- model_name="gpt-4.1-mini",
91
- provider="openai",
92
- dataset_path="path/to/evaluation_data.json",
93
- metrics=["accuracy", "f1_score", "bleu_score"],
94
- save_results=True
95
- )
96
-
97
- # Run benchmark evaluation
98
- benchmark_result = await factory.run_benchmark(
99
- model_name="claude-sonnet-4",
100
- provider="yyds",
101
- benchmark_name="mmlu",
102
- subjects=["math", "physics", "chemistry"]
103
- )
104
-
105
- # Compare multiple models
106
- comparison = await factory.compare_models(
107
- models=[
108
- {"name": "gpt-4.1-mini", "provider": "openai"},
109
- {"name": "claude-sonnet-4", "provider": "yyds"}
110
- ],
111
- dataset_path="comparison_dataset.json"
112
- )
113
- ```
114
- """
115
-
116
- def __init__(self,
117
- config: Optional[Union[Dict[str, Any], EvaluationConfig]] = None,
118
- experiment_tracking: Optional[Dict[str, Any]] = None,
119
- output_dir: Optional[str] = None):
120
- """
121
- Initialize the enterprise evaluation factory.
122
-
123
- Args:
124
- config: Evaluation configuration (dict or EvaluationConfig object)
125
- experiment_tracking: Experiment tracking configuration
126
- output_dir: Output directory for results
127
- """
128
- # Initialize configuration
129
- if isinstance(config, dict):
130
- self.config = EvaluationConfig.from_dict(config)
131
- elif isinstance(config, EvaluationConfig):
132
- self.config = config
133
- else:
134
- self.config = EvaluationConfig()
135
-
136
- # Override output directory if provided
137
- if output_dir:
138
- self.config.output_dir = output_dir
139
-
140
- # Initialize experiment tracker
141
- self.experiment_tracker = None
142
- if experiment_tracking and EXPERIMENT_TRACKING_AVAILABLE:
143
- try:
144
- self.experiment_tracker = create_experiment_tracker(**experiment_tracking)
145
- logger.info(f"Initialized experiment tracking: {experiment_tracking['type']}")
146
- except Exception as e:
147
- logger.warning(f"Failed to initialize experiment tracking: {e}")
148
-
149
- # Initialize ISA Model interface
150
- self.isa_interface = ISAModelInterface()
151
-
152
- # Initialize evaluators
153
- self.llm_evaluator = LLMEvaluator(
154
- config=self.config.to_dict(),
155
- experiment_tracker=self.experiment_tracker
156
- )
157
-
158
- self.vision_evaluator = VisionEvaluator(
159
- config=self.config.to_dict(),
160
- experiment_tracker=self.experiment_tracker
161
- )
162
-
163
- self.audio_evaluator = AudioEvaluator(
164
- config=self.config.to_dict(),
165
- experiment_tracker=self.experiment_tracker
166
- )
167
-
168
- self.embedding_evaluator = EmbeddingEvaluator(
169
- config=self.config.to_dict(),
170
- experiment_tracker=self.experiment_tracker
171
- )
172
-
173
- # State tracking
174
- self._active_evaluations: Dict[str, asyncio.Task] = {}
175
-
176
- logger.info(f"EvaluationFactory initialized with output dir: {self.config.output_dir}")
177
-
178
- async def evaluate_llm(self,
179
- model_name: str,
180
- provider: str = "openai",
181
- dataset_path: Optional[str] = None,
182
- dataset: Optional[List[Dict[str, Any]]] = None,
183
- metrics: Optional[List[str]] = None,
184
- batch_size: Optional[int] = None,
185
- save_results: bool = True,
186
- experiment_name: Optional[str] = None,
187
- progress_callback: Optional[Callable] = None) -> EvaluationResult:
188
- """
189
- Evaluate LLM with comprehensive metrics and tracking.
190
-
191
- Args:
192
- model_name: Name of the model to evaluate
193
- provider: Model provider (openai, yyds, ollama, etc.)
194
- dataset_path: Path to evaluation dataset JSON file
195
- dataset: Direct dataset input (alternative to dataset_path)
196
- metrics: List of metrics to compute
197
- batch_size: Batch size for evaluation
198
- save_results: Whether to save results to disk
199
- experiment_name: Custom experiment name
200
- progress_callback: Optional progress callback function
201
-
202
- Returns:
203
- Comprehensive evaluation results
204
- """
205
- # Load dataset
206
- if dataset is None:
207
- if dataset_path is None:
208
- raise ValueError("Either dataset_path or dataset must be provided")
209
- dataset = self._load_dataset(dataset_path)
210
-
211
- # Configure LLM evaluator
212
- llm_config = {
213
- "provider": provider,
214
- "model_name": model_name,
215
- "batch_size": batch_size or self.config.batch_size,
216
- "temperature": self.config.default_temperature,
217
- "max_tokens": self.config.default_max_tokens
218
- }
219
-
220
- self.llm_evaluator.config.update(llm_config)
221
-
222
- # Generate experiment name
223
- dataset_name = Path(dataset_path).stem if dataset_path else "custom_dataset"
224
- experiment_name = experiment_name or f"llm_eval_{model_name}_{dataset_name}"
225
-
226
- # Run evaluation
227
- result = await self.llm_evaluator.evaluate(
228
- model_interface=self.isa_interface,
229
- dataset=dataset,
230
- dataset_name=dataset_name,
231
- model_name=f"{provider}:{model_name}",
232
- batch_size=batch_size,
233
- progress_callback=progress_callback
234
- )
235
-
236
- # Save results if requested
237
- if save_results:
238
- await self._save_results(result, experiment_name)
239
-
240
- return result
241
-
242
- async def run_benchmark(self,
243
- model_name: str,
244
- provider: str,
245
- benchmark_name: str,
246
- subjects: Optional[List[str]] = None,
247
- max_samples: Optional[int] = None,
248
- few_shot: bool = True,
249
- num_shots: int = 5,
250
- save_results: bool = True,
251
- experiment_name: Optional[str] = None) -> EvaluationResult:
252
- """
253
- Run standardized benchmark evaluation.
254
-
255
- Args:
256
- model_name: Name of the model to evaluate
257
- provider: Model provider
258
- benchmark_name: Name of benchmark (mmlu, hellaswag, arc, gsm8k, etc.)
259
- subjects: List of subjects to evaluate (for MMLU)
260
- max_samples: Maximum number of samples to evaluate
261
- few_shot: Whether to use few-shot examples
262
- num_shots: Number of few-shot examples
263
- save_results: Whether to save results
264
- experiment_name: Custom experiment name
265
-
266
- Returns:
267
- Benchmark evaluation results
268
- """
269
- # Load benchmark dataset
270
- benchmark_dataset = await self._load_benchmark(
271
- benchmark_name,
272
- subjects=subjects,
273
- max_samples=max_samples,
274
- few_shot=few_shot,
275
- num_shots=num_shots
276
- )
277
-
278
- # Configure for benchmark evaluation
279
- benchmark_config = {
280
- "provider": provider,
281
- "model_name": model_name,
282
- "temperature": 0.0, # Deterministic for benchmarks
283
- "max_tokens": 50, # Short answers for most benchmarks
284
- "task_type": "benchmark",
285
- "benchmark_name": benchmark_name
286
- }
287
-
288
- self.llm_evaluator.config.update(benchmark_config)
289
-
290
- # Generate experiment name
291
- experiment_name = experiment_name or f"benchmark_{benchmark_name}_{model_name}"
292
-
293
- # Run evaluation
294
- result = await self.llm_evaluator.evaluate(
295
- model_interface=None,
296
- dataset=benchmark_dataset,
297
- dataset_name=benchmark_name,
298
- model_name=f"{provider}:{model_name}",
299
- batch_size=self.config.batch_size
300
- )
301
-
302
- # Add benchmark-specific metadata
303
- result.config.update({
304
- "benchmark_name": benchmark_name,
305
- "subjects": subjects,
306
- "few_shot": few_shot,
307
- "num_shots": num_shots
308
- })
309
-
310
- # Save results if requested
311
- if save_results:
312
- await self._save_results(result, experiment_name)
313
-
314
- return result
315
-
316
- async def compare_models(self,
317
- models: List[Dict[str, str]],
318
- dataset_path: Optional[str] = None,
319
- dataset: Optional[List[Dict[str, Any]]] = None,
320
- benchmark_name: Optional[str] = None,
321
- metrics: Optional[List[str]] = None,
322
- save_results: bool = True,
323
- experiment_name: Optional[str] = None) -> Dict[str, EvaluationResult]:
324
- """
325
- Compare multiple models on the same evaluation task.
326
-
327
- Args:
328
- models: List of model configs [{"name": "gpt-4", "provider": "openai"}, ...]
329
- dataset_path: Path to evaluation dataset
330
- dataset: Direct dataset input
331
- benchmark_name: Benchmark name (alternative to dataset)
332
- metrics: Metrics to compute
333
- save_results: Whether to save comparison results
334
- experiment_name: Custom experiment name
335
-
336
- Returns:
337
- Dictionary mapping model names to evaluation results
338
- """
339
- results = {}
340
-
341
- # Run evaluations concurrently (with concurrency limits)
342
- semaphore = asyncio.Semaphore(self.config.max_concurrent_evaluations)
343
-
344
- async def evaluate_single_model(model_config: Dict[str, str]) -> tuple:
345
- async with semaphore:
346
- model_name = model_config["name"]
347
- provider = model_config["provider"]
348
-
349
- if benchmark_name:
350
- result = await self.run_benchmark(
351
- model_name=model_name,
352
- provider=provider,
353
- benchmark_name=benchmark_name,
354
- save_results=False # Save comparison results together
355
- )
356
- else:
357
- result = await self.evaluate_llm(
358
- model_name=model_name,
359
- provider=provider,
360
- dataset_path=dataset_path,
361
- dataset=dataset,
362
- metrics=metrics,
363
- save_results=False
364
- )
365
-
366
- return f"{provider}:{model_name}", result
367
-
368
- # Execute all evaluations
369
- tasks = [evaluate_single_model(model) for model in models]
370
- evaluation_results = await asyncio.gather(*tasks)
371
-
372
- # Collect results
373
- for model_id, result in evaluation_results:
374
- results[model_id] = result
375
-
376
- # Generate comparison report
377
- comparison_report = self._generate_comparison_report(results)
378
-
379
- # Save results if requested
380
- if save_results:
381
- experiment_name = experiment_name or f"model_comparison_{len(models)}_models"
382
- await self._save_comparison_results(results, comparison_report, experiment_name)
383
-
384
- return results
385
-
386
- async def evaluate_vision(self,
387
- dataset: List[Dict[str, Any]],
388
- task_type: str = "ocr",
389
- model_name: str = "gpt-4.1-mini",
390
- save_results: bool = True,
391
- experiment_name: Optional[str] = None) -> EvaluationResult:
392
- """
393
- Evaluate vision model on image tasks.
394
-
395
- Args:
396
- dataset: Vision dataset with images and expected outputs
397
- task_type: Vision task type (ocr, table, ui, vqa, caption)
398
- model_name: Vision model name
399
- save_results: Whether to save results
400
- experiment_name: Custom experiment name
401
-
402
- Returns:
403
- Vision evaluation results
404
- """
405
- # Configure vision evaluator
406
- self.vision_evaluator.config.update({
407
- "task_type": task_type,
408
- "model_name": model_name
409
- })
410
-
411
- experiment_name = experiment_name or f"vision_{task_type}_{model_name}"
412
-
413
- result = await self.vision_evaluator.evaluate(
414
- model_interface=self.isa_interface,
415
- dataset=dataset,
416
- dataset_name=f"vision_{task_type}",
417
- model_name=model_name
418
- )
419
-
420
- if save_results:
421
- await self._save_results(result, experiment_name)
422
-
423
- return result
424
-
425
- async def evaluate_audio(self,
426
- dataset: List[Dict[str, Any]],
427
- task_type: str = "stt",
428
- model_name: str = "isa_audio_sota_service",
429
- save_results: bool = True,
430
- experiment_name: Optional[str] = None) -> EvaluationResult:
431
- """
432
- Evaluate audio model on speech tasks.
433
-
434
- Args:
435
- dataset: Audio dataset with audio files and expected outputs
436
- task_type: Audio task type (stt, emotion, diarization)
437
- model_name: Audio model name
438
- save_results: Whether to save results
439
- experiment_name: Custom experiment name
440
-
441
- Returns:
442
- Audio evaluation results
443
- """
444
- # Configure audio evaluator
445
- self.audio_evaluator.config.update({
446
- "task_type": task_type,
447
- "model_name": model_name
448
- })
449
-
450
- experiment_name = experiment_name or f"audio_{task_type}_{model_name}"
451
-
452
- result = await self.audio_evaluator.evaluate(
453
- model_interface=self.isa_interface,
454
- dataset=dataset,
455
- dataset_name=f"audio_{task_type}",
456
- model_name=model_name
457
- )
458
-
459
- if save_results:
460
- await self._save_results(result, experiment_name)
461
-
462
- return result
463
-
464
- async def evaluate_embedding(self,
465
- dataset: List[Dict[str, Any]],
466
- task_type: str = "similarity",
467
- model_name: str = "text-embedding-3-small",
468
- save_results: bool = True,
469
- experiment_name: Optional[str] = None) -> EvaluationResult:
470
- """
471
- Evaluate embedding model on semantic tasks.
472
-
473
- Args:
474
- dataset: Embedding dataset with text and expected outputs
475
- task_type: Embedding task type (similarity, retrieval, reranking)
476
- model_name: Embedding model name
477
- save_results: Whether to save results
478
- experiment_name: Custom experiment name
479
-
480
- Returns:
481
- Embedding evaluation results
482
- """
483
- # Configure embedding evaluator
484
- self.embedding_evaluator.config.update({
485
- "task_type": task_type,
486
- "model_name": model_name
487
- })
488
-
489
- experiment_name = experiment_name or f"embedding_{task_type}_{model_name}"
490
-
491
- result = await self.embedding_evaluator.evaluate(
492
- model_interface=self.isa_interface,
493
- dataset=dataset,
494
- dataset_name=f"embedding_{task_type}",
495
- model_name=model_name
496
- )
497
-
498
- if save_results:
499
- await self._save_results(result, experiment_name)
500
-
501
- return result
502
-
503
- async def compare_models(self,
504
- models: List[Dict[str, str]],
505
- dataset_path: Optional[str] = None,
506
- dataset: Optional[List[Dict[str, Any]]] = None,
507
- evaluator_type: str = "llm",
508
- benchmark_name: Optional[str] = None,
509
- metrics: Optional[List[str]] = None,
510
- save_results: bool = True,
511
- experiment_name: Optional[str] = None) -> Dict[str, EvaluationResult]:
512
- """
513
- Compare multiple models on the same evaluation task.
514
-
515
- Args:
516
- models: List of model configs [{"name": "gpt-4", "provider": "openai"}, ...]
517
- dataset_path: Path to evaluation dataset
518
- dataset: Direct dataset input
519
- evaluator_type: Type of evaluator (llm, vision, audio, embedding)
520
- benchmark_name: Benchmark name (alternative to dataset)
521
- metrics: Metrics to compute
522
- save_results: Whether to save comparison results
523
- experiment_name: Custom experiment name
524
-
525
- Returns:
526
- Dictionary mapping model names to evaluation results
527
- """
528
- results = {}
529
-
530
- # Load dataset if needed
531
- if dataset is None and dataset_path:
532
- dataset = self._load_dataset(dataset_path)
533
-
534
- # Run evaluations concurrently (with concurrency limits)
535
- semaphore = asyncio.Semaphore(self.config.max_concurrent_evaluations)
536
-
537
- async def evaluate_single_model(model_config: Dict[str, str]) -> tuple:
538
- async with semaphore:
539
- model_name = model_config["name"]
540
- provider = model_config.get("provider", "openai")
541
-
542
- if evaluator_type == "llm":
543
- if benchmark_name:
544
- result = await self.run_benchmark(
545
- model_name=model_name,
546
- provider=provider,
547
- benchmark_name=benchmark_name,
548
- save_results=False
549
- )
550
- else:
551
- result = await self.evaluate_llm(
552
- model_name=model_name,
553
- provider=provider,
554
- dataset=dataset,
555
- metrics=metrics,
556
- save_results=False
557
- )
558
- elif evaluator_type == "vision":
559
- result = await self.evaluate_vision(
560
- dataset=dataset,
561
- model_name=model_name,
562
- save_results=False
563
- )
564
- elif evaluator_type == "audio":
565
- result = await self.evaluate_audio(
566
- dataset=dataset,
567
- model_name=model_name,
568
- save_results=False
569
- )
570
- elif evaluator_type == "embedding":
571
- result = await self.evaluate_embedding(
572
- dataset=dataset,
573
- model_name=model_name,
574
- save_results=False
575
- )
576
- else:
577
- raise ValueError(f"Unknown evaluator type: {evaluator_type}")
578
-
579
- return f"{provider}:{model_name}", result
580
-
581
- # Execute all evaluations
582
- tasks = [evaluate_single_model(model) for model in models]
583
- evaluation_results = await asyncio.gather(*tasks)
584
-
585
- # Collect results
586
- for model_id, result in evaluation_results:
587
- results[model_id] = result
588
-
589
- # Generate comparison report
590
- comparison_report = self._generate_comparison_report(results)
591
-
592
- # Save results if requested
593
- if save_results:
594
- experiment_name = experiment_name or f"model_comparison_{evaluator_type}_{len(models)}_models"
595
- await self._save_comparison_results(results, comparison_report, experiment_name)
596
-
597
- return results
598
-
599
- def _load_dataset(self, dataset_path: str) -> List[Dict[str, Any]]:
600
- """Load dataset from file."""
601
- with open(dataset_path, 'r', encoding='utf-8') as f:
602
- if dataset_path.endswith('.json'):
603
- dataset = json.load(f)
604
- elif dataset_path.endswith('.jsonl'):
605
- dataset = [json.loads(line) for line in f]
606
- else:
607
- raise ValueError(f"Unsupported dataset format: {dataset_path}")
608
-
609
- logger.info(f"Loaded dataset with {len(dataset)} samples from {dataset_path}")
610
- return dataset
611
-
612
- async def _load_benchmark(self,
613
- benchmark_name: str,
614
- subjects: Optional[List[str]] = None,
615
- max_samples: Optional[int] = None,
616
- few_shot: bool = True,
617
- num_shots: int = 5) -> List[Dict[str, Any]]:
618
- """Load benchmark dataset."""
619
- # This would integrate with the benchmark loaders
620
- # For now, return a placeholder
621
- logger.warning(f"Benchmark {benchmark_name} loading not yet implemented")
622
-
623
- # Placeholder benchmark data
624
- return [
625
- {
626
- "id": f"sample_{i}",
627
- "prompt": f"Sample question {i} for {benchmark_name}",
628
- "reference": "A",
629
- "choices": ["A", "B", "C", "D"] if benchmark_name != "gsm8k" else None
630
- }
631
- for i in range(min(max_samples or 10, 10))
632
- ]
633
-
634
- async def _save_results(self, result: EvaluationResult, experiment_name: str) -> None:
635
- """Save evaluation results to disk."""
636
- # Create output directory
637
- output_dir = Path(self.config.output_dir) / experiment_name
638
- output_dir.mkdir(parents=True, exist_ok=True)
639
-
640
- # Save main results
641
- results_path = output_dir / "results.json"
642
- result.save_to_file(results_path)
643
-
644
- # Save detailed predictions if available
645
- if result.sample_results:
646
- predictions_path = output_dir / "predictions.json"
647
- with open(predictions_path, 'w', encoding='utf-8') as f:
648
- json.dump(result.sample_results, f, indent=2, ensure_ascii=False)
649
-
650
- # Save summary
651
- summary_path = output_dir / "summary.json"
652
- with open(summary_path, 'w', encoding='utf-8') as f:
653
- json.dump(result.get_summary(), f, indent=2, ensure_ascii=False)
654
-
655
- logger.info(f"Saved evaluation results to {output_dir}")
656
-
657
- async def _save_comparison_results(self,
658
- results: Dict[str, EvaluationResult],
659
- comparison_report: Dict[str, Any],
660
- experiment_name: str) -> None:
661
- """Save model comparison results."""
662
- output_dir = Path(self.config.output_dir) / experiment_name
663
- output_dir.mkdir(parents=True, exist_ok=True)
664
-
665
- # Save individual results
666
- for model_id, result in results.items():
667
- model_dir = output_dir / model_id.replace(":", "_")
668
- model_dir.mkdir(exist_ok=True)
669
- result.save_to_file(model_dir / "results.json")
670
-
671
- # Save comparison report
672
- comparison_path = output_dir / "comparison_report.json"
673
- with open(comparison_path, 'w', encoding='utf-8') as f:
674
- json.dump(comparison_report, f, indent=2, ensure_ascii=False)
675
-
676
- logger.info(f"Saved comparison results to {output_dir}")
677
-
678
- def _generate_comparison_report(self, results: Dict[str, EvaluationResult]) -> Dict[str, Any]:
679
- """Generate comparison report from multiple model results."""
680
- report = {
681
- "models_compared": list(results.keys()),
682
- "comparison_timestamp": results[list(results.keys())[0]].timestamp,
683
- "metric_comparison": {},
684
- "rankings": {},
685
- "best_model_per_metric": {}
686
- }
687
-
688
- # Extract all metrics
689
- all_metrics = set()
690
- for result in results.values():
691
- all_metrics.update(result.metrics.keys())
692
-
693
- # Compare each metric
694
- for metric in all_metrics:
695
- metric_values = {}
696
- for model_id, result in results.items():
697
- if metric in result.metrics:
698
- metric_values[model_id] = result.metrics[metric]
699
-
700
- if metric_values:
701
- # Determine if higher is better
702
- higher_is_better = metric not in ["perplexity", "loss", "error_rate"]
703
-
704
- # Find best model
705
- best_model = max(metric_values.items(), key=lambda x: x[1]) if higher_is_better else min(metric_values.items(), key=lambda x: x[1])
706
-
707
- # Create ranking
708
- sorted_models = sorted(metric_values.items(), key=lambda x: x[1], reverse=higher_is_better)
709
-
710
- report["metric_comparison"][metric] = metric_values
711
- report["rankings"][metric] = [{"model": model, "value": value} for model, value in sorted_models]
712
- report["best_model_per_metric"][metric] = {"model": best_model[0], "value": best_model[1]}
713
-
714
- return report
715
-
716
- def get_configuration(self) -> Dict[str, Any]:
717
- """Get current factory configuration."""
718
- return self.config.to_dict()
719
-
720
- def get_active_evaluations(self) -> List[str]:
721
- """Get list of currently running evaluations."""
722
- return list(self._active_evaluations.keys())
723
-
724
- async def stop_evaluation(self, evaluation_id: str) -> bool:
725
- """Stop a running evaluation."""
726
- if evaluation_id in self._active_evaluations:
727
- task = self._active_evaluations[evaluation_id]
728
- task.cancel()
729
- del self._active_evaluations[evaluation_id]
730
- logger.info(f"Stopped evaluation: {evaluation_id}")
731
- return True
732
- return False
733
-
734
- async def cleanup(self) -> None:
735
- """Cleanup resources and stop all running evaluations."""
736
- # Cancel all active evaluations
737
- for evaluation_id in list(self._active_evaluations.keys()):
738
- await self.stop_evaluation(evaluation_id)
739
-
740
- # Close experiment tracker
741
- if self.experiment_tracker and self.experiment_tracker.is_running:
742
- await self.experiment_tracker.end_run()
743
-
744
- logger.info("EvaluationFactory cleanup completed")
745
-
746
-
747
- # Convenience functions for quick evaluation
748
- async def evaluate_llm_quick(model_name: str,
749
- provider: str,
750
- dataset_path: str,
751
- metrics: Optional[List[str]] = None) -> EvaluationResult:
752
- """
753
- Quick LLM evaluation function.
754
-
755
- Args:
756
- model_name: Name of the model
757
- provider: Model provider
758
- dataset_path: Path to dataset
759
- metrics: Metrics to compute
760
-
761
- Returns:
762
- Evaluation results
763
- """
764
- factory = EvaluationFactory()
765
- try:
766
- return await factory.evaluate_llm(
767
- model_name=model_name,
768
- provider=provider,
769
- dataset_path=dataset_path,
770
- metrics=metrics
771
- )
772
- finally:
773
- await factory.cleanup()
774
-
775
-
776
- async def run_benchmark_quick(model_name: str,
777
- provider: str,
778
- benchmark_name: str) -> EvaluationResult:
779
- """
780
- Quick benchmark evaluation function.
781
-
782
- Args:
783
- model_name: Name of the model
784
- provider: Model provider
785
- benchmark_name: Benchmark name
786
-
787
- Returns:
788
- Benchmark results
789
- """
790
- factory = EvaluationFactory()
791
- try:
792
- return await factory.run_benchmark(
793
- model_name=model_name,
794
- provider=provider,
795
- benchmark_name=benchmark_name
796
- )
797
- finally:
798
- await factory.cleanup()