isa-model 0.3.91__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. isa_model/client.py +1166 -584
  2. isa_model/core/cache/redis_cache.py +410 -0
  3. isa_model/core/config/config_manager.py +282 -12
  4. isa_model/core/config.py +91 -1
  5. isa_model/core/database/__init__.py +1 -0
  6. isa_model/core/database/direct_db_client.py +114 -0
  7. isa_model/core/database/migration_manager.py +563 -0
  8. isa_model/core/database/migrations.py +297 -0
  9. isa_model/core/database/supabase_client.py +258 -0
  10. isa_model/core/dependencies.py +316 -0
  11. isa_model/core/discovery/__init__.py +19 -0
  12. isa_model/core/discovery/consul_discovery.py +190 -0
  13. isa_model/core/logging/__init__.py +54 -0
  14. isa_model/core/logging/influx_logger.py +523 -0
  15. isa_model/core/logging/loki_logger.py +160 -0
  16. isa_model/core/models/__init__.py +46 -0
  17. isa_model/core/models/config_models.py +625 -0
  18. isa_model/core/models/deployment_billing_tracker.py +430 -0
  19. isa_model/core/models/model_billing_tracker.py +60 -88
  20. isa_model/core/models/model_manager.py +66 -25
  21. isa_model/core/models/model_metadata.py +690 -0
  22. isa_model/core/models/model_repo.py +217 -55
  23. isa_model/core/models/model_statistics_tracker.py +234 -0
  24. isa_model/core/models/model_storage.py +0 -1
  25. isa_model/core/models/model_version_manager.py +959 -0
  26. isa_model/core/models/system_models.py +857 -0
  27. isa_model/core/pricing_manager.py +2 -249
  28. isa_model/core/repositories/__init__.py +9 -0
  29. isa_model/core/repositories/config_repository.py +912 -0
  30. isa_model/core/resilience/circuit_breaker.py +366 -0
  31. isa_model/core/security/secrets.py +358 -0
  32. isa_model/core/services/__init__.py +2 -4
  33. isa_model/core/services/intelligent_model_selector.py +479 -370
  34. isa_model/core/storage/hf_storage.py +2 -2
  35. isa_model/core/types.py +8 -0
  36. isa_model/deployment/__init__.py +5 -48
  37. isa_model/deployment/core/__init__.py +2 -31
  38. isa_model/deployment/core/deployment_manager.py +1278 -368
  39. isa_model/deployment/local/__init__.py +31 -0
  40. isa_model/deployment/local/config.py +248 -0
  41. isa_model/deployment/local/gpu_gateway.py +607 -0
  42. isa_model/deployment/local/health_checker.py +428 -0
  43. isa_model/deployment/local/provider.py +586 -0
  44. isa_model/deployment/local/tensorrt_service.py +621 -0
  45. isa_model/deployment/local/transformers_service.py +644 -0
  46. isa_model/deployment/local/vllm_service.py +527 -0
  47. isa_model/deployment/modal/__init__.py +8 -0
  48. isa_model/deployment/modal/config.py +136 -0
  49. isa_model/deployment/modal/deployer.py +894 -0
  50. isa_model/deployment/modal/services/__init__.py +3 -0
  51. isa_model/deployment/modal/services/audio/__init__.py +1 -0
  52. isa_model/deployment/modal/services/audio/isa_audio_chatTTS_service.py +520 -0
  53. isa_model/deployment/modal/services/audio/isa_audio_openvoice_service.py +758 -0
  54. isa_model/deployment/modal/services/audio/isa_audio_service_v2.py +1044 -0
  55. isa_model/deployment/modal/services/embedding/__init__.py +1 -0
  56. isa_model/deployment/modal/services/embedding/isa_embed_rerank_service.py +296 -0
  57. isa_model/deployment/modal/services/llm/__init__.py +1 -0
  58. isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
  59. isa_model/deployment/modal/services/video/__init__.py +1 -0
  60. isa_model/deployment/modal/services/video/isa_video_hunyuan_service.py +423 -0
  61. isa_model/deployment/modal/services/vision/__init__.py +1 -0
  62. isa_model/deployment/modal/services/vision/isa_vision_ocr_service.py +519 -0
  63. isa_model/deployment/modal/services/vision/isa_vision_qwen25_service.py +709 -0
  64. isa_model/deployment/modal/services/vision/isa_vision_table_service.py +676 -0
  65. isa_model/deployment/modal/services/vision/isa_vision_ui_service.py +833 -0
  66. isa_model/deployment/modal/services/vision/isa_vision_ui_service_optimized.py +660 -0
  67. isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
  68. isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
  69. isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
  70. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
  71. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
  72. isa_model/deployment/storage/__init__.py +5 -0
  73. isa_model/deployment/storage/deployment_repository.py +824 -0
  74. isa_model/deployment/triton/__init__.py +10 -0
  75. isa_model/deployment/triton/config.py +196 -0
  76. isa_model/deployment/triton/configs/__init__.py +1 -0
  77. isa_model/deployment/triton/provider.py +512 -0
  78. isa_model/deployment/triton/scripts/__init__.py +1 -0
  79. isa_model/deployment/triton/templates/__init__.py +1 -0
  80. isa_model/inference/__init__.py +47 -1
  81. isa_model/inference/ai_factory.py +179 -16
  82. isa_model/inference/legacy_services/__init__.py +21 -0
  83. isa_model/inference/legacy_services/model_evaluation.py +637 -0
  84. isa_model/inference/legacy_services/model_service.py +573 -0
  85. isa_model/inference/legacy_services/model_serving.py +717 -0
  86. isa_model/inference/legacy_services/model_training.py +561 -0
  87. isa_model/inference/models/__init__.py +21 -0
  88. isa_model/inference/models/inference_config.py +551 -0
  89. isa_model/inference/models/inference_record.py +675 -0
  90. isa_model/inference/models/performance_models.py +714 -0
  91. isa_model/inference/repositories/__init__.py +9 -0
  92. isa_model/inference/repositories/inference_repository.py +828 -0
  93. isa_model/inference/services/audio/__init__.py +21 -0
  94. isa_model/inference/services/audio/base_realtime_service.py +225 -0
  95. isa_model/inference/services/audio/base_stt_service.py +184 -11
  96. isa_model/inference/services/audio/isa_tts_service.py +0 -0
  97. isa_model/inference/services/audio/openai_realtime_service.py +320 -124
  98. isa_model/inference/services/audio/openai_stt_service.py +53 -11
  99. isa_model/inference/services/base_service.py +17 -1
  100. isa_model/inference/services/custom_model_manager.py +277 -0
  101. isa_model/inference/services/embedding/__init__.py +13 -0
  102. isa_model/inference/services/embedding/base_embed_service.py +111 -8
  103. isa_model/inference/services/embedding/isa_embed_service.py +305 -0
  104. isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
  105. isa_model/inference/services/embedding/openai_embed_service.py +2 -4
  106. isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
  107. isa_model/inference/services/embedding/tests/test_embedding.py +222 -0
  108. isa_model/inference/services/img/__init__.py +2 -2
  109. isa_model/inference/services/img/base_image_gen_service.py +24 -7
  110. isa_model/inference/services/img/replicate_image_gen_service.py +84 -422
  111. isa_model/inference/services/img/services/replicate_face_swap.py +193 -0
  112. isa_model/inference/services/img/services/replicate_flux.py +226 -0
  113. isa_model/inference/services/img/services/replicate_flux_kontext.py +219 -0
  114. isa_model/inference/services/img/services/replicate_sticker_maker.py +249 -0
  115. isa_model/inference/services/img/tests/test_img_client.py +297 -0
  116. isa_model/inference/services/llm/__init__.py +10 -2
  117. isa_model/inference/services/llm/base_llm_service.py +361 -26
  118. isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
  119. isa_model/inference/services/llm/helpers/llm_adapter.py +71 -12
  120. isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
  121. isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
  122. isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
  123. isa_model/inference/services/llm/local_llm_service.py +747 -0
  124. isa_model/inference/services/llm/ollama_llm_service.py +11 -3
  125. isa_model/inference/services/llm/openai_llm_service.py +670 -56
  126. isa_model/inference/services/llm/yyds_llm_service.py +10 -3
  127. isa_model/inference/services/vision/__init__.py +27 -6
  128. isa_model/inference/services/vision/base_vision_service.py +118 -185
  129. isa_model/inference/services/vision/blip_vision_service.py +359 -0
  130. isa_model/inference/services/vision/helpers/image_utils.py +19 -10
  131. isa_model/inference/services/vision/isa_vision_service.py +634 -0
  132. isa_model/inference/services/vision/openai_vision_service.py +19 -10
  133. isa_model/inference/services/vision/tests/test_ocr_client.py +284 -0
  134. isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
  135. isa_model/serving/api/cache_manager.py +245 -0
  136. isa_model/serving/api/dependencies/__init__.py +1 -0
  137. isa_model/serving/api/dependencies/auth.py +194 -0
  138. isa_model/serving/api/dependencies/database.py +139 -0
  139. isa_model/serving/api/error_handlers.py +284 -0
  140. isa_model/serving/api/fastapi_server.py +240 -18
  141. isa_model/serving/api/middleware/auth.py +317 -0
  142. isa_model/serving/api/middleware/security.py +268 -0
  143. isa_model/serving/api/middleware/tenant_context.py +414 -0
  144. isa_model/serving/api/routes/analytics.py +489 -0
  145. isa_model/serving/api/routes/config.py +645 -0
  146. isa_model/serving/api/routes/deployment_billing.py +315 -0
  147. isa_model/serving/api/routes/deployments.py +475 -0
  148. isa_model/serving/api/routes/gpu_gateway.py +440 -0
  149. isa_model/serving/api/routes/health.py +32 -12
  150. isa_model/serving/api/routes/inference_monitoring.py +486 -0
  151. isa_model/serving/api/routes/local_deployments.py +448 -0
  152. isa_model/serving/api/routes/logs.py +430 -0
  153. isa_model/serving/api/routes/settings.py +582 -0
  154. isa_model/serving/api/routes/tenants.py +575 -0
  155. isa_model/serving/api/routes/unified.py +992 -171
  156. isa_model/serving/api/routes/webhooks.py +479 -0
  157. isa_model/serving/api/startup.py +318 -0
  158. isa_model/serving/modal_proxy_server.py +249 -0
  159. isa_model/utils/gpu_utils.py +311 -0
  160. {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/METADATA +76 -22
  161. isa_model-0.4.3.dist-info/RECORD +193 -0
  162. isa_model/deployment/cloud/__init__.py +0 -9
  163. isa_model/deployment/cloud/modal/__init__.py +0 -10
  164. isa_model/deployment/cloud/modal/isa_vision_doc_service.py +0 -766
  165. isa_model/deployment/cloud/modal/isa_vision_table_service.py +0 -532
  166. isa_model/deployment/cloud/modal/isa_vision_ui_service.py +0 -406
  167. isa_model/deployment/cloud/modal/register_models.py +0 -321
  168. isa_model/deployment/core/deployment_config.py +0 -356
  169. isa_model/deployment/core/isa_deployment_service.py +0 -401
  170. isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
  171. isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
  172. isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
  173. isa_model/deployment/runtime/deployed_service.py +0 -338
  174. isa_model/deployment/services/__init__.py +0 -9
  175. isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
  176. isa_model/deployment/services/model_service.py +0 -332
  177. isa_model/deployment/services/service_monitor.py +0 -356
  178. isa_model/deployment/services/service_registry.py +0 -527
  179. isa_model/eval/__init__.py +0 -92
  180. isa_model/eval/benchmarks.py +0 -469
  181. isa_model/eval/config/__init__.py +0 -10
  182. isa_model/eval/config/evaluation_config.py +0 -108
  183. isa_model/eval/evaluators/__init__.py +0 -18
  184. isa_model/eval/evaluators/base_evaluator.py +0 -503
  185. isa_model/eval/evaluators/llm_evaluator.py +0 -472
  186. isa_model/eval/factory.py +0 -531
  187. isa_model/eval/infrastructure/__init__.py +0 -24
  188. isa_model/eval/infrastructure/experiment_tracker.py +0 -466
  189. isa_model/eval/metrics.py +0 -798
  190. isa_model/inference/adapter/unified_api.py +0 -248
  191. isa_model/inference/services/helpers/stacked_config.py +0 -148
  192. isa_model/inference/services/img/flux_professional_service.py +0 -603
  193. isa_model/inference/services/img/helpers/base_stacked_service.py +0 -274
  194. isa_model/inference/services/others/table_transformer_service.py +0 -61
  195. isa_model/inference/services/vision/doc_analysis_service.py +0 -640
  196. isa_model/inference/services/vision/helpers/base_stacked_service.py +0 -274
  197. isa_model/inference/services/vision/ui_analysis_service.py +0 -823
  198. isa_model/scripts/inference_tracker.py +0 -283
  199. isa_model/scripts/mlflow_manager.py +0 -379
  200. isa_model/scripts/model_registry.py +0 -465
  201. isa_model/scripts/register_models.py +0 -370
  202. isa_model/scripts/register_models_with_embeddings.py +0 -510
  203. isa_model/scripts/start_mlflow.py +0 -95
  204. isa_model/scripts/training_tracker.py +0 -257
  205. isa_model/training/__init__.py +0 -74
  206. isa_model/training/annotation/annotation_schema.py +0 -47
  207. isa_model/training/annotation/processors/annotation_processor.py +0 -126
  208. isa_model/training/annotation/storage/dataset_manager.py +0 -131
  209. isa_model/training/annotation/storage/dataset_schema.py +0 -44
  210. isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
  211. isa_model/training/annotation/tests/test_minio copy.py +0 -113
  212. isa_model/training/annotation/tests/test_minio_upload.py +0 -43
  213. isa_model/training/annotation/views/annotation_controller.py +0 -158
  214. isa_model/training/cloud/__init__.py +0 -22
  215. isa_model/training/cloud/job_orchestrator.py +0 -402
  216. isa_model/training/cloud/runpod_trainer.py +0 -454
  217. isa_model/training/cloud/storage_manager.py +0 -482
  218. isa_model/training/core/__init__.py +0 -23
  219. isa_model/training/core/config.py +0 -181
  220. isa_model/training/core/dataset.py +0 -222
  221. isa_model/training/core/trainer.py +0 -720
  222. isa_model/training/core/utils.py +0 -213
  223. isa_model/training/factory.py +0 -424
  224. isa_model-0.3.91.dist-info/RECORD +0 -138
  225. /isa_model/{core/storage/minio_storage.py → deployment/modal/services/audio/isa_audio_fish_service.py} +0 -0
  226. /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
  227. {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/WHEEL +0 -0
  228. {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/top_level.txt +0 -0
isa_model/eval/factory.py DELETED
@@ -1,531 +0,0 @@
1
- """
2
- Enterprise-Grade Evaluation Factory for ISA Model Framework
3
-
4
- Implements industry best practices for AI model evaluation at scale:
5
- - Async evaluation with concurrency control
6
- - Comprehensive experiment tracking (W&B, MLflow)
7
- - Distributed evaluation support
8
- - Production-ready monitoring and alerting
9
- - Cost tracking and optimization
10
- - Reproducible evaluation pipelines
11
- """
12
-
13
- import asyncio
14
- import logging
15
- from typing import Optional, Dict, Any, List, Union, Callable
16
- from pathlib import Path
17
- import json
18
-
19
- from .evaluators import LLMEvaluator, VisionEvaluator, MultimodalEvaluator, EvaluationResult
20
- from .infrastructure import ExperimentTracker, create_experiment_tracker
21
- from .config import EvaluationConfig
22
-
23
- logger = logging.getLogger(__name__)
24
-
25
-
26
- class EvaluationFactory:
27
- """
28
- Enterprise-grade evaluation factory implementing MLOps best practices.
29
-
30
- Features:
31
- - Multi-modal evaluation support (LLM, Vision, Multimodal)
32
- - Async evaluation with smart concurrency management
33
- - Comprehensive experiment tracking and visualization
34
- - Cost optimization and resource monitoring
35
- - Distributed evaluation across multiple GPUs/nodes
36
- - Production-ready error handling and retry logic
37
- - Automated result storage and comparison
38
-
39
- Example usage:
40
- ```python
41
- from isa_model.eval import EvaluationFactory
42
-
43
- # Initialize with experiment tracking
44
- factory = EvaluationFactory(
45
- experiment_tracking={
46
- "type": "wandb",
47
- "project": "model-evaluation",
48
- "entity": "my-team"
49
- }
50
- )
51
-
52
- # Evaluate LLM on dataset
53
- result = await factory.evaluate_llm(
54
- model_name="gpt-4.1-mini",
55
- provider="openai",
56
- dataset_path="path/to/evaluation_data.json",
57
- metrics=["accuracy", "f1_score", "bleu_score"],
58
- save_results=True
59
- )
60
-
61
- # Run benchmark evaluation
62
- benchmark_result = await factory.run_benchmark(
63
- model_name="claude-sonnet-4",
64
- provider="yyds",
65
- benchmark_name="mmlu",
66
- subjects=["math", "physics", "chemistry"]
67
- )
68
-
69
- # Compare multiple models
70
- comparison = await factory.compare_models(
71
- models=[
72
- {"name": "gpt-4.1-mini", "provider": "openai"},
73
- {"name": "claude-sonnet-4", "provider": "yyds"}
74
- ],
75
- dataset_path="comparison_dataset.json"
76
- )
77
- ```
78
- """
79
-
80
- def __init__(self,
81
- config: Optional[Union[Dict[str, Any], EvaluationConfig]] = None,
82
- experiment_tracking: Optional[Dict[str, Any]] = None,
83
- output_dir: Optional[str] = None):
84
- """
85
- Initialize the enterprise evaluation factory.
86
-
87
- Args:
88
- config: Evaluation configuration (dict or EvaluationConfig object)
89
- experiment_tracking: Experiment tracking configuration
90
- output_dir: Output directory for results
91
- """
92
- # Initialize configuration
93
- if isinstance(config, dict):
94
- self.config = EvaluationConfig.from_dict(config)
95
- elif isinstance(config, EvaluationConfig):
96
- self.config = config
97
- else:
98
- self.config = EvaluationConfig()
99
-
100
- # Override output directory if provided
101
- if output_dir:
102
- self.config.output_dir = output_dir
103
-
104
- # Initialize experiment tracker
105
- self.experiment_tracker = None
106
- if experiment_tracking:
107
- try:
108
- self.experiment_tracker = create_experiment_tracker(**experiment_tracking)
109
- logger.info(f"Initialized experiment tracking: {experiment_tracking['type']}")
110
- except Exception as e:
111
- logger.warning(f"Failed to initialize experiment tracking: {e}")
112
-
113
- # Initialize evaluators
114
- self.llm_evaluator = LLMEvaluator(
115
- config=self.config.to_dict(),
116
- experiment_tracker=self.experiment_tracker
117
- )
118
-
119
- # State tracking
120
- self._active_evaluations: Dict[str, asyncio.Task] = {}
121
-
122
- logger.info(f"EvaluationFactory initialized with output dir: {self.config.output_dir}")
123
-
124
- async def evaluate_llm(self,
125
- model_name: str,
126
- provider: str = "openai",
127
- dataset_path: Optional[str] = None,
128
- dataset: Optional[List[Dict[str, Any]]] = None,
129
- metrics: Optional[List[str]] = None,
130
- batch_size: Optional[int] = None,
131
- save_results: bool = True,
132
- experiment_name: Optional[str] = None,
133
- progress_callback: Optional[Callable] = None) -> EvaluationResult:
134
- """
135
- Evaluate LLM with comprehensive metrics and tracking.
136
-
137
- Args:
138
- model_name: Name of the model to evaluate
139
- provider: Model provider (openai, yyds, ollama, etc.)
140
- dataset_path: Path to evaluation dataset JSON file
141
- dataset: Direct dataset input (alternative to dataset_path)
142
- metrics: List of metrics to compute
143
- batch_size: Batch size for evaluation
144
- save_results: Whether to save results to disk
145
- experiment_name: Custom experiment name
146
- progress_callback: Optional progress callback function
147
-
148
- Returns:
149
- Comprehensive evaluation results
150
- """
151
- # Load dataset
152
- if dataset is None:
153
- if dataset_path is None:
154
- raise ValueError("Either dataset_path or dataset must be provided")
155
- dataset = self._load_dataset(dataset_path)
156
-
157
- # Configure LLM evaluator
158
- llm_config = {
159
- "provider": provider,
160
- "model_name": model_name,
161
- "batch_size": batch_size or self.config.batch_size,
162
- "temperature": self.config.default_temperature,
163
- "max_tokens": self.config.default_max_tokens
164
- }
165
-
166
- self.llm_evaluator.config.update(llm_config)
167
-
168
- # Generate experiment name
169
- dataset_name = Path(dataset_path).stem if dataset_path else "custom_dataset"
170
- experiment_name = experiment_name or f"llm_eval_{model_name}_{dataset_name}"
171
-
172
- # Run evaluation
173
- result = await self.llm_evaluator.evaluate(
174
- model_interface=None, # Will use AI factory
175
- dataset=dataset,
176
- dataset_name=dataset_name,
177
- model_name=f"{provider}:{model_name}",
178
- batch_size=batch_size,
179
- progress_callback=progress_callback
180
- )
181
-
182
- # Save results if requested
183
- if save_results:
184
- await self._save_results(result, experiment_name)
185
-
186
- return result
187
-
188
- async def run_benchmark(self,
189
- model_name: str,
190
- provider: str,
191
- benchmark_name: str,
192
- subjects: Optional[List[str]] = None,
193
- max_samples: Optional[int] = None,
194
- few_shot: bool = True,
195
- num_shots: int = 5,
196
- save_results: bool = True,
197
- experiment_name: Optional[str] = None) -> EvaluationResult:
198
- """
199
- Run standardized benchmark evaluation.
200
-
201
- Args:
202
- model_name: Name of the model to evaluate
203
- provider: Model provider
204
- benchmark_name: Name of benchmark (mmlu, hellaswag, arc, gsm8k, etc.)
205
- subjects: List of subjects to evaluate (for MMLU)
206
- max_samples: Maximum number of samples to evaluate
207
- few_shot: Whether to use few-shot examples
208
- num_shots: Number of few-shot examples
209
- save_results: Whether to save results
210
- experiment_name: Custom experiment name
211
-
212
- Returns:
213
- Benchmark evaluation results
214
- """
215
- # Load benchmark dataset
216
- benchmark_dataset = await self._load_benchmark(
217
- benchmark_name,
218
- subjects=subjects,
219
- max_samples=max_samples,
220
- few_shot=few_shot,
221
- num_shots=num_shots
222
- )
223
-
224
- # Configure for benchmark evaluation
225
- benchmark_config = {
226
- "provider": provider,
227
- "model_name": model_name,
228
- "temperature": 0.0, # Deterministic for benchmarks
229
- "max_tokens": 50, # Short answers for most benchmarks
230
- "task_type": "benchmark",
231
- "benchmark_name": benchmark_name
232
- }
233
-
234
- self.llm_evaluator.config.update(benchmark_config)
235
-
236
- # Generate experiment name
237
- experiment_name = experiment_name or f"benchmark_{benchmark_name}_{model_name}"
238
-
239
- # Run evaluation
240
- result = await self.llm_evaluator.evaluate(
241
- model_interface=None,
242
- dataset=benchmark_dataset,
243
- dataset_name=benchmark_name,
244
- model_name=f"{provider}:{model_name}",
245
- batch_size=self.config.batch_size
246
- )
247
-
248
- # Add benchmark-specific metadata
249
- result.config.update({
250
- "benchmark_name": benchmark_name,
251
- "subjects": subjects,
252
- "few_shot": few_shot,
253
- "num_shots": num_shots
254
- })
255
-
256
- # Save results if requested
257
- if save_results:
258
- await self._save_results(result, experiment_name)
259
-
260
- return result
261
-
262
- async def compare_models(self,
263
- models: List[Dict[str, str]],
264
- dataset_path: Optional[str] = None,
265
- dataset: Optional[List[Dict[str, Any]]] = None,
266
- benchmark_name: Optional[str] = None,
267
- metrics: Optional[List[str]] = None,
268
- save_results: bool = True,
269
- experiment_name: Optional[str] = None) -> Dict[str, EvaluationResult]:
270
- """
271
- Compare multiple models on the same evaluation task.
272
-
273
- Args:
274
- models: List of model configs [{"name": "gpt-4", "provider": "openai"}, ...]
275
- dataset_path: Path to evaluation dataset
276
- dataset: Direct dataset input
277
- benchmark_name: Benchmark name (alternative to dataset)
278
- metrics: Metrics to compute
279
- save_results: Whether to save comparison results
280
- experiment_name: Custom experiment name
281
-
282
- Returns:
283
- Dictionary mapping model names to evaluation results
284
- """
285
- results = {}
286
-
287
- # Run evaluations concurrently (with concurrency limits)
288
- semaphore = asyncio.Semaphore(self.config.max_concurrent_evaluations)
289
-
290
- async def evaluate_single_model(model_config: Dict[str, str]) -> tuple:
291
- async with semaphore:
292
- model_name = model_config["name"]
293
- provider = model_config["provider"]
294
-
295
- if benchmark_name:
296
- result = await self.run_benchmark(
297
- model_name=model_name,
298
- provider=provider,
299
- benchmark_name=benchmark_name,
300
- save_results=False # Save comparison results together
301
- )
302
- else:
303
- result = await self.evaluate_llm(
304
- model_name=model_name,
305
- provider=provider,
306
- dataset_path=dataset_path,
307
- dataset=dataset,
308
- metrics=metrics,
309
- save_results=False
310
- )
311
-
312
- return f"{provider}:{model_name}", result
313
-
314
- # Execute all evaluations
315
- tasks = [evaluate_single_model(model) for model in models]
316
- evaluation_results = await asyncio.gather(*tasks)
317
-
318
- # Collect results
319
- for model_id, result in evaluation_results:
320
- results[model_id] = result
321
-
322
- # Generate comparison report
323
- comparison_report = self._generate_comparison_report(results)
324
-
325
- # Save results if requested
326
- if save_results:
327
- experiment_name = experiment_name or f"model_comparison_{len(models)}_models"
328
- await self._save_comparison_results(results, comparison_report, experiment_name)
329
-
330
- return results
331
-
332
- def _load_dataset(self, dataset_path: str) -> List[Dict[str, Any]]:
333
- """Load dataset from file."""
334
- with open(dataset_path, 'r', encoding='utf-8') as f:
335
- if dataset_path.endswith('.json'):
336
- dataset = json.load(f)
337
- elif dataset_path.endswith('.jsonl'):
338
- dataset = [json.loads(line) for line in f]
339
- else:
340
- raise ValueError(f"Unsupported dataset format: {dataset_path}")
341
-
342
- logger.info(f"Loaded dataset with {len(dataset)} samples from {dataset_path}")
343
- return dataset
344
-
345
- async def _load_benchmark(self,
346
- benchmark_name: str,
347
- subjects: Optional[List[str]] = None,
348
- max_samples: Optional[int] = None,
349
- few_shot: bool = True,
350
- num_shots: int = 5) -> List[Dict[str, Any]]:
351
- """Load benchmark dataset."""
352
- # This would integrate with the benchmark loaders
353
- # For now, return a placeholder
354
- logger.warning(f"Benchmark {benchmark_name} loading not yet implemented")
355
-
356
- # Placeholder benchmark data
357
- return [
358
- {
359
- "id": f"sample_{i}",
360
- "prompt": f"Sample question {i} for {benchmark_name}",
361
- "reference": "A",
362
- "choices": ["A", "B", "C", "D"] if benchmark_name != "gsm8k" else None
363
- }
364
- for i in range(min(max_samples or 10, 10))
365
- ]
366
-
367
- async def _save_results(self, result: EvaluationResult, experiment_name: str) -> None:
368
- """Save evaluation results to disk."""
369
- # Create output directory
370
- output_dir = Path(self.config.output_dir) / experiment_name
371
- output_dir.mkdir(parents=True, exist_ok=True)
372
-
373
- # Save main results
374
- results_path = output_dir / "results.json"
375
- result.save_to_file(results_path)
376
-
377
- # Save detailed predictions if available
378
- if result.sample_results:
379
- predictions_path = output_dir / "predictions.json"
380
- with open(predictions_path, 'w', encoding='utf-8') as f:
381
- json.dump(result.sample_results, f, indent=2, ensure_ascii=False)
382
-
383
- # Save summary
384
- summary_path = output_dir / "summary.json"
385
- with open(summary_path, 'w', encoding='utf-8') as f:
386
- json.dump(result.get_summary(), f, indent=2, ensure_ascii=False)
387
-
388
- logger.info(f"Saved evaluation results to {output_dir}")
389
-
390
- async def _save_comparison_results(self,
391
- results: Dict[str, EvaluationResult],
392
- comparison_report: Dict[str, Any],
393
- experiment_name: str) -> None:
394
- """Save model comparison results."""
395
- output_dir = Path(self.config.output_dir) / experiment_name
396
- output_dir.mkdir(parents=True, exist_ok=True)
397
-
398
- # Save individual results
399
- for model_id, result in results.items():
400
- model_dir = output_dir / model_id.replace(":", "_")
401
- model_dir.mkdir(exist_ok=True)
402
- result.save_to_file(model_dir / "results.json")
403
-
404
- # Save comparison report
405
- comparison_path = output_dir / "comparison_report.json"
406
- with open(comparison_path, 'w', encoding='utf-8') as f:
407
- json.dump(comparison_report, f, indent=2, ensure_ascii=False)
408
-
409
- logger.info(f"Saved comparison results to {output_dir}")
410
-
411
- def _generate_comparison_report(self, results: Dict[str, EvaluationResult]) -> Dict[str, Any]:
412
- """Generate comparison report from multiple model results."""
413
- report = {
414
- "models_compared": list(results.keys()),
415
- "comparison_timestamp": results[list(results.keys())[0]].timestamp,
416
- "metric_comparison": {},
417
- "rankings": {},
418
- "best_model_per_metric": {}
419
- }
420
-
421
- # Extract all metrics
422
- all_metrics = set()
423
- for result in results.values():
424
- all_metrics.update(result.metrics.keys())
425
-
426
- # Compare each metric
427
- for metric in all_metrics:
428
- metric_values = {}
429
- for model_id, result in results.items():
430
- if metric in result.metrics:
431
- metric_values[model_id] = result.metrics[metric]
432
-
433
- if metric_values:
434
- # Determine if higher is better
435
- higher_is_better = metric not in ["perplexity", "loss", "error_rate"]
436
-
437
- # Find best model
438
- best_model = max(metric_values.items(), key=lambda x: x[1]) if higher_is_better else min(metric_values.items(), key=lambda x: x[1])
439
-
440
- # Create ranking
441
- sorted_models = sorted(metric_values.items(), key=lambda x: x[1], reverse=higher_is_better)
442
-
443
- report["metric_comparison"][metric] = metric_values
444
- report["rankings"][metric] = [{"model": model, "value": value} for model, value in sorted_models]
445
- report["best_model_per_metric"][metric] = {"model": best_model[0], "value": best_model[1]}
446
-
447
- return report
448
-
449
- def get_configuration(self) -> Dict[str, Any]:
450
- """Get current factory configuration."""
451
- return self.config.to_dict()
452
-
453
- def get_active_evaluations(self) -> List[str]:
454
- """Get list of currently running evaluations."""
455
- return list(self._active_evaluations.keys())
456
-
457
- async def stop_evaluation(self, evaluation_id: str) -> bool:
458
- """Stop a running evaluation."""
459
- if evaluation_id in self._active_evaluations:
460
- task = self._active_evaluations[evaluation_id]
461
- task.cancel()
462
- del self._active_evaluations[evaluation_id]
463
- logger.info(f"Stopped evaluation: {evaluation_id}")
464
- return True
465
- return False
466
-
467
- async def cleanup(self) -> None:
468
- """Cleanup resources and stop all running evaluations."""
469
- # Cancel all active evaluations
470
- for evaluation_id in list(self._active_evaluations.keys()):
471
- await self.stop_evaluation(evaluation_id)
472
-
473
- # Close experiment tracker
474
- if self.experiment_tracker and self.experiment_tracker.is_running:
475
- await self.experiment_tracker.end_run()
476
-
477
- logger.info("EvaluationFactory cleanup completed")
478
-
479
-
480
- # Convenience functions for quick evaluation
481
- async def evaluate_llm_quick(model_name: str,
482
- provider: str,
483
- dataset_path: str,
484
- metrics: Optional[List[str]] = None) -> EvaluationResult:
485
- """
486
- Quick LLM evaluation function.
487
-
488
- Args:
489
- model_name: Name of the model
490
- provider: Model provider
491
- dataset_path: Path to dataset
492
- metrics: Metrics to compute
493
-
494
- Returns:
495
- Evaluation results
496
- """
497
- factory = EvaluationFactory()
498
- try:
499
- return await factory.evaluate_llm(
500
- model_name=model_name,
501
- provider=provider,
502
- dataset_path=dataset_path,
503
- metrics=metrics
504
- )
505
- finally:
506
- await factory.cleanup()
507
-
508
-
509
- async def run_benchmark_quick(model_name: str,
510
- provider: str,
511
- benchmark_name: str) -> EvaluationResult:
512
- """
513
- Quick benchmark evaluation function.
514
-
515
- Args:
516
- model_name: Name of the model
517
- provider: Model provider
518
- benchmark_name: Benchmark name
519
-
520
- Returns:
521
- Benchmark results
522
- """
523
- factory = EvaluationFactory()
524
- try:
525
- return await factory.run_benchmark(
526
- model_name=model_name,
527
- provider=provider,
528
- benchmark_name=benchmark_name
529
- )
530
- finally:
531
- await factory.cleanup()
@@ -1,24 +0,0 @@
1
- """
2
- Infrastructure components for evaluation framework.
3
-
4
- Provides robust infrastructure for production-scale evaluation:
5
- - Async execution and concurrency management
6
- - Distributed evaluation support
7
- - Experiment tracking integration
8
- - Result storage and caching
9
- - Resource monitoring
10
- """
11
-
12
- from .experiment_tracker import ExperimentTracker, WandBTracker, MLflowTracker
13
- from .async_runner import AsyncEvaluationRunner
14
- from .result_storage import ResultStorage
15
- from .cache_manager import CacheManager
16
-
17
- __all__ = [
18
- "ExperimentTracker",
19
- "WandBTracker",
20
- "MLflowTracker",
21
- "AsyncEvaluationRunner",
22
- "ResultStorage",
23
- "CacheManager"
24
- ]