isa-model 0.3.91__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. isa_model/client.py +1166 -584
  2. isa_model/core/cache/redis_cache.py +410 -0
  3. isa_model/core/config/config_manager.py +282 -12
  4. isa_model/core/config.py +91 -1
  5. isa_model/core/database/__init__.py +1 -0
  6. isa_model/core/database/direct_db_client.py +114 -0
  7. isa_model/core/database/migration_manager.py +563 -0
  8. isa_model/core/database/migrations.py +297 -0
  9. isa_model/core/database/supabase_client.py +258 -0
  10. isa_model/core/dependencies.py +316 -0
  11. isa_model/core/discovery/__init__.py +19 -0
  12. isa_model/core/discovery/consul_discovery.py +190 -0
  13. isa_model/core/logging/__init__.py +54 -0
  14. isa_model/core/logging/influx_logger.py +523 -0
  15. isa_model/core/logging/loki_logger.py +160 -0
  16. isa_model/core/models/__init__.py +46 -0
  17. isa_model/core/models/config_models.py +625 -0
  18. isa_model/core/models/deployment_billing_tracker.py +430 -0
  19. isa_model/core/models/model_billing_tracker.py +60 -88
  20. isa_model/core/models/model_manager.py +66 -25
  21. isa_model/core/models/model_metadata.py +690 -0
  22. isa_model/core/models/model_repo.py +217 -55
  23. isa_model/core/models/model_statistics_tracker.py +234 -0
  24. isa_model/core/models/model_storage.py +0 -1
  25. isa_model/core/models/model_version_manager.py +959 -0
  26. isa_model/core/models/system_models.py +857 -0
  27. isa_model/core/pricing_manager.py +2 -249
  28. isa_model/core/repositories/__init__.py +9 -0
  29. isa_model/core/repositories/config_repository.py +912 -0
  30. isa_model/core/resilience/circuit_breaker.py +366 -0
  31. isa_model/core/security/secrets.py +358 -0
  32. isa_model/core/services/__init__.py +2 -4
  33. isa_model/core/services/intelligent_model_selector.py +479 -370
  34. isa_model/core/storage/hf_storage.py +2 -2
  35. isa_model/core/types.py +8 -0
  36. isa_model/deployment/__init__.py +5 -48
  37. isa_model/deployment/core/__init__.py +2 -31
  38. isa_model/deployment/core/deployment_manager.py +1278 -368
  39. isa_model/deployment/local/__init__.py +31 -0
  40. isa_model/deployment/local/config.py +248 -0
  41. isa_model/deployment/local/gpu_gateway.py +607 -0
  42. isa_model/deployment/local/health_checker.py +428 -0
  43. isa_model/deployment/local/provider.py +586 -0
  44. isa_model/deployment/local/tensorrt_service.py +621 -0
  45. isa_model/deployment/local/transformers_service.py +644 -0
  46. isa_model/deployment/local/vllm_service.py +527 -0
  47. isa_model/deployment/modal/__init__.py +8 -0
  48. isa_model/deployment/modal/config.py +136 -0
  49. isa_model/deployment/modal/deployer.py +894 -0
  50. isa_model/deployment/modal/services/__init__.py +3 -0
  51. isa_model/deployment/modal/services/audio/__init__.py +1 -0
  52. isa_model/deployment/modal/services/audio/isa_audio_chatTTS_service.py +520 -0
  53. isa_model/deployment/modal/services/audio/isa_audio_openvoice_service.py +758 -0
  54. isa_model/deployment/modal/services/audio/isa_audio_service_v2.py +1044 -0
  55. isa_model/deployment/modal/services/embedding/__init__.py +1 -0
  56. isa_model/deployment/modal/services/embedding/isa_embed_rerank_service.py +296 -0
  57. isa_model/deployment/modal/services/llm/__init__.py +1 -0
  58. isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
  59. isa_model/deployment/modal/services/video/__init__.py +1 -0
  60. isa_model/deployment/modal/services/video/isa_video_hunyuan_service.py +423 -0
  61. isa_model/deployment/modal/services/vision/__init__.py +1 -0
  62. isa_model/deployment/modal/services/vision/isa_vision_ocr_service.py +519 -0
  63. isa_model/deployment/modal/services/vision/isa_vision_qwen25_service.py +709 -0
  64. isa_model/deployment/modal/services/vision/isa_vision_table_service.py +676 -0
  65. isa_model/deployment/modal/services/vision/isa_vision_ui_service.py +833 -0
  66. isa_model/deployment/modal/services/vision/isa_vision_ui_service_optimized.py +660 -0
  67. isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
  68. isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
  69. isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
  70. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
  71. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
  72. isa_model/deployment/storage/__init__.py +5 -0
  73. isa_model/deployment/storage/deployment_repository.py +824 -0
  74. isa_model/deployment/triton/__init__.py +10 -0
  75. isa_model/deployment/triton/config.py +196 -0
  76. isa_model/deployment/triton/configs/__init__.py +1 -0
  77. isa_model/deployment/triton/provider.py +512 -0
  78. isa_model/deployment/triton/scripts/__init__.py +1 -0
  79. isa_model/deployment/triton/templates/__init__.py +1 -0
  80. isa_model/inference/__init__.py +47 -1
  81. isa_model/inference/ai_factory.py +179 -16
  82. isa_model/inference/legacy_services/__init__.py +21 -0
  83. isa_model/inference/legacy_services/model_evaluation.py +637 -0
  84. isa_model/inference/legacy_services/model_service.py +573 -0
  85. isa_model/inference/legacy_services/model_serving.py +717 -0
  86. isa_model/inference/legacy_services/model_training.py +561 -0
  87. isa_model/inference/models/__init__.py +21 -0
  88. isa_model/inference/models/inference_config.py +551 -0
  89. isa_model/inference/models/inference_record.py +675 -0
  90. isa_model/inference/models/performance_models.py +714 -0
  91. isa_model/inference/repositories/__init__.py +9 -0
  92. isa_model/inference/repositories/inference_repository.py +828 -0
  93. isa_model/inference/services/audio/__init__.py +21 -0
  94. isa_model/inference/services/audio/base_realtime_service.py +225 -0
  95. isa_model/inference/services/audio/base_stt_service.py +184 -11
  96. isa_model/inference/services/audio/isa_tts_service.py +0 -0
  97. isa_model/inference/services/audio/openai_realtime_service.py +320 -124
  98. isa_model/inference/services/audio/openai_stt_service.py +53 -11
  99. isa_model/inference/services/base_service.py +17 -1
  100. isa_model/inference/services/custom_model_manager.py +277 -0
  101. isa_model/inference/services/embedding/__init__.py +13 -0
  102. isa_model/inference/services/embedding/base_embed_service.py +111 -8
  103. isa_model/inference/services/embedding/isa_embed_service.py +305 -0
  104. isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
  105. isa_model/inference/services/embedding/openai_embed_service.py +2 -4
  106. isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
  107. isa_model/inference/services/embedding/tests/test_embedding.py +222 -0
  108. isa_model/inference/services/img/__init__.py +2 -2
  109. isa_model/inference/services/img/base_image_gen_service.py +24 -7
  110. isa_model/inference/services/img/replicate_image_gen_service.py +84 -422
  111. isa_model/inference/services/img/services/replicate_face_swap.py +193 -0
  112. isa_model/inference/services/img/services/replicate_flux.py +226 -0
  113. isa_model/inference/services/img/services/replicate_flux_kontext.py +219 -0
  114. isa_model/inference/services/img/services/replicate_sticker_maker.py +249 -0
  115. isa_model/inference/services/img/tests/test_img_client.py +297 -0
  116. isa_model/inference/services/llm/__init__.py +10 -2
  117. isa_model/inference/services/llm/base_llm_service.py +361 -26
  118. isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
  119. isa_model/inference/services/llm/helpers/llm_adapter.py +71 -12
  120. isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
  121. isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
  122. isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
  123. isa_model/inference/services/llm/local_llm_service.py +747 -0
  124. isa_model/inference/services/llm/ollama_llm_service.py +11 -3
  125. isa_model/inference/services/llm/openai_llm_service.py +670 -56
  126. isa_model/inference/services/llm/yyds_llm_service.py +10 -3
  127. isa_model/inference/services/vision/__init__.py +27 -6
  128. isa_model/inference/services/vision/base_vision_service.py +118 -185
  129. isa_model/inference/services/vision/blip_vision_service.py +359 -0
  130. isa_model/inference/services/vision/helpers/image_utils.py +19 -10
  131. isa_model/inference/services/vision/isa_vision_service.py +634 -0
  132. isa_model/inference/services/vision/openai_vision_service.py +19 -10
  133. isa_model/inference/services/vision/tests/test_ocr_client.py +284 -0
  134. isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
  135. isa_model/serving/api/cache_manager.py +245 -0
  136. isa_model/serving/api/dependencies/__init__.py +1 -0
  137. isa_model/serving/api/dependencies/auth.py +194 -0
  138. isa_model/serving/api/dependencies/database.py +139 -0
  139. isa_model/serving/api/error_handlers.py +284 -0
  140. isa_model/serving/api/fastapi_server.py +240 -18
  141. isa_model/serving/api/middleware/auth.py +317 -0
  142. isa_model/serving/api/middleware/security.py +268 -0
  143. isa_model/serving/api/middleware/tenant_context.py +414 -0
  144. isa_model/serving/api/routes/analytics.py +489 -0
  145. isa_model/serving/api/routes/config.py +645 -0
  146. isa_model/serving/api/routes/deployment_billing.py +315 -0
  147. isa_model/serving/api/routes/deployments.py +475 -0
  148. isa_model/serving/api/routes/gpu_gateway.py +440 -0
  149. isa_model/serving/api/routes/health.py +32 -12
  150. isa_model/serving/api/routes/inference_monitoring.py +486 -0
  151. isa_model/serving/api/routes/local_deployments.py +448 -0
  152. isa_model/serving/api/routes/logs.py +430 -0
  153. isa_model/serving/api/routes/settings.py +582 -0
  154. isa_model/serving/api/routes/tenants.py +575 -0
  155. isa_model/serving/api/routes/unified.py +992 -171
  156. isa_model/serving/api/routes/webhooks.py +479 -0
  157. isa_model/serving/api/startup.py +318 -0
  158. isa_model/serving/modal_proxy_server.py +249 -0
  159. isa_model/utils/gpu_utils.py +311 -0
  160. {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/METADATA +76 -22
  161. isa_model-0.4.3.dist-info/RECORD +193 -0
  162. isa_model/deployment/cloud/__init__.py +0 -9
  163. isa_model/deployment/cloud/modal/__init__.py +0 -10
  164. isa_model/deployment/cloud/modal/isa_vision_doc_service.py +0 -766
  165. isa_model/deployment/cloud/modal/isa_vision_table_service.py +0 -532
  166. isa_model/deployment/cloud/modal/isa_vision_ui_service.py +0 -406
  167. isa_model/deployment/cloud/modal/register_models.py +0 -321
  168. isa_model/deployment/core/deployment_config.py +0 -356
  169. isa_model/deployment/core/isa_deployment_service.py +0 -401
  170. isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
  171. isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
  172. isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
  173. isa_model/deployment/runtime/deployed_service.py +0 -338
  174. isa_model/deployment/services/__init__.py +0 -9
  175. isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
  176. isa_model/deployment/services/model_service.py +0 -332
  177. isa_model/deployment/services/service_monitor.py +0 -356
  178. isa_model/deployment/services/service_registry.py +0 -527
  179. isa_model/eval/__init__.py +0 -92
  180. isa_model/eval/benchmarks.py +0 -469
  181. isa_model/eval/config/__init__.py +0 -10
  182. isa_model/eval/config/evaluation_config.py +0 -108
  183. isa_model/eval/evaluators/__init__.py +0 -18
  184. isa_model/eval/evaluators/base_evaluator.py +0 -503
  185. isa_model/eval/evaluators/llm_evaluator.py +0 -472
  186. isa_model/eval/factory.py +0 -531
  187. isa_model/eval/infrastructure/__init__.py +0 -24
  188. isa_model/eval/infrastructure/experiment_tracker.py +0 -466
  189. isa_model/eval/metrics.py +0 -798
  190. isa_model/inference/adapter/unified_api.py +0 -248
  191. isa_model/inference/services/helpers/stacked_config.py +0 -148
  192. isa_model/inference/services/img/flux_professional_service.py +0 -603
  193. isa_model/inference/services/img/helpers/base_stacked_service.py +0 -274
  194. isa_model/inference/services/others/table_transformer_service.py +0 -61
  195. isa_model/inference/services/vision/doc_analysis_service.py +0 -640
  196. isa_model/inference/services/vision/helpers/base_stacked_service.py +0 -274
  197. isa_model/inference/services/vision/ui_analysis_service.py +0 -823
  198. isa_model/scripts/inference_tracker.py +0 -283
  199. isa_model/scripts/mlflow_manager.py +0 -379
  200. isa_model/scripts/model_registry.py +0 -465
  201. isa_model/scripts/register_models.py +0 -370
  202. isa_model/scripts/register_models_with_embeddings.py +0 -510
  203. isa_model/scripts/start_mlflow.py +0 -95
  204. isa_model/scripts/training_tracker.py +0 -257
  205. isa_model/training/__init__.py +0 -74
  206. isa_model/training/annotation/annotation_schema.py +0 -47
  207. isa_model/training/annotation/processors/annotation_processor.py +0 -126
  208. isa_model/training/annotation/storage/dataset_manager.py +0 -131
  209. isa_model/training/annotation/storage/dataset_schema.py +0 -44
  210. isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
  211. isa_model/training/annotation/tests/test_minio copy.py +0 -113
  212. isa_model/training/annotation/tests/test_minio_upload.py +0 -43
  213. isa_model/training/annotation/views/annotation_controller.py +0 -158
  214. isa_model/training/cloud/__init__.py +0 -22
  215. isa_model/training/cloud/job_orchestrator.py +0 -402
  216. isa_model/training/cloud/runpod_trainer.py +0 -454
  217. isa_model/training/cloud/storage_manager.py +0 -482
  218. isa_model/training/core/__init__.py +0 -23
  219. isa_model/training/core/config.py +0 -181
  220. isa_model/training/core/dataset.py +0 -222
  221. isa_model/training/core/trainer.py +0 -720
  222. isa_model/training/core/utils.py +0 -213
  223. isa_model/training/factory.py +0 -424
  224. isa_model-0.3.91.dist-info/RECORD +0 -138
  225. /isa_model/{core/storage/minio_storage.py → deployment/modal/services/audio/isa_audio_fish_service.py} +0 -0
  226. /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
  227. {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/WHEEL +0 -0
  228. {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/top_level.txt +0 -0
isa_model/eval/metrics.py DELETED
@@ -1,798 +0,0 @@
1
- """
2
- Evaluation Metrics for ISA Model Framework
3
-
4
- This module provides various metrics for evaluating AI models:
5
- - LLM metrics: perplexity, BLEU, ROUGE, accuracy, etc.
6
- - Image metrics: FID, IS, LPIPS, etc.
7
- - Custom metrics and benchmark runners
8
- """
9
-
10
- import os
11
- import json
12
- import logging
13
- import numpy as np
14
- from typing import Dict, List, Any, Optional, Union
15
- from enum import Enum
16
- from abc import ABC, abstractmethod
17
-
18
- try:
19
- from ..inference.ai_factory import AIFactory
20
- AI_FACTORY_AVAILABLE = True
21
- except ImportError:
22
- AI_FACTORY_AVAILABLE = False
23
-
24
- logger = logging.getLogger(__name__)
25
-
26
-
27
- class MetricType(str, Enum):
28
- """Types of evaluation metrics."""
29
- PERPLEXITY = "perplexity"
30
- BLEU = "bleu"
31
- ROUGE = "rouge"
32
- ACCURACY = "accuracy"
33
- F1_SCORE = "f1"
34
- DIVERSITY = "diversity"
35
- COHERENCE = "coherence"
36
- FLUENCY = "fluency"
37
- FID = "fid"
38
- IS = "is"
39
- LPIPS = "lpips"
40
-
41
-
42
- class BaseMetric(ABC):
43
- """Base class for all metrics."""
44
-
45
- @abstractmethod
46
- def compute(self, predictions: List[str], references: List[str] = None, **kwargs) -> Dict[str, float]:
47
- """Compute the metric."""
48
- pass
49
-
50
-
51
- class LLMMetrics:
52
- """
53
- Metrics calculator for Language Models.
54
-
55
- Supports various metrics including:
56
- - Perplexity
57
- - BLEU score
58
- - ROUGE score
59
- - Accuracy
60
- - F1 score
61
- - Generation quality metrics
62
- """
63
-
64
- def __init__(self):
65
- self.available_metrics = [
66
- MetricType.PERPLEXITY,
67
- MetricType.BLEU,
68
- MetricType.ROUGE,
69
- MetricType.ACCURACY,
70
- MetricType.F1_SCORE,
71
- MetricType.DIVERSITY,
72
- MetricType.COHERENCE,
73
- MetricType.FLUENCY
74
- ]
75
-
76
- # Initialize AI factory if available
77
- if AI_FACTORY_AVAILABLE:
78
- try:
79
- self.ai_factory = AIFactory()
80
- except Exception as e:
81
- logger.warning(f"Failed to initialize AIFactory: {e}")
82
- self.ai_factory = None
83
- else:
84
- self.ai_factory = None
85
-
86
- def evaluate(
87
- self,
88
- model_path: str,
89
- dataset: List[Dict[str, Any]],
90
- metrics: List[str],
91
- batch_size: int = 8,
92
- provider: str = "ollama",
93
- **kwargs
94
- ) -> Dict[str, Any]:
95
- """
96
- Evaluate LLM on dataset with specified metrics.
97
-
98
- Args:
99
- model_path: Path to the model
100
- dataset: Evaluation dataset
101
- metrics: List of metrics to compute
102
- batch_size: Batch size for evaluation
103
- provider: Model provider
104
- **kwargs: Additional parameters
105
-
106
- Returns:
107
- Dictionary with metric results
108
- """
109
- results = {
110
- "model_path": model_path,
111
- "num_samples": len(dataset),
112
- "metrics": {}
113
- }
114
-
115
- # Generate predictions
116
- predictions, references = self._generate_predictions(
117
- model_path, dataset, batch_size, provider, **kwargs
118
- )
119
-
120
- # Compute each metric
121
- for metric in metrics:
122
- try:
123
- if metric == MetricType.PERPLEXITY:
124
- score = self._compute_perplexity(predictions, references)
125
- elif metric == MetricType.BLEU:
126
- score = self._compute_bleu(predictions, references)
127
- elif metric == MetricType.ROUGE:
128
- score = self._compute_rouge(predictions, references)
129
- elif metric == MetricType.ACCURACY:
130
- score = self._compute_accuracy(predictions, references)
131
- elif metric == MetricType.F1_SCORE:
132
- score = self._compute_f1(predictions, references)
133
- elif metric == MetricType.DIVERSITY:
134
- score = self._compute_diversity(predictions)
135
- elif metric == MetricType.COHERENCE:
136
- score = self._compute_coherence(predictions)
137
- elif metric == MetricType.FLUENCY:
138
- score = self._compute_fluency(predictions)
139
- else:
140
- logger.warning(f"Unknown metric: {metric}")
141
- continue
142
-
143
- results["metrics"][metric] = score
144
- logger.info(f"Computed {metric}: {score}")
145
-
146
- except Exception as e:
147
- logger.error(f"Failed to compute {metric}: {e}")
148
- results["metrics"][metric] = {"error": str(e)}
149
-
150
- return results
151
-
152
- def evaluate_generation(
153
- self,
154
- model_path: str,
155
- prompts: List[str],
156
- reference_texts: List[str] = None,
157
- metrics: List[str] = None,
158
- provider: str = "ollama",
159
- **kwargs
160
- ) -> Dict[str, Any]:
161
- """
162
- Evaluate text generation quality.
163
-
164
- Args:
165
- model_path: Path to the model
166
- prompts: Input prompts
167
- reference_texts: Reference texts (optional)
168
- metrics: Metrics to compute
169
- provider: Model provider
170
- **kwargs: Additional parameters
171
-
172
- Returns:
173
- Generation evaluation results
174
- """
175
- if metrics is None:
176
- metrics = [MetricType.DIVERSITY, MetricType.COHERENCE, MetricType.FLUENCY]
177
-
178
- # Generate texts
179
- generated_texts = self._generate_texts(model_path, prompts, provider, **kwargs)
180
-
181
- results = {
182
- "model_path": model_path,
183
- "num_prompts": len(prompts),
184
- "metrics": {}
185
- }
186
-
187
- # Compute metrics
188
- for metric in metrics:
189
- try:
190
- if metric == MetricType.DIVERSITY:
191
- score = self._compute_diversity(generated_texts)
192
- elif metric == MetricType.COHERENCE:
193
- score = self._compute_coherence(generated_texts)
194
- elif metric == MetricType.FLUENCY:
195
- score = self._compute_fluency(generated_texts)
196
- elif metric == MetricType.BLEU and reference_texts:
197
- score = self._compute_bleu(generated_texts, reference_texts)
198
- elif metric == MetricType.ROUGE and reference_texts:
199
- score = self._compute_rouge(generated_texts, reference_texts)
200
- else:
201
- continue
202
-
203
- results["metrics"][metric] = score
204
-
205
- except Exception as e:
206
- logger.error(f"Failed to compute {metric}: {e}")
207
- results["metrics"][metric] = {"error": str(e)}
208
-
209
- return results
210
-
211
- def _generate_predictions(
212
- self,
213
- model_path: str,
214
- dataset: List[Dict[str, Any]],
215
- batch_size: int,
216
- provider: str,
217
- **kwargs
218
- ) -> tuple:
219
- """Generate predictions from model using actual inference."""
220
- predictions = []
221
- references = []
222
-
223
- if not self.ai_factory:
224
- logger.warning("AIFactory not available, using placeholder predictions")
225
- # Fallback to placeholder predictions
226
- for item in dataset:
227
- if isinstance(item, dict):
228
- if "input" in item and "output" in item:
229
- predictions.append(f"Generated response for: {item['input']}")
230
- references.append(item["output"])
231
- elif "prompt" in item and "response" in item:
232
- predictions.append(f"Generated response for: {item['prompt']}")
233
- references.append(item["response"])
234
- return predictions, references
235
-
236
- try:
237
- # Get LLM service
238
- llm_service = self.ai_factory.get_llm(model_name=model_path, provider=provider)
239
-
240
- # Process dataset in batches
241
- for i in range(0, len(dataset), batch_size):
242
- batch = dataset[i:i + batch_size]
243
- batch_predictions = []
244
- batch_references = []
245
-
246
- for item in batch:
247
- if isinstance(item, dict):
248
- prompt = None
249
- reference = None
250
-
251
- # Extract prompt and reference based on data format
252
- if "input" in item and "output" in item:
253
- prompt = item["input"]
254
- reference = item["output"]
255
- elif "prompt" in item and "response" in item:
256
- prompt = item["prompt"]
257
- reference = item["response"]
258
- elif "question" in item and "answer" in item:
259
- prompt = item["question"]
260
- reference = item["answer"]
261
- elif "text" in item and "label" in item:
262
- prompt = item["text"]
263
- reference = str(item["label"])
264
-
265
- if prompt and reference:
266
- try:
267
- # Generate prediction using actual model
268
- response = await llm_service.ainvoke(prompt)
269
-
270
- # Extract text from response
271
- if hasattr(response, 'text'):
272
- prediction = response.text
273
- elif isinstance(response, dict) and 'text' in response:
274
- prediction = response['text']
275
- elif isinstance(response, str):
276
- prediction = response
277
- else:
278
- prediction = str(response)
279
-
280
- batch_predictions.append(prediction.strip())
281
- batch_references.append(reference)
282
-
283
- except Exception as e:
284
- logger.error(f"Failed to generate prediction for item: {e}")
285
- # Use fallback prediction
286
- batch_predictions.append(f"Error generating prediction: {str(e)}")
287
- batch_references.append(reference)
288
-
289
- predictions.extend(batch_predictions)
290
- references.extend(batch_references)
291
-
292
- logger.info(f"Processed batch {i//batch_size + 1}/{(len(dataset) + batch_size - 1)//batch_size}")
293
-
294
- except Exception as e:
295
- logger.error(f"Failed to use AIFactory for predictions: {e}")
296
- # Fallback to placeholder predictions
297
- for item in dataset:
298
- if isinstance(item, dict):
299
- if "input" in item and "output" in item:
300
- predictions.append(f"Generated response for: {item['input']}")
301
- references.append(item["output"])
302
- elif "prompt" in item and "response" in item:
303
- predictions.append(f"Generated response for: {item['prompt']}")
304
- references.append(item["response"])
305
-
306
- logger.info(f"Generated {len(predictions)} predictions")
307
- return predictions, references
308
-
309
- def _generate_texts(
310
- self,
311
- model_path: str,
312
- prompts: List[str],
313
- provider: str,
314
- **kwargs
315
- ) -> List[str]:
316
- """Generate texts from prompts using actual model inference."""
317
- generated_texts = []
318
-
319
- if not self.ai_factory:
320
- logger.warning("AIFactory not available, using placeholder text generation")
321
- # Fallback to placeholder generation
322
- for prompt in prompts:
323
- generated_texts.append(f"Generated response for: {prompt}")
324
- return generated_texts
325
-
326
- try:
327
- # Get LLM service
328
- llm_service = self.ai_factory.get_llm(model_name=model_path, provider=provider)
329
-
330
- for prompt in prompts:
331
- try:
332
- # Generate text using actual model
333
- response = await llm_service.ainvoke(prompt)
334
-
335
- # Extract text from response
336
- if hasattr(response, 'text'):
337
- generated_text = response.text
338
- elif isinstance(response, dict) and 'text' in response:
339
- generated_text = response['text']
340
- elif isinstance(response, str):
341
- generated_text = response
342
- else:
343
- generated_text = str(response)
344
-
345
- generated_texts.append(generated_text.strip())
346
-
347
- except Exception as e:
348
- logger.error(f"Failed to generate text for prompt: {e}")
349
- # Use fallback generation
350
- generated_texts.append(f"Error generating text: {str(e)}")
351
-
352
- except Exception as e:
353
- logger.error(f"Failed to use AIFactory for text generation: {e}")
354
- # Fallback to placeholder generation
355
- for prompt in prompts:
356
- generated_texts.append(f"Generated response for: {prompt}")
357
-
358
- return generated_texts
359
-
360
- def _compute_perplexity(self, predictions: List[str], references: List[str]) -> Dict[str, float]:
361
- """Compute perplexity score (simplified implementation)."""
362
- # This is a placeholder - actual perplexity requires model probabilities
363
- return {
364
- "perplexity": np.random.uniform(10, 100), # Placeholder
365
- "log_perplexity": np.random.uniform(2, 5)
366
- }
367
-
368
- def _compute_bleu(self, predictions: List[str], references: List[str]) -> Dict[str, float]:
369
- """Compute BLEU score (simplified implementation)."""
370
- try:
371
- # Placeholder implementation - use actual BLEU calculation
372
- # from nltk.translate.bleu_score import sentence_bleu
373
- scores = []
374
- for pred, ref in zip(predictions, references):
375
- # Simplified BLEU calculation
376
- pred_words = pred.lower().split()
377
- ref_words = ref.lower().split()
378
-
379
- # Simple overlap calculation (not actual BLEU)
380
- overlap = len(set(pred_words) & set(ref_words))
381
- total = len(set(pred_words) | set(ref_words))
382
-
383
- if total > 0:
384
- scores.append(overlap / total)
385
- else:
386
- scores.append(0.0)
387
-
388
- return {
389
- "bleu": np.mean(scores),
390
- "bleu_std": np.std(scores)
391
- }
392
- except Exception as e:
393
- logger.error(f"BLEU computation failed: {e}")
394
- return {"bleu": 0.0, "error": str(e)}
395
-
396
- def _compute_rouge(self, predictions: List[str], references: List[str]) -> Dict[str, float]:
397
- """Compute ROUGE score (simplified implementation)."""
398
- try:
399
- rouge_1_scores = []
400
- rouge_l_scores = []
401
-
402
- for pred, ref in zip(predictions, references):
403
- pred_words = set(pred.lower().split())
404
- ref_words = set(ref.lower().split())
405
-
406
- # ROUGE-1 (unigram overlap)
407
- if len(ref_words) > 0:
408
- rouge_1 = len(pred_words & ref_words) / len(ref_words)
409
- rouge_1_scores.append(rouge_1)
410
-
411
- # Simplified ROUGE-L (longest common subsequence)
412
- rouge_l = len(pred_words & ref_words) / max(len(pred_words), len(ref_words), 1)
413
- rouge_l_scores.append(rouge_l)
414
-
415
- return {
416
- "rouge_1": np.mean(rouge_1_scores),
417
- "rouge_l": np.mean(rouge_l_scores),
418
- "rouge_1_std": np.std(rouge_1_scores),
419
- "rouge_l_std": np.std(rouge_l_scores)
420
- }
421
- except Exception as e:
422
- logger.error(f"ROUGE computation failed: {e}")
423
- return {"rouge_1": 0.0, "rouge_l": 0.0, "error": str(e)}
424
-
425
- def _compute_accuracy(self, predictions: List[str], references: List[str]) -> Dict[str, float]:
426
- """Compute accuracy score."""
427
- try:
428
- correct = 0
429
- total = len(predictions)
430
-
431
- for pred, ref in zip(predictions, references):
432
- if pred.strip().lower() == ref.strip().lower():
433
- correct += 1
434
-
435
- accuracy = correct / total if total > 0 else 0.0
436
-
437
- return {
438
- "accuracy": accuracy,
439
- "correct": correct,
440
- "total": total
441
- }
442
- except Exception as e:
443
- logger.error(f"Accuracy computation failed: {e}")
444
- return {"accuracy": 0.0, "error": str(e)}
445
-
446
- def _compute_f1(self, predictions: List[str], references: List[str]) -> Dict[str, float]:
447
- """Compute F1 score (simplified implementation)."""
448
- try:
449
- f1_scores = []
450
-
451
- for pred, ref in zip(predictions, references):
452
- pred_words = set(pred.lower().split())
453
- ref_words = set(ref.lower().split())
454
-
455
- if len(pred_words) == 0 and len(ref_words) == 0:
456
- f1_scores.append(1.0)
457
- elif len(pred_words) == 0 or len(ref_words) == 0:
458
- f1_scores.append(0.0)
459
- else:
460
- intersection = len(pred_words & ref_words)
461
- precision = intersection / len(pred_words)
462
- recall = intersection / len(ref_words)
463
-
464
- if precision + recall > 0:
465
- f1 = 2 * (precision * recall) / (precision + recall)
466
- f1_scores.append(f1)
467
- else:
468
- f1_scores.append(0.0)
469
-
470
- return {
471
- "f1": np.mean(f1_scores),
472
- "f1_std": np.std(f1_scores)
473
- }
474
- except Exception as e:
475
- logger.error(f"F1 computation failed: {e}")
476
- return {"f1": 0.0, "error": str(e)}
477
-
478
- def _compute_diversity(self, texts: List[str]) -> Dict[str, float]:
479
- """Compute diversity metrics."""
480
- try:
481
- # Distinct-1 and Distinct-2
482
- all_unigrams = []
483
- all_bigrams = []
484
-
485
- for text in texts:
486
- words = text.lower().split()
487
- all_unigrams.extend(words)
488
-
489
- # Create bigrams
490
- for i in range(len(words) - 1):
491
- all_bigrams.append((words[i], words[i + 1]))
492
-
493
- distinct_1 = len(set(all_unigrams)) / len(all_unigrams) if all_unigrams else 0
494
- distinct_2 = len(set(all_bigrams)) / len(all_bigrams) if all_bigrams else 0
495
-
496
- return {
497
- "distinct_1": distinct_1,
498
- "distinct_2": distinct_2,
499
- "vocab_size": len(set(all_unigrams))
500
- }
501
- except Exception as e:
502
- logger.error(f"Diversity computation failed: {e}")
503
- return {"distinct_1": 0.0, "distinct_2": 0.0, "error": str(e)}
504
-
505
- def _compute_coherence(self, texts: List[str]) -> Dict[str, float]:
506
- """Compute coherence score (simplified implementation)."""
507
- try:
508
- # Simplified coherence based on sentence length consistency
509
- coherence_scores = []
510
-
511
- for text in texts:
512
- sentences = text.split('.')
513
- if len(sentences) > 1:
514
- lengths = [len(s.split()) for s in sentences if s.strip()]
515
- if lengths:
516
- # Coherence as inverse of length variance
517
- coherence = 1.0 / (1.0 + np.var(lengths))
518
- coherence_scores.append(coherence)
519
- else:
520
- coherence_scores.append(0.5)
521
- else:
522
- coherence_scores.append(0.5)
523
-
524
- return {
525
- "coherence": np.mean(coherence_scores),
526
- "coherence_std": np.std(coherence_scores)
527
- }
528
- except Exception as e:
529
- logger.error(f"Coherence computation failed: {e}")
530
- return {"coherence": 0.5, "error": str(e)}
531
-
532
- def _compute_fluency(self, texts: List[str]) -> Dict[str, float]:
533
- """Compute fluency score (simplified implementation)."""
534
- try:
535
- fluency_scores = []
536
-
537
- for text in texts:
538
- # Simplified fluency based on word count and sentence structure
539
- words = text.split()
540
- sentences = text.split('.')
541
-
542
- if len(words) > 0 and len(sentences) > 0:
543
- avg_words_per_sentence = len(words) / len(sentences)
544
- # Fluency based on reasonable sentence length (5-20 words)
545
- if 5 <= avg_words_per_sentence <= 20:
546
- fluency = 1.0
547
- else:
548
- fluency = max(0.0, 1.0 - abs(avg_words_per_sentence - 12.5) / 12.5)
549
-
550
- fluency_scores.append(fluency)
551
- else:
552
- fluency_scores.append(0.0)
553
-
554
- return {
555
- "fluency": np.mean(fluency_scores),
556
- "fluency_std": np.std(fluency_scores)
557
- }
558
- except Exception as e:
559
- logger.error(f"Fluency computation failed: {e}")
560
- return {"fluency": 0.0, "error": str(e)}
561
-
562
-
563
- class ImageMetrics:
564
- """
565
- Metrics calculator for Image Generation Models.
566
-
567
- Supports metrics including:
568
- - FID (Fréchet Inception Distance)
569
- - IS (Inception Score)
570
- - LPIPS (Learned Perceptual Image Patch Similarity)
571
- """
572
-
573
- def __init__(self):
574
- self.available_metrics = [
575
- MetricType.FID,
576
- MetricType.IS,
577
- MetricType.LPIPS
578
- ]
579
-
580
- def evaluate(
581
- self,
582
- model_path: str,
583
- test_images_dir: str,
584
- reference_images_dir: Optional[str] = None,
585
- metrics: List[str] = None,
586
- **kwargs
587
- ) -> Dict[str, Any]:
588
- """
589
- Evaluate image generation model.
590
-
591
- Args:
592
- model_path: Path to the image model
593
- test_images_dir: Directory with test images
594
- reference_images_dir: Directory with reference images
595
- metrics: Metrics to compute
596
- **kwargs: Additional parameters
597
-
598
- Returns:
599
- Image evaluation results
600
- """
601
- if metrics is None:
602
- metrics = [MetricType.FID, MetricType.IS]
603
-
604
- results = {
605
- "model_path": model_path,
606
- "test_images_dir": test_images_dir,
607
- "reference_images_dir": reference_images_dir,
608
- "metrics": {}
609
- }
610
-
611
- for metric in metrics:
612
- try:
613
- if metric == MetricType.FID:
614
- score = self._compute_fid(test_images_dir, reference_images_dir)
615
- elif metric == MetricType.IS:
616
- score = self._compute_is(test_images_dir)
617
- elif metric == MetricType.LPIPS:
618
- score = self._compute_lpips(test_images_dir, reference_images_dir)
619
- else:
620
- logger.warning(f"Unknown image metric: {metric}")
621
- continue
622
-
623
- results["metrics"][metric] = score
624
- logger.info(f"Computed {metric}: {score}")
625
-
626
- except Exception as e:
627
- logger.error(f"Failed to compute {metric}: {e}")
628
- results["metrics"][metric] = {"error": str(e)}
629
-
630
- return results
631
-
632
- def _compute_fid(self, test_dir: str, reference_dir: Optional[str]) -> Dict[str, float]:
633
- """Compute FID score (placeholder implementation)."""
634
- # This is a placeholder - actual FID requires complex neural network computations
635
- logger.warning("FID computation not fully implemented - returning placeholder")
636
- return {
637
- "fid": np.random.uniform(20, 100), # Placeholder
638
- "note": "Placeholder implementation"
639
- }
640
-
641
- def _compute_is(self, images_dir: str) -> Dict[str, float]:
642
- """Compute Inception Score (placeholder implementation)."""
643
- # This is a placeholder - actual IS requires Inception network
644
- logger.warning("IS computation not fully implemented - returning placeholder")
645
- return {
646
- "is_mean": np.random.uniform(2, 10), # Placeholder
647
- "is_std": np.random.uniform(0.1, 1.0),
648
- "note": "Placeholder implementation"
649
- }
650
-
651
- def _compute_lpips(self, test_dir: str, reference_dir: Optional[str]) -> Dict[str, float]:
652
- """Compute LPIPS score (placeholder implementation)."""
653
- # This is a placeholder - actual LPIPS requires perceptual loss networks
654
- logger.warning("LPIPS computation not fully implemented - returning placeholder")
655
- return {
656
- "lpips": np.random.uniform(0.1, 0.8), # Placeholder
657
- "note": "Placeholder implementation"
658
- }
659
-
660
-
661
- class BenchmarkRunner:
662
- """
663
- Runner for standard AI benchmarks.
664
-
665
- Supports running various benchmarks and collecting results.
666
- """
667
-
668
- def __init__(self):
669
- self.supported_benchmarks = ["mmlu", "hellaswag", "arc", "gsm8k"]
670
-
671
- # Initialize AI factory if available
672
- if AI_FACTORY_AVAILABLE:
673
- try:
674
- self.ai_factory = AIFactory()
675
- except Exception as e:
676
- logger.warning(f"Failed to initialize AIFactory: {e}")
677
- self.ai_factory = None
678
- else:
679
- self.ai_factory = None
680
-
681
- def run(
682
- self,
683
- benchmark,
684
- model_path: str,
685
- num_shots: int = 0,
686
- max_samples: Optional[int] = None,
687
- provider: str = "ollama",
688
- **kwargs
689
- ) -> Dict[str, Any]:
690
- """
691
- Run a benchmark evaluation.
692
-
693
- Args:
694
- benchmark: Benchmark instance
695
- model_path: Path to the model
696
- num_shots: Number of few-shot examples
697
- max_samples: Maximum samples to evaluate
698
- provider: Model provider
699
- **kwargs: Additional parameters
700
-
701
- Returns:
702
- Benchmark results
703
- """
704
- logger.info(f"Running benchmark {benchmark.name} on {model_path}")
705
-
706
- # Load benchmark data
707
- test_data = benchmark.load_data(max_samples=max_samples)
708
-
709
- # Run evaluation
710
- results = {
711
- "benchmark": benchmark.name,
712
- "model_path": model_path,
713
- "num_shots": num_shots,
714
- "num_samples": len(test_data),
715
- "results": {}
716
- }
717
-
718
- # Process each sample
719
- correct = 0
720
- total = 0
721
-
722
- for sample in test_data:
723
- try:
724
- # Format prompt using benchmark's method
725
- prompt = benchmark.format_prompt(sample)
726
-
727
- # Generate prediction using actual model
728
- prediction = self._generate_prediction(
729
- model_path, {"prompt": prompt}, num_shots, provider, **kwargs
730
- )
731
-
732
- # Check if correct
733
- is_correct = benchmark.evaluate_sample(sample, prediction)
734
- if is_correct:
735
- correct += 1
736
- total += 1
737
-
738
- except Exception as e:
739
- logger.error(f"Failed to process sample: {e}")
740
- continue
741
-
742
- # Calculate final score
743
- accuracy = correct / total if total > 0 else 0.0
744
-
745
- results["results"] = {
746
- "accuracy": accuracy,
747
- "correct": correct,
748
- "total": total
749
- }
750
-
751
- logger.info(f"Benchmark completed: {accuracy:.3f} accuracy ({correct}/{total})")
752
- return results
753
-
754
- def _generate_prediction(
755
- self,
756
- model_path: str,
757
- sample: Dict[str, Any],
758
- num_shots: int,
759
- provider: str,
760
- **kwargs
761
- ) -> str:
762
- """Generate prediction for a sample using actual model inference."""
763
- if not self.ai_factory:
764
- logger.warning("AIFactory not available, using placeholder prediction")
765
- return "A" # Placeholder answer
766
-
767
- try:
768
- # Get LLM service
769
- llm_service = self.ai_factory.get_llm(model_name=model_path, provider=provider)
770
-
771
- # Format the prompt (this should be done by the benchmark)
772
- if hasattr(sample, 'get'):
773
- prompt = sample.get('prompt', str(sample))
774
- else:
775
- prompt = str(sample)
776
-
777
- # Generate prediction using actual model
778
- response = llm_service.generate(
779
- prompt=prompt,
780
- max_tokens=kwargs.get("max_tokens", 50),
781
- temperature=kwargs.get("temperature", 0.0) # Low temperature for consistency
782
- )
783
-
784
- # Extract text from response
785
- if hasattr(response, 'text'):
786
- prediction = response.text
787
- elif isinstance(response, dict) and 'text' in response:
788
- prediction = response['text']
789
- elif isinstance(response, str):
790
- prediction = response
791
- else:
792
- prediction = str(response)
793
-
794
- return prediction.strip()
795
-
796
- except Exception as e:
797
- logger.error(f"Failed to generate prediction: {e}")
798
- return "A" # Fallback answer