isa-model 0.4.0__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (199) hide show
  1. isa_model/client.py +466 -43
  2. isa_model/core/cache/redis_cache.py +12 -3
  3. isa_model/core/config/config_manager.py +230 -3
  4. isa_model/core/config.py +90 -0
  5. isa_model/core/database/direct_db_client.py +114 -0
  6. isa_model/core/database/migration_manager.py +563 -0
  7. isa_model/core/database/migrations.py +21 -1
  8. isa_model/core/database/supabase_client.py +154 -19
  9. isa_model/core/dependencies.py +316 -0
  10. isa_model/core/discovery/__init__.py +19 -0
  11. isa_model/core/discovery/consul_discovery.py +190 -0
  12. isa_model/core/logging/__init__.py +54 -0
  13. isa_model/core/logging/influx_logger.py +523 -0
  14. isa_model/core/logging/loki_logger.py +160 -0
  15. isa_model/core/models/__init__.py +27 -18
  16. isa_model/core/models/config_models.py +625 -0
  17. isa_model/core/models/deployment_billing_tracker.py +430 -0
  18. isa_model/core/models/model_manager.py +40 -17
  19. isa_model/core/models/model_metadata.py +690 -0
  20. isa_model/core/models/model_repo.py +174 -18
  21. isa_model/core/models/system_models.py +857 -0
  22. isa_model/core/repositories/__init__.py +9 -0
  23. isa_model/core/repositories/config_repository.py +912 -0
  24. isa_model/core/services/intelligent_model_selector.py +399 -21
  25. isa_model/core/storage/hf_storage.py +1 -1
  26. isa_model/core/types.py +1 -0
  27. isa_model/deployment/__init__.py +5 -48
  28. isa_model/deployment/core/__init__.py +2 -31
  29. isa_model/deployment/core/deployment_manager.py +1278 -370
  30. isa_model/deployment/local/__init__.py +31 -0
  31. isa_model/deployment/local/config.py +248 -0
  32. isa_model/deployment/local/gpu_gateway.py +607 -0
  33. isa_model/deployment/local/health_checker.py +428 -0
  34. isa_model/deployment/local/provider.py +586 -0
  35. isa_model/deployment/local/tensorrt_service.py +621 -0
  36. isa_model/deployment/local/transformers_service.py +644 -0
  37. isa_model/deployment/local/vllm_service.py +527 -0
  38. isa_model/deployment/modal/__init__.py +8 -0
  39. isa_model/deployment/modal/config.py +136 -0
  40. isa_model/deployment/{services/auto_hf_modal_deployer.py → modal/deployer.py} +1 -1
  41. isa_model/deployment/modal/services/__init__.py +3 -0
  42. isa_model/deployment/modal/services/audio/__init__.py +1 -0
  43. isa_model/deployment/modal/services/embedding/__init__.py +1 -0
  44. isa_model/deployment/modal/services/llm/__init__.py +1 -0
  45. isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
  46. isa_model/deployment/modal/services/video/__init__.py +1 -0
  47. isa_model/deployment/modal/services/vision/__init__.py +1 -0
  48. isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
  49. isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
  50. isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
  51. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
  52. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
  53. isa_model/deployment/storage/__init__.py +5 -0
  54. isa_model/deployment/storage/deployment_repository.py +824 -0
  55. isa_model/deployment/triton/__init__.py +10 -0
  56. isa_model/deployment/triton/config.py +196 -0
  57. isa_model/deployment/triton/configs/__init__.py +1 -0
  58. isa_model/deployment/triton/provider.py +512 -0
  59. isa_model/deployment/triton/scripts/__init__.py +1 -0
  60. isa_model/deployment/triton/templates/__init__.py +1 -0
  61. isa_model/inference/__init__.py +47 -1
  62. isa_model/inference/ai_factory.py +137 -10
  63. isa_model/inference/legacy_services/__init__.py +21 -0
  64. isa_model/inference/legacy_services/model_evaluation.py +637 -0
  65. isa_model/inference/legacy_services/model_service.py +573 -0
  66. isa_model/inference/legacy_services/model_serving.py +717 -0
  67. isa_model/inference/legacy_services/model_training.py +561 -0
  68. isa_model/inference/models/__init__.py +21 -0
  69. isa_model/inference/models/inference_config.py +551 -0
  70. isa_model/inference/models/inference_record.py +675 -0
  71. isa_model/inference/models/performance_models.py +714 -0
  72. isa_model/inference/repositories/__init__.py +9 -0
  73. isa_model/inference/repositories/inference_repository.py +828 -0
  74. isa_model/inference/services/audio/base_stt_service.py +184 -11
  75. isa_model/inference/services/audio/openai_stt_service.py +22 -6
  76. isa_model/inference/services/custom_model_manager.py +277 -0
  77. isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
  78. isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
  79. isa_model/inference/services/llm/__init__.py +10 -2
  80. isa_model/inference/services/llm/base_llm_service.py +335 -24
  81. isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
  82. isa_model/inference/services/llm/helpers/llm_adapter.py +9 -4
  83. isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
  84. isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
  85. isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
  86. isa_model/inference/services/llm/local_llm_service.py +747 -0
  87. isa_model/inference/services/llm/ollama_llm_service.py +9 -2
  88. isa_model/inference/services/llm/openai_llm_service.py +33 -16
  89. isa_model/inference/services/llm/yyds_llm_service.py +8 -2
  90. isa_model/inference/services/vision/__init__.py +22 -1
  91. isa_model/inference/services/vision/blip_vision_service.py +359 -0
  92. isa_model/inference/services/vision/helpers/image_utils.py +8 -5
  93. isa_model/inference/services/vision/isa_vision_service.py +65 -4
  94. isa_model/inference/services/vision/openai_vision_service.py +19 -10
  95. isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
  96. isa_model/serving/api/cache_manager.py +245 -0
  97. isa_model/serving/api/dependencies/__init__.py +1 -0
  98. isa_model/serving/api/dependencies/auth.py +194 -0
  99. isa_model/serving/api/dependencies/database.py +139 -0
  100. isa_model/serving/api/error_handlers.py +284 -0
  101. isa_model/serving/api/fastapi_server.py +172 -22
  102. isa_model/serving/api/middleware/auth.py +8 -2
  103. isa_model/serving/api/middleware/security.py +23 -33
  104. isa_model/serving/api/middleware/tenant_context.py +414 -0
  105. isa_model/serving/api/routes/analytics.py +4 -1
  106. isa_model/serving/api/routes/config.py +645 -0
  107. isa_model/serving/api/routes/deployment_billing.py +315 -0
  108. isa_model/serving/api/routes/deployments.py +138 -2
  109. isa_model/serving/api/routes/gpu_gateway.py +440 -0
  110. isa_model/serving/api/routes/health.py +32 -12
  111. isa_model/serving/api/routes/inference_monitoring.py +486 -0
  112. isa_model/serving/api/routes/local_deployments.py +448 -0
  113. isa_model/serving/api/routes/tenants.py +575 -0
  114. isa_model/serving/api/routes/unified.py +680 -18
  115. isa_model/serving/api/routes/webhooks.py +479 -0
  116. isa_model/serving/api/startup.py +68 -54
  117. isa_model/utils/gpu_utils.py +311 -0
  118. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/METADATA +66 -24
  119. isa_model-0.4.3.dist-info/RECORD +193 -0
  120. isa_model/core/storage/minio_storage.py +0 -0
  121. isa_model/deployment/cloud/__init__.py +0 -9
  122. isa_model/deployment/cloud/modal/__init__.py +0 -10
  123. isa_model/deployment/core/deployment_config.py +0 -356
  124. isa_model/deployment/core/isa_deployment_service.py +0 -401
  125. isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
  126. isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
  127. isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
  128. isa_model/deployment/runtime/deployed_service.py +0 -338
  129. isa_model/deployment/services/__init__.py +0 -9
  130. isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
  131. isa_model/deployment/services/model_service.py +0 -332
  132. isa_model/deployment/services/service_monitor.py +0 -356
  133. isa_model/deployment/services/service_registry.py +0 -527
  134. isa_model/eval/__init__.py +0 -92
  135. isa_model/eval/benchmarks/__init__.py +0 -27
  136. isa_model/eval/benchmarks/multimodal_datasets.py +0 -460
  137. isa_model/eval/benchmarks.py +0 -701
  138. isa_model/eval/config/__init__.py +0 -10
  139. isa_model/eval/config/evaluation_config.py +0 -108
  140. isa_model/eval/evaluators/__init__.py +0 -24
  141. isa_model/eval/evaluators/audio_evaluator.py +0 -727
  142. isa_model/eval/evaluators/base_evaluator.py +0 -503
  143. isa_model/eval/evaluators/embedding_evaluator.py +0 -742
  144. isa_model/eval/evaluators/llm_evaluator.py +0 -472
  145. isa_model/eval/evaluators/vision_evaluator.py +0 -564
  146. isa_model/eval/example_evaluation.py +0 -395
  147. isa_model/eval/factory.py +0 -798
  148. isa_model/eval/infrastructure/__init__.py +0 -24
  149. isa_model/eval/infrastructure/experiment_tracker.py +0 -466
  150. isa_model/eval/isa_benchmarks.py +0 -700
  151. isa_model/eval/isa_integration.py +0 -582
  152. isa_model/eval/metrics.py +0 -951
  153. isa_model/eval/tests/unit/test_basic.py +0 -396
  154. isa_model/serving/api/routes/evaluations.py +0 -579
  155. isa_model/training/__init__.py +0 -168
  156. isa_model/training/annotation/annotation_schema.py +0 -47
  157. isa_model/training/annotation/processors/annotation_processor.py +0 -126
  158. isa_model/training/annotation/storage/dataset_manager.py +0 -131
  159. isa_model/training/annotation/storage/dataset_schema.py +0 -44
  160. isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
  161. isa_model/training/annotation/tests/test_minio copy.py +0 -113
  162. isa_model/training/annotation/tests/test_minio_upload.py +0 -43
  163. isa_model/training/annotation/views/annotation_controller.py +0 -158
  164. isa_model/training/cloud/__init__.py +0 -22
  165. isa_model/training/cloud/job_orchestrator.py +0 -402
  166. isa_model/training/cloud/runpod_trainer.py +0 -454
  167. isa_model/training/cloud/storage_manager.py +0 -482
  168. isa_model/training/core/__init__.py +0 -26
  169. isa_model/training/core/config.py +0 -181
  170. isa_model/training/core/dataset.py +0 -222
  171. isa_model/training/core/trainer.py +0 -720
  172. isa_model/training/core/utils.py +0 -213
  173. isa_model/training/examples/intelligent_training_example.py +0 -281
  174. isa_model/training/factory.py +0 -424
  175. isa_model/training/intelligent/__init__.py +0 -25
  176. isa_model/training/intelligent/decision_engine.py +0 -643
  177. isa_model/training/intelligent/intelligent_factory.py +0 -888
  178. isa_model/training/intelligent/knowledge_base.py +0 -751
  179. isa_model/training/intelligent/resource_optimizer.py +0 -839
  180. isa_model/training/intelligent/task_classifier.py +0 -576
  181. isa_model/training/storage/__init__.py +0 -24
  182. isa_model/training/storage/core_integration.py +0 -439
  183. isa_model/training/storage/training_repository.py +0 -552
  184. isa_model/training/storage/training_storage.py +0 -628
  185. isa_model-0.4.0.dist-info/RECORD +0 -182
  186. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_chatTTS_service.py +0 -0
  187. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_fish_service.py +0 -0
  188. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_openvoice_service.py +0 -0
  189. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_service_v2.py +0 -0
  190. /isa_model/deployment/{cloud/modal → modal/services/embedding}/isa_embed_rerank_service.py +0 -0
  191. /isa_model/deployment/{cloud/modal → modal/services/video}/isa_video_hunyuan_service.py +0 -0
  192. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ocr_service.py +0 -0
  193. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_qwen25_service.py +0 -0
  194. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_table_service.py +0 -0
  195. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service.py +0 -0
  196. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service_optimized.py +0 -0
  197. /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
  198. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/WHEEL +0 -0
  199. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/top_level.txt +0 -0
isa_model/eval/metrics.py DELETED
@@ -1,951 +0,0 @@
1
- """
2
- Evaluation Metrics for ISA Model Framework
3
-
4
- This module provides various metrics for evaluating AI models:
5
- - LLM metrics: perplexity, BLEU, ROUGE, accuracy, etc.
6
- - Image metrics: FID, IS, LPIPS, etc.
7
- - Custom metrics and benchmark runners
8
- """
9
-
10
- import os
11
- import json
12
- import logging
13
- import numpy as np
14
- from typing import Dict, List, Any, Optional, Union
15
- from enum import Enum
16
- from abc import ABC, abstractmethod
17
-
18
- try:
19
- from ..inference.ai_factory import AIFactory
20
- AI_FACTORY_AVAILABLE = True
21
- except ImportError:
22
- AI_FACTORY_AVAILABLE = False
23
-
24
- logger = logging.getLogger(__name__)
25
-
26
-
27
- class MetricType(str, Enum):
28
- """Types of evaluation metrics."""
29
- PERPLEXITY = "perplexity"
30
- BLEU = "bleu"
31
- ROUGE = "rouge"
32
- ACCURACY = "accuracy"
33
- F1_SCORE = "f1"
34
- DIVERSITY = "diversity"
35
- COHERENCE = "coherence"
36
- FLUENCY = "fluency"
37
- FID = "fid"
38
- IS = "is"
39
- LPIPS = "lpips"
40
-
41
-
42
- class BaseMetric(ABC):
43
- """Base class for all metrics."""
44
-
45
- @abstractmethod
46
- def compute(self, predictions: List[str], references: List[str] = None, **kwargs) -> Dict[str, float]:
47
- """Compute the metric."""
48
- pass
49
-
50
-
51
- class LLMMetrics:
52
- """
53
- Metrics calculator for Language Models.
54
-
55
- Supports various metrics including:
56
- - Perplexity
57
- - BLEU score
58
- - ROUGE score
59
- - Accuracy
60
- - F1 score
61
- - Generation quality metrics
62
- """
63
-
64
- def __init__(self):
65
- self.available_metrics = [
66
- MetricType.PERPLEXITY,
67
- MetricType.BLEU,
68
- MetricType.ROUGE,
69
- MetricType.ACCURACY,
70
- MetricType.F1_SCORE,
71
- MetricType.DIVERSITY,
72
- MetricType.COHERENCE,
73
- MetricType.FLUENCY
74
- ]
75
-
76
- # Initialize AI factory if available
77
- if AI_FACTORY_AVAILABLE:
78
- try:
79
- self.ai_factory = AIFactory()
80
- except Exception as e:
81
- logger.warning(f"Failed to initialize AIFactory: {e}")
82
- self.ai_factory = None
83
- else:
84
- self.ai_factory = None
85
-
86
- async def evaluate(
87
- self,
88
- model_path: str,
89
- dataset: List[Dict[str, Any]],
90
- metrics: List[str],
91
- batch_size: int = 8,
92
- provider: str = "ollama",
93
- **kwargs
94
- ) -> Dict[str, Any]:
95
- """
96
- Evaluate LLM on dataset with specified metrics.
97
-
98
- Args:
99
- model_path: Path to the model
100
- dataset: Evaluation dataset
101
- metrics: List of metrics to compute
102
- batch_size: Batch size for evaluation
103
- provider: Model provider
104
- **kwargs: Additional parameters
105
-
106
- Returns:
107
- Dictionary with metric results
108
- """
109
- results = {
110
- "model_path": model_path,
111
- "num_samples": len(dataset),
112
- "metrics": {}
113
- }
114
-
115
- # Generate predictions
116
- predictions, references = await self._generate_predictions(
117
- model_path, dataset, batch_size, provider, **kwargs
118
- )
119
-
120
- # Compute each metric
121
- for metric in metrics:
122
- try:
123
- if metric == MetricType.PERPLEXITY:
124
- score = self._compute_perplexity(predictions, references)
125
- elif metric == MetricType.BLEU:
126
- score = self._compute_bleu(predictions, references)
127
- elif metric == MetricType.ROUGE:
128
- score = self._compute_rouge(predictions, references)
129
- elif metric == MetricType.ACCURACY:
130
- score = self._compute_accuracy(predictions, references)
131
- elif metric == MetricType.F1_SCORE:
132
- score = self._compute_f1(predictions, references)
133
- elif metric == MetricType.DIVERSITY:
134
- score = self._compute_diversity(predictions)
135
- elif metric == MetricType.COHERENCE:
136
- score = self._compute_coherence(predictions)
137
- elif metric == MetricType.FLUENCY:
138
- score = self._compute_fluency(predictions)
139
- else:
140
- logger.warning(f"Unknown metric: {metric}")
141
- continue
142
-
143
- results["metrics"][metric] = score
144
- logger.info(f"Computed {metric}: {score}")
145
-
146
- except Exception as e:
147
- logger.error(f"Failed to compute {metric}: {e}")
148
- results["metrics"][metric] = {"error": str(e)}
149
-
150
- return results
151
-
152
- async def evaluate_generation(
153
- self,
154
- model_path: str,
155
- prompts: List[str],
156
- reference_texts: List[str] = None,
157
- metrics: List[str] = None,
158
- provider: str = "ollama",
159
- **kwargs
160
- ) -> Dict[str, Any]:
161
- """
162
- Evaluate text generation quality.
163
-
164
- Args:
165
- model_path: Path to the model
166
- prompts: Input prompts
167
- reference_texts: Reference texts (optional)
168
- metrics: Metrics to compute
169
- provider: Model provider
170
- **kwargs: Additional parameters
171
-
172
- Returns:
173
- Generation evaluation results
174
- """
175
- if metrics is None:
176
- metrics = [MetricType.DIVERSITY, MetricType.COHERENCE, MetricType.FLUENCY]
177
-
178
- # Generate texts
179
- generated_texts = self._generate_texts(model_path, prompts, provider, **kwargs)
180
-
181
- results = {
182
- "model_path": model_path,
183
- "num_prompts": len(prompts),
184
- "metrics": {}
185
- }
186
-
187
- # Compute metrics
188
- for metric in metrics:
189
- try:
190
- if metric == MetricType.DIVERSITY:
191
- score = self._compute_diversity(generated_texts)
192
- elif metric == MetricType.COHERENCE:
193
- score = self._compute_coherence(generated_texts)
194
- elif metric == MetricType.FLUENCY:
195
- score = self._compute_fluency(generated_texts)
196
- elif metric == MetricType.BLEU and reference_texts:
197
- score = self._compute_bleu(generated_texts, reference_texts)
198
- elif metric == MetricType.ROUGE and reference_texts:
199
- score = self._compute_rouge(generated_texts, reference_texts)
200
- else:
201
- continue
202
-
203
- results["metrics"][metric] = score
204
-
205
- except Exception as e:
206
- logger.error(f"Failed to compute {metric}: {e}")
207
- results["metrics"][metric] = {"error": str(e)}
208
-
209
- return results
210
-
211
- async def _generate_predictions(
212
- self,
213
- model_path: str,
214
- dataset: List[Dict[str, Any]],
215
- batch_size: int,
216
- provider: str,
217
- **kwargs
218
- ) -> tuple:
219
- """Generate predictions from model using actual inference."""
220
- predictions = []
221
- references = []
222
-
223
- if not self.ai_factory:
224
- logger.warning("AIFactory not available, using placeholder predictions")
225
- # Fallback to placeholder predictions
226
- for item in dataset:
227
- if isinstance(item, dict):
228
- if "input" in item and "output" in item:
229
- predictions.append(f"Generated response for: {item['input']}")
230
- references.append(item["output"])
231
- elif "prompt" in item and "response" in item:
232
- predictions.append(f"Generated response for: {item['prompt']}")
233
- references.append(item["response"])
234
- return predictions, references
235
-
236
- try:
237
- # Get LLM service
238
- llm_service = self.ai_factory.get_llm(model_name=model_path, provider=provider)
239
-
240
- # Process dataset in batches
241
- for i in range(0, len(dataset), batch_size):
242
- batch = dataset[i:i + batch_size]
243
- batch_predictions = []
244
- batch_references = []
245
-
246
- for item in batch:
247
- if isinstance(item, dict):
248
- prompt = None
249
- reference = None
250
-
251
- # Extract prompt and reference based on data format
252
- if "input" in item and "output" in item:
253
- prompt = item["input"]
254
- reference = item["output"]
255
- elif "prompt" in item and "response" in item:
256
- prompt = item["prompt"]
257
- reference = item["response"]
258
- elif "question" in item and "answer" in item:
259
- prompt = item["question"]
260
- reference = item["answer"]
261
- elif "text" in item and "label" in item:
262
- prompt = item["text"]
263
- reference = str(item["label"])
264
-
265
- if prompt and reference:
266
- try:
267
- # Generate prediction using actual model
268
- response = await llm_service.ainvoke(prompt)
269
-
270
- # Extract text from response
271
- if hasattr(response, 'text'):
272
- prediction = response.text
273
- elif isinstance(response, dict) and 'text' in response:
274
- prediction = response['text']
275
- elif isinstance(response, str):
276
- prediction = response
277
- else:
278
- prediction = str(response)
279
-
280
- batch_predictions.append(prediction.strip())
281
- batch_references.append(reference)
282
-
283
- except Exception as e:
284
- logger.error(f"Failed to generate prediction for item: {e}")
285
- # Use fallback prediction
286
- batch_predictions.append(f"Error generating prediction: {str(e)}")
287
- batch_references.append(reference)
288
-
289
- predictions.extend(batch_predictions)
290
- references.extend(batch_references)
291
-
292
- logger.info(f"Processed batch {i//batch_size + 1}/{(len(dataset) + batch_size - 1)//batch_size}")
293
-
294
- except Exception as e:
295
- logger.error(f"Failed to use AIFactory for predictions: {e}")
296
- # Fallback to placeholder predictions
297
- for item in dataset:
298
- if isinstance(item, dict):
299
- if "input" in item and "output" in item:
300
- predictions.append(f"Generated response for: {item['input']}")
301
- references.append(item["output"])
302
- elif "prompt" in item and "response" in item:
303
- predictions.append(f"Generated response for: {item['prompt']}")
304
- references.append(item["response"])
305
-
306
- logger.info(f"Generated {len(predictions)} predictions")
307
- return predictions, references
308
-
309
- async def _generate_texts(
310
- self,
311
- model_path: str,
312
- prompts: List[str],
313
- provider: str,
314
- **kwargs
315
- ) -> List[str]:
316
- """Generate texts from prompts using actual model inference."""
317
- generated_texts = []
318
-
319
- if not self.ai_factory:
320
- logger.warning("AIFactory not available, using placeholder text generation")
321
- # Fallback to placeholder generation
322
- for prompt in prompts:
323
- generated_texts.append(f"Generated response for: {prompt}")
324
- return generated_texts
325
-
326
- try:
327
- # Get LLM service
328
- llm_service = self.ai_factory.get_llm(model_name=model_path, provider=provider)
329
-
330
- for prompt in prompts:
331
- try:
332
- # Generate text using actual model
333
- response = await llm_service.ainvoke(prompt)
334
-
335
- # Extract text from response
336
- if hasattr(response, 'text'):
337
- generated_text = response.text
338
- elif isinstance(response, dict) and 'text' in response:
339
- generated_text = response['text']
340
- elif isinstance(response, str):
341
- generated_text = response
342
- else:
343
- generated_text = str(response)
344
-
345
- generated_texts.append(generated_text.strip())
346
-
347
- except Exception as e:
348
- logger.error(f"Failed to generate text for prompt: {e}")
349
- # Use fallback generation
350
- generated_texts.append(f"Error generating text: {str(e)}")
351
-
352
- except Exception as e:
353
- logger.error(f"Failed to use AIFactory for text generation: {e}")
354
- # Fallback to placeholder generation
355
- for prompt in prompts:
356
- generated_texts.append(f"Generated response for: {prompt}")
357
-
358
- return generated_texts
359
-
360
- def _compute_perplexity(self, predictions: List[str], references: List[str]) -> Dict[str, float]:
361
- """Compute perplexity score (simplified implementation)."""
362
- # This is a placeholder - actual perplexity requires model probabilities
363
- return {
364
- "perplexity": np.random.uniform(10, 100), # Placeholder
365
- "log_perplexity": np.random.uniform(2, 5)
366
- }
367
-
368
- def _compute_bleu(self, predictions: List[str], references: List[str]) -> Dict[str, float]:
369
- """Compute BLEU score (simplified implementation)."""
370
- try:
371
- # Placeholder implementation - use actual BLEU calculation
372
- # from nltk.translate.bleu_score import sentence_bleu
373
- scores = []
374
- for pred, ref in zip(predictions, references):
375
- # Simplified BLEU calculation
376
- pred_words = pred.lower().split()
377
- ref_words = ref.lower().split()
378
-
379
- # Simple overlap calculation (not actual BLEU)
380
- overlap = len(set(pred_words) & set(ref_words))
381
- total = len(set(pred_words) | set(ref_words))
382
-
383
- if total > 0:
384
- scores.append(overlap / total)
385
- else:
386
- scores.append(0.0)
387
-
388
- return {
389
- "bleu": np.mean(scores),
390
- "bleu_std": np.std(scores)
391
- }
392
- except Exception as e:
393
- logger.error(f"BLEU computation failed: {e}")
394
- return {"bleu": 0.0, "error": str(e)}
395
-
396
- def _compute_rouge(self, predictions: List[str], references: List[str]) -> Dict[str, float]:
397
- """Compute ROUGE score (simplified implementation)."""
398
- try:
399
- rouge_1_scores = []
400
- rouge_l_scores = []
401
-
402
- for pred, ref in zip(predictions, references):
403
- pred_words = set(pred.lower().split())
404
- ref_words = set(ref.lower().split())
405
-
406
- # ROUGE-1 (unigram overlap)
407
- if len(ref_words) > 0:
408
- rouge_1 = len(pred_words & ref_words) / len(ref_words)
409
- rouge_1_scores.append(rouge_1)
410
-
411
- # Simplified ROUGE-L (longest common subsequence)
412
- rouge_l = len(pred_words & ref_words) / max(len(pred_words), len(ref_words), 1)
413
- rouge_l_scores.append(rouge_l)
414
-
415
- return {
416
- "rouge_1": np.mean(rouge_1_scores),
417
- "rouge_l": np.mean(rouge_l_scores),
418
- "rouge_1_std": np.std(rouge_1_scores),
419
- "rouge_l_std": np.std(rouge_l_scores)
420
- }
421
- except Exception as e:
422
- logger.error(f"ROUGE computation failed: {e}")
423
- return {"rouge_1": 0.0, "rouge_l": 0.0, "error": str(e)}
424
-
425
- def _compute_accuracy(self, predictions: List[str], references: List[str]) -> Dict[str, float]:
426
- """Compute accuracy score."""
427
- try:
428
- correct = 0
429
- total = len(predictions)
430
-
431
- for pred, ref in zip(predictions, references):
432
- if pred.strip().lower() == ref.strip().lower():
433
- correct += 1
434
-
435
- accuracy = correct / total if total > 0 else 0.0
436
-
437
- return {
438
- "accuracy": accuracy,
439
- "correct": correct,
440
- "total": total
441
- }
442
- except Exception as e:
443
- logger.error(f"Accuracy computation failed: {e}")
444
- return {"accuracy": 0.0, "error": str(e)}
445
-
446
- def _compute_f1(self, predictions: List[str], references: List[str]) -> Dict[str, float]:
447
- """Compute F1 score (simplified implementation)."""
448
- try:
449
- f1_scores = []
450
-
451
- for pred, ref in zip(predictions, references):
452
- pred_words = set(pred.lower().split())
453
- ref_words = set(ref.lower().split())
454
-
455
- if len(pred_words) == 0 and len(ref_words) == 0:
456
- f1_scores.append(1.0)
457
- elif len(pred_words) == 0 or len(ref_words) == 0:
458
- f1_scores.append(0.0)
459
- else:
460
- intersection = len(pred_words & ref_words)
461
- precision = intersection / len(pred_words)
462
- recall = intersection / len(ref_words)
463
-
464
- if precision + recall > 0:
465
- f1 = 2 * (precision * recall) / (precision + recall)
466
- f1_scores.append(f1)
467
- else:
468
- f1_scores.append(0.0)
469
-
470
- return {
471
- "f1": np.mean(f1_scores),
472
- "f1_std": np.std(f1_scores)
473
- }
474
- except Exception as e:
475
- logger.error(f"F1 computation failed: {e}")
476
- return {"f1": 0.0, "error": str(e)}
477
-
478
- def _compute_diversity(self, texts: List[str]) -> Dict[str, float]:
479
- """Compute diversity metrics."""
480
- try:
481
- # Distinct-1 and Distinct-2
482
- all_unigrams = []
483
- all_bigrams = []
484
-
485
- for text in texts:
486
- words = text.lower().split()
487
- all_unigrams.extend(words)
488
-
489
- # Create bigrams
490
- for i in range(len(words) - 1):
491
- all_bigrams.append((words[i], words[i + 1]))
492
-
493
- distinct_1 = len(set(all_unigrams)) / len(all_unigrams) if all_unigrams else 0
494
- distinct_2 = len(set(all_bigrams)) / len(all_bigrams) if all_bigrams else 0
495
-
496
- return {
497
- "distinct_1": distinct_1,
498
- "distinct_2": distinct_2,
499
- "vocab_size": len(set(all_unigrams))
500
- }
501
- except Exception as e:
502
- logger.error(f"Diversity computation failed: {e}")
503
- return {"distinct_1": 0.0, "distinct_2": 0.0, "error": str(e)}
504
-
505
- def _compute_coherence(self, texts: List[str]) -> Dict[str, float]:
506
- """Compute coherence score (simplified implementation)."""
507
- try:
508
- # Simplified coherence based on sentence length consistency
509
- coherence_scores = []
510
-
511
- for text in texts:
512
- sentences = text.split('.')
513
- if len(sentences) > 1:
514
- lengths = [len(s.split()) for s in sentences if s.strip()]
515
- if lengths:
516
- # Coherence as inverse of length variance
517
- coherence = 1.0 / (1.0 + np.var(lengths))
518
- coherence_scores.append(coherence)
519
- else:
520
- coherence_scores.append(0.5)
521
- else:
522
- coherence_scores.append(0.5)
523
-
524
- return {
525
- "coherence": np.mean(coherence_scores),
526
- "coherence_std": np.std(coherence_scores)
527
- }
528
- except Exception as e:
529
- logger.error(f"Coherence computation failed: {e}")
530
- return {"coherence": 0.5, "error": str(e)}
531
-
532
- def _compute_fluency(self, texts: List[str]) -> Dict[str, float]:
533
- """Compute fluency score (simplified implementation)."""
534
- try:
535
- fluency_scores = []
536
-
537
- for text in texts:
538
- # Simplified fluency based on word count and sentence structure
539
- words = text.split()
540
- sentences = text.split('.')
541
-
542
- if len(words) > 0 and len(sentences) > 0:
543
- avg_words_per_sentence = len(words) / len(sentences)
544
- # Fluency based on reasonable sentence length (5-20 words)
545
- if 5 <= avg_words_per_sentence <= 20:
546
- fluency = 1.0
547
- else:
548
- fluency = max(0.0, 1.0 - abs(avg_words_per_sentence - 12.5) / 12.5)
549
-
550
- fluency_scores.append(fluency)
551
- else:
552
- fluency_scores.append(0.0)
553
-
554
- return {
555
- "fluency": np.mean(fluency_scores),
556
- "fluency_std": np.std(fluency_scores)
557
- }
558
- except Exception as e:
559
- logger.error(f"Fluency computation failed: {e}")
560
- return {"fluency": 0.0, "error": str(e)}
561
-
562
-
563
- class ImageMetrics:
564
- """
565
- Metrics calculator for Image Generation Models.
566
-
567
- Supports metrics including:
568
- - FID (Fréchet Inception Distance)
569
- - IS (Inception Score)
570
- - LPIPS (Learned Perceptual Image Patch Similarity)
571
- """
572
-
573
- def __init__(self):
574
- self.available_metrics = [
575
- MetricType.FID,
576
- MetricType.IS,
577
- MetricType.LPIPS
578
- ]
579
-
580
- def evaluate(
581
- self,
582
- model_path: str,
583
- test_images_dir: str,
584
- reference_images_dir: Optional[str] = None,
585
- metrics: List[str] = None,
586
- **kwargs
587
- ) -> Dict[str, Any]:
588
- """
589
- Evaluate image generation model.
590
-
591
- Args:
592
- model_path: Path to the image model
593
- test_images_dir: Directory with test images
594
- reference_images_dir: Directory with reference images
595
- metrics: Metrics to compute
596
- **kwargs: Additional parameters
597
-
598
- Returns:
599
- Image evaluation results
600
- """
601
- if metrics is None:
602
- metrics = [MetricType.FID, MetricType.IS]
603
-
604
- results = {
605
- "model_path": model_path,
606
- "test_images_dir": test_images_dir,
607
- "reference_images_dir": reference_images_dir,
608
- "metrics": {}
609
- }
610
-
611
- for metric in metrics:
612
- try:
613
- if metric == MetricType.FID:
614
- score = self._compute_fid(test_images_dir, reference_images_dir)
615
- elif metric == MetricType.IS:
616
- score = self._compute_is(test_images_dir)
617
- elif metric == MetricType.LPIPS:
618
- score = self._compute_lpips(test_images_dir, reference_images_dir)
619
- else:
620
- logger.warning(f"Unknown image metric: {metric}")
621
- continue
622
-
623
- results["metrics"][metric] = score
624
- logger.info(f"Computed {metric}: {score}")
625
-
626
- except Exception as e:
627
- logger.error(f"Failed to compute {metric}: {e}")
628
- results["metrics"][metric] = {"error": str(e)}
629
-
630
- return results
631
-
632
- def _compute_fid(self, test_dir: str, reference_dir: Optional[str]) -> Dict[str, float]:
633
- """Compute FID score (placeholder implementation)."""
634
- # This is a placeholder - actual FID requires complex neural network computations
635
- logger.warning("FID computation not fully implemented - returning placeholder")
636
- return {
637
- "fid": np.random.uniform(20, 100), # Placeholder
638
- "note": "Placeholder implementation"
639
- }
640
-
641
- def _compute_is(self, images_dir: str) -> Dict[str, float]:
642
- """Compute Inception Score (placeholder implementation)."""
643
- # This is a placeholder - actual IS requires Inception network
644
- logger.warning("IS computation not fully implemented - returning placeholder")
645
- return {
646
- "is_mean": np.random.uniform(2, 10), # Placeholder
647
- "is_std": np.random.uniform(0.1, 1.0),
648
- "note": "Placeholder implementation"
649
- }
650
-
651
- def _compute_lpips(self, test_dir: str, reference_dir: Optional[str]) -> Dict[str, float]:
652
- """Compute LPIPS score (placeholder implementation)."""
653
- # This is a placeholder - actual LPIPS requires perceptual loss networks
654
- logger.warning("LPIPS computation not fully implemented - returning placeholder")
655
- return {
656
- "lpips": np.random.uniform(0.1, 0.8), # Placeholder
657
- "note": "Placeholder implementation"
658
- }
659
-
660
-
661
- class BenchmarkRunner:
662
- """
663
- Runner for standard AI benchmarks.
664
-
665
- Supports running various benchmarks and collecting results.
666
- """
667
-
668
- def __init__(self):
669
- self.supported_benchmarks = ["mmlu", "hellaswag", "arc", "gsm8k"]
670
-
671
- # Initialize AI factory if available
672
- if AI_FACTORY_AVAILABLE:
673
- try:
674
- self.ai_factory = AIFactory()
675
- except Exception as e:
676
- logger.warning(f"Failed to initialize AIFactory: {e}")
677
- self.ai_factory = None
678
- else:
679
- self.ai_factory = None
680
-
681
- def run(
682
- self,
683
- benchmark,
684
- model_path: str,
685
- num_shots: int = 0,
686
- max_samples: Optional[int] = None,
687
- provider: str = "ollama",
688
- **kwargs
689
- ) -> Dict[str, Any]:
690
- """
691
- Run a benchmark evaluation.
692
-
693
- Args:
694
- benchmark: Benchmark instance
695
- model_path: Path to the model
696
- num_shots: Number of few-shot examples
697
- max_samples: Maximum samples to evaluate
698
- provider: Model provider
699
- **kwargs: Additional parameters
700
-
701
- Returns:
702
- Benchmark results
703
- """
704
- logger.info(f"Running benchmark {benchmark.name} on {model_path}")
705
-
706
- # Load benchmark data
707
- test_data = benchmark.load_data(max_samples=max_samples)
708
-
709
- # Run evaluation
710
- results = {
711
- "benchmark": benchmark.name,
712
- "model_path": model_path,
713
- "num_shots": num_shots,
714
- "num_samples": len(test_data),
715
- "results": {}
716
- }
717
-
718
- # Process each sample
719
- correct = 0
720
- total = 0
721
-
722
- for sample in test_data:
723
- try:
724
- # Format prompt using benchmark's method
725
- prompt = benchmark.format_prompt(sample)
726
-
727
- # Generate prediction using actual model
728
- prediction = self._generate_prediction(
729
- model_path, {"prompt": prompt}, num_shots, provider, **kwargs
730
- )
731
-
732
- # Check if correct
733
- is_correct = benchmark.evaluate_sample(sample, prediction)
734
- if is_correct:
735
- correct += 1
736
- total += 1
737
-
738
- except Exception as e:
739
- logger.error(f"Failed to process sample: {e}")
740
- continue
741
-
742
- # Calculate final score
743
- accuracy = correct / total if total > 0 else 0.0
744
-
745
- results["results"] = {
746
- "accuracy": accuracy,
747
- "correct": correct,
748
- "total": total
749
- }
750
-
751
- logger.info(f"Benchmark completed: {accuracy:.3f} accuracy ({correct}/{total})")
752
- return results
753
-
754
- def _generate_prediction(
755
- self,
756
- model_path: str,
757
- sample: Dict[str, Any],
758
- num_shots: int,
759
- provider: str,
760
- **kwargs
761
- ) -> str:
762
- """Generate prediction for a sample using actual model inference."""
763
- if not self.ai_factory:
764
- logger.warning("AIFactory not available, using placeholder prediction")
765
- return "A" # Placeholder answer
766
-
767
- try:
768
- # Get LLM service
769
- llm_service = self.ai_factory.get_llm(model_name=model_path, provider=provider)
770
-
771
- # Format the prompt (this should be done by the benchmark)
772
- if hasattr(sample, 'get'):
773
- prompt = sample.get('prompt', str(sample))
774
- else:
775
- prompt = str(sample)
776
-
777
- # Generate prediction using actual model
778
- response = llm_service.generate(
779
- prompt=prompt,
780
- max_tokens=kwargs.get("max_tokens", 50),
781
- temperature=kwargs.get("temperature", 0.0) # Low temperature for consistency
782
- )
783
-
784
- # Extract text from response
785
- if hasattr(response, 'text'):
786
- prediction = response.text
787
- elif isinstance(response, dict) and 'text' in response:
788
- prediction = response['text']
789
- elif isinstance(response, str):
790
- prediction = response
791
- else:
792
- prediction = str(response)
793
-
794
- return prediction.strip()
795
-
796
- except Exception as e:
797
- logger.error(f"Failed to generate prediction: {e}")
798
- return "A" # Fallback answer
799
-
800
-
801
- # Utility functions for evaluators
802
- def compute_text_metrics(predictions: Union[str, List[str]],
803
- references: Union[str, List[str]],
804
- aggregate: bool = False) -> Dict[str, float]:
805
- """
806
- Compute standard text evaluation metrics.
807
-
808
- Args:
809
- predictions: Single prediction or list of predictions
810
- references: Single reference or list of references
811
- aggregate: Whether to compute aggregate metrics for lists
812
-
813
- Returns:
814
- Dictionary of computed metrics
815
- """
816
- try:
817
- # Handle single string inputs
818
- if isinstance(predictions, str) and isinstance(references, str):
819
- pred_list = [predictions]
820
- ref_list = [references]
821
- else:
822
- pred_list = predictions if isinstance(predictions, list) else [str(predictions)]
823
- ref_list = references if isinstance(references, list) else [str(references)]
824
-
825
- # Ensure equal lengths
826
- min_len = min(len(pred_list), len(ref_list))
827
- pred_list = pred_list[:min_len]
828
- ref_list = ref_list[:min_len]
829
-
830
- metrics = {}
831
-
832
- # Exact match
833
- exact_matches = sum(1 for p, r in zip(pred_list, ref_list) if p.strip().lower() == r.strip().lower())
834
- metrics["exact_match"] = exact_matches / len(pred_list) if pred_list else 0.0
835
-
836
- # F1 Score (token-level)
837
- f1_scores = []
838
- for pred, ref in zip(pred_list, ref_list):
839
- pred_tokens = set(pred.lower().split())
840
- ref_tokens = set(ref.lower().split())
841
-
842
- if not ref_tokens and not pred_tokens:
843
- f1_scores.append(1.0)
844
- elif not ref_tokens or not pred_tokens:
845
- f1_scores.append(0.0)
846
- else:
847
- intersection = len(pred_tokens & ref_tokens)
848
- precision = intersection / len(pred_tokens)
849
- recall = intersection / len(ref_tokens)
850
-
851
- if precision + recall > 0:
852
- f1 = 2 * (precision * recall) / (precision + recall)
853
- f1_scores.append(f1)
854
- else:
855
- f1_scores.append(0.0)
856
-
857
- metrics["f1_score"] = np.mean(f1_scores) if f1_scores else 0.0
858
-
859
- # BLEU Score (simplified)
860
- bleu_scores = []
861
- for pred, ref in zip(pred_list, ref_list):
862
- pred_words = pred.lower().split()
863
- ref_words = ref.lower().split()
864
-
865
- # Simple n-gram overlap
866
- overlap = len(set(pred_words) & set(ref_words))
867
- total = len(set(pred_words) | set(ref_words))
868
-
869
- bleu_scores.append(overlap / total if total > 0 else 0.0)
870
-
871
- metrics["bleu_score"] = np.mean(bleu_scores) if bleu_scores else 0.0
872
-
873
- # ROUGE-L (simplified)
874
- rouge_scores = []
875
- for pred, ref in zip(pred_list, ref_list):
876
- pred_words = set(pred.lower().split())
877
- ref_words = set(ref.lower().split())
878
-
879
- if len(ref_words) > 0:
880
- rouge_l = len(pred_words & ref_words) / len(ref_words)
881
- rouge_scores.append(rouge_l)
882
- else:
883
- rouge_scores.append(0.0)
884
-
885
- metrics["rouge_l"] = np.mean(rouge_scores) if rouge_scores else 0.0
886
-
887
- # Response length metrics
888
- pred_lengths = [len(p.split()) for p in pred_list]
889
- ref_lengths = [len(r.split()) for r in ref_list]
890
-
891
- metrics["avg_prediction_length"] = np.mean(pred_lengths) if pred_lengths else 0.0
892
- metrics["avg_reference_length"] = np.mean(ref_lengths) if ref_lengths else 0.0
893
- metrics["length_ratio"] = (np.mean(pred_lengths) / np.mean(ref_lengths)) if np.mean(ref_lengths) > 0 else 0.0
894
-
895
- # Diversity metrics for predictions
896
- if len(pred_list) > 1:
897
- all_words = []
898
- for pred in pred_list:
899
- all_words.extend(pred.lower().split())
900
-
901
- unique_words = len(set(all_words))
902
- total_words = len(all_words)
903
-
904
- metrics["vocabulary_diversity"] = unique_words / total_words if total_words > 0 else 0.0
905
-
906
- return metrics
907
-
908
- except Exception as e:
909
- logger.error(f"Error computing text metrics: {e}")
910
- return {"text_metrics_error": 1.0}
911
-
912
-
913
- def compute_vision_metrics(predictions: List[Any],
914
- references: List[Any],
915
- task_type: str = "general") -> Dict[str, float]:
916
- """
917
- Compute vision-specific evaluation metrics.
918
-
919
- Args:
920
- predictions: List of vision model predictions
921
- references: List of reference outputs
922
- task_type: Type of vision task (ocr, detection, etc.)
923
-
924
- Returns:
925
- Dictionary of computed metrics
926
- """
927
- try:
928
- metrics = {}
929
-
930
- # Basic success rate
931
- successful_predictions = sum(1 for p in predictions if p is not None)
932
- metrics["prediction_success_rate"] = successful_predictions / len(predictions) if predictions else 0.0
933
-
934
- # Task-specific metrics would be computed by individual evaluators
935
- # This is a placeholder for common vision metrics
936
-
937
- if task_type == "ocr":
938
- # OCR-specific metrics would be computed in VisionEvaluator
939
- pass
940
- elif task_type == "detection":
941
- # Object detection metrics (IoU, mAP, etc.)
942
- pass
943
- elif task_type == "classification":
944
- # Image classification metrics
945
- pass
946
-
947
- return metrics
948
-
949
- except Exception as e:
950
- logger.error(f"Error computing vision metrics: {e}")
951
- return {"vision_metrics_error": 1.0}