isa-model 0.4.0__py3-none-any.whl → 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (189) hide show
  1. isa_model/client.py +466 -43
  2. isa_model/core/cache/redis_cache.py +12 -3
  3. isa_model/core/config/config_manager.py +230 -3
  4. isa_model/core/config.py +90 -0
  5. isa_model/core/database/direct_db_client.py +114 -0
  6. isa_model/core/database/migration_manager.py +563 -0
  7. isa_model/core/database/migrations.py +21 -1
  8. isa_model/core/database/supabase_client.py +154 -19
  9. isa_model/core/dependencies.py +316 -0
  10. isa_model/core/discovery/__init__.py +19 -0
  11. isa_model/core/discovery/consul_discovery.py +190 -0
  12. isa_model/core/logging/__init__.py +54 -0
  13. isa_model/core/logging/influx_logger.py +523 -0
  14. isa_model/core/logging/loki_logger.py +160 -0
  15. isa_model/core/models/__init__.py +27 -18
  16. isa_model/core/models/config_models.py +625 -0
  17. isa_model/core/models/deployment_billing_tracker.py +430 -0
  18. isa_model/core/models/model_manager.py +35 -80
  19. isa_model/core/models/model_metadata.py +690 -0
  20. isa_model/core/models/model_repo.py +174 -18
  21. isa_model/core/models/system_models.py +857 -0
  22. isa_model/core/repositories/__init__.py +9 -0
  23. isa_model/core/repositories/config_repository.py +912 -0
  24. isa_model/core/services/intelligent_model_selector.py +399 -21
  25. isa_model/core/types.py +1 -0
  26. isa_model/deployment/__init__.py +5 -48
  27. isa_model/deployment/core/__init__.py +2 -31
  28. isa_model/deployment/core/deployment_manager.py +1278 -370
  29. isa_model/deployment/modal/__init__.py +8 -0
  30. isa_model/deployment/modal/config.py +136 -0
  31. isa_model/deployment/{services/auto_hf_modal_deployer.py → modal/deployer.py} +1 -1
  32. isa_model/deployment/modal/services/__init__.py +3 -0
  33. isa_model/deployment/modal/services/audio/__init__.py +1 -0
  34. isa_model/deployment/modal/services/embedding/__init__.py +1 -0
  35. isa_model/deployment/modal/services/llm/__init__.py +1 -0
  36. isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
  37. isa_model/deployment/modal/services/video/__init__.py +1 -0
  38. isa_model/deployment/modal/services/vision/__init__.py +1 -0
  39. isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
  40. isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
  41. isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
  42. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
  43. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
  44. isa_model/deployment/storage/__init__.py +5 -0
  45. isa_model/deployment/storage/deployment_repository.py +824 -0
  46. isa_model/deployment/triton/__init__.py +10 -0
  47. isa_model/deployment/triton/config.py +196 -0
  48. isa_model/deployment/triton/configs/__init__.py +1 -0
  49. isa_model/deployment/triton/provider.py +512 -0
  50. isa_model/deployment/triton/scripts/__init__.py +1 -0
  51. isa_model/deployment/triton/templates/__init__.py +1 -0
  52. isa_model/inference/__init__.py +47 -1
  53. isa_model/inference/ai_factory.py +137 -10
  54. isa_model/inference/legacy_services/__init__.py +21 -0
  55. isa_model/inference/legacy_services/model_evaluation.py +637 -0
  56. isa_model/inference/legacy_services/model_service.py +573 -0
  57. isa_model/inference/legacy_services/model_serving.py +717 -0
  58. isa_model/inference/legacy_services/model_training.py +561 -0
  59. isa_model/inference/models/__init__.py +21 -0
  60. isa_model/inference/models/inference_config.py +551 -0
  61. isa_model/inference/models/inference_record.py +675 -0
  62. isa_model/inference/models/performance_models.py +714 -0
  63. isa_model/inference/repositories/__init__.py +9 -0
  64. isa_model/inference/repositories/inference_repository.py +828 -0
  65. isa_model/inference/services/audio/base_stt_service.py +184 -11
  66. isa_model/inference/services/audio/openai_stt_service.py +22 -6
  67. isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
  68. isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
  69. isa_model/inference/services/llm/__init__.py +10 -2
  70. isa_model/inference/services/llm/base_llm_service.py +335 -24
  71. isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
  72. isa_model/inference/services/llm/helpers/llm_adapter.py +9 -4
  73. isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
  74. isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
  75. isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
  76. isa_model/inference/services/llm/ollama_llm_service.py +9 -2
  77. isa_model/inference/services/llm/openai_llm_service.py +33 -16
  78. isa_model/inference/services/llm/yyds_llm_service.py +8 -2
  79. isa_model/inference/services/vision/__init__.py +22 -1
  80. isa_model/inference/services/vision/helpers/image_utils.py +8 -5
  81. isa_model/inference/services/vision/isa_vision_service.py +65 -4
  82. isa_model/inference/services/vision/openai_vision_service.py +19 -10
  83. isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
  84. isa_model/serving/api/cache_manager.py +245 -0
  85. isa_model/serving/api/dependencies/__init__.py +1 -0
  86. isa_model/serving/api/dependencies/auth.py +194 -0
  87. isa_model/serving/api/dependencies/database.py +139 -0
  88. isa_model/serving/api/error_handlers.py +284 -0
  89. isa_model/serving/api/fastapi_server.py +172 -22
  90. isa_model/serving/api/middleware/auth.py +8 -2
  91. isa_model/serving/api/middleware/security.py +23 -33
  92. isa_model/serving/api/middleware/tenant_context.py +414 -0
  93. isa_model/serving/api/routes/analytics.py +4 -1
  94. isa_model/serving/api/routes/config.py +645 -0
  95. isa_model/serving/api/routes/deployment_billing.py +315 -0
  96. isa_model/serving/api/routes/deployments.py +138 -2
  97. isa_model/serving/api/routes/gpu_gateway.py +440 -0
  98. isa_model/serving/api/routes/health.py +32 -12
  99. isa_model/serving/api/routes/inference_monitoring.py +486 -0
  100. isa_model/serving/api/routes/local_deployments.py +448 -0
  101. isa_model/serving/api/routes/tenants.py +575 -0
  102. isa_model/serving/api/routes/unified.py +680 -18
  103. isa_model/serving/api/routes/webhooks.py +479 -0
  104. isa_model/serving/api/startup.py +68 -54
  105. isa_model/utils/gpu_utils.py +311 -0
  106. {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/METADATA +71 -24
  107. isa_model-0.4.4.dist-info/RECORD +180 -0
  108. isa_model/core/security/secrets.py +0 -358
  109. isa_model/core/storage/hf_storage.py +0 -419
  110. isa_model/core/storage/minio_storage.py +0 -0
  111. isa_model/deployment/cloud/__init__.py +0 -9
  112. isa_model/deployment/cloud/modal/__init__.py +0 -10
  113. isa_model/deployment/core/deployment_config.py +0 -356
  114. isa_model/deployment/core/isa_deployment_service.py +0 -401
  115. isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
  116. isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
  117. isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
  118. isa_model/deployment/runtime/deployed_service.py +0 -338
  119. isa_model/deployment/services/__init__.py +0 -9
  120. isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
  121. isa_model/deployment/services/model_service.py +0 -332
  122. isa_model/deployment/services/service_monitor.py +0 -356
  123. isa_model/deployment/services/service_registry.py +0 -527
  124. isa_model/eval/__init__.py +0 -92
  125. isa_model/eval/benchmarks/__init__.py +0 -27
  126. isa_model/eval/benchmarks/multimodal_datasets.py +0 -460
  127. isa_model/eval/benchmarks.py +0 -701
  128. isa_model/eval/config/__init__.py +0 -10
  129. isa_model/eval/config/evaluation_config.py +0 -108
  130. isa_model/eval/evaluators/__init__.py +0 -24
  131. isa_model/eval/evaluators/audio_evaluator.py +0 -727
  132. isa_model/eval/evaluators/base_evaluator.py +0 -503
  133. isa_model/eval/evaluators/embedding_evaluator.py +0 -742
  134. isa_model/eval/evaluators/llm_evaluator.py +0 -472
  135. isa_model/eval/evaluators/vision_evaluator.py +0 -564
  136. isa_model/eval/example_evaluation.py +0 -395
  137. isa_model/eval/factory.py +0 -798
  138. isa_model/eval/infrastructure/__init__.py +0 -24
  139. isa_model/eval/infrastructure/experiment_tracker.py +0 -466
  140. isa_model/eval/isa_benchmarks.py +0 -700
  141. isa_model/eval/isa_integration.py +0 -582
  142. isa_model/eval/metrics.py +0 -951
  143. isa_model/eval/tests/unit/test_basic.py +0 -396
  144. isa_model/serving/api/routes/evaluations.py +0 -579
  145. isa_model/training/__init__.py +0 -168
  146. isa_model/training/annotation/annotation_schema.py +0 -47
  147. isa_model/training/annotation/processors/annotation_processor.py +0 -126
  148. isa_model/training/annotation/storage/dataset_manager.py +0 -131
  149. isa_model/training/annotation/storage/dataset_schema.py +0 -44
  150. isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
  151. isa_model/training/annotation/tests/test_minio copy.py +0 -113
  152. isa_model/training/annotation/tests/test_minio_upload.py +0 -43
  153. isa_model/training/annotation/views/annotation_controller.py +0 -158
  154. isa_model/training/cloud/__init__.py +0 -22
  155. isa_model/training/cloud/job_orchestrator.py +0 -402
  156. isa_model/training/cloud/runpod_trainer.py +0 -454
  157. isa_model/training/cloud/storage_manager.py +0 -482
  158. isa_model/training/core/__init__.py +0 -26
  159. isa_model/training/core/config.py +0 -181
  160. isa_model/training/core/dataset.py +0 -222
  161. isa_model/training/core/trainer.py +0 -720
  162. isa_model/training/core/utils.py +0 -213
  163. isa_model/training/examples/intelligent_training_example.py +0 -281
  164. isa_model/training/factory.py +0 -424
  165. isa_model/training/intelligent/__init__.py +0 -25
  166. isa_model/training/intelligent/decision_engine.py +0 -643
  167. isa_model/training/intelligent/intelligent_factory.py +0 -888
  168. isa_model/training/intelligent/knowledge_base.py +0 -751
  169. isa_model/training/intelligent/resource_optimizer.py +0 -839
  170. isa_model/training/intelligent/task_classifier.py +0 -576
  171. isa_model/training/storage/__init__.py +0 -24
  172. isa_model/training/storage/core_integration.py +0 -439
  173. isa_model/training/storage/training_repository.py +0 -552
  174. isa_model/training/storage/training_storage.py +0 -628
  175. isa_model-0.4.0.dist-info/RECORD +0 -182
  176. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_chatTTS_service.py +0 -0
  177. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_fish_service.py +0 -0
  178. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_openvoice_service.py +0 -0
  179. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_service_v2.py +0 -0
  180. /isa_model/deployment/{cloud/modal → modal/services/embedding}/isa_embed_rerank_service.py +0 -0
  181. /isa_model/deployment/{cloud/modal → modal/services/video}/isa_video_hunyuan_service.py +0 -0
  182. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ocr_service.py +0 -0
  183. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_qwen25_service.py +0 -0
  184. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_table_service.py +0 -0
  185. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service.py +0 -0
  186. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service_optimized.py +0 -0
  187. /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
  188. {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/WHEEL +0 -0
  189. {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/top_level.txt +0 -0
isa_model/eval/metrics.py DELETED
@@ -1,951 +0,0 @@
1
- """
2
- Evaluation Metrics for ISA Model Framework
3
-
4
- This module provides various metrics for evaluating AI models:
5
- - LLM metrics: perplexity, BLEU, ROUGE, accuracy, etc.
6
- - Image metrics: FID, IS, LPIPS, etc.
7
- - Custom metrics and benchmark runners
8
- """
9
-
10
- import os
11
- import json
12
- import logging
13
- import numpy as np
14
- from typing import Dict, List, Any, Optional, Union
15
- from enum import Enum
16
- from abc import ABC, abstractmethod
17
-
18
- try:
19
- from ..inference.ai_factory import AIFactory
20
- AI_FACTORY_AVAILABLE = True
21
- except ImportError:
22
- AI_FACTORY_AVAILABLE = False
23
-
24
- logger = logging.getLogger(__name__)
25
-
26
-
27
- class MetricType(str, Enum):
28
- """Types of evaluation metrics."""
29
- PERPLEXITY = "perplexity"
30
- BLEU = "bleu"
31
- ROUGE = "rouge"
32
- ACCURACY = "accuracy"
33
- F1_SCORE = "f1"
34
- DIVERSITY = "diversity"
35
- COHERENCE = "coherence"
36
- FLUENCY = "fluency"
37
- FID = "fid"
38
- IS = "is"
39
- LPIPS = "lpips"
40
-
41
-
42
- class BaseMetric(ABC):
43
- """Base class for all metrics."""
44
-
45
- @abstractmethod
46
- def compute(self, predictions: List[str], references: List[str] = None, **kwargs) -> Dict[str, float]:
47
- """Compute the metric."""
48
- pass
49
-
50
-
51
- class LLMMetrics:
52
- """
53
- Metrics calculator for Language Models.
54
-
55
- Supports various metrics including:
56
- - Perplexity
57
- - BLEU score
58
- - ROUGE score
59
- - Accuracy
60
- - F1 score
61
- - Generation quality metrics
62
- """
63
-
64
- def __init__(self):
65
- self.available_metrics = [
66
- MetricType.PERPLEXITY,
67
- MetricType.BLEU,
68
- MetricType.ROUGE,
69
- MetricType.ACCURACY,
70
- MetricType.F1_SCORE,
71
- MetricType.DIVERSITY,
72
- MetricType.COHERENCE,
73
- MetricType.FLUENCY
74
- ]
75
-
76
- # Initialize AI factory if available
77
- if AI_FACTORY_AVAILABLE:
78
- try:
79
- self.ai_factory = AIFactory()
80
- except Exception as e:
81
- logger.warning(f"Failed to initialize AIFactory: {e}")
82
- self.ai_factory = None
83
- else:
84
- self.ai_factory = None
85
-
86
- async def evaluate(
87
- self,
88
- model_path: str,
89
- dataset: List[Dict[str, Any]],
90
- metrics: List[str],
91
- batch_size: int = 8,
92
- provider: str = "ollama",
93
- **kwargs
94
- ) -> Dict[str, Any]:
95
- """
96
- Evaluate LLM on dataset with specified metrics.
97
-
98
- Args:
99
- model_path: Path to the model
100
- dataset: Evaluation dataset
101
- metrics: List of metrics to compute
102
- batch_size: Batch size for evaluation
103
- provider: Model provider
104
- **kwargs: Additional parameters
105
-
106
- Returns:
107
- Dictionary with metric results
108
- """
109
- results = {
110
- "model_path": model_path,
111
- "num_samples": len(dataset),
112
- "metrics": {}
113
- }
114
-
115
- # Generate predictions
116
- predictions, references = await self._generate_predictions(
117
- model_path, dataset, batch_size, provider, **kwargs
118
- )
119
-
120
- # Compute each metric
121
- for metric in metrics:
122
- try:
123
- if metric == MetricType.PERPLEXITY:
124
- score = self._compute_perplexity(predictions, references)
125
- elif metric == MetricType.BLEU:
126
- score = self._compute_bleu(predictions, references)
127
- elif metric == MetricType.ROUGE:
128
- score = self._compute_rouge(predictions, references)
129
- elif metric == MetricType.ACCURACY:
130
- score = self._compute_accuracy(predictions, references)
131
- elif metric == MetricType.F1_SCORE:
132
- score = self._compute_f1(predictions, references)
133
- elif metric == MetricType.DIVERSITY:
134
- score = self._compute_diversity(predictions)
135
- elif metric == MetricType.COHERENCE:
136
- score = self._compute_coherence(predictions)
137
- elif metric == MetricType.FLUENCY:
138
- score = self._compute_fluency(predictions)
139
- else:
140
- logger.warning(f"Unknown metric: {metric}")
141
- continue
142
-
143
- results["metrics"][metric] = score
144
- logger.info(f"Computed {metric}: {score}")
145
-
146
- except Exception as e:
147
- logger.error(f"Failed to compute {metric}: {e}")
148
- results["metrics"][metric] = {"error": str(e)}
149
-
150
- return results
151
-
152
- async def evaluate_generation(
153
- self,
154
- model_path: str,
155
- prompts: List[str],
156
- reference_texts: List[str] = None,
157
- metrics: List[str] = None,
158
- provider: str = "ollama",
159
- **kwargs
160
- ) -> Dict[str, Any]:
161
- """
162
- Evaluate text generation quality.
163
-
164
- Args:
165
- model_path: Path to the model
166
- prompts: Input prompts
167
- reference_texts: Reference texts (optional)
168
- metrics: Metrics to compute
169
- provider: Model provider
170
- **kwargs: Additional parameters
171
-
172
- Returns:
173
- Generation evaluation results
174
- """
175
- if metrics is None:
176
- metrics = [MetricType.DIVERSITY, MetricType.COHERENCE, MetricType.FLUENCY]
177
-
178
- # Generate texts
179
- generated_texts = self._generate_texts(model_path, prompts, provider, **kwargs)
180
-
181
- results = {
182
- "model_path": model_path,
183
- "num_prompts": len(prompts),
184
- "metrics": {}
185
- }
186
-
187
- # Compute metrics
188
- for metric in metrics:
189
- try:
190
- if metric == MetricType.DIVERSITY:
191
- score = self._compute_diversity(generated_texts)
192
- elif metric == MetricType.COHERENCE:
193
- score = self._compute_coherence(generated_texts)
194
- elif metric == MetricType.FLUENCY:
195
- score = self._compute_fluency(generated_texts)
196
- elif metric == MetricType.BLEU and reference_texts:
197
- score = self._compute_bleu(generated_texts, reference_texts)
198
- elif metric == MetricType.ROUGE and reference_texts:
199
- score = self._compute_rouge(generated_texts, reference_texts)
200
- else:
201
- continue
202
-
203
- results["metrics"][metric] = score
204
-
205
- except Exception as e:
206
- logger.error(f"Failed to compute {metric}: {e}")
207
- results["metrics"][metric] = {"error": str(e)}
208
-
209
- return results
210
-
211
- async def _generate_predictions(
212
- self,
213
- model_path: str,
214
- dataset: List[Dict[str, Any]],
215
- batch_size: int,
216
- provider: str,
217
- **kwargs
218
- ) -> tuple:
219
- """Generate predictions from model using actual inference."""
220
- predictions = []
221
- references = []
222
-
223
- if not self.ai_factory:
224
- logger.warning("AIFactory not available, using placeholder predictions")
225
- # Fallback to placeholder predictions
226
- for item in dataset:
227
- if isinstance(item, dict):
228
- if "input" in item and "output" in item:
229
- predictions.append(f"Generated response for: {item['input']}")
230
- references.append(item["output"])
231
- elif "prompt" in item and "response" in item:
232
- predictions.append(f"Generated response for: {item['prompt']}")
233
- references.append(item["response"])
234
- return predictions, references
235
-
236
- try:
237
- # Get LLM service
238
- llm_service = self.ai_factory.get_llm(model_name=model_path, provider=provider)
239
-
240
- # Process dataset in batches
241
- for i in range(0, len(dataset), batch_size):
242
- batch = dataset[i:i + batch_size]
243
- batch_predictions = []
244
- batch_references = []
245
-
246
- for item in batch:
247
- if isinstance(item, dict):
248
- prompt = None
249
- reference = None
250
-
251
- # Extract prompt and reference based on data format
252
- if "input" in item and "output" in item:
253
- prompt = item["input"]
254
- reference = item["output"]
255
- elif "prompt" in item and "response" in item:
256
- prompt = item["prompt"]
257
- reference = item["response"]
258
- elif "question" in item and "answer" in item:
259
- prompt = item["question"]
260
- reference = item["answer"]
261
- elif "text" in item and "label" in item:
262
- prompt = item["text"]
263
- reference = str(item["label"])
264
-
265
- if prompt and reference:
266
- try:
267
- # Generate prediction using actual model
268
- response = await llm_service.ainvoke(prompt)
269
-
270
- # Extract text from response
271
- if hasattr(response, 'text'):
272
- prediction = response.text
273
- elif isinstance(response, dict) and 'text' in response:
274
- prediction = response['text']
275
- elif isinstance(response, str):
276
- prediction = response
277
- else:
278
- prediction = str(response)
279
-
280
- batch_predictions.append(prediction.strip())
281
- batch_references.append(reference)
282
-
283
- except Exception as e:
284
- logger.error(f"Failed to generate prediction for item: {e}")
285
- # Use fallback prediction
286
- batch_predictions.append(f"Error generating prediction: {str(e)}")
287
- batch_references.append(reference)
288
-
289
- predictions.extend(batch_predictions)
290
- references.extend(batch_references)
291
-
292
- logger.info(f"Processed batch {i//batch_size + 1}/{(len(dataset) + batch_size - 1)//batch_size}")
293
-
294
- except Exception as e:
295
- logger.error(f"Failed to use AIFactory for predictions: {e}")
296
- # Fallback to placeholder predictions
297
- for item in dataset:
298
- if isinstance(item, dict):
299
- if "input" in item and "output" in item:
300
- predictions.append(f"Generated response for: {item['input']}")
301
- references.append(item["output"])
302
- elif "prompt" in item and "response" in item:
303
- predictions.append(f"Generated response for: {item['prompt']}")
304
- references.append(item["response"])
305
-
306
- logger.info(f"Generated {len(predictions)} predictions")
307
- return predictions, references
308
-
309
- async def _generate_texts(
310
- self,
311
- model_path: str,
312
- prompts: List[str],
313
- provider: str,
314
- **kwargs
315
- ) -> List[str]:
316
- """Generate texts from prompts using actual model inference."""
317
- generated_texts = []
318
-
319
- if not self.ai_factory:
320
- logger.warning("AIFactory not available, using placeholder text generation")
321
- # Fallback to placeholder generation
322
- for prompt in prompts:
323
- generated_texts.append(f"Generated response for: {prompt}")
324
- return generated_texts
325
-
326
- try:
327
- # Get LLM service
328
- llm_service = self.ai_factory.get_llm(model_name=model_path, provider=provider)
329
-
330
- for prompt in prompts:
331
- try:
332
- # Generate text using actual model
333
- response = await llm_service.ainvoke(prompt)
334
-
335
- # Extract text from response
336
- if hasattr(response, 'text'):
337
- generated_text = response.text
338
- elif isinstance(response, dict) and 'text' in response:
339
- generated_text = response['text']
340
- elif isinstance(response, str):
341
- generated_text = response
342
- else:
343
- generated_text = str(response)
344
-
345
- generated_texts.append(generated_text.strip())
346
-
347
- except Exception as e:
348
- logger.error(f"Failed to generate text for prompt: {e}")
349
- # Use fallback generation
350
- generated_texts.append(f"Error generating text: {str(e)}")
351
-
352
- except Exception as e:
353
- logger.error(f"Failed to use AIFactory for text generation: {e}")
354
- # Fallback to placeholder generation
355
- for prompt in prompts:
356
- generated_texts.append(f"Generated response for: {prompt}")
357
-
358
- return generated_texts
359
-
360
- def _compute_perplexity(self, predictions: List[str], references: List[str]) -> Dict[str, float]:
361
- """Compute perplexity score (simplified implementation)."""
362
- # This is a placeholder - actual perplexity requires model probabilities
363
- return {
364
- "perplexity": np.random.uniform(10, 100), # Placeholder
365
- "log_perplexity": np.random.uniform(2, 5)
366
- }
367
-
368
- def _compute_bleu(self, predictions: List[str], references: List[str]) -> Dict[str, float]:
369
- """Compute BLEU score (simplified implementation)."""
370
- try:
371
- # Placeholder implementation - use actual BLEU calculation
372
- # from nltk.translate.bleu_score import sentence_bleu
373
- scores = []
374
- for pred, ref in zip(predictions, references):
375
- # Simplified BLEU calculation
376
- pred_words = pred.lower().split()
377
- ref_words = ref.lower().split()
378
-
379
- # Simple overlap calculation (not actual BLEU)
380
- overlap = len(set(pred_words) & set(ref_words))
381
- total = len(set(pred_words) | set(ref_words))
382
-
383
- if total > 0:
384
- scores.append(overlap / total)
385
- else:
386
- scores.append(0.0)
387
-
388
- return {
389
- "bleu": np.mean(scores),
390
- "bleu_std": np.std(scores)
391
- }
392
- except Exception as e:
393
- logger.error(f"BLEU computation failed: {e}")
394
- return {"bleu": 0.0, "error": str(e)}
395
-
396
- def _compute_rouge(self, predictions: List[str], references: List[str]) -> Dict[str, float]:
397
- """Compute ROUGE score (simplified implementation)."""
398
- try:
399
- rouge_1_scores = []
400
- rouge_l_scores = []
401
-
402
- for pred, ref in zip(predictions, references):
403
- pred_words = set(pred.lower().split())
404
- ref_words = set(ref.lower().split())
405
-
406
- # ROUGE-1 (unigram overlap)
407
- if len(ref_words) > 0:
408
- rouge_1 = len(pred_words & ref_words) / len(ref_words)
409
- rouge_1_scores.append(rouge_1)
410
-
411
- # Simplified ROUGE-L (longest common subsequence)
412
- rouge_l = len(pred_words & ref_words) / max(len(pred_words), len(ref_words), 1)
413
- rouge_l_scores.append(rouge_l)
414
-
415
- return {
416
- "rouge_1": np.mean(rouge_1_scores),
417
- "rouge_l": np.mean(rouge_l_scores),
418
- "rouge_1_std": np.std(rouge_1_scores),
419
- "rouge_l_std": np.std(rouge_l_scores)
420
- }
421
- except Exception as e:
422
- logger.error(f"ROUGE computation failed: {e}")
423
- return {"rouge_1": 0.0, "rouge_l": 0.0, "error": str(e)}
424
-
425
- def _compute_accuracy(self, predictions: List[str], references: List[str]) -> Dict[str, float]:
426
- """Compute accuracy score."""
427
- try:
428
- correct = 0
429
- total = len(predictions)
430
-
431
- for pred, ref in zip(predictions, references):
432
- if pred.strip().lower() == ref.strip().lower():
433
- correct += 1
434
-
435
- accuracy = correct / total if total > 0 else 0.0
436
-
437
- return {
438
- "accuracy": accuracy,
439
- "correct": correct,
440
- "total": total
441
- }
442
- except Exception as e:
443
- logger.error(f"Accuracy computation failed: {e}")
444
- return {"accuracy": 0.0, "error": str(e)}
445
-
446
- def _compute_f1(self, predictions: List[str], references: List[str]) -> Dict[str, float]:
447
- """Compute F1 score (simplified implementation)."""
448
- try:
449
- f1_scores = []
450
-
451
- for pred, ref in zip(predictions, references):
452
- pred_words = set(pred.lower().split())
453
- ref_words = set(ref.lower().split())
454
-
455
- if len(pred_words) == 0 and len(ref_words) == 0:
456
- f1_scores.append(1.0)
457
- elif len(pred_words) == 0 or len(ref_words) == 0:
458
- f1_scores.append(0.0)
459
- else:
460
- intersection = len(pred_words & ref_words)
461
- precision = intersection / len(pred_words)
462
- recall = intersection / len(ref_words)
463
-
464
- if precision + recall > 0:
465
- f1 = 2 * (precision * recall) / (precision + recall)
466
- f1_scores.append(f1)
467
- else:
468
- f1_scores.append(0.0)
469
-
470
- return {
471
- "f1": np.mean(f1_scores),
472
- "f1_std": np.std(f1_scores)
473
- }
474
- except Exception as e:
475
- logger.error(f"F1 computation failed: {e}")
476
- return {"f1": 0.0, "error": str(e)}
477
-
478
- def _compute_diversity(self, texts: List[str]) -> Dict[str, float]:
479
- """Compute diversity metrics."""
480
- try:
481
- # Distinct-1 and Distinct-2
482
- all_unigrams = []
483
- all_bigrams = []
484
-
485
- for text in texts:
486
- words = text.lower().split()
487
- all_unigrams.extend(words)
488
-
489
- # Create bigrams
490
- for i in range(len(words) - 1):
491
- all_bigrams.append((words[i], words[i + 1]))
492
-
493
- distinct_1 = len(set(all_unigrams)) / len(all_unigrams) if all_unigrams else 0
494
- distinct_2 = len(set(all_bigrams)) / len(all_bigrams) if all_bigrams else 0
495
-
496
- return {
497
- "distinct_1": distinct_1,
498
- "distinct_2": distinct_2,
499
- "vocab_size": len(set(all_unigrams))
500
- }
501
- except Exception as e:
502
- logger.error(f"Diversity computation failed: {e}")
503
- return {"distinct_1": 0.0, "distinct_2": 0.0, "error": str(e)}
504
-
505
- def _compute_coherence(self, texts: List[str]) -> Dict[str, float]:
506
- """Compute coherence score (simplified implementation)."""
507
- try:
508
- # Simplified coherence based on sentence length consistency
509
- coherence_scores = []
510
-
511
- for text in texts:
512
- sentences = text.split('.')
513
- if len(sentences) > 1:
514
- lengths = [len(s.split()) for s in sentences if s.strip()]
515
- if lengths:
516
- # Coherence as inverse of length variance
517
- coherence = 1.0 / (1.0 + np.var(lengths))
518
- coherence_scores.append(coherence)
519
- else:
520
- coherence_scores.append(0.5)
521
- else:
522
- coherence_scores.append(0.5)
523
-
524
- return {
525
- "coherence": np.mean(coherence_scores),
526
- "coherence_std": np.std(coherence_scores)
527
- }
528
- except Exception as e:
529
- logger.error(f"Coherence computation failed: {e}")
530
- return {"coherence": 0.5, "error": str(e)}
531
-
532
- def _compute_fluency(self, texts: List[str]) -> Dict[str, float]:
533
- """Compute fluency score (simplified implementation)."""
534
- try:
535
- fluency_scores = []
536
-
537
- for text in texts:
538
- # Simplified fluency based on word count and sentence structure
539
- words = text.split()
540
- sentences = text.split('.')
541
-
542
- if len(words) > 0 and len(sentences) > 0:
543
- avg_words_per_sentence = len(words) / len(sentences)
544
- # Fluency based on reasonable sentence length (5-20 words)
545
- if 5 <= avg_words_per_sentence <= 20:
546
- fluency = 1.0
547
- else:
548
- fluency = max(0.0, 1.0 - abs(avg_words_per_sentence - 12.5) / 12.5)
549
-
550
- fluency_scores.append(fluency)
551
- else:
552
- fluency_scores.append(0.0)
553
-
554
- return {
555
- "fluency": np.mean(fluency_scores),
556
- "fluency_std": np.std(fluency_scores)
557
- }
558
- except Exception as e:
559
- logger.error(f"Fluency computation failed: {e}")
560
- return {"fluency": 0.0, "error": str(e)}
561
-
562
-
563
- class ImageMetrics:
564
- """
565
- Metrics calculator for Image Generation Models.
566
-
567
- Supports metrics including:
568
- - FID (Fréchet Inception Distance)
569
- - IS (Inception Score)
570
- - LPIPS (Learned Perceptual Image Patch Similarity)
571
- """
572
-
573
- def __init__(self):
574
- self.available_metrics = [
575
- MetricType.FID,
576
- MetricType.IS,
577
- MetricType.LPIPS
578
- ]
579
-
580
- def evaluate(
581
- self,
582
- model_path: str,
583
- test_images_dir: str,
584
- reference_images_dir: Optional[str] = None,
585
- metrics: List[str] = None,
586
- **kwargs
587
- ) -> Dict[str, Any]:
588
- """
589
- Evaluate image generation model.
590
-
591
- Args:
592
- model_path: Path to the image model
593
- test_images_dir: Directory with test images
594
- reference_images_dir: Directory with reference images
595
- metrics: Metrics to compute
596
- **kwargs: Additional parameters
597
-
598
- Returns:
599
- Image evaluation results
600
- """
601
- if metrics is None:
602
- metrics = [MetricType.FID, MetricType.IS]
603
-
604
- results = {
605
- "model_path": model_path,
606
- "test_images_dir": test_images_dir,
607
- "reference_images_dir": reference_images_dir,
608
- "metrics": {}
609
- }
610
-
611
- for metric in metrics:
612
- try:
613
- if metric == MetricType.FID:
614
- score = self._compute_fid(test_images_dir, reference_images_dir)
615
- elif metric == MetricType.IS:
616
- score = self._compute_is(test_images_dir)
617
- elif metric == MetricType.LPIPS:
618
- score = self._compute_lpips(test_images_dir, reference_images_dir)
619
- else:
620
- logger.warning(f"Unknown image metric: {metric}")
621
- continue
622
-
623
- results["metrics"][metric] = score
624
- logger.info(f"Computed {metric}: {score}")
625
-
626
- except Exception as e:
627
- logger.error(f"Failed to compute {metric}: {e}")
628
- results["metrics"][metric] = {"error": str(e)}
629
-
630
- return results
631
-
632
- def _compute_fid(self, test_dir: str, reference_dir: Optional[str]) -> Dict[str, float]:
633
- """Compute FID score (placeholder implementation)."""
634
- # This is a placeholder - actual FID requires complex neural network computations
635
- logger.warning("FID computation not fully implemented - returning placeholder")
636
- return {
637
- "fid": np.random.uniform(20, 100), # Placeholder
638
- "note": "Placeholder implementation"
639
- }
640
-
641
- def _compute_is(self, images_dir: str) -> Dict[str, float]:
642
- """Compute Inception Score (placeholder implementation)."""
643
- # This is a placeholder - actual IS requires Inception network
644
- logger.warning("IS computation not fully implemented - returning placeholder")
645
- return {
646
- "is_mean": np.random.uniform(2, 10), # Placeholder
647
- "is_std": np.random.uniform(0.1, 1.0),
648
- "note": "Placeholder implementation"
649
- }
650
-
651
- def _compute_lpips(self, test_dir: str, reference_dir: Optional[str]) -> Dict[str, float]:
652
- """Compute LPIPS score (placeholder implementation)."""
653
- # This is a placeholder - actual LPIPS requires perceptual loss networks
654
- logger.warning("LPIPS computation not fully implemented - returning placeholder")
655
- return {
656
- "lpips": np.random.uniform(0.1, 0.8), # Placeholder
657
- "note": "Placeholder implementation"
658
- }
659
-
660
-
661
- class BenchmarkRunner:
662
- """
663
- Runner for standard AI benchmarks.
664
-
665
- Supports running various benchmarks and collecting results.
666
- """
667
-
668
- def __init__(self):
669
- self.supported_benchmarks = ["mmlu", "hellaswag", "arc", "gsm8k"]
670
-
671
- # Initialize AI factory if available
672
- if AI_FACTORY_AVAILABLE:
673
- try:
674
- self.ai_factory = AIFactory()
675
- except Exception as e:
676
- logger.warning(f"Failed to initialize AIFactory: {e}")
677
- self.ai_factory = None
678
- else:
679
- self.ai_factory = None
680
-
681
- def run(
682
- self,
683
- benchmark,
684
- model_path: str,
685
- num_shots: int = 0,
686
- max_samples: Optional[int] = None,
687
- provider: str = "ollama",
688
- **kwargs
689
- ) -> Dict[str, Any]:
690
- """
691
- Run a benchmark evaluation.
692
-
693
- Args:
694
- benchmark: Benchmark instance
695
- model_path: Path to the model
696
- num_shots: Number of few-shot examples
697
- max_samples: Maximum samples to evaluate
698
- provider: Model provider
699
- **kwargs: Additional parameters
700
-
701
- Returns:
702
- Benchmark results
703
- """
704
- logger.info(f"Running benchmark {benchmark.name} on {model_path}")
705
-
706
- # Load benchmark data
707
- test_data = benchmark.load_data(max_samples=max_samples)
708
-
709
- # Run evaluation
710
- results = {
711
- "benchmark": benchmark.name,
712
- "model_path": model_path,
713
- "num_shots": num_shots,
714
- "num_samples": len(test_data),
715
- "results": {}
716
- }
717
-
718
- # Process each sample
719
- correct = 0
720
- total = 0
721
-
722
- for sample in test_data:
723
- try:
724
- # Format prompt using benchmark's method
725
- prompt = benchmark.format_prompt(sample)
726
-
727
- # Generate prediction using actual model
728
- prediction = self._generate_prediction(
729
- model_path, {"prompt": prompt}, num_shots, provider, **kwargs
730
- )
731
-
732
- # Check if correct
733
- is_correct = benchmark.evaluate_sample(sample, prediction)
734
- if is_correct:
735
- correct += 1
736
- total += 1
737
-
738
- except Exception as e:
739
- logger.error(f"Failed to process sample: {e}")
740
- continue
741
-
742
- # Calculate final score
743
- accuracy = correct / total if total > 0 else 0.0
744
-
745
- results["results"] = {
746
- "accuracy": accuracy,
747
- "correct": correct,
748
- "total": total
749
- }
750
-
751
- logger.info(f"Benchmark completed: {accuracy:.3f} accuracy ({correct}/{total})")
752
- return results
753
-
754
- def _generate_prediction(
755
- self,
756
- model_path: str,
757
- sample: Dict[str, Any],
758
- num_shots: int,
759
- provider: str,
760
- **kwargs
761
- ) -> str:
762
- """Generate prediction for a sample using actual model inference."""
763
- if not self.ai_factory:
764
- logger.warning("AIFactory not available, using placeholder prediction")
765
- return "A" # Placeholder answer
766
-
767
- try:
768
- # Get LLM service
769
- llm_service = self.ai_factory.get_llm(model_name=model_path, provider=provider)
770
-
771
- # Format the prompt (this should be done by the benchmark)
772
- if hasattr(sample, 'get'):
773
- prompt = sample.get('prompt', str(sample))
774
- else:
775
- prompt = str(sample)
776
-
777
- # Generate prediction using actual model
778
- response = llm_service.generate(
779
- prompt=prompt,
780
- max_tokens=kwargs.get("max_tokens", 50),
781
- temperature=kwargs.get("temperature", 0.0) # Low temperature for consistency
782
- )
783
-
784
- # Extract text from response
785
- if hasattr(response, 'text'):
786
- prediction = response.text
787
- elif isinstance(response, dict) and 'text' in response:
788
- prediction = response['text']
789
- elif isinstance(response, str):
790
- prediction = response
791
- else:
792
- prediction = str(response)
793
-
794
- return prediction.strip()
795
-
796
- except Exception as e:
797
- logger.error(f"Failed to generate prediction: {e}")
798
- return "A" # Fallback answer
799
-
800
-
801
- # Utility functions for evaluators
802
- def compute_text_metrics(predictions: Union[str, List[str]],
803
- references: Union[str, List[str]],
804
- aggregate: bool = False) -> Dict[str, float]:
805
- """
806
- Compute standard text evaluation metrics.
807
-
808
- Args:
809
- predictions: Single prediction or list of predictions
810
- references: Single reference or list of references
811
- aggregate: Whether to compute aggregate metrics for lists
812
-
813
- Returns:
814
- Dictionary of computed metrics
815
- """
816
- try:
817
- # Handle single string inputs
818
- if isinstance(predictions, str) and isinstance(references, str):
819
- pred_list = [predictions]
820
- ref_list = [references]
821
- else:
822
- pred_list = predictions if isinstance(predictions, list) else [str(predictions)]
823
- ref_list = references if isinstance(references, list) else [str(references)]
824
-
825
- # Ensure equal lengths
826
- min_len = min(len(pred_list), len(ref_list))
827
- pred_list = pred_list[:min_len]
828
- ref_list = ref_list[:min_len]
829
-
830
- metrics = {}
831
-
832
- # Exact match
833
- exact_matches = sum(1 for p, r in zip(pred_list, ref_list) if p.strip().lower() == r.strip().lower())
834
- metrics["exact_match"] = exact_matches / len(pred_list) if pred_list else 0.0
835
-
836
- # F1 Score (token-level)
837
- f1_scores = []
838
- for pred, ref in zip(pred_list, ref_list):
839
- pred_tokens = set(pred.lower().split())
840
- ref_tokens = set(ref.lower().split())
841
-
842
- if not ref_tokens and not pred_tokens:
843
- f1_scores.append(1.0)
844
- elif not ref_tokens or not pred_tokens:
845
- f1_scores.append(0.0)
846
- else:
847
- intersection = len(pred_tokens & ref_tokens)
848
- precision = intersection / len(pred_tokens)
849
- recall = intersection / len(ref_tokens)
850
-
851
- if precision + recall > 0:
852
- f1 = 2 * (precision * recall) / (precision + recall)
853
- f1_scores.append(f1)
854
- else:
855
- f1_scores.append(0.0)
856
-
857
- metrics["f1_score"] = np.mean(f1_scores) if f1_scores else 0.0
858
-
859
- # BLEU Score (simplified)
860
- bleu_scores = []
861
- for pred, ref in zip(pred_list, ref_list):
862
- pred_words = pred.lower().split()
863
- ref_words = ref.lower().split()
864
-
865
- # Simple n-gram overlap
866
- overlap = len(set(pred_words) & set(ref_words))
867
- total = len(set(pred_words) | set(ref_words))
868
-
869
- bleu_scores.append(overlap / total if total > 0 else 0.0)
870
-
871
- metrics["bleu_score"] = np.mean(bleu_scores) if bleu_scores else 0.0
872
-
873
- # ROUGE-L (simplified)
874
- rouge_scores = []
875
- for pred, ref in zip(pred_list, ref_list):
876
- pred_words = set(pred.lower().split())
877
- ref_words = set(ref.lower().split())
878
-
879
- if len(ref_words) > 0:
880
- rouge_l = len(pred_words & ref_words) / len(ref_words)
881
- rouge_scores.append(rouge_l)
882
- else:
883
- rouge_scores.append(0.0)
884
-
885
- metrics["rouge_l"] = np.mean(rouge_scores) if rouge_scores else 0.0
886
-
887
- # Response length metrics
888
- pred_lengths = [len(p.split()) for p in pred_list]
889
- ref_lengths = [len(r.split()) for r in ref_list]
890
-
891
- metrics["avg_prediction_length"] = np.mean(pred_lengths) if pred_lengths else 0.0
892
- metrics["avg_reference_length"] = np.mean(ref_lengths) if ref_lengths else 0.0
893
- metrics["length_ratio"] = (np.mean(pred_lengths) / np.mean(ref_lengths)) if np.mean(ref_lengths) > 0 else 0.0
894
-
895
- # Diversity metrics for predictions
896
- if len(pred_list) > 1:
897
- all_words = []
898
- for pred in pred_list:
899
- all_words.extend(pred.lower().split())
900
-
901
- unique_words = len(set(all_words))
902
- total_words = len(all_words)
903
-
904
- metrics["vocabulary_diversity"] = unique_words / total_words if total_words > 0 else 0.0
905
-
906
- return metrics
907
-
908
- except Exception as e:
909
- logger.error(f"Error computing text metrics: {e}")
910
- return {"text_metrics_error": 1.0}
911
-
912
-
913
- def compute_vision_metrics(predictions: List[Any],
914
- references: List[Any],
915
- task_type: str = "general") -> Dict[str, float]:
916
- """
917
- Compute vision-specific evaluation metrics.
918
-
919
- Args:
920
- predictions: List of vision model predictions
921
- references: List of reference outputs
922
- task_type: Type of vision task (ocr, detection, etc.)
923
-
924
- Returns:
925
- Dictionary of computed metrics
926
- """
927
- try:
928
- metrics = {}
929
-
930
- # Basic success rate
931
- successful_predictions = sum(1 for p in predictions if p is not None)
932
- metrics["prediction_success_rate"] = successful_predictions / len(predictions) if predictions else 0.0
933
-
934
- # Task-specific metrics would be computed by individual evaluators
935
- # This is a placeholder for common vision metrics
936
-
937
- if task_type == "ocr":
938
- # OCR-specific metrics would be computed in VisionEvaluator
939
- pass
940
- elif task_type == "detection":
941
- # Object detection metrics (IoU, mAP, etc.)
942
- pass
943
- elif task_type == "classification":
944
- # Image classification metrics
945
- pass
946
-
947
- return metrics
948
-
949
- except Exception as e:
950
- logger.error(f"Error computing vision metrics: {e}")
951
- return {"vision_metrics_error": 1.0}