isa-model 0.3.91__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. isa_model/client.py +1166 -584
  2. isa_model/core/cache/redis_cache.py +410 -0
  3. isa_model/core/config/config_manager.py +282 -12
  4. isa_model/core/config.py +91 -1
  5. isa_model/core/database/__init__.py +1 -0
  6. isa_model/core/database/direct_db_client.py +114 -0
  7. isa_model/core/database/migration_manager.py +563 -0
  8. isa_model/core/database/migrations.py +297 -0
  9. isa_model/core/database/supabase_client.py +258 -0
  10. isa_model/core/dependencies.py +316 -0
  11. isa_model/core/discovery/__init__.py +19 -0
  12. isa_model/core/discovery/consul_discovery.py +190 -0
  13. isa_model/core/logging/__init__.py +54 -0
  14. isa_model/core/logging/influx_logger.py +523 -0
  15. isa_model/core/logging/loki_logger.py +160 -0
  16. isa_model/core/models/__init__.py +46 -0
  17. isa_model/core/models/config_models.py +625 -0
  18. isa_model/core/models/deployment_billing_tracker.py +430 -0
  19. isa_model/core/models/model_billing_tracker.py +60 -88
  20. isa_model/core/models/model_manager.py +66 -25
  21. isa_model/core/models/model_metadata.py +690 -0
  22. isa_model/core/models/model_repo.py +217 -55
  23. isa_model/core/models/model_statistics_tracker.py +234 -0
  24. isa_model/core/models/model_storage.py +0 -1
  25. isa_model/core/models/model_version_manager.py +959 -0
  26. isa_model/core/models/system_models.py +857 -0
  27. isa_model/core/pricing_manager.py +2 -249
  28. isa_model/core/repositories/__init__.py +9 -0
  29. isa_model/core/repositories/config_repository.py +912 -0
  30. isa_model/core/resilience/circuit_breaker.py +366 -0
  31. isa_model/core/security/secrets.py +358 -0
  32. isa_model/core/services/__init__.py +2 -4
  33. isa_model/core/services/intelligent_model_selector.py +479 -370
  34. isa_model/core/storage/hf_storage.py +2 -2
  35. isa_model/core/types.py +8 -0
  36. isa_model/deployment/__init__.py +5 -48
  37. isa_model/deployment/core/__init__.py +2 -31
  38. isa_model/deployment/core/deployment_manager.py +1278 -368
  39. isa_model/deployment/local/__init__.py +31 -0
  40. isa_model/deployment/local/config.py +248 -0
  41. isa_model/deployment/local/gpu_gateway.py +607 -0
  42. isa_model/deployment/local/health_checker.py +428 -0
  43. isa_model/deployment/local/provider.py +586 -0
  44. isa_model/deployment/local/tensorrt_service.py +621 -0
  45. isa_model/deployment/local/transformers_service.py +644 -0
  46. isa_model/deployment/local/vllm_service.py +527 -0
  47. isa_model/deployment/modal/__init__.py +8 -0
  48. isa_model/deployment/modal/config.py +136 -0
  49. isa_model/deployment/modal/deployer.py +894 -0
  50. isa_model/deployment/modal/services/__init__.py +3 -0
  51. isa_model/deployment/modal/services/audio/__init__.py +1 -0
  52. isa_model/deployment/modal/services/audio/isa_audio_chatTTS_service.py +520 -0
  53. isa_model/deployment/modal/services/audio/isa_audio_openvoice_service.py +758 -0
  54. isa_model/deployment/modal/services/audio/isa_audio_service_v2.py +1044 -0
  55. isa_model/deployment/modal/services/embedding/__init__.py +1 -0
  56. isa_model/deployment/modal/services/embedding/isa_embed_rerank_service.py +296 -0
  57. isa_model/deployment/modal/services/llm/__init__.py +1 -0
  58. isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
  59. isa_model/deployment/modal/services/video/__init__.py +1 -0
  60. isa_model/deployment/modal/services/video/isa_video_hunyuan_service.py +423 -0
  61. isa_model/deployment/modal/services/vision/__init__.py +1 -0
  62. isa_model/deployment/modal/services/vision/isa_vision_ocr_service.py +519 -0
  63. isa_model/deployment/modal/services/vision/isa_vision_qwen25_service.py +709 -0
  64. isa_model/deployment/modal/services/vision/isa_vision_table_service.py +676 -0
  65. isa_model/deployment/modal/services/vision/isa_vision_ui_service.py +833 -0
  66. isa_model/deployment/modal/services/vision/isa_vision_ui_service_optimized.py +660 -0
  67. isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
  68. isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
  69. isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
  70. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
  71. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
  72. isa_model/deployment/storage/__init__.py +5 -0
  73. isa_model/deployment/storage/deployment_repository.py +824 -0
  74. isa_model/deployment/triton/__init__.py +10 -0
  75. isa_model/deployment/triton/config.py +196 -0
  76. isa_model/deployment/triton/configs/__init__.py +1 -0
  77. isa_model/deployment/triton/provider.py +512 -0
  78. isa_model/deployment/triton/scripts/__init__.py +1 -0
  79. isa_model/deployment/triton/templates/__init__.py +1 -0
  80. isa_model/inference/__init__.py +47 -1
  81. isa_model/inference/ai_factory.py +179 -16
  82. isa_model/inference/legacy_services/__init__.py +21 -0
  83. isa_model/inference/legacy_services/model_evaluation.py +637 -0
  84. isa_model/inference/legacy_services/model_service.py +573 -0
  85. isa_model/inference/legacy_services/model_serving.py +717 -0
  86. isa_model/inference/legacy_services/model_training.py +561 -0
  87. isa_model/inference/models/__init__.py +21 -0
  88. isa_model/inference/models/inference_config.py +551 -0
  89. isa_model/inference/models/inference_record.py +675 -0
  90. isa_model/inference/models/performance_models.py +714 -0
  91. isa_model/inference/repositories/__init__.py +9 -0
  92. isa_model/inference/repositories/inference_repository.py +828 -0
  93. isa_model/inference/services/audio/__init__.py +21 -0
  94. isa_model/inference/services/audio/base_realtime_service.py +225 -0
  95. isa_model/inference/services/audio/base_stt_service.py +184 -11
  96. isa_model/inference/services/audio/isa_tts_service.py +0 -0
  97. isa_model/inference/services/audio/openai_realtime_service.py +320 -124
  98. isa_model/inference/services/audio/openai_stt_service.py +53 -11
  99. isa_model/inference/services/base_service.py +17 -1
  100. isa_model/inference/services/custom_model_manager.py +277 -0
  101. isa_model/inference/services/embedding/__init__.py +13 -0
  102. isa_model/inference/services/embedding/base_embed_service.py +111 -8
  103. isa_model/inference/services/embedding/isa_embed_service.py +305 -0
  104. isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
  105. isa_model/inference/services/embedding/openai_embed_service.py +2 -4
  106. isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
  107. isa_model/inference/services/embedding/tests/test_embedding.py +222 -0
  108. isa_model/inference/services/img/__init__.py +2 -2
  109. isa_model/inference/services/img/base_image_gen_service.py +24 -7
  110. isa_model/inference/services/img/replicate_image_gen_service.py +84 -422
  111. isa_model/inference/services/img/services/replicate_face_swap.py +193 -0
  112. isa_model/inference/services/img/services/replicate_flux.py +226 -0
  113. isa_model/inference/services/img/services/replicate_flux_kontext.py +219 -0
  114. isa_model/inference/services/img/services/replicate_sticker_maker.py +249 -0
  115. isa_model/inference/services/img/tests/test_img_client.py +297 -0
  116. isa_model/inference/services/llm/__init__.py +10 -2
  117. isa_model/inference/services/llm/base_llm_service.py +361 -26
  118. isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
  119. isa_model/inference/services/llm/helpers/llm_adapter.py +71 -12
  120. isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
  121. isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
  122. isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
  123. isa_model/inference/services/llm/local_llm_service.py +747 -0
  124. isa_model/inference/services/llm/ollama_llm_service.py +11 -3
  125. isa_model/inference/services/llm/openai_llm_service.py +670 -56
  126. isa_model/inference/services/llm/yyds_llm_service.py +10 -3
  127. isa_model/inference/services/vision/__init__.py +27 -6
  128. isa_model/inference/services/vision/base_vision_service.py +118 -185
  129. isa_model/inference/services/vision/blip_vision_service.py +359 -0
  130. isa_model/inference/services/vision/helpers/image_utils.py +19 -10
  131. isa_model/inference/services/vision/isa_vision_service.py +634 -0
  132. isa_model/inference/services/vision/openai_vision_service.py +19 -10
  133. isa_model/inference/services/vision/tests/test_ocr_client.py +284 -0
  134. isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
  135. isa_model/serving/api/cache_manager.py +245 -0
  136. isa_model/serving/api/dependencies/__init__.py +1 -0
  137. isa_model/serving/api/dependencies/auth.py +194 -0
  138. isa_model/serving/api/dependencies/database.py +139 -0
  139. isa_model/serving/api/error_handlers.py +284 -0
  140. isa_model/serving/api/fastapi_server.py +240 -18
  141. isa_model/serving/api/middleware/auth.py +317 -0
  142. isa_model/serving/api/middleware/security.py +268 -0
  143. isa_model/serving/api/middleware/tenant_context.py +414 -0
  144. isa_model/serving/api/routes/analytics.py +489 -0
  145. isa_model/serving/api/routes/config.py +645 -0
  146. isa_model/serving/api/routes/deployment_billing.py +315 -0
  147. isa_model/serving/api/routes/deployments.py +475 -0
  148. isa_model/serving/api/routes/gpu_gateway.py +440 -0
  149. isa_model/serving/api/routes/health.py +32 -12
  150. isa_model/serving/api/routes/inference_monitoring.py +486 -0
  151. isa_model/serving/api/routes/local_deployments.py +448 -0
  152. isa_model/serving/api/routes/logs.py +430 -0
  153. isa_model/serving/api/routes/settings.py +582 -0
  154. isa_model/serving/api/routes/tenants.py +575 -0
  155. isa_model/serving/api/routes/unified.py +992 -171
  156. isa_model/serving/api/routes/webhooks.py +479 -0
  157. isa_model/serving/api/startup.py +318 -0
  158. isa_model/serving/modal_proxy_server.py +249 -0
  159. isa_model/utils/gpu_utils.py +311 -0
  160. {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/METADATA +76 -22
  161. isa_model-0.4.3.dist-info/RECORD +193 -0
  162. isa_model/deployment/cloud/__init__.py +0 -9
  163. isa_model/deployment/cloud/modal/__init__.py +0 -10
  164. isa_model/deployment/cloud/modal/isa_vision_doc_service.py +0 -766
  165. isa_model/deployment/cloud/modal/isa_vision_table_service.py +0 -532
  166. isa_model/deployment/cloud/modal/isa_vision_ui_service.py +0 -406
  167. isa_model/deployment/cloud/modal/register_models.py +0 -321
  168. isa_model/deployment/core/deployment_config.py +0 -356
  169. isa_model/deployment/core/isa_deployment_service.py +0 -401
  170. isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
  171. isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
  172. isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
  173. isa_model/deployment/runtime/deployed_service.py +0 -338
  174. isa_model/deployment/services/__init__.py +0 -9
  175. isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
  176. isa_model/deployment/services/model_service.py +0 -332
  177. isa_model/deployment/services/service_monitor.py +0 -356
  178. isa_model/deployment/services/service_registry.py +0 -527
  179. isa_model/eval/__init__.py +0 -92
  180. isa_model/eval/benchmarks.py +0 -469
  181. isa_model/eval/config/__init__.py +0 -10
  182. isa_model/eval/config/evaluation_config.py +0 -108
  183. isa_model/eval/evaluators/__init__.py +0 -18
  184. isa_model/eval/evaluators/base_evaluator.py +0 -503
  185. isa_model/eval/evaluators/llm_evaluator.py +0 -472
  186. isa_model/eval/factory.py +0 -531
  187. isa_model/eval/infrastructure/__init__.py +0 -24
  188. isa_model/eval/infrastructure/experiment_tracker.py +0 -466
  189. isa_model/eval/metrics.py +0 -798
  190. isa_model/inference/adapter/unified_api.py +0 -248
  191. isa_model/inference/services/helpers/stacked_config.py +0 -148
  192. isa_model/inference/services/img/flux_professional_service.py +0 -603
  193. isa_model/inference/services/img/helpers/base_stacked_service.py +0 -274
  194. isa_model/inference/services/others/table_transformer_service.py +0 -61
  195. isa_model/inference/services/vision/doc_analysis_service.py +0 -640
  196. isa_model/inference/services/vision/helpers/base_stacked_service.py +0 -274
  197. isa_model/inference/services/vision/ui_analysis_service.py +0 -823
  198. isa_model/scripts/inference_tracker.py +0 -283
  199. isa_model/scripts/mlflow_manager.py +0 -379
  200. isa_model/scripts/model_registry.py +0 -465
  201. isa_model/scripts/register_models.py +0 -370
  202. isa_model/scripts/register_models_with_embeddings.py +0 -510
  203. isa_model/scripts/start_mlflow.py +0 -95
  204. isa_model/scripts/training_tracker.py +0 -257
  205. isa_model/training/__init__.py +0 -74
  206. isa_model/training/annotation/annotation_schema.py +0 -47
  207. isa_model/training/annotation/processors/annotation_processor.py +0 -126
  208. isa_model/training/annotation/storage/dataset_manager.py +0 -131
  209. isa_model/training/annotation/storage/dataset_schema.py +0 -44
  210. isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
  211. isa_model/training/annotation/tests/test_minio copy.py +0 -113
  212. isa_model/training/annotation/tests/test_minio_upload.py +0 -43
  213. isa_model/training/annotation/views/annotation_controller.py +0 -158
  214. isa_model/training/cloud/__init__.py +0 -22
  215. isa_model/training/cloud/job_orchestrator.py +0 -402
  216. isa_model/training/cloud/runpod_trainer.py +0 -454
  217. isa_model/training/cloud/storage_manager.py +0 -482
  218. isa_model/training/core/__init__.py +0 -23
  219. isa_model/training/core/config.py +0 -181
  220. isa_model/training/core/dataset.py +0 -222
  221. isa_model/training/core/trainer.py +0 -720
  222. isa_model/training/core/utils.py +0 -213
  223. isa_model/training/factory.py +0 -424
  224. isa_model-0.3.91.dist-info/RECORD +0 -138
  225. /isa_model/{core/storage/minio_storage.py → deployment/modal/services/audio/isa_audio_fish_service.py} +0 -0
  226. /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
  227. {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/WHEEL +0 -0
  228. {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/top_level.txt +0 -0
@@ -1,469 +0,0 @@
1
- """
2
- Standard AI Benchmarks for ISA Model Framework
3
-
4
- This module provides implementations of standard AI benchmarks:
5
- - MMLU (Massive Multitask Language Understanding)
6
- - HellaSwag (Commonsense Reasoning)
7
- - ARC (AI2 Reasoning Challenge)
8
- - GSM8K (Grade School Math)
9
- """
10
-
11
- import os
12
- import json
13
- import logging
14
- from typing import Dict, List, Any, Optional
15
- from abc import ABC, abstractmethod
16
- from dataclasses import dataclass
17
-
18
- logger = logging.getLogger(__name__)
19
-
20
-
21
- @dataclass
22
- class BenchmarkConfig:
23
- """Configuration for benchmark evaluation."""
24
- name: str
25
- description: str
26
- num_choices: int = 4
27
- few_shot_examples: int = 5
28
- max_samples: Optional[int] = None
29
- subjects: Optional[List[str]] = None
30
-
31
-
32
- class BaseBenchmark(ABC):
33
- """Base class for all benchmarks."""
34
-
35
- def __init__(self, config: BenchmarkConfig):
36
- self.config = config
37
- self.name = config.name
38
- self.data = None
39
-
40
- @abstractmethod
41
- def load_data(self, max_samples: Optional[int] = None) -> List[Dict[str, Any]]:
42
- """Load benchmark data."""
43
- pass
44
-
45
- @abstractmethod
46
- def evaluate_sample(self, sample: Dict[str, Any], prediction: str) -> bool:
47
- """Evaluate a single sample."""
48
- pass
49
-
50
- def format_prompt(self, sample: Dict[str, Any], few_shot_examples: Optional[List[Dict[str, Any]]] = None) -> str:
51
- """Format prompt for the sample."""
52
- prompt = ""
53
-
54
- # Add few-shot examples if provided
55
- if few_shot_examples:
56
- for example in few_shot_examples:
57
- prompt += self._format_single_example(example, include_answer=True) + "\n\n"
58
-
59
- # Add the actual question
60
- prompt += self._format_single_example(sample, include_answer=False)
61
-
62
- return prompt
63
-
64
- @abstractmethod
65
- def _format_single_example(self, sample: Dict[str, Any], include_answer: bool = False) -> str:
66
- """Format a single example."""
67
- pass
68
-
69
-
70
- class MMLU(BaseBenchmark):
71
- """
72
- MMLU (Massive Multitask Language Understanding) Benchmark
73
-
74
- Tests knowledge across 57 subjects including mathematics, history,
75
- computer science, law, and more.
76
- """
77
-
78
- def __init__(self, subjects: Optional[List[str]] = None):
79
- config = BenchmarkConfig(
80
- name="MMLU",
81
- description="Massive Multitask Language Understanding",
82
- num_choices=4,
83
- few_shot_examples=5,
84
- subjects=subjects
85
- )
86
- super().__init__(config)
87
-
88
- # MMLU subjects
89
- self.all_subjects = [
90
- "abstract_algebra", "anatomy", "astronomy", "business_ethics",
91
- "clinical_knowledge", "college_biology", "college_chemistry",
92
- "college_computer_science", "college_mathematics", "college_medicine",
93
- "college_physics", "computer_security", "conceptual_physics",
94
- "econometrics", "electrical_engineering", "elementary_mathematics",
95
- "formal_logic", "global_facts", "high_school_biology",
96
- "high_school_chemistry", "high_school_computer_science",
97
- "high_school_european_history", "high_school_geography",
98
- "high_school_government_and_politics", "high_school_macroeconomics",
99
- "high_school_mathematics", "high_school_microeconomics",
100
- "high_school_physics", "high_school_psychology", "high_school_statistics",
101
- "high_school_us_history", "high_school_world_history", "human_aging",
102
- "human_sexuality", "international_law", "jurisprudence",
103
- "logical_fallacies", "machine_learning", "management", "marketing",
104
- "medical_genetics", "miscellaneous", "moral_disputes", "moral_scenarios",
105
- "nutrition", "philosophy", "prehistory", "professional_accounting",
106
- "professional_law", "professional_medicine", "professional_psychology",
107
- "public_relations", "security_studies", "sociology", "us_foreign_policy",
108
- "virology", "world_religions"
109
- ]
110
-
111
- self.subjects = subjects or self.all_subjects[:10] # Use first 10 subjects by default
112
-
113
- def load_data(self, max_samples: Optional[int] = None) -> List[Dict[str, Any]]:
114
- """Load MMLU data (simplified implementation)."""
115
- # This is a simplified implementation
116
- # In practice, you'd load from the actual MMLU dataset
117
-
118
- data = []
119
-
120
- for subject in self.subjects:
121
- # Generate sample questions for each subject
122
- for i in range(min(10, max_samples // len(self.subjects) if max_samples else 10)):
123
- sample = {
124
- "subject": subject,
125
- "question": f"Sample {subject} question {i+1}",
126
- "choices": [
127
- f"Option A for {subject}",
128
- f"Option B for {subject}",
129
- f"Option C for {subject}",
130
- f"Option D for {subject}"
131
- ],
132
- "answer": "A", # Simplified
133
- "id": f"{subject}_{i}"
134
- }
135
- data.append(sample)
136
-
137
- if max_samples:
138
- data = data[:max_samples]
139
-
140
- logger.info(f"Loaded {len(data)} MMLU samples across {len(self.subjects)} subjects")
141
- return data
142
-
143
- def evaluate_sample(self, sample: Dict[str, Any], prediction: str) -> bool:
144
- """Evaluate a single MMLU sample."""
145
- # Extract the letter choice from prediction
146
- prediction = prediction.strip().upper()
147
-
148
- # Handle various response formats
149
- if prediction in ["A", "B", "C", "D"]:
150
- return prediction == sample["answer"]
151
- elif prediction.startswith("(") and prediction.endswith(")"):
152
- letter = prediction[1]
153
- return letter == sample["answer"]
154
- else:
155
- # Try to find A, B, C, or D in the response
156
- for choice in ["A", "B", "C", "D"]:
157
- if choice in prediction:
158
- return choice == sample["answer"]
159
-
160
- return False
161
-
162
- def _format_single_example(self, sample: Dict[str, Any], include_answer: bool = False) -> str:
163
- """Format a single MMLU example."""
164
- prompt = f"Subject: {sample['subject'].replace('_', ' ').title()}\n"
165
- prompt += f"Question: {sample['question']}\n"
166
-
167
- choices = sample['choices']
168
- for i, choice in enumerate(choices):
169
- letter = chr(65 + i) # A, B, C, D
170
- prompt += f"{letter}. {choice}\n"
171
-
172
- if include_answer:
173
- prompt += f"Answer: {sample['answer']}"
174
- else:
175
- prompt += "Answer:"
176
-
177
- return prompt
178
-
179
-
180
- class HellaSwag(BaseBenchmark):
181
- """
182
- HellaSwag Benchmark
183
-
184
- Tests commonsense reasoning about physical situations.
185
- """
186
-
187
- def __init__(self):
188
- config = BenchmarkConfig(
189
- name="HellaSwag",
190
- description="Commonsense Reasoning about Physical Situations",
191
- num_choices=4,
192
- few_shot_examples=10
193
- )
194
- super().__init__(config)
195
-
196
- def load_data(self, max_samples: Optional[int] = None) -> List[Dict[str, Any]]:
197
- """Load HellaSwag data (simplified implementation)."""
198
- # This is a simplified implementation
199
- # In practice, you'd load from the actual HellaSwag dataset
200
-
201
- data = []
202
-
203
- sample_contexts = [
204
- "A person is washing dishes in the kitchen",
205
- "Someone is riding a bicycle down a hill",
206
- "A chef is preparing ingredients for cooking",
207
- "A student is taking notes in class",
208
- "A gardener is planting flowers"
209
- ]
210
-
211
- for i, context in enumerate(sample_contexts):
212
- if max_samples and i >= max_samples:
213
- break
214
-
215
- sample = {
216
- "context": context,
217
- "question": "What happens next?",
218
- "choices": [
219
- f"They continue with the logical next step for scenario {i+1}",
220
- f"They do something completely unrelated to scenario {i+1}",
221
- f"They stop and do something random in scenario {i+1}",
222
- f"They repeat the same action in scenario {i+1}"
223
- ],
224
- "answer": "A", # First choice is usually most logical
225
- "id": f"hellaswag_{i}"
226
- }
227
- data.append(sample)
228
-
229
- logger.info(f"Loaded {len(data)} HellaSwag samples")
230
- return data
231
-
232
- def evaluate_sample(self, sample: Dict[str, Any], prediction: str) -> bool:
233
- """Evaluate a single HellaSwag sample."""
234
- prediction = prediction.strip().upper()
235
-
236
- if prediction in ["A", "B", "C", "D"]:
237
- return prediction == sample["answer"]
238
-
239
- # Try to extract choice from longer response
240
- for choice in ["A", "B", "C", "D"]:
241
- if choice in prediction:
242
- return choice == sample["answer"]
243
-
244
- return False
245
-
246
- def _format_single_example(self, sample: Dict[str, Any], include_answer: bool = False) -> str:
247
- """Format a single HellaSwag example."""
248
- prompt = f"Context: {sample['context']}\n"
249
- prompt += f"Question: {sample['question']}\n"
250
-
251
- choices = sample['choices']
252
- for i, choice in enumerate(choices):
253
- letter = chr(65 + i) # A, B, C, D
254
- prompt += f"{letter}. {choice}\n"
255
-
256
- if include_answer:
257
- prompt += f"Answer: {sample['answer']}"
258
- else:
259
- prompt += "Answer:"
260
-
261
- return prompt
262
-
263
-
264
- class ARC(BaseBenchmark):
265
- """
266
- ARC (AI2 Reasoning Challenge) Benchmark
267
-
268
- Tests scientific reasoning with grade-school level science questions.
269
- """
270
-
271
- def __init__(self, challenge_set: str = "easy"):
272
- config = BenchmarkConfig(
273
- name=f"ARC-{challenge_set}",
274
- description=f"AI2 Reasoning Challenge ({challenge_set})",
275
- num_choices=4,
276
- few_shot_examples=25
277
- )
278
- super().__init__(config)
279
- self.challenge_set = challenge_set # "easy" or "challenge"
280
-
281
- def load_data(self, max_samples: Optional[int] = None) -> List[Dict[str, Any]]:
282
- """Load ARC data (simplified implementation)."""
283
- # This is a simplified implementation
284
- # In practice, you'd load from the actual ARC dataset
285
-
286
- data = []
287
-
288
- sample_questions = [
289
- {
290
- "question": "What happens to water when it freezes?",
291
- "choices": ["It becomes ice", "It becomes gas", "It disappears", "It becomes hot"],
292
- "answer": "A"
293
- },
294
- {
295
- "question": "Which planet is closest to the Sun?",
296
- "choices": ["Earth", "Mars", "Mercury", "Venus"],
297
- "answer": "C"
298
- },
299
- {
300
- "question": "What do plants need to make their own food?",
301
- "choices": ["Sunlight and water", "Only water", "Only sunlight", "Soil only"],
302
- "answer": "A"
303
- },
304
- {
305
- "question": "What is the main gas in Earth's atmosphere?",
306
- "choices": ["Oxygen", "Carbon dioxide", "Nitrogen", "Hydrogen"],
307
- "answer": "C"
308
- },
309
- {
310
- "question": "How many legs does a spider have?",
311
- "choices": ["6", "8", "10", "12"],
312
- "answer": "B"
313
- }
314
- ]
315
-
316
- for i, q in enumerate(sample_questions):
317
- if max_samples and i >= max_samples:
318
- break
319
-
320
- sample = {
321
- "question": q["question"],
322
- "choices": q["choices"],
323
- "answer": q["answer"],
324
- "challenge_set": self.challenge_set,
325
- "id": f"arc_{self.challenge_set}_{i}"
326
- }
327
- data.append(sample)
328
-
329
- logger.info(f"Loaded {len(data)} ARC-{self.challenge_set} samples")
330
- return data
331
-
332
- def evaluate_sample(self, sample: Dict[str, Any], prediction: str) -> bool:
333
- """Evaluate a single ARC sample."""
334
- prediction = prediction.strip().upper()
335
-
336
- if prediction in ["A", "B", "C", "D"]:
337
- return prediction == sample["answer"]
338
-
339
- # Try to extract choice from longer response
340
- for choice in ["A", "B", "C", "D"]:
341
- if choice in prediction:
342
- return choice == sample["answer"]
343
-
344
- return False
345
-
346
- def _format_single_example(self, sample: Dict[str, Any], include_answer: bool = False) -> str:
347
- """Format a single ARC example."""
348
- prompt = f"Question: {sample['question']}\n"
349
-
350
- choices = sample['choices']
351
- for i, choice in enumerate(choices):
352
- letter = chr(65 + i) # A, B, C, D
353
- prompt += f"{letter}. {choice}\n"
354
-
355
- if include_answer:
356
- prompt += f"Answer: {sample['answer']}"
357
- else:
358
- prompt += "Answer:"
359
-
360
- return prompt
361
-
362
-
363
- class GSM8K(BaseBenchmark):
364
- """
365
- GSM8K Benchmark
366
-
367
- Tests mathematical reasoning with grade school math word problems.
368
- """
369
-
370
- def __init__(self):
371
- config = BenchmarkConfig(
372
- name="GSM8K",
373
- description="Grade School Math 8K",
374
- num_choices=1, # Open-ended numerical answers
375
- few_shot_examples=8
376
- )
377
- super().__init__(config)
378
-
379
- def load_data(self, max_samples: Optional[int] = None) -> List[Dict[str, Any]]:
380
- """Load GSM8K data (simplified implementation)."""
381
- # This is a simplified implementation
382
- # In practice, you'd load from the actual GSM8K dataset
383
-
384
- data = []
385
-
386
- sample_problems = [
387
- {
388
- "question": "Janet has 12 apples. She gives 3 apples to her friend and eats 2 apples. How many apples does Janet have left?",
389
- "answer": "7"
390
- },
391
- {
392
- "question": "A school has 24 students in each class. If there are 5 classes, how many students are there in total?",
393
- "answer": "120"
394
- },
395
- {
396
- "question": "Tom buys 4 books for $8 each. How much money does Tom spend in total?",
397
- "answer": "32"
398
- },
399
- {
400
- "question": "Sarah has 36 stickers. She wants to put them equally into 6 albums. How many stickers will be in each album?",
401
- "answer": "6"
402
- },
403
- {
404
- "question": "A rectangle has a length of 15 cm and a width of 8 cm. What is the area of the rectangle?",
405
- "answer": "120"
406
- }
407
- ]
408
-
409
- for i, problem in enumerate(sample_problems):
410
- if max_samples and i >= max_samples:
411
- break
412
-
413
- sample = {
414
- "question": problem["question"],
415
- "answer": problem["answer"],
416
- "id": f"gsm8k_{i}"
417
- }
418
- data.append(sample)
419
-
420
- logger.info(f"Loaded {len(data)} GSM8K samples")
421
- return data
422
-
423
- def evaluate_sample(self, sample: Dict[str, Any], prediction: str) -> bool:
424
- """Evaluate a single GSM8K sample."""
425
- # Extract numerical answer from prediction
426
- prediction = prediction.strip()
427
-
428
- # Try to find the numerical answer
429
- import re
430
- numbers = re.findall(r'\d+', prediction)
431
-
432
- if numbers:
433
- # Take the last number found (often the final answer)
434
- predicted_answer = numbers[-1]
435
- return predicted_answer == sample["answer"]
436
-
437
- return False
438
-
439
- def _format_single_example(self, sample: Dict[str, Any], include_answer: bool = False) -> str:
440
- """Format a single GSM8K example."""
441
- prompt = f"Problem: {sample['question']}\n"
442
-
443
- if include_answer:
444
- prompt += f"Answer: {sample['answer']}"
445
- else:
446
- prompt += "Answer:"
447
-
448
- return prompt
449
-
450
-
451
- # Convenience functions for creating benchmark instances
452
- def create_mmlu_benchmark(subjects: Optional[List[str]] = None) -> MMLU:
453
- """Create MMLU benchmark instance."""
454
- return MMLU(subjects=subjects)
455
-
456
-
457
- def create_hellaswag_benchmark() -> HellaSwag:
458
- """Create HellaSwag benchmark instance."""
459
- return HellaSwag()
460
-
461
-
462
- def create_arc_benchmark(challenge_set: str = "easy") -> ARC:
463
- """Create ARC benchmark instance."""
464
- return ARC(challenge_set=challenge_set)
465
-
466
-
467
- def create_gsm8k_benchmark() -> GSM8K:
468
- """Create GSM8K benchmark instance."""
469
- return GSM8K()
@@ -1,10 +0,0 @@
1
- """
2
- Configuration management for evaluation framework.
3
- """
4
-
5
- from .evaluation_config import EvaluationConfig, ConfigManager
6
-
7
- __all__ = [
8
- "EvaluationConfig",
9
- "ConfigManager"
10
- ]
@@ -1,108 +0,0 @@
1
- """
2
- Configuration management for evaluation framework
3
- """
4
-
5
- import os
6
- import json
7
- import logging
8
- from typing import Dict, Any, Optional, List
9
- from dataclasses import dataclass, asdict
10
- from pathlib import Path
11
-
12
- logger = logging.getLogger(__name__)
13
-
14
-
15
- @dataclass
16
- class EvaluationConfig:
17
- """
18
- Configuration class for evaluation settings.
19
- """
20
-
21
- # General settings
22
- output_dir: str = "evaluation_results"
23
- max_concurrent_evaluations: int = 3
24
- timeout_seconds: int = 600
25
-
26
- # Model settings
27
- default_provider: str = "openai"
28
- default_max_tokens: int = 150
29
- default_temperature: float = 0.1
30
- batch_size: int = 8
31
-
32
- # Metrics settings
33
- compute_all_metrics: bool = False
34
- custom_metrics: List[str] = None
35
-
36
- # Benchmark settings
37
- max_samples_per_benchmark: Optional[int] = None
38
- enable_few_shot: bool = True
39
- num_shots: int = 5
40
-
41
- # Experiment tracking
42
- use_wandb: bool = False
43
- wandb_project: Optional[str] = None
44
- wandb_entity: Optional[str] = None
45
- use_mlflow: bool = False
46
- mlflow_tracking_uri: Optional[str] = None
47
-
48
- # Results settings
49
- save_predictions: bool = True
50
- save_detailed_results: bool = True
51
- export_format: str = "json" # json, csv, html
52
-
53
- def __post_init__(self):
54
- """Initialize default values after creation."""
55
- if self.custom_metrics is None:
56
- self.custom_metrics = []
57
-
58
- # Ensure output directory exists
59
- os.makedirs(self.output_dir, exist_ok=True)
60
-
61
- @classmethod
62
- def from_dict(cls, config_dict: Dict[str, Any]) -> 'EvaluationConfig':
63
- """
64
- Create configuration from dictionary.
65
-
66
- Args:
67
- config_dict: Configuration dictionary
68
-
69
- Returns:
70
- EvaluationConfig instance
71
- """
72
- # Filter out unknown keys
73
- valid_keys = {field.name for field in cls.__dataclass_fields__.values()}
74
- filtered_dict = {k: v for k, v in config_dict.items() if k in valid_keys}
75
-
76
- return cls(**filtered_dict)
77
-
78
- def to_dict(self) -> Dict[str, Any]:
79
- """
80
- Convert configuration to dictionary.
81
-
82
- Returns:
83
- Configuration as dictionary
84
- """
85
- return asdict(self)
86
-
87
-
88
- class ConfigManager:
89
- """Manager for handling multiple evaluation configurations."""
90
-
91
- def __init__(self, config_dir: str = "configs"):
92
- """Initialize configuration manager."""
93
- self.config_dir = config_dir
94
- self.configs: Dict[str, EvaluationConfig] = {}
95
- self.default_config = EvaluationConfig()
96
-
97
- # Ensure config directory exists
98
- os.makedirs(config_dir, exist_ok=True)
99
-
100
- def get_config(self, config_name: Optional[str] = None) -> EvaluationConfig:
101
- """Get configuration by name."""
102
- if config_name is None:
103
- return self.default_config
104
-
105
- if config_name in self.configs:
106
- return self.configs[config_name]
107
-
108
- return self.default_config
@@ -1,18 +0,0 @@
1
- """
2
- Evaluators module for ISA Model Framework
3
-
4
- Provides specialized evaluators for different model types and evaluation tasks.
5
- """
6
-
7
- from .base_evaluator import BaseEvaluator, EvaluationResult
8
- from .llm_evaluator import LLMEvaluator
9
- from .vision_evaluator import VisionEvaluator
10
- from .multimodal_evaluator import MultimodalEvaluator
11
-
12
- __all__ = [
13
- "BaseEvaluator",
14
- "EvaluationResult",
15
- "LLMEvaluator",
16
- "VisionEvaluator",
17
- "MultimodalEvaluator"
18
- ]