isa-model 0.4.0__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (199) hide show
  1. isa_model/client.py +466 -43
  2. isa_model/core/cache/redis_cache.py +12 -3
  3. isa_model/core/config/config_manager.py +230 -3
  4. isa_model/core/config.py +90 -0
  5. isa_model/core/database/direct_db_client.py +114 -0
  6. isa_model/core/database/migration_manager.py +563 -0
  7. isa_model/core/database/migrations.py +21 -1
  8. isa_model/core/database/supabase_client.py +154 -19
  9. isa_model/core/dependencies.py +316 -0
  10. isa_model/core/discovery/__init__.py +19 -0
  11. isa_model/core/discovery/consul_discovery.py +190 -0
  12. isa_model/core/logging/__init__.py +54 -0
  13. isa_model/core/logging/influx_logger.py +523 -0
  14. isa_model/core/logging/loki_logger.py +160 -0
  15. isa_model/core/models/__init__.py +27 -18
  16. isa_model/core/models/config_models.py +625 -0
  17. isa_model/core/models/deployment_billing_tracker.py +430 -0
  18. isa_model/core/models/model_manager.py +40 -17
  19. isa_model/core/models/model_metadata.py +690 -0
  20. isa_model/core/models/model_repo.py +174 -18
  21. isa_model/core/models/system_models.py +857 -0
  22. isa_model/core/repositories/__init__.py +9 -0
  23. isa_model/core/repositories/config_repository.py +912 -0
  24. isa_model/core/services/intelligent_model_selector.py +399 -21
  25. isa_model/core/storage/hf_storage.py +1 -1
  26. isa_model/core/types.py +1 -0
  27. isa_model/deployment/__init__.py +5 -48
  28. isa_model/deployment/core/__init__.py +2 -31
  29. isa_model/deployment/core/deployment_manager.py +1278 -370
  30. isa_model/deployment/local/__init__.py +31 -0
  31. isa_model/deployment/local/config.py +248 -0
  32. isa_model/deployment/local/gpu_gateway.py +607 -0
  33. isa_model/deployment/local/health_checker.py +428 -0
  34. isa_model/deployment/local/provider.py +586 -0
  35. isa_model/deployment/local/tensorrt_service.py +621 -0
  36. isa_model/deployment/local/transformers_service.py +644 -0
  37. isa_model/deployment/local/vllm_service.py +527 -0
  38. isa_model/deployment/modal/__init__.py +8 -0
  39. isa_model/deployment/modal/config.py +136 -0
  40. isa_model/deployment/{services/auto_hf_modal_deployer.py → modal/deployer.py} +1 -1
  41. isa_model/deployment/modal/services/__init__.py +3 -0
  42. isa_model/deployment/modal/services/audio/__init__.py +1 -0
  43. isa_model/deployment/modal/services/embedding/__init__.py +1 -0
  44. isa_model/deployment/modal/services/llm/__init__.py +1 -0
  45. isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
  46. isa_model/deployment/modal/services/video/__init__.py +1 -0
  47. isa_model/deployment/modal/services/vision/__init__.py +1 -0
  48. isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
  49. isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
  50. isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
  51. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
  52. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
  53. isa_model/deployment/storage/__init__.py +5 -0
  54. isa_model/deployment/storage/deployment_repository.py +824 -0
  55. isa_model/deployment/triton/__init__.py +10 -0
  56. isa_model/deployment/triton/config.py +196 -0
  57. isa_model/deployment/triton/configs/__init__.py +1 -0
  58. isa_model/deployment/triton/provider.py +512 -0
  59. isa_model/deployment/triton/scripts/__init__.py +1 -0
  60. isa_model/deployment/triton/templates/__init__.py +1 -0
  61. isa_model/inference/__init__.py +47 -1
  62. isa_model/inference/ai_factory.py +137 -10
  63. isa_model/inference/legacy_services/__init__.py +21 -0
  64. isa_model/inference/legacy_services/model_evaluation.py +637 -0
  65. isa_model/inference/legacy_services/model_service.py +573 -0
  66. isa_model/inference/legacy_services/model_serving.py +717 -0
  67. isa_model/inference/legacy_services/model_training.py +561 -0
  68. isa_model/inference/models/__init__.py +21 -0
  69. isa_model/inference/models/inference_config.py +551 -0
  70. isa_model/inference/models/inference_record.py +675 -0
  71. isa_model/inference/models/performance_models.py +714 -0
  72. isa_model/inference/repositories/__init__.py +9 -0
  73. isa_model/inference/repositories/inference_repository.py +828 -0
  74. isa_model/inference/services/audio/base_stt_service.py +184 -11
  75. isa_model/inference/services/audio/openai_stt_service.py +22 -6
  76. isa_model/inference/services/custom_model_manager.py +277 -0
  77. isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
  78. isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
  79. isa_model/inference/services/llm/__init__.py +10 -2
  80. isa_model/inference/services/llm/base_llm_service.py +335 -24
  81. isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
  82. isa_model/inference/services/llm/helpers/llm_adapter.py +9 -4
  83. isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
  84. isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
  85. isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
  86. isa_model/inference/services/llm/local_llm_service.py +747 -0
  87. isa_model/inference/services/llm/ollama_llm_service.py +9 -2
  88. isa_model/inference/services/llm/openai_llm_service.py +33 -16
  89. isa_model/inference/services/llm/yyds_llm_service.py +8 -2
  90. isa_model/inference/services/vision/__init__.py +22 -1
  91. isa_model/inference/services/vision/blip_vision_service.py +359 -0
  92. isa_model/inference/services/vision/helpers/image_utils.py +8 -5
  93. isa_model/inference/services/vision/isa_vision_service.py +65 -4
  94. isa_model/inference/services/vision/openai_vision_service.py +19 -10
  95. isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
  96. isa_model/serving/api/cache_manager.py +245 -0
  97. isa_model/serving/api/dependencies/__init__.py +1 -0
  98. isa_model/serving/api/dependencies/auth.py +194 -0
  99. isa_model/serving/api/dependencies/database.py +139 -0
  100. isa_model/serving/api/error_handlers.py +284 -0
  101. isa_model/serving/api/fastapi_server.py +172 -22
  102. isa_model/serving/api/middleware/auth.py +8 -2
  103. isa_model/serving/api/middleware/security.py +23 -33
  104. isa_model/serving/api/middleware/tenant_context.py +414 -0
  105. isa_model/serving/api/routes/analytics.py +4 -1
  106. isa_model/serving/api/routes/config.py +645 -0
  107. isa_model/serving/api/routes/deployment_billing.py +315 -0
  108. isa_model/serving/api/routes/deployments.py +138 -2
  109. isa_model/serving/api/routes/gpu_gateway.py +440 -0
  110. isa_model/serving/api/routes/health.py +32 -12
  111. isa_model/serving/api/routes/inference_monitoring.py +486 -0
  112. isa_model/serving/api/routes/local_deployments.py +448 -0
  113. isa_model/serving/api/routes/tenants.py +575 -0
  114. isa_model/serving/api/routes/unified.py +680 -18
  115. isa_model/serving/api/routes/webhooks.py +479 -0
  116. isa_model/serving/api/startup.py +68 -54
  117. isa_model/utils/gpu_utils.py +311 -0
  118. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/METADATA +66 -24
  119. isa_model-0.4.3.dist-info/RECORD +193 -0
  120. isa_model/core/storage/minio_storage.py +0 -0
  121. isa_model/deployment/cloud/__init__.py +0 -9
  122. isa_model/deployment/cloud/modal/__init__.py +0 -10
  123. isa_model/deployment/core/deployment_config.py +0 -356
  124. isa_model/deployment/core/isa_deployment_service.py +0 -401
  125. isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
  126. isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
  127. isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
  128. isa_model/deployment/runtime/deployed_service.py +0 -338
  129. isa_model/deployment/services/__init__.py +0 -9
  130. isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
  131. isa_model/deployment/services/model_service.py +0 -332
  132. isa_model/deployment/services/service_monitor.py +0 -356
  133. isa_model/deployment/services/service_registry.py +0 -527
  134. isa_model/eval/__init__.py +0 -92
  135. isa_model/eval/benchmarks/__init__.py +0 -27
  136. isa_model/eval/benchmarks/multimodal_datasets.py +0 -460
  137. isa_model/eval/benchmarks.py +0 -701
  138. isa_model/eval/config/__init__.py +0 -10
  139. isa_model/eval/config/evaluation_config.py +0 -108
  140. isa_model/eval/evaluators/__init__.py +0 -24
  141. isa_model/eval/evaluators/audio_evaluator.py +0 -727
  142. isa_model/eval/evaluators/base_evaluator.py +0 -503
  143. isa_model/eval/evaluators/embedding_evaluator.py +0 -742
  144. isa_model/eval/evaluators/llm_evaluator.py +0 -472
  145. isa_model/eval/evaluators/vision_evaluator.py +0 -564
  146. isa_model/eval/example_evaluation.py +0 -395
  147. isa_model/eval/factory.py +0 -798
  148. isa_model/eval/infrastructure/__init__.py +0 -24
  149. isa_model/eval/infrastructure/experiment_tracker.py +0 -466
  150. isa_model/eval/isa_benchmarks.py +0 -700
  151. isa_model/eval/isa_integration.py +0 -582
  152. isa_model/eval/metrics.py +0 -951
  153. isa_model/eval/tests/unit/test_basic.py +0 -396
  154. isa_model/serving/api/routes/evaluations.py +0 -579
  155. isa_model/training/__init__.py +0 -168
  156. isa_model/training/annotation/annotation_schema.py +0 -47
  157. isa_model/training/annotation/processors/annotation_processor.py +0 -126
  158. isa_model/training/annotation/storage/dataset_manager.py +0 -131
  159. isa_model/training/annotation/storage/dataset_schema.py +0 -44
  160. isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
  161. isa_model/training/annotation/tests/test_minio copy.py +0 -113
  162. isa_model/training/annotation/tests/test_minio_upload.py +0 -43
  163. isa_model/training/annotation/views/annotation_controller.py +0 -158
  164. isa_model/training/cloud/__init__.py +0 -22
  165. isa_model/training/cloud/job_orchestrator.py +0 -402
  166. isa_model/training/cloud/runpod_trainer.py +0 -454
  167. isa_model/training/cloud/storage_manager.py +0 -482
  168. isa_model/training/core/__init__.py +0 -26
  169. isa_model/training/core/config.py +0 -181
  170. isa_model/training/core/dataset.py +0 -222
  171. isa_model/training/core/trainer.py +0 -720
  172. isa_model/training/core/utils.py +0 -213
  173. isa_model/training/examples/intelligent_training_example.py +0 -281
  174. isa_model/training/factory.py +0 -424
  175. isa_model/training/intelligent/__init__.py +0 -25
  176. isa_model/training/intelligent/decision_engine.py +0 -643
  177. isa_model/training/intelligent/intelligent_factory.py +0 -888
  178. isa_model/training/intelligent/knowledge_base.py +0 -751
  179. isa_model/training/intelligent/resource_optimizer.py +0 -839
  180. isa_model/training/intelligent/task_classifier.py +0 -576
  181. isa_model/training/storage/__init__.py +0 -24
  182. isa_model/training/storage/core_integration.py +0 -439
  183. isa_model/training/storage/training_repository.py +0 -552
  184. isa_model/training/storage/training_storage.py +0 -628
  185. isa_model-0.4.0.dist-info/RECORD +0 -182
  186. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_chatTTS_service.py +0 -0
  187. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_fish_service.py +0 -0
  188. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_openvoice_service.py +0 -0
  189. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_service_v2.py +0 -0
  190. /isa_model/deployment/{cloud/modal → modal/services/embedding}/isa_embed_rerank_service.py +0 -0
  191. /isa_model/deployment/{cloud/modal → modal/services/video}/isa_video_hunyuan_service.py +0 -0
  192. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ocr_service.py +0 -0
  193. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_qwen25_service.py +0 -0
  194. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_table_service.py +0 -0
  195. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service.py +0 -0
  196. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service_optimized.py +0 -0
  197. /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
  198. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/WHEEL +0 -0
  199. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/top_level.txt +0 -0
@@ -1,701 +0,0 @@
1
- """
2
- Standard AI Benchmarks for ISA Model Framework
3
-
4
- This module provides implementations of standard AI benchmarks:
5
- - MMLU (Massive Multitask Language Understanding)
6
- - HellaSwag (Commonsense Reasoning)
7
- - ARC (AI2 Reasoning Challenge)
8
- - GSM8K (Grade School Math)
9
- """
10
-
11
- import os
12
- import json
13
- import logging
14
- import requests
15
- import zipfile
16
- import tarfile
17
- from pathlib import Path
18
- from typing import Dict, List, Any, Optional
19
- from abc import ABC, abstractmethod
20
- from dataclasses import dataclass
21
- import pandas as pd
22
-
23
- logger = logging.getLogger(__name__)
24
-
25
-
26
- class DatasetDownloader:
27
- """Utility class for downloading and caching benchmark datasets."""
28
-
29
- def __init__(self, cache_dir: str = "~/.isa_model/datasets"):
30
- self.cache_dir = Path(cache_dir).expanduser()
31
- self.cache_dir.mkdir(parents=True, exist_ok=True)
32
-
33
- # Dataset URLs and info
34
- self.dataset_info = {
35
- "mmlu": {
36
- "url": "https://people.eecs.berkeley.edu/~hendrycks/data.tar",
37
- "filename": "mmlu_data.tar",
38
- "extracted_dir": "data"
39
- },
40
- "hellaswag": {
41
- "url": "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_val.jsonl",
42
- "filename": "hellaswag_val.jsonl"
43
- },
44
- "arc": {
45
- "url": "https://s3-us-west-2.amazonaws.com/ai2-website/data/ARC-V1-Feb2018.zip",
46
- "filename": "arc_data.zip",
47
- "extracted_dir": "ARC-V1-Feb2018-2"
48
- },
49
- "gsm8k": {
50
- "url": "https://github.com/openai/grade-school-math/raw/master/grade_school_math/data/test.jsonl",
51
- "filename": "gsm8k_test.jsonl"
52
- }
53
- }
54
-
55
- def download_dataset(self, dataset_name: str, force_download: bool = False) -> Path:
56
- """Download and cache a dataset."""
57
- if dataset_name not in self.dataset_info:
58
- raise ValueError(f"Unknown dataset: {dataset_name}")
59
-
60
- info = self.dataset_info[dataset_name]
61
- dataset_dir = self.cache_dir / dataset_name
62
- dataset_dir.mkdir(exist_ok=True)
63
-
64
- file_path = dataset_dir / info["filename"]
65
-
66
- # Check if already downloaded
67
- if file_path.exists() and not force_download:
68
- logger.info(f"Using cached {dataset_name} dataset at {file_path}")
69
- return self._get_data_path(dataset_name, file_path)
70
-
71
- # Download the dataset
72
- logger.info(f"Downloading {dataset_name} dataset from {info['url']}")
73
- try:
74
- response = requests.get(info["url"], stream=True)
75
- response.raise_for_status()
76
-
77
- with open(file_path, 'wb') as f:
78
- for chunk in response.iter_content(chunk_size=8192):
79
- f.write(chunk)
80
-
81
- logger.info(f"Downloaded {dataset_name} dataset to {file_path}")
82
-
83
- # Extract if needed
84
- return self._get_data_path(dataset_name, file_path)
85
-
86
- except Exception as e:
87
- logger.error(f"Failed to download {dataset_name}: {e}")
88
- # Fall back to placeholder data
89
- return None
90
-
91
- def _get_data_path(self, dataset_name: str, file_path: Path) -> Path:
92
- """Get the actual data path, extracting archives if needed."""
93
- info = self.dataset_info[dataset_name]
94
-
95
- if "extracted_dir" in info:
96
- # Need to extract
97
- extract_dir = file_path.parent / info["extracted_dir"]
98
-
99
- if not extract_dir.exists():
100
- logger.info(f"Extracting {file_path}")
101
-
102
- if file_path.suffix == ".zip":
103
- with zipfile.ZipFile(file_path, 'r') as zip_ref:
104
- zip_ref.extractall(file_path.parent)
105
- elif file_path.suffix == ".tar" or ".tar." in file_path.name:
106
- with tarfile.open(file_path, 'r') as tar_ref:
107
- tar_ref.extractall(file_path.parent)
108
-
109
- return extract_dir
110
- else:
111
- return file_path
112
-
113
-
114
- @dataclass
115
- class BenchmarkConfig:
116
- """Configuration for benchmark evaluation."""
117
- name: str
118
- description: str
119
- num_choices: int = 4
120
- few_shot_examples: int = 5
121
- max_samples: Optional[int] = None
122
- subjects: Optional[List[str]] = None
123
-
124
-
125
- class BaseBenchmark(ABC):
126
- """Base class for all benchmarks."""
127
-
128
- def __init__(self, config: BenchmarkConfig):
129
- self.config = config
130
- self.name = config.name
131
- self.data = None
132
- self.downloader = DatasetDownloader()
133
- self.use_real_data = True # Flag to control real vs placeholder data
134
-
135
- @abstractmethod
136
- def load_data(self, max_samples: Optional[int] = None) -> List[Dict[str, Any]]:
137
- """Load benchmark data."""
138
- pass
139
-
140
- @abstractmethod
141
- def evaluate_sample(self, sample: Dict[str, Any], prediction: str) -> bool:
142
- """Evaluate a single sample."""
143
- pass
144
-
145
- def format_prompt(self, sample: Dict[str, Any], few_shot_examples: Optional[List[Dict[str, Any]]] = None) -> str:
146
- """Format prompt for the sample."""
147
- prompt = ""
148
-
149
- # Add few-shot examples if provided
150
- if few_shot_examples:
151
- for example in few_shot_examples:
152
- prompt += self._format_single_example(example, include_answer=True) + "\n\n"
153
-
154
- # Add the actual question
155
- prompt += self._format_single_example(sample, include_answer=False)
156
-
157
- return prompt
158
-
159
- @abstractmethod
160
- def _format_single_example(self, sample: Dict[str, Any], include_answer: bool = False) -> str:
161
- """Format a single example."""
162
- pass
163
-
164
-
165
- class MMLU(BaseBenchmark):
166
- """
167
- MMLU (Massive Multitask Language Understanding) Benchmark
168
-
169
- Tests knowledge across 57 subjects including mathematics, history,
170
- computer science, law, and more.
171
- """
172
-
173
- def __init__(self, subjects: Optional[List[str]] = None):
174
- config = BenchmarkConfig(
175
- name="MMLU",
176
- description="Massive Multitask Language Understanding",
177
- num_choices=4,
178
- few_shot_examples=5,
179
- subjects=subjects
180
- )
181
- super().__init__(config)
182
-
183
- # MMLU subjects
184
- self.all_subjects = [
185
- "abstract_algebra", "anatomy", "astronomy", "business_ethics",
186
- "clinical_knowledge", "college_biology", "college_chemistry",
187
- "college_computer_science", "college_mathematics", "college_medicine",
188
- "college_physics", "computer_security", "conceptual_physics",
189
- "econometrics", "electrical_engineering", "elementary_mathematics",
190
- "formal_logic", "global_facts", "high_school_biology",
191
- "high_school_chemistry", "high_school_computer_science",
192
- "high_school_european_history", "high_school_geography",
193
- "high_school_government_and_politics", "high_school_macroeconomics",
194
- "high_school_mathematics", "high_school_microeconomics",
195
- "high_school_physics", "high_school_psychology", "high_school_statistics",
196
- "high_school_us_history", "high_school_world_history", "human_aging",
197
- "human_sexuality", "international_law", "jurisprudence",
198
- "logical_fallacies", "machine_learning", "management", "marketing",
199
- "medical_genetics", "miscellaneous", "moral_disputes", "moral_scenarios",
200
- "nutrition", "philosophy", "prehistory", "professional_accounting",
201
- "professional_law", "professional_medicine", "professional_psychology",
202
- "public_relations", "security_studies", "sociology", "us_foreign_policy",
203
- "virology", "world_religions"
204
- ]
205
-
206
- self.subjects = subjects or self.all_subjects[:10] # Use first 10 subjects by default
207
-
208
- def load_data(self, max_samples: Optional[int] = None) -> List[Dict[str, Any]]:
209
- """Load MMLU data with real dataset support."""
210
- if self.use_real_data:
211
- try:
212
- return self._load_real_mmlu_data(max_samples)
213
- except Exception as e:
214
- logger.warning(f"Failed to load real MMLU data: {e}. Falling back to placeholder data.")
215
- return self._load_placeholder_mmlu_data(max_samples)
216
- else:
217
- return self._load_placeholder_mmlu_data(max_samples)
218
-
219
- def _load_real_mmlu_data(self, max_samples: Optional[int] = None) -> List[Dict[str, Any]]:
220
- """Load real MMLU dataset."""
221
- data_path = self.downloader.download_dataset("mmlu")
222
- if not data_path or not data_path.exists():
223
- raise FileNotFoundError("MMLU dataset not found")
224
-
225
- data = []
226
- samples_per_subject = max_samples // len(self.subjects) if max_samples else None
227
-
228
- for subject in self.subjects:
229
- subject_file = data_path / "test" / f"{subject}_test.csv"
230
- if not subject_file.exists():
231
- logger.warning(f"Subject file not found: {subject_file}")
232
- continue
233
-
234
- try:
235
- # Load CSV data
236
- df = pd.read_csv(subject_file, header=None,
237
- names=["question", "A", "B", "C", "D", "answer"])
238
-
239
- # Convert to our format
240
- for idx, row in df.iterrows():
241
- if samples_per_subject and len([d for d in data if d["subject"] == subject]) >= samples_per_subject:
242
- break
243
-
244
- sample = {
245
- "subject": subject,
246
- "question": row["question"],
247
- "choices": [row["A"], row["B"], row["C"], row["D"]],
248
- "answer": str(row["answer"]).strip().upper(),
249
- "id": f"{subject}_{idx}"
250
- }
251
- data.append(sample)
252
-
253
- except Exception as e:
254
- logger.error(f"Error loading subject {subject}: {e}")
255
- continue
256
-
257
- if max_samples:
258
- data = data[:max_samples]
259
-
260
- logger.info(f"Loaded {len(data)} real MMLU samples across {len(self.subjects)} subjects")
261
- return data
262
-
263
- def _load_placeholder_mmlu_data(self, max_samples: Optional[int] = None) -> List[Dict[str, Any]]:
264
- """Load placeholder MMLU data."""
265
- data = []
266
-
267
- for subject in self.subjects:
268
- # Generate sample questions for each subject
269
- for i in range(min(10, max_samples // len(self.subjects) if max_samples else 10)):
270
- sample = {
271
- "subject": subject,
272
- "question": f"Sample {subject} question {i+1}",
273
- "choices": [
274
- f"Option A for {subject}",
275
- f"Option B for {subject}",
276
- f"Option C for {subject}",
277
- f"Option D for {subject}"
278
- ],
279
- "answer": "A", # Simplified
280
- "id": f"{subject}_{i}"
281
- }
282
- data.append(sample)
283
-
284
- if max_samples:
285
- data = data[:max_samples]
286
-
287
- logger.info(f"Loaded {len(data)} placeholder MMLU samples across {len(self.subjects)} subjects")
288
- return data
289
-
290
- def evaluate_sample(self, sample: Dict[str, Any], prediction: str) -> bool:
291
- """Evaluate a single MMLU sample."""
292
- # Extract the letter choice from prediction
293
- prediction = prediction.strip().upper()
294
-
295
- # Handle various response formats
296
- if prediction in ["A", "B", "C", "D"]:
297
- return prediction == sample["answer"]
298
- elif prediction.startswith("(") and prediction.endswith(")"):
299
- letter = prediction[1]
300
- return letter == sample["answer"]
301
- else:
302
- # Try to find A, B, C, or D in the response
303
- for choice in ["A", "B", "C", "D"]:
304
- if choice in prediction:
305
- return choice == sample["answer"]
306
-
307
- return False
308
-
309
- def _format_single_example(self, sample: Dict[str, Any], include_answer: bool = False) -> str:
310
- """Format a single MMLU example."""
311
- prompt = f"Subject: {sample['subject'].replace('_', ' ').title()}\n"
312
- prompt += f"Question: {sample['question']}\n"
313
-
314
- choices = sample['choices']
315
- for i, choice in enumerate(choices):
316
- letter = chr(65 + i) # A, B, C, D
317
- prompt += f"{letter}. {choice}\n"
318
-
319
- if include_answer:
320
- prompt += f"Answer: {sample['answer']}"
321
- else:
322
- prompt += "Answer:"
323
-
324
- return prompt
325
-
326
-
327
- class HellaSwag(BaseBenchmark):
328
- """
329
- HellaSwag Benchmark
330
-
331
- Tests commonsense reasoning about physical situations.
332
- """
333
-
334
- def __init__(self):
335
- config = BenchmarkConfig(
336
- name="HellaSwag",
337
- description="Commonsense Reasoning about Physical Situations",
338
- num_choices=4,
339
- few_shot_examples=10
340
- )
341
- super().__init__(config)
342
-
343
- def load_data(self, max_samples: Optional[int] = None) -> List[Dict[str, Any]]:
344
- """Load HellaSwag data with real dataset support."""
345
- if self.use_real_data:
346
- try:
347
- return self._load_real_hellaswag_data(max_samples)
348
- except Exception as e:
349
- logger.warning(f"Failed to load real HellaSwag data: {e}. Falling back to placeholder data.")
350
- return self._load_placeholder_hellaswag_data(max_samples)
351
- else:
352
- return self._load_placeholder_hellaswag_data(max_samples)
353
-
354
- def _load_real_hellaswag_data(self, max_samples: Optional[int] = None) -> List[Dict[str, Any]]:
355
- """Load real HellaSwag dataset."""
356
- data_path = self.downloader.download_dataset("hellaswag")
357
- if not data_path or not data_path.exists():
358
- raise FileNotFoundError("HellaSwag dataset not found")
359
-
360
- data = []
361
-
362
- try:
363
- with open(data_path, 'r', encoding='utf-8') as f:
364
- for i, line in enumerate(f):
365
- if max_samples and i >= max_samples:
366
- break
367
-
368
- item = json.loads(line.strip())
369
-
370
- sample = {
371
- "context": item["ctx"],
372
- "question": "What happens next?",
373
- "choices": item["endings"],
374
- "answer": chr(65 + int(item["label"])), # Convert 0,1,2,3 to A,B,C,D
375
- "id": f"hellaswag_{item.get('ind', i)}"
376
- }
377
- data.append(sample)
378
-
379
- except Exception as e:
380
- logger.error(f"Error loading HellaSwag data: {e}")
381
- raise
382
-
383
- logger.info(f"Loaded {len(data)} real HellaSwag samples")
384
- return data
385
-
386
- def _load_placeholder_hellaswag_data(self, max_samples: Optional[int] = None) -> List[Dict[str, Any]]:
387
- """Load placeholder HellaSwag data."""
388
- data = []
389
-
390
- sample_contexts = [
391
- "A person is washing dishes in the kitchen",
392
- "Someone is riding a bicycle down a hill",
393
- "A chef is preparing ingredients for cooking",
394
- "A student is taking notes in class",
395
- "A gardener is planting flowers"
396
- ]
397
-
398
- for i, context in enumerate(sample_contexts):
399
- if max_samples and i >= max_samples:
400
- break
401
-
402
- sample = {
403
- "context": context,
404
- "question": "What happens next?",
405
- "choices": [
406
- f"They continue with the logical next step for scenario {i+1}",
407
- f"They do something completely unrelated to scenario {i+1}",
408
- f"They stop and do something random in scenario {i+1}",
409
- f"They repeat the same action in scenario {i+1}"
410
- ],
411
- "answer": "A", # First choice is usually most logical
412
- "id": f"hellaswag_{i}"
413
- }
414
- data.append(sample)
415
-
416
- logger.info(f"Loaded {len(data)} placeholder HellaSwag samples")
417
- return data
418
-
419
- def evaluate_sample(self, sample: Dict[str, Any], prediction: str) -> bool:
420
- """Evaluate a single HellaSwag sample."""
421
- prediction = prediction.strip().upper()
422
-
423
- if prediction in ["A", "B", "C", "D"]:
424
- return prediction == sample["answer"]
425
-
426
- # Try to extract choice from longer response
427
- for choice in ["A", "B", "C", "D"]:
428
- if choice in prediction:
429
- return choice == sample["answer"]
430
-
431
- return False
432
-
433
- def _format_single_example(self, sample: Dict[str, Any], include_answer: bool = False) -> str:
434
- """Format a single HellaSwag example."""
435
- prompt = f"Context: {sample['context']}\n"
436
- prompt += f"Question: {sample['question']}\n"
437
-
438
- choices = sample['choices']
439
- for i, choice in enumerate(choices):
440
- letter = chr(65 + i) # A, B, C, D
441
- prompt += f"{letter}. {choice}\n"
442
-
443
- if include_answer:
444
- prompt += f"Answer: {sample['answer']}"
445
- else:
446
- prompt += "Answer:"
447
-
448
- return prompt
449
-
450
-
451
- class ARC(BaseBenchmark):
452
- """
453
- ARC (AI2 Reasoning Challenge) Benchmark
454
-
455
- Tests scientific reasoning with grade-school level science questions.
456
- """
457
-
458
- def __init__(self, challenge_set: str = "easy"):
459
- config = BenchmarkConfig(
460
- name=f"ARC-{challenge_set}",
461
- description=f"AI2 Reasoning Challenge ({challenge_set})",
462
- num_choices=4,
463
- few_shot_examples=25
464
- )
465
- super().__init__(config)
466
- self.challenge_set = challenge_set # "easy" or "challenge"
467
-
468
- def load_data(self, max_samples: Optional[int] = None) -> List[Dict[str, Any]]:
469
- """Load ARC data (simplified implementation)."""
470
- # This is a simplified implementation
471
- # In practice, you'd load from the actual ARC dataset
472
-
473
- data = []
474
-
475
- sample_questions = [
476
- {
477
- "question": "What happens to water when it freezes?",
478
- "choices": ["It becomes ice", "It becomes gas", "It disappears", "It becomes hot"],
479
- "answer": "A"
480
- },
481
- {
482
- "question": "Which planet is closest to the Sun?",
483
- "choices": ["Earth", "Mars", "Mercury", "Venus"],
484
- "answer": "C"
485
- },
486
- {
487
- "question": "What do plants need to make their own food?",
488
- "choices": ["Sunlight and water", "Only water", "Only sunlight", "Soil only"],
489
- "answer": "A"
490
- },
491
- {
492
- "question": "What is the main gas in Earth's atmosphere?",
493
- "choices": ["Oxygen", "Carbon dioxide", "Nitrogen", "Hydrogen"],
494
- "answer": "C"
495
- },
496
- {
497
- "question": "How many legs does a spider have?",
498
- "choices": ["6", "8", "10", "12"],
499
- "answer": "B"
500
- }
501
- ]
502
-
503
- for i, q in enumerate(sample_questions):
504
- if max_samples and i >= max_samples:
505
- break
506
-
507
- sample = {
508
- "question": q["question"],
509
- "choices": q["choices"],
510
- "answer": q["answer"],
511
- "challenge_set": self.challenge_set,
512
- "id": f"arc_{self.challenge_set}_{i}"
513
- }
514
- data.append(sample)
515
-
516
- logger.info(f"Loaded {len(data)} ARC-{self.challenge_set} samples")
517
- return data
518
-
519
- def evaluate_sample(self, sample: Dict[str, Any], prediction: str) -> bool:
520
- """Evaluate a single ARC sample."""
521
- prediction = prediction.strip().upper()
522
-
523
- if prediction in ["A", "B", "C", "D"]:
524
- return prediction == sample["answer"]
525
-
526
- # Try to extract choice from longer response
527
- for choice in ["A", "B", "C", "D"]:
528
- if choice in prediction:
529
- return choice == sample["answer"]
530
-
531
- return False
532
-
533
- def _format_single_example(self, sample: Dict[str, Any], include_answer: bool = False) -> str:
534
- """Format a single ARC example."""
535
- prompt = f"Question: {sample['question']}\n"
536
-
537
- choices = sample['choices']
538
- for i, choice in enumerate(choices):
539
- letter = chr(65 + i) # A, B, C, D
540
- prompt += f"{letter}. {choice}\n"
541
-
542
- if include_answer:
543
- prompt += f"Answer: {sample['answer']}"
544
- else:
545
- prompt += "Answer:"
546
-
547
- return prompt
548
-
549
-
550
- class GSM8K(BaseBenchmark):
551
- """
552
- GSM8K Benchmark
553
-
554
- Tests mathematical reasoning with grade school math word problems.
555
- """
556
-
557
- def __init__(self):
558
- config = BenchmarkConfig(
559
- name="GSM8K",
560
- description="Grade School Math 8K",
561
- num_choices=1, # Open-ended numerical answers
562
- few_shot_examples=8
563
- )
564
- super().__init__(config)
565
-
566
- def load_data(self, max_samples: Optional[int] = None) -> List[Dict[str, Any]]:
567
- """Load GSM8K data with real dataset support."""
568
- if self.use_real_data:
569
- try:
570
- return self._load_real_gsm8k_data(max_samples)
571
- except Exception as e:
572
- logger.warning(f"Failed to load real GSM8K data: {e}. Falling back to placeholder data.")
573
- return self._load_placeholder_gsm8k_data(max_samples)
574
- else:
575
- return self._load_placeholder_gsm8k_data(max_samples)
576
-
577
- def _load_real_gsm8k_data(self, max_samples: Optional[int] = None) -> List[Dict[str, Any]]:
578
- """Load real GSM8K dataset."""
579
- data_path = self.downloader.download_dataset("gsm8k")
580
- if not data_path or not data_path.exists():
581
- raise FileNotFoundError("GSM8K dataset not found")
582
-
583
- data = []
584
-
585
- try:
586
- with open(data_path, 'r', encoding='utf-8') as f:
587
- for i, line in enumerate(f):
588
- if max_samples and i >= max_samples:
589
- break
590
-
591
- item = json.loads(line.strip())
592
-
593
- # Extract numerical answer from solution
594
- answer_text = item["answer"]
595
- import re
596
- numbers = re.findall(r'\d+', answer_text)
597
- answer = numbers[-1] if numbers else "0"
598
-
599
- sample = {
600
- "question": item["question"],
601
- "answer": answer,
602
- "solution": answer_text, # Keep full solution for reference
603
- "id": f"gsm8k_{i}"
604
- }
605
- data.append(sample)
606
-
607
- except Exception as e:
608
- logger.error(f"Error loading GSM8K data: {e}")
609
- raise
610
-
611
- logger.info(f"Loaded {len(data)} real GSM8K samples")
612
- return data
613
-
614
- def _load_placeholder_gsm8k_data(self, max_samples: Optional[int] = None) -> List[Dict[str, Any]]:
615
- """Load placeholder GSM8K data."""
616
- data = []
617
-
618
- sample_problems = [
619
- {
620
- "question": "Janet has 12 apples. She gives 3 apples to her friend and eats 2 apples. How many apples does Janet have left?",
621
- "answer": "7"
622
- },
623
- {
624
- "question": "A school has 24 students in each class. If there are 5 classes, how many students are there in total?",
625
- "answer": "120"
626
- },
627
- {
628
- "question": "Tom buys 4 books for $8 each. How much money does Tom spend in total?",
629
- "answer": "32"
630
- },
631
- {
632
- "question": "Sarah has 36 stickers. She wants to put them equally into 6 albums. How many stickers will be in each album?",
633
- "answer": "6"
634
- },
635
- {
636
- "question": "A rectangle has a length of 15 cm and a width of 8 cm. What is the area of the rectangle?",
637
- "answer": "120"
638
- }
639
- ]
640
-
641
- for i, problem in enumerate(sample_problems):
642
- if max_samples and i >= max_samples:
643
- break
644
-
645
- sample = {
646
- "question": problem["question"],
647
- "answer": problem["answer"],
648
- "id": f"gsm8k_{i}"
649
- }
650
- data.append(sample)
651
-
652
- logger.info(f"Loaded {len(data)} placeholder GSM8K samples")
653
- return data
654
-
655
- def evaluate_sample(self, sample: Dict[str, Any], prediction: str) -> bool:
656
- """Evaluate a single GSM8K sample."""
657
- # Extract numerical answer from prediction
658
- prediction = prediction.strip()
659
-
660
- # Try to find the numerical answer
661
- import re
662
- numbers = re.findall(r'\d+', prediction)
663
-
664
- if numbers:
665
- # Take the last number found (often the final answer)
666
- predicted_answer = numbers[-1]
667
- return predicted_answer == sample["answer"]
668
-
669
- return False
670
-
671
- def _format_single_example(self, sample: Dict[str, Any], include_answer: bool = False) -> str:
672
- """Format a single GSM8K example."""
673
- prompt = f"Problem: {sample['question']}\n"
674
-
675
- if include_answer:
676
- prompt += f"Answer: {sample['answer']}"
677
- else:
678
- prompt += "Answer:"
679
-
680
- return prompt
681
-
682
-
683
- # Convenience functions for creating benchmark instances
684
- def create_mmlu_benchmark(subjects: Optional[List[str]] = None) -> MMLU:
685
- """Create MMLU benchmark instance."""
686
- return MMLU(subjects=subjects)
687
-
688
-
689
- def create_hellaswag_benchmark() -> HellaSwag:
690
- """Create HellaSwag benchmark instance."""
691
- return HellaSwag()
692
-
693
-
694
- def create_arc_benchmark(challenge_set: str = "easy") -> ARC:
695
- """Create ARC benchmark instance."""
696
- return ARC(challenge_set=challenge_set)
697
-
698
-
699
- def create_gsm8k_benchmark() -> GSM8K:
700
- """Create GSM8K benchmark instance."""
701
- return GSM8K()