themis-eval 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (158) hide show
  1. themis/__init__.py +12 -1
  2. themis/_version.py +2 -2
  3. themis/api.py +343 -0
  4. themis/backends/__init__.py +17 -0
  5. themis/backends/execution.py +197 -0
  6. themis/backends/storage.py +260 -0
  7. themis/cli/__init__.py +5 -0
  8. themis/cli/__main__.py +6 -0
  9. themis/cli/commands/__init__.py +19 -0
  10. themis/cli/commands/benchmarks.py +221 -0
  11. themis/cli/commands/comparison.py +394 -0
  12. themis/cli/commands/config_commands.py +244 -0
  13. themis/cli/commands/cost.py +214 -0
  14. themis/cli/commands/demo.py +68 -0
  15. themis/cli/commands/info.py +90 -0
  16. themis/cli/commands/leaderboard.py +362 -0
  17. themis/cli/commands/math_benchmarks.py +318 -0
  18. themis/cli/commands/mcq_benchmarks.py +207 -0
  19. themis/cli/commands/results.py +252 -0
  20. themis/cli/commands/sample_run.py +244 -0
  21. themis/cli/commands/visualize.py +299 -0
  22. themis/cli/main.py +463 -0
  23. themis/cli/new_project.py +33 -0
  24. themis/cli/utils.py +51 -0
  25. themis/comparison/__init__.py +25 -0
  26. themis/comparison/engine.py +348 -0
  27. themis/comparison/reports.py +283 -0
  28. themis/comparison/statistics.py +402 -0
  29. themis/config/__init__.py +19 -0
  30. themis/config/loader.py +27 -0
  31. themis/config/registry.py +34 -0
  32. themis/config/runtime.py +214 -0
  33. themis/config/schema.py +112 -0
  34. themis/core/__init__.py +5 -0
  35. themis/core/conversation.py +354 -0
  36. themis/core/entities.py +184 -0
  37. themis/core/serialization.py +231 -0
  38. themis/core/tools.py +393 -0
  39. themis/core/types.py +141 -0
  40. themis/datasets/__init__.py +273 -0
  41. themis/datasets/base.py +264 -0
  42. themis/datasets/commonsense_qa.py +174 -0
  43. themis/datasets/competition_math.py +265 -0
  44. themis/datasets/coqa.py +133 -0
  45. themis/datasets/gpqa.py +190 -0
  46. themis/datasets/gsm8k.py +123 -0
  47. themis/datasets/gsm_symbolic.py +124 -0
  48. themis/datasets/math500.py +122 -0
  49. themis/datasets/med_qa.py +179 -0
  50. themis/datasets/medmcqa.py +169 -0
  51. themis/datasets/mmlu_pro.py +262 -0
  52. themis/datasets/piqa.py +146 -0
  53. themis/datasets/registry.py +201 -0
  54. themis/datasets/schema.py +245 -0
  55. themis/datasets/sciq.py +150 -0
  56. themis/datasets/social_i_qa.py +151 -0
  57. themis/datasets/super_gpqa.py +263 -0
  58. themis/evaluation/__init__.py +1 -0
  59. themis/evaluation/conditional.py +410 -0
  60. themis/evaluation/extractors/__init__.py +19 -0
  61. themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
  62. themis/evaluation/extractors/exceptions.py +7 -0
  63. themis/evaluation/extractors/identity_extractor.py +29 -0
  64. themis/evaluation/extractors/json_field_extractor.py +45 -0
  65. themis/evaluation/extractors/math_verify_extractor.py +37 -0
  66. themis/evaluation/extractors/regex_extractor.py +43 -0
  67. themis/evaluation/math_verify_utils.py +87 -0
  68. themis/evaluation/metrics/__init__.py +21 -0
  69. themis/evaluation/metrics/code/__init__.py +19 -0
  70. themis/evaluation/metrics/code/codebleu.py +144 -0
  71. themis/evaluation/metrics/code/execution.py +280 -0
  72. themis/evaluation/metrics/code/pass_at_k.py +181 -0
  73. themis/evaluation/metrics/composite_metric.py +47 -0
  74. themis/evaluation/metrics/consistency_metric.py +80 -0
  75. themis/evaluation/metrics/exact_match.py +51 -0
  76. themis/evaluation/metrics/length_difference_tolerance.py +33 -0
  77. themis/evaluation/metrics/math_verify_accuracy.py +40 -0
  78. themis/evaluation/metrics/nlp/__init__.py +21 -0
  79. themis/evaluation/metrics/nlp/bertscore.py +138 -0
  80. themis/evaluation/metrics/nlp/bleu.py +129 -0
  81. themis/evaluation/metrics/nlp/meteor.py +153 -0
  82. themis/evaluation/metrics/nlp/rouge.py +136 -0
  83. themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
  84. themis/evaluation/metrics/response_length.py +33 -0
  85. themis/evaluation/metrics/rubric_judge_metric.py +134 -0
  86. themis/evaluation/pipeline.py +49 -0
  87. themis/evaluation/pipelines/__init__.py +15 -0
  88. themis/evaluation/pipelines/composable_pipeline.py +357 -0
  89. themis/evaluation/pipelines/standard_pipeline.py +348 -0
  90. themis/evaluation/reports.py +293 -0
  91. themis/evaluation/statistics/__init__.py +53 -0
  92. themis/evaluation/statistics/bootstrap.py +79 -0
  93. themis/evaluation/statistics/confidence_intervals.py +121 -0
  94. themis/evaluation/statistics/distributions.py +207 -0
  95. themis/evaluation/statistics/effect_sizes.py +124 -0
  96. themis/evaluation/statistics/hypothesis_tests.py +305 -0
  97. themis/evaluation/statistics/types.py +139 -0
  98. themis/evaluation/strategies/__init__.py +13 -0
  99. themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
  100. themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
  101. themis/evaluation/strategies/evaluation_strategy.py +24 -0
  102. themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
  103. themis/experiment/__init__.py +5 -0
  104. themis/experiment/builder.py +151 -0
  105. themis/experiment/cache_manager.py +134 -0
  106. themis/experiment/comparison.py +631 -0
  107. themis/experiment/cost.py +310 -0
  108. themis/experiment/definitions.py +62 -0
  109. themis/experiment/export.py +798 -0
  110. themis/experiment/export_csv.py +159 -0
  111. themis/experiment/integration_manager.py +104 -0
  112. themis/experiment/math.py +192 -0
  113. themis/experiment/mcq.py +169 -0
  114. themis/experiment/orchestrator.py +415 -0
  115. themis/experiment/pricing.py +317 -0
  116. themis/experiment/storage.py +1458 -0
  117. themis/experiment/visualization.py +588 -0
  118. themis/generation/__init__.py +1 -0
  119. themis/generation/agentic_runner.py +420 -0
  120. themis/generation/batching.py +254 -0
  121. themis/generation/clients.py +143 -0
  122. themis/generation/conversation_runner.py +236 -0
  123. themis/generation/plan.py +456 -0
  124. themis/generation/providers/litellm_provider.py +221 -0
  125. themis/generation/providers/vllm_provider.py +135 -0
  126. themis/generation/router.py +34 -0
  127. themis/generation/runner.py +207 -0
  128. themis/generation/strategies.py +98 -0
  129. themis/generation/templates.py +71 -0
  130. themis/generation/turn_strategies.py +393 -0
  131. themis/generation/types.py +9 -0
  132. themis/integrations/__init__.py +0 -0
  133. themis/integrations/huggingface.py +72 -0
  134. themis/integrations/wandb.py +77 -0
  135. themis/interfaces/__init__.py +169 -0
  136. themis/presets/__init__.py +10 -0
  137. themis/presets/benchmarks.py +354 -0
  138. themis/presets/models.py +190 -0
  139. themis/project/__init__.py +20 -0
  140. themis/project/definitions.py +98 -0
  141. themis/project/patterns.py +230 -0
  142. themis/providers/__init__.py +5 -0
  143. themis/providers/registry.py +39 -0
  144. themis/server/__init__.py +28 -0
  145. themis/server/app.py +337 -0
  146. themis/utils/api_generator.py +379 -0
  147. themis/utils/cost_tracking.py +376 -0
  148. themis/utils/dashboard.py +452 -0
  149. themis/utils/logging_utils.py +41 -0
  150. themis/utils/progress.py +58 -0
  151. themis/utils/tracing.py +320 -0
  152. themis_eval-0.2.0.dist-info/METADATA +596 -0
  153. themis_eval-0.2.0.dist-info/RECORD +157 -0
  154. {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/WHEEL +1 -1
  155. themis_eval-0.1.0.dist-info/METADATA +0 -758
  156. themis_eval-0.1.0.dist-info/RECORD +0 -8
  157. {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/licenses/LICENSE +0 -0
  158. {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,260 @@
1
+ """Storage backend interface for custom storage implementations.
2
+
3
+ This module defines the abstract interface for storage backends, allowing
4
+ users to implement custom storage solutions (cloud storage, databases, etc.)
5
+ without modifying Themis core code.
6
+
7
+ Example implementations:
8
+ - S3Backend: Store results in AWS S3
9
+ - GCSBackend: Store results in Google Cloud Storage
10
+ - PostgresBackend: Store results in PostgreSQL
11
+ - RedisBackend: Use Redis for distributed caching
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ from abc import ABC, abstractmethod
17
+ from pathlib import Path
18
+ from typing import Any, Dict, List
19
+
20
+ from themis.core.entities import (
21
+ EvaluationRecord,
22
+ ExperimentReport,
23
+ GenerationRecord,
24
+ )
25
+
26
+
27
+ class StorageBackend(ABC):
28
+ """Abstract interface for storage backends.
29
+
30
+ Implement this interface to create custom storage solutions.
31
+ All methods should be thread-safe if used with concurrent workers.
32
+
33
+ Example:
34
+ >>> class S3StorageBackend(StorageBackend):
35
+ ... def __init__(self, bucket: str):
36
+ ... self.bucket = bucket
37
+ ... self.s3_client = boto3.client('s3')
38
+ ...
39
+ ... def save_run_metadata(self, run_id: str, metadata: RunMetadata) -> None:
40
+ ... key = f"runs/{run_id}/metadata.json"
41
+ ... self.s3_client.put_object(
42
+ ... Bucket=self.bucket,
43
+ ... Key=key,
44
+ ... Body=metadata.to_json(),
45
+ ... )
46
+ ... # ... implement other methods
47
+ """
48
+
49
+ @abstractmethod
50
+ def save_run_metadata(self, run_id: str, metadata: Dict[str, Any]) -> None:
51
+ """Save run metadata.
52
+
53
+ Args:
54
+ run_id: Unique identifier for the run
55
+ metadata: Run metadata to save (as dictionary)
56
+ """
57
+ pass
58
+
59
+ @abstractmethod
60
+ def load_run_metadata(self, run_id: str) -> Dict[str, Any]:
61
+ """Load run metadata.
62
+
63
+ Args:
64
+ run_id: Unique identifier for the run
65
+
66
+ Returns:
67
+ Run metadata as dictionary
68
+
69
+ Raises:
70
+ FileNotFoundError: If run metadata doesn't exist
71
+ """
72
+ pass
73
+
74
+ @abstractmethod
75
+ def save_generation_record(self, run_id: str, record: GenerationRecord) -> None:
76
+ """Save a generation record.
77
+
78
+ Args:
79
+ run_id: Unique identifier for the run
80
+ record: Generation record to save
81
+
82
+ Note:
83
+ This method should be atomic and thread-safe.
84
+ """
85
+ pass
86
+
87
+ @abstractmethod
88
+ def load_generation_records(self, run_id: str) -> List[GenerationRecord]:
89
+ """Load all generation records for a run.
90
+
91
+ Args:
92
+ run_id: Unique identifier for the run
93
+
94
+ Returns:
95
+ List of generation records
96
+ """
97
+ pass
98
+
99
+ @abstractmethod
100
+ def save_evaluation_record(self, run_id: str, record: EvaluationRecord) -> None:
101
+ """Save an evaluation record.
102
+
103
+ Args:
104
+ run_id: Unique identifier for the run
105
+ record: Evaluation record to save
106
+
107
+ Note:
108
+ This method should be atomic and thread-safe.
109
+ """
110
+ pass
111
+
112
+ @abstractmethod
113
+ def load_evaluation_records(self, run_id: str) -> Dict[str, EvaluationRecord]:
114
+ """Load all evaluation records for a run.
115
+
116
+ Args:
117
+ run_id: Unique identifier for the run
118
+
119
+ Returns:
120
+ Dictionary mapping cache_key to EvaluationRecord
121
+ """
122
+ pass
123
+
124
+ @abstractmethod
125
+ def save_report(self, run_id: str, report: ExperimentReport) -> None:
126
+ """Save experiment report.
127
+
128
+ Args:
129
+ run_id: Unique identifier for the run
130
+ report: Experiment report to save
131
+ """
132
+ pass
133
+
134
+ @abstractmethod
135
+ def load_report(self, run_id: str) -> ExperimentReport:
136
+ """Load experiment report.
137
+
138
+ Args:
139
+ run_id: Unique identifier for the run
140
+
141
+ Returns:
142
+ Experiment report
143
+
144
+ Raises:
145
+ FileNotFoundError: If report doesn't exist
146
+ """
147
+ pass
148
+
149
+ @abstractmethod
150
+ def list_runs(self) -> List[str]:
151
+ """List all run IDs in storage.
152
+
153
+ Returns:
154
+ List of run IDs
155
+ """
156
+ pass
157
+
158
+ @abstractmethod
159
+ def run_exists(self, run_id: str) -> bool:
160
+ """Check if a run exists in storage.
161
+
162
+ Args:
163
+ run_id: Unique identifier for the run
164
+
165
+ Returns:
166
+ True if run exists, False otherwise
167
+ """
168
+ pass
169
+
170
+ @abstractmethod
171
+ def delete_run(self, run_id: str) -> None:
172
+ """Delete all data for a run.
173
+
174
+ Args:
175
+ run_id: Unique identifier for the run
176
+ """
177
+ pass
178
+
179
+ def close(self) -> None:
180
+ """Close the storage backend and release resources.
181
+
182
+ Optional method for cleanup. Called when storage is no longer needed.
183
+ """
184
+ pass
185
+
186
+
187
+ class LocalFileStorageBackend(StorageBackend):
188
+ """Adapter for the existing ExperimentStorage implementation.
189
+
190
+ This class wraps the current file-based storage implementation
191
+ to conform to the StorageBackend interface.
192
+
193
+ Note:
194
+ This is a compatibility layer. New code should use the interface,
195
+ but existing storage logic is preserved.
196
+ """
197
+
198
+ def __init__(self, storage_path: str | Path):
199
+ """Initialize with path to storage directory.
200
+
201
+ Args:
202
+ storage_path: Path to storage directory
203
+ """
204
+ from themis.experiment.storage import ExperimentStorage
205
+ self._storage = ExperimentStorage(storage_path)
206
+
207
+ def save_run_metadata(self, run_id: str, metadata: Dict[str, Any]) -> None:
208
+ """Save run metadata."""
209
+ experiment_id = metadata.get("experiment_id", "default")
210
+ self._storage.start_run(run_id, experiment_id=experiment_id)
211
+
212
+ def load_run_metadata(self, run_id: str) -> Dict[str, Any]:
213
+ """Load run metadata."""
214
+ # Note: Current storage doesn't have a direct method for this
215
+ # This is a limitation of the adapter pattern
216
+ raise NotImplementedError("Use ExperimentStorage directly for now")
217
+
218
+ def save_generation_record(self, run_id: str, record: GenerationRecord) -> None:
219
+ """Save generation record."""
220
+ self._storage.append_record(run_id, record)
221
+
222
+ def load_generation_records(self, run_id: str) -> List[GenerationRecord]:
223
+ """Load generation records."""
224
+ cached = self._storage.load_cached_records(run_id)
225
+ return list(cached.values())
226
+
227
+ def save_evaluation_record(self, run_id: str, record: EvaluationRecord) -> None:
228
+ """Save evaluation record."""
229
+ self._storage.append_evaluation(run_id, record)
230
+
231
+ def load_evaluation_records(self, run_id: str) -> Dict[str, EvaluationRecord]:
232
+ """Load evaluation records."""
233
+ return self._storage.load_cached_evaluations(run_id)
234
+
235
+ def save_report(self, run_id: str, report: ExperimentReport) -> None:
236
+ """Save report."""
237
+ self._storage.save_report(run_id, report)
238
+
239
+ def load_report(self, run_id: str) -> ExperimentReport:
240
+ """Load report."""
241
+ return self._storage.load_report(run_id)
242
+
243
+ def list_runs(self) -> List[str]:
244
+ """List runs."""
245
+ return self._storage.list_runs()
246
+
247
+ def run_exists(self, run_id: str) -> bool:
248
+ """Check if run exists."""
249
+ return run_id in self._storage.list_runs()
250
+
251
+ def delete_run(self, run_id: str) -> None:
252
+ """Delete run."""
253
+ # Note: Current storage doesn't have delete functionality
254
+ raise NotImplementedError("Delete not implemented in current storage")
255
+
256
+
257
+ __all__ = [
258
+ "StorageBackend",
259
+ "LocalFileStorageBackend",
260
+ ]
themis/cli/__init__.py ADDED
@@ -0,0 +1,5 @@
1
+ """Command-line helpers for running Themis experiments."""
2
+
3
+ from . import main
4
+
5
+ __all__ = ["main"]
themis/cli/__main__.py ADDED
@@ -0,0 +1,6 @@
1
+ """Entry point for running themis.cli as a module."""
2
+
3
+ from .main import main
4
+
5
+ if __name__ == "__main__":
6
+ raise SystemExit(main())
@@ -0,0 +1,19 @@
1
+ """CLI command modules."""
2
+
3
+ from themis.cli.commands import (
4
+ benchmarks,
5
+ config_commands,
6
+ demo,
7
+ info,
8
+ math_benchmarks,
9
+ mcq_benchmarks,
10
+ )
11
+
12
+ __all__ = [
13
+ "benchmarks",
14
+ "config_commands",
15
+ "demo",
16
+ "info",
17
+ "math_benchmarks",
18
+ "mcq_benchmarks",
19
+ ]
@@ -0,0 +1,221 @@
1
+ """Benchmark listing commands."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Annotated
6
+
7
+ from cyclopts import Parameter
8
+
9
+ from themis.providers.registry import _REGISTRY
10
+
11
+
12
+ def list_providers(
13
+ *,
14
+ verbose: Annotated[
15
+ bool, Parameter(help="Show detailed provider information")
16
+ ] = False,
17
+ ) -> int:
18
+ """List available LLM providers."""
19
+ providers = sorted(_REGISTRY._factories.keys())
20
+
21
+ if not providers:
22
+ print("No providers registered.")
23
+ return 0
24
+
25
+ print("Available Providers:")
26
+ print("=" * 60)
27
+
28
+ provider_info = {
29
+ "fake": "Built-in fake provider for testing (no API required)",
30
+ "openai-compatible": "OpenAI-compatible API (LM Studio, Ollama, vLLM, OpenAI)",
31
+ "vllm": "vLLM server provider for local model hosting",
32
+ }
33
+
34
+ for provider in providers:
35
+ status = "āœ“" if provider in provider_info else "Ā·"
36
+ print(f"{status} {provider}")
37
+ if verbose and provider in provider_info:
38
+ print(f" {provider_info[provider]}")
39
+
40
+ if not verbose:
41
+ print("\nUse --verbose for more details")
42
+
43
+ return 0
44
+
45
+
46
+ def list_benchmarks(
47
+ *,
48
+ verbose: Annotated[
49
+ bool, Parameter(help="Show detailed benchmark information")
50
+ ] = False,
51
+ ) -> int:
52
+ """List available datasets and benchmarks."""
53
+ benchmarks = [
54
+ {
55
+ "name": "math500",
56
+ "description": "MATH-500 dataset for mathematical reasoning",
57
+ "source": "huggingface (default) or local",
58
+ "subjects": [
59
+ "algebra",
60
+ "counting_and_probability",
61
+ "geometry",
62
+ "intermediate_algebra",
63
+ "number_theory",
64
+ "prealgebra",
65
+ "precalculus",
66
+ ],
67
+ "command": "uv run python -m themis.cli math500",
68
+ },
69
+ {
70
+ "name": "gsm8k",
71
+ "description": "GSM8K dataset for grade school math word problems",
72
+ "source": "huggingface (default) or local",
73
+ "subjects": "math",
74
+ "command": "uv run python -m themis.cli gsm8k",
75
+ },
76
+ {
77
+ "name": "gpqa",
78
+ "description": "GPQA dataset for graduate-level science questions",
79
+ "source": "huggingface (default) or local",
80
+ "subjects": "science",
81
+ "command": "uv run python -m themis.cli gpqa",
82
+ },
83
+ {
84
+ "name": "gsm-symbolic",
85
+ "description": "GSM-Symbolic dataset for symbolic math reasoning",
86
+ "source": "huggingface (default) or local",
87
+ "subjects": "math",
88
+ "command": "uv run python -m themis.cli gsm-symbolic",
89
+ },
90
+ {
91
+ "name": "medmcqa",
92
+ "description": "MedMCQA dataset for medical entrance exams",
93
+ "source": "huggingface (default) or local",
94
+ "subjects": "medicine",
95
+ "command": "uv run python -m themis.cli medmcqa",
96
+ },
97
+ {
98
+ "name": "med_qa",
99
+ "description": "MedQA dataset for medical question answering",
100
+ "source": "huggingface (default) or local",
101
+ "subjects": "medicine",
102
+ "command": "uv run python -m themis.cli med_qa",
103
+ },
104
+ {
105
+ "name": "sciq",
106
+ "description": "SciQ dataset for science questions",
107
+ "source": "huggingface (default) or local",
108
+ "subjects": "science",
109
+ "command": "uv run python -m themis.cli sciq",
110
+ },
111
+ {
112
+ "name": "commonsense_qa",
113
+ "description": "CommonsenseQA dataset for commonsense reasoning",
114
+ "source": "huggingface (default) or local",
115
+ "subjects": "commonsense",
116
+ "command": "uv run python -m themis.cli commonsense_qa",
117
+ },
118
+ {
119
+ "name": "piqa",
120
+ "description": "PIQA dataset for physical commonsense reasoning",
121
+ "source": "huggingface (default) or local",
122
+ "subjects": "commonsense",
123
+ "command": "uv run python -m themis.cli piqa",
124
+ },
125
+ {
126
+ "name": "social_i_qa",
127
+ "description": "Social IQA dataset for social commonsense reasoning",
128
+ "source": "huggingface (default) or local",
129
+ "subjects": "commonsense",
130
+ "command": "uv run python -m themis.cli social_i_qa",
131
+ },
132
+ {
133
+ "name": "coqa",
134
+ "description": "CoQA dataset for conversational question answering",
135
+ "source": "huggingface (default) or local",
136
+ "subjects": "conversational",
137
+ "command": "uv run python -m themis.cli coqa",
138
+ },
139
+ {
140
+ "name": "supergpqa",
141
+ "description": "Graduate-level QA benchmark with multiple-choice questions",
142
+ "source": "huggingface (default) or local",
143
+ "subjects": "category filter via --subjects",
144
+ "command": "uv run python -m themis.cli supergpqa",
145
+ },
146
+ {
147
+ "name": "mmlu-pro",
148
+ "description": "Professional-level MMLU benchmark with refined distractors",
149
+ "source": "huggingface (default) or local",
150
+ "subjects": "subject filter via --subjects",
151
+ "command": "uv run python -m themis.cli mmlu-pro",
152
+ },
153
+ {
154
+ "name": "aime24",
155
+ "description": "AIME 2024 competition problems",
156
+ "source": "huggingface (default) or local",
157
+ "subjects": "problem set",
158
+ "command": "uv run python -m themis.cli aime24",
159
+ },
160
+ {
161
+ "name": "aime25",
162
+ "description": "AIME 2025 competition problems",
163
+ "source": "huggingface (default) or local",
164
+ "subjects": "problem set",
165
+ "command": "uv run python -m themis.cli aime25",
166
+ },
167
+ {
168
+ "name": "amc23",
169
+ "description": "AMC 2023 competition problems",
170
+ "source": "huggingface (default) or local",
171
+ "subjects": "problem set",
172
+ "command": "uv run python -m themis.cli amc23",
173
+ },
174
+ {
175
+ "name": "olympiadbench",
176
+ "description": "Mixed Olympiad-style math benchmark",
177
+ "source": "huggingface (default) or local",
178
+ "subjects": "competition metadata",
179
+ "command": "uv run python -m themis.cli olympiadbench",
180
+ },
181
+ {
182
+ "name": "beyondaime",
183
+ "description": "BeyondAIME advanced math competition set",
184
+ "source": "huggingface (default) or local",
185
+ "subjects": "problem set",
186
+ "command": "uv run python -m themis.cli beyondaime",
187
+ },
188
+ {
189
+ "name": "demo",
190
+ "description": "Built-in demo with 2 math problems",
191
+ "source": "inline",
192
+ "subjects": ["precalculus", "arithmetic"],
193
+ "command": "uv run python -m themis.cli demo",
194
+ },
195
+ {
196
+ "name": "inline",
197
+ "description": "Custom inline dataset (via config file)",
198
+ "source": "config file",
199
+ "subjects": "user-defined",
200
+ "command": "uv run python -m themis.cli run-config --config your_config.yaml",
201
+ },
202
+ ]
203
+
204
+ print("Available Datasets & Benchmarks:")
205
+ print("=" * 60)
206
+
207
+ for bench in benchmarks:
208
+ print(f"\nšŸ“Š {bench['name']}")
209
+ print(f" {bench['description']}")
210
+ if verbose:
211
+ print(f" Source: {bench['source']}")
212
+ if isinstance(bench["subjects"], list):
213
+ print(f" Subjects: {', '.join(bench['subjects'])}")
214
+ else:
215
+ print(f" Subjects: {bench['subjects']}")
216
+ print(f" Command: {bench['command']}")
217
+
218
+ if not verbose:
219
+ print("\nUse --verbose for more details and example commands")
220
+
221
+ return 0