themis-eval 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. themis/cli/__init__.py +5 -0
  2. themis/cli/__main__.py +6 -0
  3. themis/cli/commands/__init__.py +19 -0
  4. themis/cli/commands/benchmarks.py +221 -0
  5. themis/cli/commands/comparison.py +394 -0
  6. themis/cli/commands/config_commands.py +244 -0
  7. themis/cli/commands/cost.py +214 -0
  8. themis/cli/commands/demo.py +68 -0
  9. themis/cli/commands/info.py +90 -0
  10. themis/cli/commands/leaderboard.py +362 -0
  11. themis/cli/commands/math_benchmarks.py +318 -0
  12. themis/cli/commands/mcq_benchmarks.py +207 -0
  13. themis/cli/commands/sample_run.py +244 -0
  14. themis/cli/commands/visualize.py +299 -0
  15. themis/cli/main.py +93 -0
  16. themis/cli/new_project.py +33 -0
  17. themis/cli/utils.py +51 -0
  18. themis/config/__init__.py +19 -0
  19. themis/config/loader.py +27 -0
  20. themis/config/registry.py +34 -0
  21. themis/config/runtime.py +214 -0
  22. themis/config/schema.py +112 -0
  23. themis/core/__init__.py +5 -0
  24. themis/core/conversation.py +354 -0
  25. themis/core/entities.py +164 -0
  26. themis/core/serialization.py +231 -0
  27. themis/core/tools.py +393 -0
  28. themis/core/types.py +141 -0
  29. themis/datasets/__init__.py +273 -0
  30. themis/datasets/base.py +264 -0
  31. themis/datasets/commonsense_qa.py +174 -0
  32. themis/datasets/competition_math.py +265 -0
  33. themis/datasets/coqa.py +133 -0
  34. themis/datasets/gpqa.py +190 -0
  35. themis/datasets/gsm8k.py +123 -0
  36. themis/datasets/gsm_symbolic.py +124 -0
  37. themis/datasets/math500.py +122 -0
  38. themis/datasets/med_qa.py +179 -0
  39. themis/datasets/medmcqa.py +169 -0
  40. themis/datasets/mmlu_pro.py +262 -0
  41. themis/datasets/piqa.py +146 -0
  42. themis/datasets/registry.py +201 -0
  43. themis/datasets/schema.py +245 -0
  44. themis/datasets/sciq.py +150 -0
  45. themis/datasets/social_i_qa.py +151 -0
  46. themis/datasets/super_gpqa.py +263 -0
  47. themis/evaluation/__init__.py +1 -0
  48. themis/evaluation/conditional.py +410 -0
  49. themis/evaluation/extractors/__init__.py +19 -0
  50. themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
  51. themis/evaluation/extractors/exceptions.py +7 -0
  52. themis/evaluation/extractors/identity_extractor.py +29 -0
  53. themis/evaluation/extractors/json_field_extractor.py +45 -0
  54. themis/evaluation/extractors/math_verify_extractor.py +37 -0
  55. themis/evaluation/extractors/regex_extractor.py +43 -0
  56. themis/evaluation/math_verify_utils.py +87 -0
  57. themis/evaluation/metrics/__init__.py +21 -0
  58. themis/evaluation/metrics/composite_metric.py +47 -0
  59. themis/evaluation/metrics/consistency_metric.py +80 -0
  60. themis/evaluation/metrics/exact_match.py +51 -0
  61. themis/evaluation/metrics/length_difference_tolerance.py +33 -0
  62. themis/evaluation/metrics/math_verify_accuracy.py +40 -0
  63. themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
  64. themis/evaluation/metrics/response_length.py +33 -0
  65. themis/evaluation/metrics/rubric_judge_metric.py +134 -0
  66. themis/evaluation/pipeline.py +49 -0
  67. themis/evaluation/pipelines/__init__.py +15 -0
  68. themis/evaluation/pipelines/composable_pipeline.py +357 -0
  69. themis/evaluation/pipelines/standard_pipeline.py +288 -0
  70. themis/evaluation/reports.py +293 -0
  71. themis/evaluation/statistics/__init__.py +53 -0
  72. themis/evaluation/statistics/bootstrap.py +79 -0
  73. themis/evaluation/statistics/confidence_intervals.py +121 -0
  74. themis/evaluation/statistics/distributions.py +207 -0
  75. themis/evaluation/statistics/effect_sizes.py +124 -0
  76. themis/evaluation/statistics/hypothesis_tests.py +305 -0
  77. themis/evaluation/statistics/types.py +139 -0
  78. themis/evaluation/strategies/__init__.py +13 -0
  79. themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
  80. themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
  81. themis/evaluation/strategies/evaluation_strategy.py +24 -0
  82. themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
  83. themis/experiment/__init__.py +5 -0
  84. themis/experiment/builder.py +151 -0
  85. themis/experiment/cache_manager.py +129 -0
  86. themis/experiment/comparison.py +631 -0
  87. themis/experiment/cost.py +310 -0
  88. themis/experiment/definitions.py +62 -0
  89. themis/experiment/export.py +690 -0
  90. themis/experiment/export_csv.py +159 -0
  91. themis/experiment/integration_manager.py +104 -0
  92. themis/experiment/math.py +192 -0
  93. themis/experiment/mcq.py +169 -0
  94. themis/experiment/orchestrator.py +373 -0
  95. themis/experiment/pricing.py +317 -0
  96. themis/experiment/storage.py +255 -0
  97. themis/experiment/visualization.py +588 -0
  98. themis/generation/__init__.py +1 -0
  99. themis/generation/agentic_runner.py +420 -0
  100. themis/generation/batching.py +254 -0
  101. themis/generation/clients.py +143 -0
  102. themis/generation/conversation_runner.py +236 -0
  103. themis/generation/plan.py +456 -0
  104. themis/generation/providers/litellm_provider.py +221 -0
  105. themis/generation/providers/vllm_provider.py +135 -0
  106. themis/generation/router.py +34 -0
  107. themis/generation/runner.py +207 -0
  108. themis/generation/strategies.py +98 -0
  109. themis/generation/templates.py +71 -0
  110. themis/generation/turn_strategies.py +393 -0
  111. themis/generation/types.py +9 -0
  112. themis/integrations/__init__.py +0 -0
  113. themis/integrations/huggingface.py +61 -0
  114. themis/integrations/wandb.py +65 -0
  115. themis/interfaces/__init__.py +83 -0
  116. themis/project/__init__.py +20 -0
  117. themis/project/definitions.py +98 -0
  118. themis/project/patterns.py +230 -0
  119. themis/providers/__init__.py +5 -0
  120. themis/providers/registry.py +39 -0
  121. themis/utils/api_generator.py +379 -0
  122. themis/utils/cost_tracking.py +376 -0
  123. themis/utils/dashboard.py +452 -0
  124. themis/utils/logging_utils.py +41 -0
  125. themis/utils/progress.py +58 -0
  126. themis/utils/tracing.py +320 -0
  127. {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/METADATA +1 -1
  128. themis_eval-0.1.1.dist-info/RECORD +134 -0
  129. themis_eval-0.1.0.dist-info/RECORD +0 -8
  130. {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/WHEEL +0 -0
  131. {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/licenses/LICENSE +0 -0
  132. {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,244 @@
1
+ """Configuration-related commands."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Annotated, Literal
7
+
8
+ from cyclopts import Parameter
9
+
10
+ from themis.cli.utils import effective_total, export_outputs
11
+ from themis.config import (
12
+ load_dataset_from_config,
13
+ load_experiment_config,
14
+ run_experiment_from_config,
15
+ summarize_report_for_config,
16
+ )
17
+ from themis.utils.logging_utils import configure_logging
18
+ from themis.utils.progress import ProgressReporter
19
+
20
+
21
+ def run_configured_experiment(
22
+ *,
23
+ config: Annotated[
24
+ Path, Parameter(help="Path to a Hydra/OmegaConf experiment config file")
25
+ ],
26
+ overrides: Annotated[
27
+ tuple[str, ...],
28
+ Parameter(
29
+ help="Optional Hydra-style overrides (e.g. generation.sampling.temperature=0.2)",
30
+ show_default=False,
31
+ ),
32
+ ] = (),
33
+ log_level: Annotated[
34
+ str, Parameter(help="Logging level (critical/error/warning/info/debug/trace)")
35
+ ] = "info",
36
+ csv_output: Annotated[
37
+ Path | None, Parameter(help="Write CSV export to this path")
38
+ ] = None,
39
+ html_output: Annotated[
40
+ Path | None, Parameter(help="Write HTML summary to this path")
41
+ ] = None,
42
+ json_output: Annotated[
43
+ Path | None, Parameter(help="Write JSON export to this path")
44
+ ] = None,
45
+ ) -> int:
46
+ """Execute an experiment described via config file."""
47
+ configure_logging(log_level)
48
+ experiment_config = load_experiment_config(config, overrides)
49
+ dataset = load_dataset_from_config(experiment_config)
50
+ total = effective_total(len(dataset), experiment_config.max_samples)
51
+ with ProgressReporter(total=total, description="Generating") as progress:
52
+ report = run_experiment_from_config(
53
+ experiment_config,
54
+ dataset=dataset,
55
+ on_result=progress.on_result,
56
+ )
57
+ print(summarize_report_for_config(experiment_config, report))
58
+ export_outputs(
59
+ report,
60
+ csv_output=csv_output,
61
+ html_output=html_output,
62
+ json_output=json_output,
63
+ title=f"{experiment_config.name} experiment",
64
+ )
65
+ return 0
66
+
67
+
68
+ def validate_config(
69
+ *,
70
+ config: Annotated[Path, Parameter(help="Path to config file to validate")],
71
+ ) -> int:
72
+ """Validate a configuration file without running the experiment."""
73
+ if not config.exists():
74
+ print(f"āŒ Error: Config file not found: {config}")
75
+ return 1
76
+
77
+ print(f"Validating config: {config}")
78
+ print("-" * 60)
79
+
80
+ try:
81
+ # Try to load as experiment config
82
+ experiment_config = load_experiment_config(config, overrides=())
83
+ print("āœ“ Config file is valid")
84
+ print(f"\nExperiment: {experiment_config.name}")
85
+ print(f"Run ID: {experiment_config.run_id or '(auto-generated)'}")
86
+ print(f"Resume: {experiment_config.resume}")
87
+ print(f"Max samples: {experiment_config.max_samples or '(unlimited)'}")
88
+
89
+ print("\nDataset:")
90
+ print(f" Source: {experiment_config.dataset.source}")
91
+ print(f" Split: {experiment_config.dataset.split}")
92
+ if experiment_config.dataset.limit:
93
+ print(f" Limit: {experiment_config.dataset.limit}")
94
+ if experiment_config.dataset.subjects:
95
+ print(f" Subjects: {', '.join(experiment_config.dataset.subjects)}")
96
+
97
+ print("\nGeneration:")
98
+ print(f" Model: {experiment_config.generation.model_identifier}")
99
+ print(f" Provider: {experiment_config.generation.provider.name}")
100
+ print(f" Temperature: {experiment_config.generation.sampling.temperature}")
101
+ print(f" Max tokens: {experiment_config.generation.sampling.max_tokens}")
102
+
103
+ if experiment_config.storage.path:
104
+ print(f"\nStorage: {experiment_config.storage.path}")
105
+
106
+ return 0
107
+ except Exception as e:
108
+ print(f"āŒ Config validation failed: {e}")
109
+ return 1
110
+
111
+
112
+ def init_config(
113
+ *,
114
+ output: Annotated[Path, Parameter(help="Output path for config file")] = Path(
115
+ "themis_config.yaml"
116
+ ),
117
+ template: Annotated[
118
+ Literal["basic", "math500", "inline"],
119
+ Parameter(help="Config template to generate"),
120
+ ] = "basic",
121
+ ) -> int:
122
+ """Generate a sample configuration file for use with run-config."""
123
+ templates = {
124
+ "basic": """name: my_experiment
125
+ task: math500
126
+ dataset:
127
+ source: huggingface
128
+ dataset_id: math500
129
+ limit: 50
130
+ generation:
131
+ model_identifier: fake-math-llm
132
+ provider:
133
+ name: fake
134
+ sampling:
135
+ temperature: 0.0
136
+ top_p: 0.95
137
+ max_tokens: 512
138
+ runner:
139
+ max_parallel: 1
140
+ max_retries: 3
141
+ storage:
142
+ path: .cache/my_experiment
143
+ run_id: my-experiment-001
144
+ resume: true
145
+ """,
146
+ "math500": """name: math500_evaluation
147
+ task: math500
148
+ dataset:
149
+ source: huggingface
150
+ dataset_id: math500
151
+ limit: null # No limit, run full dataset
152
+ subjects:
153
+ - algebra
154
+ - geometry
155
+ generation:
156
+ model_identifier: my-model
157
+ provider:
158
+ name: openai-compatible
159
+ options:
160
+ base_url: http://localhost:1234/v1
161
+ api_key: not-needed
162
+ model_name: qwen2.5-7b-instruct
163
+ timeout: 60
164
+ sampling:
165
+ temperature: 0.0
166
+ top_p: 0.95
167
+ max_tokens: 512
168
+ runner:
169
+ max_parallel: 4
170
+ max_retries: 3
171
+ retry_initial_delay: 0.5
172
+ retry_backoff_multiplier: 2.0
173
+ retry_max_delay: 2.0
174
+ storage:
175
+ path: .cache/math500
176
+ run_id: math500-run-001
177
+ resume: true
178
+ max_samples: null
179
+ """,
180
+ "inline": """name: inline_dataset_experiment
181
+ task: math500
182
+ dataset:
183
+ source: inline
184
+ inline_samples:
185
+ - unique_id: sample-1
186
+ problem: "What is 2 + 2?"
187
+ answer: "4"
188
+ subject: arithmetic
189
+ level: 1
190
+ - unique_id: sample-2
191
+ problem: "Solve for x: 2x + 5 = 13"
192
+ answer: "4"
193
+ subject: algebra
194
+ level: 2
195
+ generation:
196
+ model_identifier: fake-math-llm
197
+ provider:
198
+ name: fake
199
+ sampling:
200
+ temperature: 0.0
201
+ top_p: 0.95
202
+ max_tokens: 512
203
+ storage:
204
+ path: .cache/inline_experiment
205
+ run_id: inline-001
206
+ resume: true
207
+ """,
208
+ }
209
+
210
+ if output.exists():
211
+ print(f"āŒ Error: File already exists: {output}")
212
+ print(" Use a different --output path or delete the existing file")
213
+ return 1
214
+
215
+ config_content = templates[template]
216
+
217
+ try:
218
+ output.parent.mkdir(parents=True, exist_ok=True)
219
+ with open(output, "w") as f:
220
+ f.write(config_content)
221
+
222
+ print(f"āœ“ Created config file: {output}")
223
+ print(f" Template: {template}")
224
+ print("\nšŸ“ Next steps:")
225
+ print(f" 1. Edit {output} to customize settings")
226
+ print(
227
+ f" 2. Validate: uv run python -m themis.cli validate-config --config {output}"
228
+ )
229
+ print(f" 3. Run: uv run python -m themis.cli run-config --config {output}")
230
+
231
+ if template == "math500":
232
+ print("\nāš ļø Remember to:")
233
+ print(" • Update provider.options.base_url with your LLM server endpoint")
234
+ print(" • Update provider.options.model_name with your actual model")
235
+ print(" • Set provider.options.api_key if required by your server")
236
+ elif template == "inline":
237
+ print("\nšŸ’” Tip:")
238
+ print(" • Add more samples to dataset.inline_samples list")
239
+ print(" • Each sample needs: unique_id, problem, answer")
240
+
241
+ return 0
242
+ except Exception as e:
243
+ print(f"āŒ Error creating config file: {e}")
244
+ return 1
@@ -0,0 +1,214 @@
1
+ """Cost estimation and tracking commands."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Annotated
6
+
7
+ from cyclopts import Parameter
8
+
9
+ from themis.experiment.cost import estimate_experiment_cost
10
+ from themis.experiment.pricing import (
11
+ compare_provider_costs,
12
+ get_all_models,
13
+ get_provider_pricing,
14
+ )
15
+
16
+
17
+ def estimate_cost_command(
18
+ *,
19
+ model: Annotated[
20
+ str, Parameter(help="Model identifier (e.g., gpt-4, claude-3-5-sonnet)")
21
+ ],
22
+ dataset_size: Annotated[int, Parameter(help="Number of samples in dataset")],
23
+ avg_prompt_tokens: Annotated[
24
+ int, Parameter(help="Average prompt tokens per sample")
25
+ ] = 500,
26
+ avg_completion_tokens: Annotated[
27
+ int, Parameter(help="Average completion tokens per sample")
28
+ ] = 300,
29
+ ) -> int:
30
+ """Estimate cost for an experiment before running.
31
+
32
+ Examples:
33
+ # Estimate cost for 100 samples with GPT-4
34
+ uv run python -m themis.cli estimate-cost \\
35
+ --model gpt-4 \\
36
+ --dataset-size 100
37
+
38
+ # Custom token estimates
39
+ uv run python -m themis.cli estimate-cost \\
40
+ --model claude-3-5-sonnet-20241022 \\
41
+ --dataset-size 1000 \\
42
+ --avg-prompt-tokens 800 \\
43
+ --avg-completion-tokens 400
44
+ """
45
+ try:
46
+ estimate = estimate_experiment_cost(
47
+ model=model,
48
+ dataset_size=dataset_size,
49
+ avg_prompt_tokens=avg_prompt_tokens,
50
+ avg_completion_tokens=avg_completion_tokens,
51
+ )
52
+
53
+ print("=" * 80)
54
+ print("Cost Estimate")
55
+ print("=" * 80)
56
+ print(f"\nModel: {model}")
57
+ print(f"Dataset size: {dataset_size} samples")
58
+ print(
59
+ f"Avg tokens per sample: {avg_prompt_tokens} prompt + {avg_completion_tokens} completion"
60
+ )
61
+
62
+ print("\nšŸ’° Estimated Cost")
63
+ print(f" Total: ${estimate.estimated_cost:.4f}")
64
+ print(f" Per sample: ${estimate.assumptions['cost_per_sample']:.6f}")
65
+ print(f" 95% CI: ${estimate.lower_bound:.4f} - ${estimate.upper_bound:.4f}")
66
+
67
+ print("\nšŸ“Š Breakdown")
68
+ for phase, cost in estimate.breakdown_by_phase.items():
69
+ print(f" {phase.capitalize()}: ${cost:.4f}")
70
+
71
+ print("\n" + "=" * 80)
72
+
73
+ # Warning if cost is high
74
+ if estimate.estimated_cost > 10.0:
75
+ print(
76
+ f"\nāš ļø Warning: Estimated cost is ${estimate.estimated_cost:.2f}. "
77
+ "Consider using --limit for initial testing."
78
+ )
79
+
80
+ return 0
81
+
82
+ except Exception as e:
83
+ print(f"Error estimating cost: {e}")
84
+ return 1
85
+
86
+
87
+ def show_pricing_command(
88
+ *,
89
+ model: Annotated[
90
+ str | None, Parameter(help="Show pricing for specific model")
91
+ ] = None,
92
+ list_all: Annotated[bool, Parameter(help="List all available models")] = False,
93
+ compare_models: Annotated[
94
+ list[str] | None, Parameter(help="Compare costs for multiple models")
95
+ ] = None,
96
+ ) -> int:
97
+ """Show pricing information for LLM models.
98
+
99
+ Examples:
100
+ # Show pricing for a specific model
101
+ uv run python -m themis.cli show-pricing --model gpt-4
102
+
103
+ # List all models with pricing
104
+ uv run python -m themis.cli show-pricing --list-all
105
+
106
+ # Compare pricing across models (use repeated --compare-models flags)
107
+ uv run python -m themis.cli show-pricing \\
108
+ --compare-models gpt-4 \\
109
+ --compare-models gpt-3.5-turbo \\
110
+ --compare-models claude-3-haiku-20240307
111
+ """
112
+ try:
113
+ if list_all:
114
+ models = get_all_models()
115
+ print("=" * 80)
116
+ print(f"Available Models ({len(models)} total)")
117
+ print("=" * 80)
118
+ print("\nModel pricing (per 1M tokens):\n")
119
+
120
+ for model_name in sorted(models):
121
+ pricing = get_provider_pricing(model_name)
122
+ prompt_price = pricing["prompt_tokens"] * 1_000_000
123
+ completion_price = pricing["completion_tokens"] * 1_000_000
124
+ print(
125
+ f" {model_name:40s} | "
126
+ f"Prompt: ${prompt_price:6.2f} | "
127
+ f"Completion: ${completion_price:6.2f}"
128
+ )
129
+
130
+ print("\n" + "=" * 80)
131
+ return 0
132
+
133
+ if compare_models:
134
+ # Compare costs for standard workload
135
+ prompt_tokens = 1000
136
+ completion_tokens = 500
137
+
138
+ costs = compare_provider_costs(
139
+ prompt_tokens, completion_tokens, compare_models
140
+ )
141
+
142
+ print("=" * 80)
143
+ print(
144
+ f"Cost Comparison ({prompt_tokens} prompt + {completion_tokens} completion tokens)"
145
+ )
146
+ print("=" * 80)
147
+ print()
148
+
149
+ # Sort by cost
150
+ sorted_costs = sorted(costs.items(), key=lambda x: x[1])
151
+
152
+ for model_name, cost in sorted_costs:
153
+ # Calculate cost per 1M tokens for comparison
154
+ pricing = get_provider_pricing(model_name)
155
+ prompt_price = pricing["prompt_tokens"] * 1_000_000
156
+ completion_price = pricing["completion_tokens"] * 1_000_000
157
+
158
+ print(f" {model_name:40s} | ${cost:.6f}")
159
+ print(
160
+ f" {'':40s} | (${prompt_price:.2f} / ${completion_price:.2f} per 1M)"
161
+ )
162
+
163
+ # Show relative costs
164
+ if sorted_costs:
165
+ cheapest_cost = sorted_costs[0][1]
166
+ print(f"\nRelative costs (vs {sorted_costs[0][0]}):")
167
+ for model_name, cost in sorted_costs[1:]:
168
+ multiplier = cost / cheapest_cost if cheapest_cost > 0 else 0
169
+ print(f" {model_name:40s} | {multiplier:.1f}x more expensive")
170
+
171
+ print("\n" + "=" * 80)
172
+ return 0
173
+
174
+ if model:
175
+ pricing = get_provider_pricing(model)
176
+ prompt_price = pricing["prompt_tokens"] * 1_000_000
177
+ completion_price = pricing["completion_tokens"] * 1_000_000
178
+
179
+ print("=" * 80)
180
+ print(f"Pricing for {model}")
181
+ print("=" * 80)
182
+ print(f"\nPrompt tokens: ${prompt_price:.2f} per 1M tokens")
183
+ print(f"Completion tokens: ${completion_price:.2f} per 1M tokens")
184
+
185
+ # Show example costs
186
+ print("\nExample costs:")
187
+ examples = [
188
+ (100, 50, "Short query"),
189
+ (500, 300, "Medium query"),
190
+ (1000, 500, "Long query"),
191
+ ]
192
+
193
+ for prompt_tok, completion_tok, label in examples:
194
+ from themis.experiment.pricing import calculate_cost
195
+
196
+ cost = calculate_cost(model, prompt_tok, completion_tok)
197
+ print(
198
+ f" {label:15s} ({prompt_tok:4d} + {completion_tok:4d} tokens): ${cost:.6f}"
199
+ )
200
+
201
+ print("\n" + "=" * 80)
202
+ return 0
203
+
204
+ # No options provided
205
+ print("Error: Must specify --model, --list-all, or --compare-models")
206
+ print("Use --help for usage information")
207
+ return 1
208
+
209
+ except Exception as e:
210
+ print(f"Error: {e}")
211
+ return 1
212
+
213
+
214
+ __all__ = ["estimate_cost_command", "show_pricing_command"]
@@ -0,0 +1,68 @@
1
+ """Demo command implementation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Annotated
7
+
8
+ from cyclopts import Parameter
9
+
10
+ from themis.cli.utils import effective_total, export_outputs
11
+ from themis.experiment import math as math_experiment
12
+ from themis.utils.logging_utils import configure_logging
13
+ from themis.utils.progress import ProgressReporter
14
+
15
+
16
+ def demo_command(
17
+ *,
18
+ max_samples: Annotated[
19
+ int | None, Parameter(help="Limit number of demo samples")
20
+ ] = None,
21
+ log_level: Annotated[
22
+ str, Parameter(help="Logging level (critical/error/warning/info/debug/trace)")
23
+ ] = "info",
24
+ csv_output: Annotated[
25
+ Path | None, Parameter(help="Write CSV export to this path")
26
+ ] = None,
27
+ html_output: Annotated[
28
+ Path | None, Parameter(help="Write HTML summary to this path")
29
+ ] = None,
30
+ json_output: Annotated[
31
+ Path | None, Parameter(help="Write JSON export to this path")
32
+ ] = None,
33
+ ) -> int:
34
+ """Run the built-in demo dataset."""
35
+ configure_logging(log_level)
36
+ dataset = [
37
+ {
38
+ "unique_id": "demo-1",
39
+ "problem": "Convert the point (0,3) in rectangular coordinates to polar coordinates.",
40
+ "answer": "\\left( 3, \\frac{\\pi}{2} \\right)",
41
+ "subject": "precalculus",
42
+ "level": 2,
43
+ },
44
+ {
45
+ "unique_id": "demo-2",
46
+ "problem": "What is 7 + 5?",
47
+ "answer": "12",
48
+ "subject": "arithmetic",
49
+ "level": 1,
50
+ },
51
+ ]
52
+ experiment = math_experiment.build_math500_zero_shot_experiment()
53
+ total = effective_total(len(dataset), max_samples)
54
+ with ProgressReporter(total=total, description="Generating") as progress:
55
+ report = experiment.run(
56
+ dataset,
57
+ max_samples=max_samples,
58
+ on_result=progress.on_result,
59
+ )
60
+ print(math_experiment.summarize_report(report))
61
+ export_outputs(
62
+ report,
63
+ csv_output=csv_output,
64
+ html_output=html_output,
65
+ json_output=json_output,
66
+ title="Demo experiment",
67
+ )
68
+ return 0
@@ -0,0 +1,90 @@
1
+ """System information and listing commands."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import sys
6
+ from pathlib import Path
7
+ from typing import Annotated
8
+
9
+ from cyclopts import Parameter
10
+
11
+ from themis.providers.registry import _REGISTRY
12
+
13
+
14
+ def show_info() -> int:
15
+ """Show system information and installed components."""
16
+ import themis
17
+ from themis import _version
18
+
19
+ print("Themis Information")
20
+ print("=" * 60)
21
+ print(f"Version: {getattr(_version, '__version__', 'unknown')}")
22
+ print(f"Python: {sys.version.split()[0]}")
23
+ print(f"Platform: {sys.platform}")
24
+
25
+ print("\nšŸ“¦ Installed Providers:")
26
+ providers = sorted(_REGISTRY._factories.keys())
27
+ for provider in providers:
28
+ print(f" āœ“ {provider}")
29
+
30
+ print("\nšŸ“Š Available Benchmarks:")
31
+ benchmarks = [
32
+ "demo",
33
+ "math500",
34
+ "aime24",
35
+ "aime25",
36
+ "amc23",
37
+ "olympiadbench",
38
+ "beyondaime",
39
+ "supergpqa",
40
+ "mmlu-pro",
41
+ "inline (via config)",
42
+ ]
43
+ for bench in benchmarks:
44
+ print(f" āœ“ {bench}")
45
+
46
+ print("\nšŸ“ Example Locations:")
47
+ examples_dir = Path(themis.__file__).parent.parent / "examples"
48
+ if examples_dir.exists():
49
+ print(f" {examples_dir}")
50
+ example_dirs = sorted(
51
+ [
52
+ d.name
53
+ for d in examples_dir.iterdir()
54
+ if d.is_dir() and not d.name.startswith("_")
55
+ ]
56
+ )
57
+ for ex in example_dirs:
58
+ print(f" • {ex}/")
59
+
60
+ print("\nšŸ“š Documentation:")
61
+ print(" examples/README.md - Comprehensive tutorial cookbook")
62
+ print(" COOKBOOK.md - Quick reference guide")
63
+ print(" docs/ - Detailed documentation")
64
+
65
+ print("\nšŸš€ Quick Start:")
66
+ print(" uv run python -m themis.cli demo")
67
+ print(" uv run python -m themis.cli list-providers")
68
+ print(" uv run python -m themis.cli list-benchmarks")
69
+
70
+ return 0
71
+
72
+
73
+ def new_project(
74
+ *,
75
+ project_name: Annotated[str, Parameter(help="The name of the new project")],
76
+ project_path: Annotated[
77
+ Path,
78
+ Parameter(help="The path where the new project will be created"),
79
+ ] = Path("."),
80
+ ) -> int:
81
+ """Create a new Themis project."""
82
+ from themis.cli.new_project import create_project
83
+
84
+ try:
85
+ create_project(project_name, project_path)
86
+ print(f"Successfully created new project '{project_name}' in {project_path}")
87
+ return 0
88
+ except FileExistsError as e:
89
+ print(f"Error: {e}")
90
+ return 1