themis-eval 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. themis/cli/__init__.py +5 -0
  2. themis/cli/__main__.py +6 -0
  3. themis/cli/commands/__init__.py +19 -0
  4. themis/cli/commands/benchmarks.py +221 -0
  5. themis/cli/commands/comparison.py +394 -0
  6. themis/cli/commands/config_commands.py +244 -0
  7. themis/cli/commands/cost.py +214 -0
  8. themis/cli/commands/demo.py +68 -0
  9. themis/cli/commands/info.py +90 -0
  10. themis/cli/commands/leaderboard.py +362 -0
  11. themis/cli/commands/math_benchmarks.py +318 -0
  12. themis/cli/commands/mcq_benchmarks.py +207 -0
  13. themis/cli/commands/sample_run.py +244 -0
  14. themis/cli/commands/visualize.py +299 -0
  15. themis/cli/main.py +93 -0
  16. themis/cli/new_project.py +33 -0
  17. themis/cli/utils.py +51 -0
  18. themis/config/__init__.py +19 -0
  19. themis/config/loader.py +27 -0
  20. themis/config/registry.py +34 -0
  21. themis/config/runtime.py +214 -0
  22. themis/config/schema.py +112 -0
  23. themis/core/__init__.py +5 -0
  24. themis/core/conversation.py +354 -0
  25. themis/core/entities.py +164 -0
  26. themis/core/serialization.py +231 -0
  27. themis/core/tools.py +393 -0
  28. themis/core/types.py +141 -0
  29. themis/datasets/__init__.py +273 -0
  30. themis/datasets/base.py +264 -0
  31. themis/datasets/commonsense_qa.py +174 -0
  32. themis/datasets/competition_math.py +265 -0
  33. themis/datasets/coqa.py +133 -0
  34. themis/datasets/gpqa.py +190 -0
  35. themis/datasets/gsm8k.py +123 -0
  36. themis/datasets/gsm_symbolic.py +124 -0
  37. themis/datasets/math500.py +122 -0
  38. themis/datasets/med_qa.py +179 -0
  39. themis/datasets/medmcqa.py +169 -0
  40. themis/datasets/mmlu_pro.py +262 -0
  41. themis/datasets/piqa.py +146 -0
  42. themis/datasets/registry.py +201 -0
  43. themis/datasets/schema.py +245 -0
  44. themis/datasets/sciq.py +150 -0
  45. themis/datasets/social_i_qa.py +151 -0
  46. themis/datasets/super_gpqa.py +263 -0
  47. themis/evaluation/__init__.py +1 -0
  48. themis/evaluation/conditional.py +410 -0
  49. themis/evaluation/extractors/__init__.py +19 -0
  50. themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
  51. themis/evaluation/extractors/exceptions.py +7 -0
  52. themis/evaluation/extractors/identity_extractor.py +29 -0
  53. themis/evaluation/extractors/json_field_extractor.py +45 -0
  54. themis/evaluation/extractors/math_verify_extractor.py +37 -0
  55. themis/evaluation/extractors/regex_extractor.py +43 -0
  56. themis/evaluation/math_verify_utils.py +87 -0
  57. themis/evaluation/metrics/__init__.py +21 -0
  58. themis/evaluation/metrics/composite_metric.py +47 -0
  59. themis/evaluation/metrics/consistency_metric.py +80 -0
  60. themis/evaluation/metrics/exact_match.py +51 -0
  61. themis/evaluation/metrics/length_difference_tolerance.py +33 -0
  62. themis/evaluation/metrics/math_verify_accuracy.py +40 -0
  63. themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
  64. themis/evaluation/metrics/response_length.py +33 -0
  65. themis/evaluation/metrics/rubric_judge_metric.py +134 -0
  66. themis/evaluation/pipeline.py +49 -0
  67. themis/evaluation/pipelines/__init__.py +15 -0
  68. themis/evaluation/pipelines/composable_pipeline.py +357 -0
  69. themis/evaluation/pipelines/standard_pipeline.py +288 -0
  70. themis/evaluation/reports.py +293 -0
  71. themis/evaluation/statistics/__init__.py +53 -0
  72. themis/evaluation/statistics/bootstrap.py +79 -0
  73. themis/evaluation/statistics/confidence_intervals.py +121 -0
  74. themis/evaluation/statistics/distributions.py +207 -0
  75. themis/evaluation/statistics/effect_sizes.py +124 -0
  76. themis/evaluation/statistics/hypothesis_tests.py +305 -0
  77. themis/evaluation/statistics/types.py +139 -0
  78. themis/evaluation/strategies/__init__.py +13 -0
  79. themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
  80. themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
  81. themis/evaluation/strategies/evaluation_strategy.py +24 -0
  82. themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
  83. themis/experiment/__init__.py +5 -0
  84. themis/experiment/builder.py +151 -0
  85. themis/experiment/cache_manager.py +129 -0
  86. themis/experiment/comparison.py +631 -0
  87. themis/experiment/cost.py +310 -0
  88. themis/experiment/definitions.py +62 -0
  89. themis/experiment/export.py +690 -0
  90. themis/experiment/export_csv.py +159 -0
  91. themis/experiment/integration_manager.py +104 -0
  92. themis/experiment/math.py +192 -0
  93. themis/experiment/mcq.py +169 -0
  94. themis/experiment/orchestrator.py +373 -0
  95. themis/experiment/pricing.py +317 -0
  96. themis/experiment/storage.py +255 -0
  97. themis/experiment/visualization.py +588 -0
  98. themis/generation/__init__.py +1 -0
  99. themis/generation/agentic_runner.py +420 -0
  100. themis/generation/batching.py +254 -0
  101. themis/generation/clients.py +143 -0
  102. themis/generation/conversation_runner.py +236 -0
  103. themis/generation/plan.py +456 -0
  104. themis/generation/providers/litellm_provider.py +221 -0
  105. themis/generation/providers/vllm_provider.py +135 -0
  106. themis/generation/router.py +34 -0
  107. themis/generation/runner.py +207 -0
  108. themis/generation/strategies.py +98 -0
  109. themis/generation/templates.py +71 -0
  110. themis/generation/turn_strategies.py +393 -0
  111. themis/generation/types.py +9 -0
  112. themis/integrations/__init__.py +0 -0
  113. themis/integrations/huggingface.py +61 -0
  114. themis/integrations/wandb.py +65 -0
  115. themis/interfaces/__init__.py +83 -0
  116. themis/project/__init__.py +20 -0
  117. themis/project/definitions.py +98 -0
  118. themis/project/patterns.py +230 -0
  119. themis/providers/__init__.py +5 -0
  120. themis/providers/registry.py +39 -0
  121. themis/utils/api_generator.py +379 -0
  122. themis/utils/cost_tracking.py +376 -0
  123. themis/utils/dashboard.py +452 -0
  124. themis/utils/logging_utils.py +41 -0
  125. themis/utils/progress.py +58 -0
  126. themis/utils/tracing.py +320 -0
  127. {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/METADATA +1 -1
  128. themis_eval-0.1.1.dist-info/RECORD +134 -0
  129. themis_eval-0.1.0.dist-info/RECORD +0 -8
  130. {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/WHEEL +0 -0
  131. {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/licenses/LICENSE +0 -0
  132. {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/top_level.txt +0 -0
themis/cli/__init__.py ADDED
@@ -0,0 +1,5 @@
1
+ """Command-line helpers for running Themis experiments."""
2
+
3
+ from . import main
4
+
5
+ __all__ = ["main"]
themis/cli/__main__.py ADDED
@@ -0,0 +1,6 @@
1
+ """Entry point for running themis.cli as a module."""
2
+
3
+ from .main import main
4
+
5
+ if __name__ == "__main__":
6
+ raise SystemExit(main())
@@ -0,0 +1,19 @@
1
+ """CLI command modules."""
2
+
3
+ from themis.cli.commands import (
4
+ benchmarks,
5
+ config_commands,
6
+ demo,
7
+ info,
8
+ math_benchmarks,
9
+ mcq_benchmarks,
10
+ )
11
+
12
+ __all__ = [
13
+ "benchmarks",
14
+ "config_commands",
15
+ "demo",
16
+ "info",
17
+ "math_benchmarks",
18
+ "mcq_benchmarks",
19
+ ]
@@ -0,0 +1,221 @@
1
+ """Benchmark listing commands."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Annotated
6
+
7
+ from cyclopts import Parameter
8
+
9
+ from themis.providers.registry import _REGISTRY
10
+
11
+
12
+ def list_providers(
13
+ *,
14
+ verbose: Annotated[
15
+ bool, Parameter(help="Show detailed provider information")
16
+ ] = False,
17
+ ) -> int:
18
+ """List available LLM providers."""
19
+ providers = sorted(_REGISTRY._factories.keys())
20
+
21
+ if not providers:
22
+ print("No providers registered.")
23
+ return 0
24
+
25
+ print("Available Providers:")
26
+ print("=" * 60)
27
+
28
+ provider_info = {
29
+ "fake": "Built-in fake provider for testing (no API required)",
30
+ "openai-compatible": "OpenAI-compatible API (LM Studio, Ollama, vLLM, OpenAI)",
31
+ "vllm": "vLLM server provider for local model hosting",
32
+ }
33
+
34
+ for provider in providers:
35
+ status = "✓" if provider in provider_info else "·"
36
+ print(f"{status} {provider}")
37
+ if verbose and provider in provider_info:
38
+ print(f" {provider_info[provider]}")
39
+
40
+ if not verbose:
41
+ print("\nUse --verbose for more details")
42
+
43
+ return 0
44
+
45
+
46
+ def list_benchmarks(
47
+ *,
48
+ verbose: Annotated[
49
+ bool, Parameter(help="Show detailed benchmark information")
50
+ ] = False,
51
+ ) -> int:
52
+ """List available datasets and benchmarks."""
53
+ benchmarks = [
54
+ {
55
+ "name": "math500",
56
+ "description": "MATH-500 dataset for mathematical reasoning",
57
+ "source": "huggingface (default) or local",
58
+ "subjects": [
59
+ "algebra",
60
+ "counting_and_probability",
61
+ "geometry",
62
+ "intermediate_algebra",
63
+ "number_theory",
64
+ "prealgebra",
65
+ "precalculus",
66
+ ],
67
+ "command": "uv run python -m themis.cli math500",
68
+ },
69
+ {
70
+ "name": "gsm8k",
71
+ "description": "GSM8K dataset for grade school math word problems",
72
+ "source": "huggingface (default) or local",
73
+ "subjects": "math",
74
+ "command": "uv run python -m themis.cli gsm8k",
75
+ },
76
+ {
77
+ "name": "gpqa",
78
+ "description": "GPQA dataset for graduate-level science questions",
79
+ "source": "huggingface (default) or local",
80
+ "subjects": "science",
81
+ "command": "uv run python -m themis.cli gpqa",
82
+ },
83
+ {
84
+ "name": "gsm-symbolic",
85
+ "description": "GSM-Symbolic dataset for symbolic math reasoning",
86
+ "source": "huggingface (default) or local",
87
+ "subjects": "math",
88
+ "command": "uv run python -m themis.cli gsm-symbolic",
89
+ },
90
+ {
91
+ "name": "medmcqa",
92
+ "description": "MedMCQA dataset for medical entrance exams",
93
+ "source": "huggingface (default) or local",
94
+ "subjects": "medicine",
95
+ "command": "uv run python -m themis.cli medmcqa",
96
+ },
97
+ {
98
+ "name": "med_qa",
99
+ "description": "MedQA dataset for medical question answering",
100
+ "source": "huggingface (default) or local",
101
+ "subjects": "medicine",
102
+ "command": "uv run python -m themis.cli med_qa",
103
+ },
104
+ {
105
+ "name": "sciq",
106
+ "description": "SciQ dataset for science questions",
107
+ "source": "huggingface (default) or local",
108
+ "subjects": "science",
109
+ "command": "uv run python -m themis.cli sciq",
110
+ },
111
+ {
112
+ "name": "commonsense_qa",
113
+ "description": "CommonsenseQA dataset for commonsense reasoning",
114
+ "source": "huggingface (default) or local",
115
+ "subjects": "commonsense",
116
+ "command": "uv run python -m themis.cli commonsense_qa",
117
+ },
118
+ {
119
+ "name": "piqa",
120
+ "description": "PIQA dataset for physical commonsense reasoning",
121
+ "source": "huggingface (default) or local",
122
+ "subjects": "commonsense",
123
+ "command": "uv run python -m themis.cli piqa",
124
+ },
125
+ {
126
+ "name": "social_i_qa",
127
+ "description": "Social IQA dataset for social commonsense reasoning",
128
+ "source": "huggingface (default) or local",
129
+ "subjects": "commonsense",
130
+ "command": "uv run python -m themis.cli social_i_qa",
131
+ },
132
+ {
133
+ "name": "coqa",
134
+ "description": "CoQA dataset for conversational question answering",
135
+ "source": "huggingface (default) or local",
136
+ "subjects": "conversational",
137
+ "command": "uv run python -m themis.cli coqa",
138
+ },
139
+ {
140
+ "name": "supergpqa",
141
+ "description": "Graduate-level QA benchmark with multiple-choice questions",
142
+ "source": "huggingface (default) or local",
143
+ "subjects": "category filter via --subjects",
144
+ "command": "uv run python -m themis.cli supergpqa",
145
+ },
146
+ {
147
+ "name": "mmlu-pro",
148
+ "description": "Professional-level MMLU benchmark with refined distractors",
149
+ "source": "huggingface (default) or local",
150
+ "subjects": "subject filter via --subjects",
151
+ "command": "uv run python -m themis.cli mmlu-pro",
152
+ },
153
+ {
154
+ "name": "aime24",
155
+ "description": "AIME 2024 competition problems",
156
+ "source": "huggingface (default) or local",
157
+ "subjects": "problem set",
158
+ "command": "uv run python -m themis.cli aime24",
159
+ },
160
+ {
161
+ "name": "aime25",
162
+ "description": "AIME 2025 competition problems",
163
+ "source": "huggingface (default) or local",
164
+ "subjects": "problem set",
165
+ "command": "uv run python -m themis.cli aime25",
166
+ },
167
+ {
168
+ "name": "amc23",
169
+ "description": "AMC 2023 competition problems",
170
+ "source": "huggingface (default) or local",
171
+ "subjects": "problem set",
172
+ "command": "uv run python -m themis.cli amc23",
173
+ },
174
+ {
175
+ "name": "olympiadbench",
176
+ "description": "Mixed Olympiad-style math benchmark",
177
+ "source": "huggingface (default) or local",
178
+ "subjects": "competition metadata",
179
+ "command": "uv run python -m themis.cli olympiadbench",
180
+ },
181
+ {
182
+ "name": "beyondaime",
183
+ "description": "BeyondAIME advanced math competition set",
184
+ "source": "huggingface (default) or local",
185
+ "subjects": "problem set",
186
+ "command": "uv run python -m themis.cli beyondaime",
187
+ },
188
+ {
189
+ "name": "demo",
190
+ "description": "Built-in demo with 2 math problems",
191
+ "source": "inline",
192
+ "subjects": ["precalculus", "arithmetic"],
193
+ "command": "uv run python -m themis.cli demo",
194
+ },
195
+ {
196
+ "name": "inline",
197
+ "description": "Custom inline dataset (via config file)",
198
+ "source": "config file",
199
+ "subjects": "user-defined",
200
+ "command": "uv run python -m themis.cli run-config --config your_config.yaml",
201
+ },
202
+ ]
203
+
204
+ print("Available Datasets & Benchmarks:")
205
+ print("=" * 60)
206
+
207
+ for bench in benchmarks:
208
+ print(f"\n📊 {bench['name']}")
209
+ print(f" {bench['description']}")
210
+ if verbose:
211
+ print(f" Source: {bench['source']}")
212
+ if isinstance(bench["subjects"], list):
213
+ print(f" Subjects: {', '.join(bench['subjects'])}")
214
+ else:
215
+ print(f" Subjects: {bench['subjects']}")
216
+ print(f" Command: {bench['command']}")
217
+
218
+ if not verbose:
219
+ print("\nUse --verbose for more details and example commands")
220
+
221
+ return 0
@@ -0,0 +1,394 @@
1
+ """Multi-experiment comparison commands."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Annotated
7
+
8
+ from cyclopts import Parameter
9
+
10
+ from themis.experiment.comparison import compare_experiments, diff_configs
11
+
12
+
13
+ def compare_command(
14
+ *,
15
+ run_ids: Annotated[
16
+ list[str],
17
+ Parameter(
18
+ help="Run IDs to compare (comma-separated or multiple --run-ids)",
19
+ ),
20
+ ],
21
+ storage: Annotated[
22
+ Path,
23
+ Parameter(
24
+ help="Storage directory containing experiment results",
25
+ ),
26
+ ] = Path(".cache/runs"),
27
+ metrics: Annotated[
28
+ list[str] | None,
29
+ Parameter(
30
+ help="Metrics to compare (default: all available)",
31
+ ),
32
+ ] = None,
33
+ output: Annotated[
34
+ Path | None,
35
+ Parameter(
36
+ help="Output file path (format inferred from extension: .csv, .md, .json)",
37
+ ),
38
+ ] = None,
39
+ format: Annotated[
40
+ str,
41
+ Parameter(
42
+ help="Output format: csv, markdown, json, latex",
43
+ ),
44
+ ] = "markdown",
45
+ highlight_best: Annotated[
46
+ str | None,
47
+ Parameter(
48
+ help="Metric to highlight best performer (e.g., 'accuracy')",
49
+ ),
50
+ ] = None,
51
+ ) -> int:
52
+ """Compare multiple experiment runs.
53
+
54
+ Automatically includes cost data when available. Costs are tracked
55
+ automatically during experiment runs and displayed in comparisons.
56
+
57
+ Examples:
58
+ # Compare three runs with default metrics (includes cost if tracked)
59
+ uv run python -m themis.cli compare \\
60
+ --run-ids run-1 run-2 run-3 \\
61
+ --storage .cache/runs
62
+
63
+ # Compare with specific metrics, export to CSV
64
+ uv run python -m themis.cli compare \\
65
+ --run-ids run-1 run-2 run-3 \\
66
+ --metrics accuracy \\
67
+ --output comparison.csv
68
+
69
+ # Use 'cost' as a metric for ranking and Pareto analysis
70
+ uv run python -m themis.cli pareto \\
71
+ --run-ids run-1 run-2 run-3 \\
72
+ --objectives accuracy cost \\
73
+ --maximize true false
74
+
75
+ # Highlight best accuracy performer
76
+ uv run python -m themis.cli compare \\
77
+ --run-ids run-1 run-2 run-3 \\
78
+ --highlight-best accuracy
79
+ """
80
+ try:
81
+ # Load and compare experiments
82
+ print(f"Loading experiments from {storage}...")
83
+ comparison = compare_experiments(
84
+ run_ids=run_ids,
85
+ storage_dir=storage,
86
+ metrics=metrics,
87
+ include_metadata=True,
88
+ )
89
+
90
+ print(f"\n✓ Loaded {len(comparison.experiments)} experiments")
91
+ print(f" Metrics: {', '.join(comparison.metrics)}\n")
92
+
93
+ # Display comparison table
94
+ print("=" * 80)
95
+ print("Experiment Comparison")
96
+ print("=" * 80)
97
+
98
+ # Check if any experiment has cost data
99
+ has_cost = any(exp.get_cost() is not None for exp in comparison.experiments)
100
+
101
+ # Header
102
+ header_cols = ["Run ID"] + comparison.metrics + ["Samples", "Failures"]
103
+ if has_cost:
104
+ header_cols.append("Cost ($)")
105
+ col_widths = [max(20, len(col)) for col in header_cols]
106
+
107
+ header = " | ".join(
108
+ col.ljust(width) for col, width in zip(header_cols, col_widths)
109
+ )
110
+ print(header)
111
+ print("-" * len(header))
112
+
113
+ # Rows
114
+ for exp in comparison.experiments:
115
+ row_values = [exp.run_id[:20]] # Truncate run ID
116
+ for metric in comparison.metrics:
117
+ val = exp.get_metric(metric)
118
+ row_values.append(f"{val:.4f}" if val is not None else "N/A")
119
+ row_values.append(str(exp.sample_count))
120
+ row_values.append(str(exp.failure_count))
121
+
122
+ # Add cost if available
123
+ if has_cost:
124
+ cost = exp.get_cost()
125
+ row_values.append(f"{cost:.4f}" if cost is not None else "N/A")
126
+
127
+ row = " | ".join(
128
+ val.ljust(width) for val, width in zip(row_values, col_widths)
129
+ )
130
+ print(row)
131
+
132
+ print("=" * 80)
133
+
134
+ # Highlight best if requested
135
+ if highlight_best:
136
+ if highlight_best in comparison.metrics:
137
+ best = comparison.highlight_best(highlight_best)
138
+ if best:
139
+ best_value = best.get_metric(highlight_best)
140
+ print(
141
+ f"\n⭐ Best {highlight_best}: {best.run_id} ({best_value:.4f})"
142
+ )
143
+ else:
144
+ print(f"\n⚠️ No valid values for metric '{highlight_best}'")
145
+ else:
146
+ print(
147
+ f"\n⚠️ Metric '{highlight_best}' not found. Available: {comparison.metrics}"
148
+ )
149
+
150
+ # Export if requested
151
+ if output:
152
+ output = Path(output)
153
+ # Infer format from extension if not specified
154
+ if output.suffix == ".csv":
155
+ comparison.to_csv(output)
156
+ print(f"\n✓ Exported to {output} (CSV)")
157
+ elif output.suffix == ".md":
158
+ comparison.to_markdown(output)
159
+ print(f"\n✓ Exported to {output} (Markdown)")
160
+ elif output.suffix == ".json":
161
+ import json
162
+
163
+ output.write_text(
164
+ json.dumps(comparison.to_dict(), indent=2), encoding="utf-8"
165
+ )
166
+ print(f"\n✓ Exported to {output} (JSON)")
167
+ elif output.suffix == ".tex":
168
+ comparison.to_latex(output, style="booktabs")
169
+ print(f"\n✓ Exported to {output} (LaTeX)")
170
+ else:
171
+ # Use specified format
172
+ if format == "csv":
173
+ comparison.to_csv(output)
174
+ print(f"\n✓ Exported to {output} (CSV)")
175
+ elif format == "markdown":
176
+ comparison.to_markdown(output)
177
+ print(f"\n✓ Exported to {output} (Markdown)")
178
+ elif format == "json":
179
+ import json
180
+
181
+ output.write_text(
182
+ json.dumps(comparison.to_dict(), indent=2), encoding="utf-8"
183
+ )
184
+ print(f"\n✓ Exported to {output} (JSON)")
185
+ elif format == "latex":
186
+ comparison.to_latex(output, style="booktabs")
187
+ print(f"\n✓ Exported to {output} (LaTeX)")
188
+ else:
189
+ print(f"\n⚠️ Unknown format: {format}")
190
+ print("Available formats: csv, markdown, json, latex")
191
+ return 1
192
+
193
+ return 0
194
+
195
+ except ValueError as e:
196
+ print(f"Error: {e}")
197
+ return 1
198
+ except FileNotFoundError as e:
199
+ print(f"Error: {e}")
200
+ return 1
201
+ except Exception as e:
202
+ print(f"Unexpected error: {e}")
203
+ import traceback
204
+
205
+ traceback.print_exc()
206
+ return 1
207
+
208
+
209
+ def diff_command(
210
+ *,
211
+ run_id_a: Annotated[
212
+ str,
213
+ Parameter(
214
+ help="First run ID",
215
+ ),
216
+ ],
217
+ run_id_b: Annotated[
218
+ str,
219
+ Parameter(
220
+ help="Second run ID",
221
+ ),
222
+ ],
223
+ storage: Annotated[
224
+ Path,
225
+ Parameter(
226
+ help="Storage directory containing experiment results",
227
+ ),
228
+ ] = Path(".cache/runs"),
229
+ ) -> int:
230
+ """Show configuration differences between two experiment runs.
231
+
232
+ Examples:
233
+ # Compare configurations
234
+ uv run python -m themis.cli diff \\
235
+ --run-id-a run-1 \\
236
+ --run-id-b run-2 \\
237
+ --storage .cache/runs
238
+ """
239
+ try:
240
+ diff = diff_configs(run_id_a, run_id_b, storage)
241
+
242
+ print("=" * 80)
243
+ print(f"Configuration Diff: {run_id_a} → {run_id_b}")
244
+ print("=" * 80)
245
+
246
+ if not diff.has_differences():
247
+ print("\n✓ No differences found - configurations are identical\n")
248
+ return 0
249
+
250
+ # Show changed fields
251
+ if diff.changed_fields:
252
+ print("\n📝 Changed Fields:")
253
+ for key, (old, new) in diff.changed_fields.items():
254
+ print(f"\n {key}:")
255
+ print(f" - {run_id_a}: {old}")
256
+ print(f" + {run_id_b}: {new}")
257
+
258
+ # Show added fields
259
+ if diff.added_fields:
260
+ print("\n➕ Added Fields (in run_id_b):")
261
+ for key, value in diff.added_fields.items():
262
+ print(f" {key}: {value}")
263
+
264
+ # Show removed fields
265
+ if diff.removed_fields:
266
+ print("\n➖ Removed Fields (from run_id_a):")
267
+ for key, value in diff.removed_fields.items():
268
+ print(f" {key}: {value}")
269
+
270
+ print("\n" + "=" * 80)
271
+ return 0
272
+
273
+ except FileNotFoundError as e:
274
+ print(f"Error: {e}")
275
+ print("\nMake sure both run IDs exist and have config.json files.")
276
+ return 1
277
+ except Exception as e:
278
+ print(f"Unexpected error: {e}")
279
+ import traceback
280
+
281
+ traceback.print_exc()
282
+ return 1
283
+
284
+
285
+ def pareto_command(
286
+ *,
287
+ run_ids: Annotated[
288
+ list[str],
289
+ Parameter(
290
+ help="Run IDs to analyze",
291
+ ),
292
+ ],
293
+ storage: Annotated[
294
+ Path,
295
+ Parameter(
296
+ help="Storage directory containing experiment results",
297
+ ),
298
+ ] = Path(".cache/runs"),
299
+ objectives: Annotated[
300
+ list[str],
301
+ Parameter(
302
+ help="Metrics to optimize (e.g., accuracy cost)",
303
+ ),
304
+ ],
305
+ maximize: Annotated[
306
+ list[bool] | None,
307
+ Parameter(
308
+ help="Whether to maximize each objective (true/false for each)",
309
+ ),
310
+ ] = None,
311
+ ) -> int:
312
+ """Find Pareto-optimal experiments across multiple objectives.
313
+
314
+ The Pareto frontier consists of experiments where no other experiment
315
+ is better on all objectives simultaneously.
316
+
317
+ Examples:
318
+ # Find experiments with best accuracy/cost tradeoff
319
+ # (maximize accuracy, minimize cost)
320
+ uv run python -m themis.cli pareto \\
321
+ --run-ids run-1 run-2 run-3 run-4 \\
322
+ --objectives accuracy cost \\
323
+ --maximize true false
324
+
325
+ # Find experiments with best accuracy/latency tradeoff
326
+ uv run python -m themis.cli pareto \\
327
+ --run-ids run-1 run-2 run-3 \\
328
+ --objectives accuracy latency \\
329
+ --maximize true false
330
+ """
331
+ try:
332
+ # Load experiments
333
+ print(f"Loading experiments from {storage}...")
334
+ comparison = compare_experiments(
335
+ run_ids=run_ids,
336
+ storage_dir=storage,
337
+ metrics=objectives,
338
+ include_metadata=True,
339
+ )
340
+
341
+ print(f"\n✓ Loaded {len(comparison.experiments)} experiments")
342
+ print(f" Objectives: {', '.join(objectives)}\n")
343
+
344
+ # Compute Pareto frontier
345
+ pareto_ids = comparison.pareto_frontier(objectives, maximize)
346
+
347
+ print("=" * 80)
348
+ print("Pareto Frontier Analysis")
349
+ print("=" * 80)
350
+
351
+ if not pareto_ids:
352
+ print(
353
+ "\n⚠️ No Pareto-optimal experiments found (all experiments have missing values)\n"
354
+ )
355
+ return 0
356
+
357
+ print(f"\n⭐ Found {len(pareto_ids)} Pareto-optimal experiment(s):\n")
358
+
359
+ # Show Pareto-optimal experiments
360
+ for run_id in pareto_ids:
361
+ exp = next(e for e in comparison.experiments if e.run_id == run_id)
362
+ print(f" • {run_id}")
363
+ for obj in objectives:
364
+ val = exp.get_metric(obj)
365
+ print(
366
+ f" {obj}: {val:.4f}"
367
+ if val is not None
368
+ else f" {obj}: N/A"
369
+ )
370
+
371
+ # Show dominated experiments
372
+ dominated = [
373
+ exp for exp in comparison.experiments if exp.run_id not in pareto_ids
374
+ ]
375
+ if dominated:
376
+ print(f"\n📊 Dominated experiments ({len(dominated)}):")
377
+ for exp in dominated:
378
+ print(f" • {exp.run_id}")
379
+
380
+ print("\n" + "=" * 80)
381
+ return 0
382
+
383
+ except ValueError as e:
384
+ print(f"Error: {e}")
385
+ return 1
386
+ except Exception as e:
387
+ print(f"Unexpected error: {e}")
388
+ import traceback
389
+
390
+ traceback.print_exc()
391
+ return 1
392
+
393
+
394
+ __all__ = ["compare_command", "diff_command", "pareto_command"]