themis-eval 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (158) hide show
  1. themis/__init__.py +12 -1
  2. themis/_version.py +2 -2
  3. themis/api.py +343 -0
  4. themis/backends/__init__.py +17 -0
  5. themis/backends/execution.py +197 -0
  6. themis/backends/storage.py +260 -0
  7. themis/cli/__init__.py +5 -0
  8. themis/cli/__main__.py +6 -0
  9. themis/cli/commands/__init__.py +19 -0
  10. themis/cli/commands/benchmarks.py +221 -0
  11. themis/cli/commands/comparison.py +394 -0
  12. themis/cli/commands/config_commands.py +244 -0
  13. themis/cli/commands/cost.py +214 -0
  14. themis/cli/commands/demo.py +68 -0
  15. themis/cli/commands/info.py +90 -0
  16. themis/cli/commands/leaderboard.py +362 -0
  17. themis/cli/commands/math_benchmarks.py +318 -0
  18. themis/cli/commands/mcq_benchmarks.py +207 -0
  19. themis/cli/commands/results.py +252 -0
  20. themis/cli/commands/sample_run.py +244 -0
  21. themis/cli/commands/visualize.py +299 -0
  22. themis/cli/main.py +463 -0
  23. themis/cli/new_project.py +33 -0
  24. themis/cli/utils.py +51 -0
  25. themis/comparison/__init__.py +25 -0
  26. themis/comparison/engine.py +348 -0
  27. themis/comparison/reports.py +283 -0
  28. themis/comparison/statistics.py +402 -0
  29. themis/config/__init__.py +19 -0
  30. themis/config/loader.py +27 -0
  31. themis/config/registry.py +34 -0
  32. themis/config/runtime.py +214 -0
  33. themis/config/schema.py +112 -0
  34. themis/core/__init__.py +5 -0
  35. themis/core/conversation.py +354 -0
  36. themis/core/entities.py +184 -0
  37. themis/core/serialization.py +231 -0
  38. themis/core/tools.py +393 -0
  39. themis/core/types.py +141 -0
  40. themis/datasets/__init__.py +273 -0
  41. themis/datasets/base.py +264 -0
  42. themis/datasets/commonsense_qa.py +174 -0
  43. themis/datasets/competition_math.py +265 -0
  44. themis/datasets/coqa.py +133 -0
  45. themis/datasets/gpqa.py +190 -0
  46. themis/datasets/gsm8k.py +123 -0
  47. themis/datasets/gsm_symbolic.py +124 -0
  48. themis/datasets/math500.py +122 -0
  49. themis/datasets/med_qa.py +179 -0
  50. themis/datasets/medmcqa.py +169 -0
  51. themis/datasets/mmlu_pro.py +262 -0
  52. themis/datasets/piqa.py +146 -0
  53. themis/datasets/registry.py +201 -0
  54. themis/datasets/schema.py +245 -0
  55. themis/datasets/sciq.py +150 -0
  56. themis/datasets/social_i_qa.py +151 -0
  57. themis/datasets/super_gpqa.py +263 -0
  58. themis/evaluation/__init__.py +1 -0
  59. themis/evaluation/conditional.py +410 -0
  60. themis/evaluation/extractors/__init__.py +19 -0
  61. themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
  62. themis/evaluation/extractors/exceptions.py +7 -0
  63. themis/evaluation/extractors/identity_extractor.py +29 -0
  64. themis/evaluation/extractors/json_field_extractor.py +45 -0
  65. themis/evaluation/extractors/math_verify_extractor.py +37 -0
  66. themis/evaluation/extractors/regex_extractor.py +43 -0
  67. themis/evaluation/math_verify_utils.py +87 -0
  68. themis/evaluation/metrics/__init__.py +21 -0
  69. themis/evaluation/metrics/code/__init__.py +19 -0
  70. themis/evaluation/metrics/code/codebleu.py +144 -0
  71. themis/evaluation/metrics/code/execution.py +280 -0
  72. themis/evaluation/metrics/code/pass_at_k.py +181 -0
  73. themis/evaluation/metrics/composite_metric.py +47 -0
  74. themis/evaluation/metrics/consistency_metric.py +80 -0
  75. themis/evaluation/metrics/exact_match.py +51 -0
  76. themis/evaluation/metrics/length_difference_tolerance.py +33 -0
  77. themis/evaluation/metrics/math_verify_accuracy.py +40 -0
  78. themis/evaluation/metrics/nlp/__init__.py +21 -0
  79. themis/evaluation/metrics/nlp/bertscore.py +138 -0
  80. themis/evaluation/metrics/nlp/bleu.py +129 -0
  81. themis/evaluation/metrics/nlp/meteor.py +153 -0
  82. themis/evaluation/metrics/nlp/rouge.py +136 -0
  83. themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
  84. themis/evaluation/metrics/response_length.py +33 -0
  85. themis/evaluation/metrics/rubric_judge_metric.py +134 -0
  86. themis/evaluation/pipeline.py +49 -0
  87. themis/evaluation/pipelines/__init__.py +15 -0
  88. themis/evaluation/pipelines/composable_pipeline.py +357 -0
  89. themis/evaluation/pipelines/standard_pipeline.py +348 -0
  90. themis/evaluation/reports.py +293 -0
  91. themis/evaluation/statistics/__init__.py +53 -0
  92. themis/evaluation/statistics/bootstrap.py +79 -0
  93. themis/evaluation/statistics/confidence_intervals.py +121 -0
  94. themis/evaluation/statistics/distributions.py +207 -0
  95. themis/evaluation/statistics/effect_sizes.py +124 -0
  96. themis/evaluation/statistics/hypothesis_tests.py +305 -0
  97. themis/evaluation/statistics/types.py +139 -0
  98. themis/evaluation/strategies/__init__.py +13 -0
  99. themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
  100. themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
  101. themis/evaluation/strategies/evaluation_strategy.py +24 -0
  102. themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
  103. themis/experiment/__init__.py +5 -0
  104. themis/experiment/builder.py +151 -0
  105. themis/experiment/cache_manager.py +134 -0
  106. themis/experiment/comparison.py +631 -0
  107. themis/experiment/cost.py +310 -0
  108. themis/experiment/definitions.py +62 -0
  109. themis/experiment/export.py +798 -0
  110. themis/experiment/export_csv.py +159 -0
  111. themis/experiment/integration_manager.py +104 -0
  112. themis/experiment/math.py +192 -0
  113. themis/experiment/mcq.py +169 -0
  114. themis/experiment/orchestrator.py +415 -0
  115. themis/experiment/pricing.py +317 -0
  116. themis/experiment/storage.py +1458 -0
  117. themis/experiment/visualization.py +588 -0
  118. themis/generation/__init__.py +1 -0
  119. themis/generation/agentic_runner.py +420 -0
  120. themis/generation/batching.py +254 -0
  121. themis/generation/clients.py +143 -0
  122. themis/generation/conversation_runner.py +236 -0
  123. themis/generation/plan.py +456 -0
  124. themis/generation/providers/litellm_provider.py +221 -0
  125. themis/generation/providers/vllm_provider.py +135 -0
  126. themis/generation/router.py +34 -0
  127. themis/generation/runner.py +207 -0
  128. themis/generation/strategies.py +98 -0
  129. themis/generation/templates.py +71 -0
  130. themis/generation/turn_strategies.py +393 -0
  131. themis/generation/types.py +9 -0
  132. themis/integrations/__init__.py +0 -0
  133. themis/integrations/huggingface.py +72 -0
  134. themis/integrations/wandb.py +77 -0
  135. themis/interfaces/__init__.py +169 -0
  136. themis/presets/__init__.py +10 -0
  137. themis/presets/benchmarks.py +354 -0
  138. themis/presets/models.py +190 -0
  139. themis/project/__init__.py +20 -0
  140. themis/project/definitions.py +98 -0
  141. themis/project/patterns.py +230 -0
  142. themis/providers/__init__.py +5 -0
  143. themis/providers/registry.py +39 -0
  144. themis/server/__init__.py +28 -0
  145. themis/server/app.py +337 -0
  146. themis/utils/api_generator.py +379 -0
  147. themis/utils/cost_tracking.py +376 -0
  148. themis/utils/dashboard.py +452 -0
  149. themis/utils/logging_utils.py +41 -0
  150. themis/utils/progress.py +58 -0
  151. themis/utils/tracing.py +320 -0
  152. themis_eval-0.2.0.dist-info/METADATA +596 -0
  153. themis_eval-0.2.0.dist-info/RECORD +157 -0
  154. {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/WHEEL +1 -1
  155. themis_eval-0.1.0.dist-info/METADATA +0 -758
  156. themis_eval-0.1.0.dist-info/RECORD +0 -8
  157. {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/licenses/LICENSE +0 -0
  158. {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,299 @@
1
+ """Visualization commands for interactive charts."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Annotated
7
+
8
+ from cyclopts import Parameter
9
+
10
+ from themis.experiment.comparison import compare_experiments
11
+ from themis.experiment.visualization import (
12
+ PLOTLY_AVAILABLE,
13
+ InteractiveVisualizer,
14
+ export_interactive_html,
15
+ )
16
+
17
+
18
+ def visualize_comparison_command(
19
+ *,
20
+ run_ids: Annotated[list[str], Parameter(help="Run IDs to visualize")],
21
+ storage: Annotated[Path, Parameter(help="Storage directory")] = Path(".cache/runs"),
22
+ metric: Annotated[str | None, Parameter(help="Metric to visualize")] = None,
23
+ output: Annotated[Path, Parameter(help="Output HTML file path")] = Path(
24
+ "visualization.html"
25
+ ),
26
+ chart_type: Annotated[
27
+ str,
28
+ Parameter(help="Chart type: comparison, evolution, dashboard, pareto"),
29
+ ] = "comparison",
30
+ ) -> int:
31
+ """Generate interactive visualization for experiments.
32
+
33
+ Examples:
34
+ # Bar chart comparing accuracy across runs
35
+ uv run python -m themis.cli visualize \\
36
+ --run-ids run-1 run-2 run-3 \\
37
+ --metric accuracy \\
38
+ --output accuracy_comparison.html
39
+
40
+ # Evolution chart showing metric over time
41
+ uv run python -m themis.cli visualize \\
42
+ --run-ids run-1 run-2 run-3 run-4 \\
43
+ --metric accuracy \\
44
+ --chart-type evolution \\
45
+ --output accuracy_evolution.html
46
+
47
+ # Dashboard with multiple metrics
48
+ uv run python -m themis.cli visualize \\
49
+ --run-ids run-1 run-2 run-3 \\
50
+ --chart-type dashboard \\
51
+ --output dashboard.html
52
+
53
+ # Pareto frontier (requires --pareto-metrics and --maximize)
54
+ uv run python -m themis.cli visualize-pareto \\
55
+ --run-ids run-1 run-2 run-3 \\
56
+ --metric1 accuracy \\
57
+ --metric2 cost \\
58
+ --output pareto.html
59
+ """
60
+ if not PLOTLY_AVAILABLE:
61
+ print("Error: Plotly is not installed.")
62
+ print("Install with: pip install plotly")
63
+ return 1
64
+
65
+ try:
66
+ # Load experiments
67
+ print(f"Loading experiments from {storage}...")
68
+ comparison = compare_experiments(
69
+ run_ids=run_ids,
70
+ storage_dir=storage,
71
+ include_metadata=True,
72
+ )
73
+
74
+ print(f"✓ Loaded {len(comparison.experiments)} experiments")
75
+
76
+ # Create visualizer
77
+ visualizer = InteractiveVisualizer()
78
+
79
+ # Generate chart based on type
80
+ if chart_type == "comparison":
81
+ if not metric:
82
+ metric = comparison.metrics[0] if comparison.metrics else "accuracy"
83
+ print(f"Using default metric: {metric}")
84
+
85
+ print(f"Creating comparison chart for '{metric}'...")
86
+ fig = visualizer.plot_metric_comparison(comparison, metric)
87
+
88
+ elif chart_type == "evolution":
89
+ if not metric:
90
+ metric = comparison.metrics[0] if comparison.metrics else "accuracy"
91
+ print(f"Using default metric: {metric}")
92
+
93
+ print(f"Creating evolution chart for '{metric}'...")
94
+ fig = visualizer.plot_metric_evolution(comparison, metric)
95
+
96
+ elif chart_type == "dashboard":
97
+ print("Creating dashboard with multiple metrics...")
98
+ fig = visualizer.create_dashboard(comparison)
99
+
100
+ else:
101
+ print(f"Error: Unknown chart type '{chart_type}'")
102
+ print("Available: comparison, evolution, dashboard")
103
+ return 1
104
+
105
+ # Export to HTML
106
+ export_interactive_html(fig, output)
107
+ print(f"\n✓ Visualization saved to {output}")
108
+ print(" Open in browser to interact with chart")
109
+
110
+ return 0
111
+
112
+ except Exception as e:
113
+ print(f"Error: {e}")
114
+ return 1
115
+
116
+
117
+ def visualize_pareto_command(
118
+ *,
119
+ run_ids: Annotated[list[str], Parameter(help="Run IDs to visualize")],
120
+ storage: Annotated[Path, Parameter(help="Storage directory")] = Path(".cache/runs"),
121
+ metric1: Annotated[str, Parameter(help="First metric (x-axis)")],
122
+ metric2: Annotated[str, Parameter(help="Second metric (y-axis)")],
123
+ maximize1: Annotated[bool, Parameter(help="Maximize metric1")] = True,
124
+ maximize2: Annotated[bool, Parameter(help="Maximize metric2")] = True,
125
+ output: Annotated[Path, Parameter(help="Output HTML file path")] = Path(
126
+ "pareto.html"
127
+ ),
128
+ ) -> int:
129
+ """Generate Pareto frontier visualization.
130
+
131
+ Examples:
132
+ # Maximize accuracy, minimize cost
133
+ uv run python -m themis.cli visualize-pareto \\
134
+ --run-ids run-1 run-2 run-3 run-4 \\
135
+ --metric1 accuracy \\
136
+ --metric2 cost \\
137
+ --maximize1 true \\
138
+ --maximize2 false \\
139
+ --output pareto.html
140
+ """
141
+ if not PLOTLY_AVAILABLE:
142
+ print("Error: Plotly is not installed.")
143
+ print("Install with: pip install plotly")
144
+ return 1
145
+
146
+ try:
147
+ # Load experiments
148
+ print(f"Loading experiments from {storage}...")
149
+ comparison = compare_experiments(
150
+ run_ids=run_ids,
151
+ storage_dir=storage,
152
+ include_metadata=True,
153
+ )
154
+
155
+ print(f"✓ Loaded {len(comparison.experiments)} experiments")
156
+
157
+ # Compute Pareto frontier
158
+ print(f"Computing Pareto frontier for {metric1} and {metric2}...")
159
+ pareto_ids = comparison.pareto_frontier(
160
+ objectives=[metric1, metric2],
161
+ maximize=[maximize1, maximize2],
162
+ )
163
+
164
+ print(f"✓ Found {len(pareto_ids)} Pareto-optimal experiments:")
165
+ for run_id in pareto_ids:
166
+ print(f" - {run_id}")
167
+
168
+ # Create visualization
169
+ visualizer = InteractiveVisualizer()
170
+ fig = visualizer.plot_pareto_frontier(
171
+ comparison, metric1, metric2, pareto_ids, maximize1, maximize2
172
+ )
173
+
174
+ # Export to HTML
175
+ export_interactive_html(fig, output)
176
+ print(f"\n✓ Visualization saved to {output}")
177
+ print(" Red points are Pareto-optimal")
178
+ print(" Blue points are dominated")
179
+
180
+ return 0
181
+
182
+ except Exception as e:
183
+ print(f"Error: {e}")
184
+ return 1
185
+
186
+
187
+ def visualize_distribution_command(
188
+ *,
189
+ run_id: Annotated[str, Parameter(help="Run ID to visualize")],
190
+ storage: Annotated[Path, Parameter(help="Storage directory")] = Path(".cache/runs"),
191
+ metric: Annotated[str, Parameter(help="Metric to visualize")],
192
+ plot_type: Annotated[
193
+ str, Parameter(help="Plot type: histogram, box, violin")
194
+ ] = "histogram",
195
+ output: Annotated[Path, Parameter(help="Output HTML file path")] = Path(
196
+ "distribution.html"
197
+ ),
198
+ ) -> int:
199
+ """Generate metric distribution visualization.
200
+
201
+ Shows the distribution of a metric across all samples in an experiment.
202
+
203
+ Examples:
204
+ # Histogram of accuracy scores
205
+ uv run python -m themis.cli visualize-distribution \\
206
+ --run-id my-run \\
207
+ --metric accuracy \\
208
+ --output accuracy_dist.html
209
+
210
+ # Violin plot
211
+ uv run python -m themis.cli visualize-distribution \\
212
+ --run-id my-run \\
213
+ --metric accuracy \\
214
+ --plot-type violin \\
215
+ --output accuracy_violin.html
216
+ """
217
+ if not PLOTLY_AVAILABLE:
218
+ print("Error: Plotly is not installed.")
219
+ print("Install with: pip install plotly")
220
+ return 1
221
+
222
+ try:
223
+ import json
224
+
225
+ # Load report
226
+ print(f"Loading report from {storage / run_id}...")
227
+ report_path = storage / run_id / "report.json"
228
+
229
+ if not report_path.exists():
230
+ print(f"Error: Report not found at {report_path}")
231
+ return 1
232
+
233
+ with report_path.open("r", encoding="utf-8") as f:
234
+ report_data = json.load(f)
235
+
236
+ # Extract evaluation report
237
+ # Note: This is simplified - in production you'd deserialize properly
238
+ from themis.core.entities import EvaluationRecord, MetricScore
239
+ from themis.evaluation.reports import EvaluationReport, MetricAggregate
240
+
241
+ # Build evaluation report from JSON
242
+ records = []
243
+ for sample_data in report_data.get("samples", []):
244
+ scores = [
245
+ MetricScore(
246
+ metric_name=score["metric"],
247
+ value=score["value"],
248
+ details=score.get("details"),
249
+ metadata=score.get("metadata", {}),
250
+ )
251
+ for score in sample_data["scores"]
252
+ ]
253
+ records.append(
254
+ EvaluationRecord(
255
+ sample_id=sample_data["sample_id"],
256
+ scores=scores,
257
+ failures=[],
258
+ )
259
+ )
260
+
261
+ # Build metric aggregates
262
+ metrics = {}
263
+ for metric_data in report_data.get("metrics", []):
264
+ metrics[metric_data["name"]] = MetricAggregate(
265
+ count=metric_data["count"],
266
+ mean=metric_data["mean"],
267
+ )
268
+
269
+ eval_report = EvaluationReport(
270
+ records=records,
271
+ metrics=metrics,
272
+ failures=[],
273
+ )
274
+
275
+ print(f"✓ Loaded report with {len(records)} samples")
276
+
277
+ # Create visualization
278
+ visualizer = InteractiveVisualizer()
279
+ fig = visualizer.plot_metric_distribution(eval_report, metric, plot_type)
280
+
281
+ # Export to HTML
282
+ export_interactive_html(fig, output)
283
+ print(f"\n✓ Visualization saved to {output}")
284
+
285
+ return 0
286
+
287
+ except Exception as e:
288
+ print(f"Error: {e}")
289
+ import traceback
290
+
291
+ traceback.print_exc()
292
+ return 1
293
+
294
+
295
+ __all__ = [
296
+ "visualize_comparison_command",
297
+ "visualize_pareto_command",
298
+ "visualize_distribution_command",
299
+ ]