themis-eval 0.1.1__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. themis/__init__.py +12 -1
  2. themis/_version.py +2 -2
  3. themis/api.py +429 -0
  4. themis/backends/__init__.py +17 -0
  5. themis/backends/execution.py +197 -0
  6. themis/backends/storage.py +260 -0
  7. themis/cli/commands/results.py +252 -0
  8. themis/cli/main.py +427 -57
  9. themis/comparison/__init__.py +25 -0
  10. themis/comparison/engine.py +348 -0
  11. themis/comparison/reports.py +283 -0
  12. themis/comparison/statistics.py +402 -0
  13. themis/core/entities.py +23 -3
  14. themis/evaluation/metrics/code/__init__.py +19 -0
  15. themis/evaluation/metrics/code/codebleu.py +144 -0
  16. themis/evaluation/metrics/code/execution.py +280 -0
  17. themis/evaluation/metrics/code/pass_at_k.py +181 -0
  18. themis/evaluation/metrics/nlp/__init__.py +21 -0
  19. themis/evaluation/metrics/nlp/bertscore.py +138 -0
  20. themis/evaluation/metrics/nlp/bleu.py +129 -0
  21. themis/evaluation/metrics/nlp/meteor.py +153 -0
  22. themis/evaluation/metrics/nlp/rouge.py +136 -0
  23. themis/evaluation/pipelines/standard_pipeline.py +68 -8
  24. themis/experiment/cache_manager.py +8 -3
  25. themis/experiment/export.py +110 -2
  26. themis/experiment/orchestrator.py +109 -11
  27. themis/experiment/storage.py +1457 -110
  28. themis/generation/providers/litellm_provider.py +46 -0
  29. themis/generation/runner.py +22 -6
  30. themis/integrations/huggingface.py +12 -1
  31. themis/integrations/wandb.py +13 -1
  32. themis/interfaces/__init__.py +86 -0
  33. themis/presets/__init__.py +10 -0
  34. themis/presets/benchmarks.py +354 -0
  35. themis/presets/models.py +190 -0
  36. themis/server/__init__.py +28 -0
  37. themis/server/app.py +337 -0
  38. themis_eval-0.2.1.dist-info/METADATA +596 -0
  39. {themis_eval-0.1.1.dist-info → themis_eval-0.2.1.dist-info}/RECORD +42 -19
  40. {themis_eval-0.1.1.dist-info → themis_eval-0.2.1.dist-info}/WHEEL +1 -1
  41. themis_eval-0.1.1.dist-info/METADATA +0 -758
  42. {themis_eval-0.1.1.dist-info → themis_eval-0.2.1.dist-info}/licenses/LICENSE +0 -0
  43. {themis_eval-0.1.1.dist-info → themis_eval-0.2.1.dist-info}/top_level.txt +0 -0
themis/cli/main.py CHANGED
@@ -1,25 +1,16 @@
1
- """Cyclopts-powered CLI entrypoints for Themis."""
1
+ """Simplified CLI for Themis - Five core commands only.
2
+
3
+ This is the new unified CLI that leverages the themis.evaluate() API.
4
+ It replaces 20+ commands with 5 essential ones.
5
+ """
2
6
 
3
7
  from __future__ import annotations
4
8
 
5
- from typing import Sequence
6
-
7
- from cyclopts import App
8
-
9
- # Import command modules
10
- from themis.cli.commands import (
11
- benchmarks,
12
- comparison,
13
- config_commands,
14
- cost,
15
- demo,
16
- info,
17
- leaderboard,
18
- sample_run,
19
- visualize,
20
- )
21
- from themis.cli.commands import math_benchmarks as math_cmds
22
- from themis.cli.commands import mcq_benchmarks as mcq_cmds
9
+ import sys
10
+ from pathlib import Path
11
+ from typing import Annotated, Sequence
12
+
13
+ from cyclopts import App, Parameter
23
14
 
24
15
  # Import provider modules to ensure they register themselves
25
16
  try:
@@ -29,63 +20,442 @@ try:
29
20
  vllm_provider, # noqa: F401
30
21
  )
31
22
  except ImportError:
32
- pass # Some providers may not be available
23
+ pass
24
+
25
+ app = App(
26
+ name="themis",
27
+ help="Dead simple LLM evaluation platform",
28
+ version="2.0.0-alpha.1",
29
+ )
30
+
33
31
 
34
- app = App(help="Run Themis experiments from the command line")
32
+ @app.command
33
+ def eval(
34
+ benchmark_or_dataset: Annotated[str, Parameter(name="BENCHMARK_OR_DATASET", show_default=False)],
35
+ *,
36
+ model: Annotated[str, Parameter(help="Model identifier (e.g., 'gpt-4', 'claude-3-opus')")],
37
+ limit: Annotated[int | None, Parameter(help="Maximum number of samples")] = None,
38
+ prompt: Annotated[str | None, Parameter(help="Custom prompt template")] = None,
39
+ temperature: Annotated[float, Parameter(help="Sampling temperature")] = 0.0,
40
+ max_tokens: Annotated[int, Parameter(help="Maximum tokens to generate")] = 512,
41
+ storage: Annotated[str | None, Parameter(help="Storage location (local path or s3://...)")] = None,
42
+ run_id: Annotated[str | None, Parameter(help="Unique run identifier")] = None,
43
+ resume: Annotated[bool, Parameter(help="Resume from cached results")] = True,
44
+ distributed: Annotated[bool, Parameter(help="Use distributed execution with Ray")] = False,
45
+ workers: Annotated[int, Parameter(help="Number of workers for distributed execution")] = 4,
46
+ output: Annotated[str | None, Parameter(help="Output file (CSV, JSON, or HTML)")] = None,
47
+ ) -> int:
48
+ """Run an evaluation on a benchmark or custom dataset.
49
+
50
+ Examples:
51
+ # Simple benchmark
52
+ themis eval math500 --model gpt-4 --limit 100
53
+
54
+ # Custom dataset
55
+ themis eval data.jsonl --model claude-3-opus --prompt "Q: {question}\\nA:"
56
+
57
+ # Distributed execution
58
+ themis eval gsm8k --model gpt-4 --distributed --workers 8
59
+ """
60
+ import themis
61
+ from themis.experiment import export as experiment_export
62
+
63
+ print(f"Running evaluation: {benchmark_or_dataset}")
64
+ print(f"Model: {model}")
65
+ if limit:
66
+ print(f"Limit: {limit} samples")
67
+ print()
68
+
69
+ # Check if it's a file (custom dataset)
70
+ if Path(benchmark_or_dataset).exists():
71
+ print(f"Loading custom dataset from: {benchmark_or_dataset}")
72
+ # TODO: Load dataset from file
73
+ print("Error: Custom dataset files not yet implemented")
74
+ return 1
75
+
76
+ try:
77
+ # Run evaluation using unified API
78
+ report = themis.evaluate(
79
+ benchmark_or_dataset,
80
+ model=model,
81
+ limit=limit,
82
+ prompt=prompt,
83
+ temperature=temperature,
84
+ max_tokens=max_tokens,
85
+ storage=storage,
86
+ run_id=run_id,
87
+ resume=resume,
88
+ distributed=distributed,
89
+ workers=workers,
90
+ )
91
+
92
+ # Print results
93
+ print("\n" + "=" * 80)
94
+ print("EVALUATION RESULTS")
95
+ print("=" * 80)
96
+
97
+ # Print metrics
98
+ eval_report = report.evaluation_report
99
+ if eval_report and eval_report.aggregates:
100
+ print("\nMetrics:")
101
+ for agg in eval_report.aggregates:
102
+ print(f" {agg.metric_name}: {agg.mean:.4f} (±{agg.std:.4f})")
103
+
104
+ # Print sample counts
105
+ total = len(report.generation_results)
106
+ failures = len(report.failures)
107
+ successful = total - failures
108
+ print(f"\nSamples: {successful}/{total} successful")
109
+
110
+ # Export if requested
111
+ if output:
112
+ output_path = Path(output)
113
+ suffix = output_path.suffix.lower()
114
+
115
+ if suffix == ".csv":
116
+ experiment_export.export_csv(report, output_path)
117
+ print(f"\nExported to CSV: {output_path}")
118
+ elif suffix == ".json":
119
+ experiment_export.export_json(report, output_path)
120
+ print(f"\nExported to JSON: {output_path}")
121
+ elif suffix in [".html", ".htm"]:
122
+ experiment_export.export_html(report, output_path)
123
+ print(f"\nExported to HTML: {output_path}")
124
+ else:
125
+ print(f"\nWarning: Unknown output format: {suffix}")
126
+
127
+ return 0
128
+
129
+ except Exception as e:
130
+ print(f"\nError: {e}", file=sys.stderr)
131
+ import traceback
132
+ traceback.print_exc()
133
+ return 1
134
+
135
+
136
+ @app.command
137
+ def compare(
138
+ run_ids: Annotated[list[str], Parameter(name="RUN_IDS", show_default=False)],
139
+ *,
140
+ metric: Annotated[str | None, Parameter(help="Metric to compare")] = None,
141
+ output: Annotated[str | None, Parameter(help="Output file (HTML or Markdown)")] = None,
142
+ show_diff: Annotated[bool, Parameter(help="Show examples where results differ")] = False,
143
+ ) -> int:
144
+ """Compare results from multiple runs with statistical tests.
145
+
146
+ Performs pairwise comparisons across all specified runs and metrics,
147
+ computing win/loss matrices and statistical significance.
148
+
149
+ Examples:
150
+ # Compare two runs
151
+ themis compare run-1 run-2
152
+
153
+ # Compare with specific metric
154
+ themis compare run-1 run-2 run-3 --metric ExactMatch
155
+
156
+ # Export to HTML
157
+ themis compare run-1 run-2 --output comparison.html --show-diff
158
+ """
159
+ try:
160
+ if len(run_ids) < 2:
161
+ print("Error: Need at least 2 runs to compare", file=sys.stderr)
162
+ return 1
163
+
164
+ # Determine storage path (default to .cache/experiments)
165
+ storage_path = Path(".cache/experiments")
166
+
167
+ if not storage_path.exists():
168
+ print(f"Error: Storage path not found: {storage_path}", file=sys.stderr)
169
+ print(f"Tip: Specify storage path with THEMIS_STORAGE env var", file=sys.stderr)
170
+ return 1
171
+
172
+ # Run comparison
173
+ print(f"Comparing {len(run_ids)} runs: {', '.join(run_ids)}")
174
+ print(f"Storage: {storage_path}")
175
+ print()
176
+
177
+ from themis.comparison import compare_runs
178
+ from themis.comparison.statistics import StatisticalTest
179
+
180
+ metrics_list = [metric] if metric else None
181
+
182
+ report = compare_runs(
183
+ run_ids=run_ids,
184
+ storage_path=storage_path,
185
+ metrics=metrics_list,
186
+ statistical_test=StatisticalTest.BOOTSTRAP,
187
+ alpha=0.05,
188
+ )
189
+
190
+ # Print summary
191
+ print(report.summary(include_details=show_diff))
192
+
193
+ # Export if requested
194
+ if output:
195
+ output_path = Path(output)
196
+ suffix = output_path.suffix.lower()
197
+
198
+ if suffix == ".json":
199
+ import json
200
+ output_path.write_text(json.dumps(report.to_dict(), indent=2))
201
+ print(f"\n✓ Exported to JSON: {output_path}")
202
+ elif suffix == ".html":
203
+ html = _generate_comparison_html(report)
204
+ output_path.write_text(html)
205
+ print(f"\n✓ Exported to HTML: {output_path}")
206
+ elif suffix == ".md":
207
+ md = _generate_comparison_markdown(report)
208
+ output_path.write_text(md)
209
+ print(f"\n✓ Exported to Markdown: {output_path}")
210
+ else:
211
+ print(f"\nWarning: Unknown output format: {suffix}", file=sys.stderr)
212
+
213
+ return 0
214
+
215
+ except Exception as e:
216
+ print(f"\nError: {e}", file=sys.stderr)
217
+ import traceback
218
+ traceback.print_exc()
219
+ return 1
35
220
 
36
- # Register demo command
37
- app.command(name="demo")(demo.demo_command)
38
221
 
39
- # Register math benchmark commands
40
- app.command(name="math500")(math_cmds.math500_command)
41
- app.command(name="aime24")(math_cmds.aime24_command)
42
- app.command(name="aime25")(math_cmds.aime25_command)
43
- app.command(name="amc23")(math_cmds.amc23_command)
44
- app.command(name="olympiadbench")(math_cmds.olympiadbench_command)
45
- app.command(name="beyondaime")(math_cmds.beyond_aime_command)
222
+ @app.command
223
+ def serve(
224
+ *,
225
+ port: Annotated[int, Parameter(help="Port to run server on")] = 8080,
226
+ host: Annotated[str, Parameter(help="Host to bind to")] = "127.0.0.1",
227
+ storage: Annotated[str | None, Parameter(help="Storage path for runs")] = None,
228
+ reload: Annotated[bool, Parameter(help="Enable auto-reload (dev mode)")] = False,
229
+ ) -> int:
230
+ """Start the Themis API server with REST and WebSocket endpoints.
231
+
232
+ Provides:
233
+ - REST API for listing/viewing runs
234
+ - Comparison endpoints with statistical tests
235
+ - WebSocket for real-time monitoring
236
+ - Interactive API docs at /docs
237
+
238
+ Examples:
239
+ # Start server on default port
240
+ themis serve
241
+
242
+ # Custom port and storage
243
+ themis serve --port 3000 --storage ~/.themis/runs
244
+
245
+ # Development mode with auto-reload
246
+ themis serve --reload
247
+ """
248
+ try:
249
+ from themis.server import create_app
250
+ import uvicorn
251
+ except ImportError:
252
+ print("Error: FastAPI server dependencies not installed", file=sys.stderr)
253
+ print("\nInstall with: pip install themis[server]", file=sys.stderr)
254
+ print(" or: uv pip install themis[server]", file=sys.stderr)
255
+ return 1
256
+
257
+ # Determine storage path
258
+ storage_path = Path(storage) if storage else Path(".cache/experiments")
259
+
260
+ print(f"Starting Themis API server...")
261
+ print(f" URL: http://{host}:{port}")
262
+ print(f" Storage: {storage_path}")
263
+ print(f" Docs: http://{host}:{port}/docs")
264
+ print()
265
+
266
+ # Create app
267
+ app_instance = create_app(storage_path=storage_path)
268
+
269
+ # Run server
270
+ uvicorn.run(
271
+ app_instance,
272
+ host=host,
273
+ port=port,
274
+ reload=reload,
275
+ log_level="info",
276
+ )
277
+
278
+ return 0
46
279
 
47
- # Register MCQ benchmark commands
48
- app.command(name="supergpqa")(mcq_cmds.supergpqa_command)
49
- app.command(name="mmlu-pro")(mcq_cmds.mmlu_pro_command)
50
280
 
51
- # Register config commands
52
- app.command(name="run-config")(config_commands.run_configured_experiment)
53
- app.command(name="validate-config")(config_commands.validate_config)
54
- app.command(name="init")(config_commands.init_config)
281
+ @app.command
282
+ def list(
283
+ what: Annotated[str, Parameter(name="WHAT", show_default=False)],
284
+ *,
285
+ storage: Annotated[str | None, Parameter(help="Storage path for runs")] = None,
286
+ limit: Annotated[int | None, Parameter(help="Limit number of results")] = None,
287
+ ) -> int:
288
+ """List runs, benchmarks, or available metrics.
289
+
290
+ Args:
291
+ what: What to list (runs, benchmarks, or metrics)
292
+
293
+ Examples:
294
+ # List all runs
295
+ themis list runs
296
+
297
+ # List available benchmarks
298
+ themis list benchmarks
299
+
300
+ # List available metrics
301
+ themis list metrics
302
+ """
303
+ # Validate input
304
+ if what not in ["runs", "benchmarks", "metrics"]:
305
+ print(f"Error: '{what}' is not valid. Choose from: runs, benchmarks, metrics")
306
+ return 1
307
+
308
+ if what == "benchmarks":
309
+ from themis.presets import list_benchmarks
310
+
311
+ benchmarks = list_benchmarks()
312
+ print("Available benchmarks:")
313
+ for benchmark in benchmarks:
314
+ print(f" - {benchmark}")
315
+ return 0
316
+
317
+ elif what == "metrics":
318
+ print("Available metrics:")
319
+ print(" Math:")
320
+ print(" - exact_match")
321
+ print(" - math_verify")
322
+ print(" General:")
323
+ print(" - response_length")
324
+ print("\n Note: NLP and code metrics will be added in Phase 2")
325
+ return 0
326
+
327
+ elif what == "runs":
328
+ print("Listing runs...")
329
+ print("Note: Run listing not yet fully implemented")
330
+ return 1
331
+
332
+ return 0
55
333
 
56
- # Register info and listing commands
57
- app.command(name="list-providers")(benchmarks.list_providers)
58
- app.command(name="list-benchmarks")(benchmarks.list_benchmarks)
59
- app.command(name="info")(info.show_info)
60
- app.command(name="new-project")(info.new_project)
61
334
 
62
- # Register comparison commands
63
- app.command(name="compare")(comparison.compare_command)
64
- app.command(name="diff")(comparison.diff_command)
65
- app.command(name="pareto")(comparison.pareto_command)
335
+ @app.command
336
+ def clean(
337
+ *,
338
+ storage: Annotated[str | None, Parameter(help="Storage path to clean")] = None,
339
+ older_than: Annotated[int | None, Parameter(help="Remove runs older than N days")] = None,
340
+ dry_run: Annotated[bool, Parameter(help="Show what would be deleted")] = False,
341
+ ) -> int:
342
+ """Clean up old runs and cached data.
343
+
344
+ Examples:
345
+ # Dry run to see what would be deleted
346
+ themis clean --dry-run
347
+
348
+ # Remove runs older than 30 days
349
+ themis clean --older-than 30
350
+ """
351
+ print("Cleaning storage...")
352
+ print("Note: Storage cleanup not yet implemented")
353
+ print("This will be implemented in Phase 6")
354
+ return 1
66
355
 
67
- # Register cost commands
68
- app.command(name="estimate-cost")(cost.estimate_cost_command)
69
- app.command(name="show-pricing")(cost.show_pricing_command)
70
356
 
71
- # Register visualization commands
72
- app.command(name="visualize")(visualize.visualize_comparison_command)
73
- app.command(name="visualize-pareto")(visualize.visualize_pareto_command)
74
- app.command(name="visualize-distribution")(visualize.visualize_distribution_command)
357
+ def _generate_comparison_html(report) -> str:
358
+ """Generate HTML report for comparison."""
359
+ html = f"""<!DOCTYPE html>
360
+ <html>
361
+ <head>
362
+ <title>Comparison Report</title>
363
+ <style>
364
+ body {{ font-family: Arial, sans-serif; margin: 20px; }}
365
+ h1, h2, h3 {{ color: #333; }}
366
+ table {{ border-collapse: collapse; margin: 20px 0; }}
367
+ th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
368
+ th {{ background-color: #f2f2f2; }}
369
+ .win {{ background-color: #d4edda; }}
370
+ .loss {{ background-color: #f8d7da; }}
371
+ .tie {{ background-color: #fff3cd; }}
372
+ .significant {{ font-weight: bold; color: #28a745; }}
373
+ </style>
374
+ </head>
375
+ <body>
376
+ <h1>Comparison Report</h1>
377
+ <p><strong>Runs:</strong> {', '.join(report.run_ids)}</p>
378
+ <p><strong>Metrics:</strong> {', '.join(report.metrics)}</p>
379
+ <p><strong>Overall Best:</strong> {report.overall_best_run}</p>
380
+
381
+ <h2>Best Run Per Metric</h2>
382
+ <ul>
383
+ """
384
+
385
+ for metric, run_id in report.best_run_per_metric.items():
386
+ html += f" <li><strong>{metric}:</strong> {run_id}</li>\n"
387
+
388
+ html += """ </ul>
389
+
390
+ <h2>Win/Loss Matrices</h2>
391
+ """
392
+
393
+ for metric, matrix in report.win_loss_matrices.items():
394
+ html += f" <h3>{metric}</h3>\n"
395
+ html += " <table>\n"
396
+ html += " <tr><th>Run</th>"
397
+ for rid in matrix.run_ids:
398
+ html += f"<th>{rid}</th>"
399
+ html += "</tr>\n"
400
+
401
+ for i, run_id in enumerate(matrix.run_ids):
402
+ html += f" <tr><td><strong>{run_id}</strong></td>"
403
+ for j in range(len(matrix.run_ids)):
404
+ result = matrix.matrix[i][j]
405
+ css_class = result if result in ["win", "loss", "tie"] else ""
406
+ html += f'<td class="{css_class}">{result}</td>'
407
+ html += "</tr>\n"
408
+
409
+ html += " </table>\n"
410
+
411
+ html += """</body>
412
+ </html>"""
413
+
414
+ return html
75
415
 
76
- # Register leaderboard command
77
- app.command(name="leaderboard")(leaderboard.leaderboard_command)
78
416
 
79
- # Register sample-run command
80
- app.command(name="sample-run")(sample_run.sample_run_command)
417
+ def _generate_comparison_markdown(report) -> str:
418
+ """Generate Markdown report for comparison."""
419
+ lines = []
420
+ lines.append("# Comparison Report")
421
+ lines.append("")
422
+ lines.append(f"**Runs:** {', '.join(report.run_ids)}")
423
+ lines.append(f"**Metrics:** {', '.join(report.metrics)}")
424
+ lines.append(f"**Overall Best:** {report.overall_best_run}")
425
+ lines.append("")
426
+
427
+ lines.append("## Best Run Per Metric")
428
+ lines.append("")
429
+ for metric, run_id in report.best_run_per_metric.items():
430
+ lines.append(f"- **{metric}:** {run_id}")
431
+ lines.append("")
432
+
433
+ lines.append("## Win/Loss Matrices")
434
+ lines.append("")
435
+ for metric, matrix in report.win_loss_matrices.items():
436
+ lines.append(f"### {metric}")
437
+ lines.append("")
438
+ lines.append(matrix.to_table())
439
+ lines.append("")
440
+
441
+ return "\n".join(lines)
81
442
 
82
443
 
83
444
  def main(argv: Sequence[str] | None = None) -> int:
445
+ """Main CLI entry point."""
84
446
  parsed_argv = list(argv) if argv is not None else None
85
447
  try:
86
448
  result = app(parsed_argv)
87
449
  except SystemExit as exc: # pragma: no cover - CLI integration path
88
450
  return int(exc.code or 0)
451
+ except KeyboardInterrupt:
452
+ print("\nInterrupted by user")
453
+ return 130
454
+ except Exception as e:
455
+ print(f"Error: {e}", file=sys.stderr)
456
+ import traceback
457
+ traceback.print_exc()
458
+ return 1
89
459
  return int(result) if isinstance(result, int) else 0
90
460
 
91
461
 
@@ -0,0 +1,25 @@
1
+ """Comparison engine for analyzing multiple experiment runs.
2
+
3
+ This module provides tools for comparing different models, prompts, or
4
+ configurations across multiple runs with statistical rigor.
5
+ """
6
+
7
+ from themis.comparison.engine import ComparisonEngine, compare_runs
8
+ from themis.comparison.reports import ComparisonReport, ComparisonResult
9
+ from themis.comparison.statistics import (
10
+ StatisticalTest,
11
+ bootstrap_confidence_interval,
12
+ permutation_test,
13
+ t_test,
14
+ )
15
+
16
+ __all__ = [
17
+ "ComparisonEngine",
18
+ "compare_runs",
19
+ "ComparisonReport",
20
+ "ComparisonResult",
21
+ "StatisticalTest",
22
+ "bootstrap_confidence_interval",
23
+ "permutation_test",
24
+ "t_test",
25
+ ]