themis-eval 0.2.3__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. themis/__init__.py +5 -2
  2. themis/_version.py +14 -1
  3. themis/api.py +83 -145
  4. themis/backends/storage.py +5 -0
  5. themis/cli/commands/info.py +2 -11
  6. themis/cli/main.py +231 -40
  7. themis/comparison/engine.py +7 -13
  8. themis/core/entities.py +4 -0
  9. themis/evaluation/metric_pipeline.py +12 -0
  10. themis/evaluation/pipeline.py +22 -0
  11. themis/evaluation/pipelines/__init__.py +4 -0
  12. themis/evaluation/pipelines/composable_pipeline.py +55 -0
  13. themis/evaluation/pipelines/standard_pipeline.py +16 -0
  14. themis/experiment/__init__.py +2 -2
  15. themis/experiment/cache_manager.py +15 -1
  16. themis/experiment/definitions.py +1 -1
  17. themis/experiment/orchestrator.py +21 -11
  18. themis/experiment/share.py +264 -0
  19. themis/experiment/storage.py +345 -298
  20. themis/generation/router.py +22 -4
  21. themis/generation/runner.py +16 -1
  22. themis/presets/benchmarks.py +602 -17
  23. themis/server/app.py +38 -26
  24. themis/session.py +125 -0
  25. themis/specs/__init__.py +7 -0
  26. themis/specs/execution.py +26 -0
  27. themis/specs/experiment.py +33 -0
  28. themis/specs/storage.py +18 -0
  29. themis/storage/__init__.py +6 -0
  30. themis/storage/experiment_storage.py +7 -0
  31. {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/METADATA +47 -34
  32. {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/RECORD +35 -28
  33. {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/WHEEL +1 -1
  34. themis/experiment/builder.py +0 -151
  35. themis/experiment/export_csv.py +0 -159
  36. {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/licenses/LICENSE +0 -0
  37. {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/top_level.txt +0 -0
themis/server/app.py CHANGED
@@ -17,7 +17,8 @@ from pydantic import BaseModel, Field
17
17
 
18
18
  from themis.comparison import compare_runs
19
19
  from themis.comparison.statistics import StatisticalTest
20
- from themis.experiment.storage import ExperimentStorage
20
+ from themis.storage import ExperimentStorage
21
+ from themis._version import __version__
21
22
 
22
23
 
23
24
  class RunSummary(BaseModel):
@@ -71,7 +72,7 @@ def create_app(storage_path: str | Path = ".cache/experiments") -> FastAPI:
71
72
  app = FastAPI(
72
73
  title="Themis API",
73
74
  description="REST API for Themis experiment management",
74
- version="2.0.0",
75
+ version=__version__,
75
76
  )
76
77
 
77
78
  # Enable CORS for web dashboard
@@ -117,13 +118,16 @@ def create_app(storage_path: str | Path = ".cache/experiments") -> FastAPI:
117
118
  return {
118
119
  "status": "ok",
119
120
  "service": "themis-api",
120
- "version": "2.0.0",
121
+ "version": __version__,
121
122
  }
122
123
 
123
124
  @app.get("/api/runs", response_model=List[RunSummary], tags=["runs"])
124
125
  async def list_runs():
125
126
  """List all experiment runs."""
126
- run_ids = storage.list_runs()
127
+ run_entries = storage.list_runs()
128
+ run_ids = [
129
+ entry.run_id if hasattr(entry, "run_id") else entry for entry in run_entries
130
+ ]
127
131
 
128
132
  summaries = []
129
133
  for run_id in run_ids:
@@ -133,15 +137,12 @@ def create_app(storage_path: str | Path = ".cache/experiments") -> FastAPI:
133
137
  # Calculate average metrics
134
138
  metrics_dict: Dict[str, List[float]] = {}
135
139
  for record in eval_records.values():
136
- for metric_name, score_obj in record.scores.items():
140
+ for score_obj in record.scores:
141
+ metric_name = score_obj.metric_name
137
142
  if metric_name not in metrics_dict:
138
143
  metrics_dict[metric_name] = []
139
-
140
- # Extract numeric score
141
- if hasattr(score_obj, 'value'):
142
- metrics_dict[metric_name].append(score_obj.value)
143
- elif isinstance(score_obj, (int, float)):
144
- metrics_dict[metric_name].append(float(score_obj))
144
+
145
+ metrics_dict[metric_name].append(score_obj.value)
145
146
 
146
147
  # Average metrics
147
148
  avg_metrics = {
@@ -162,7 +163,11 @@ def create_app(storage_path: str | Path = ".cache/experiments") -> FastAPI:
162
163
  @app.get("/api/runs/{run_id}", response_model=RunDetail, tags=["runs"])
163
164
  async def get_run(run_id: str):
164
165
  """Get detailed information about a run."""
165
- if run_id not in storage.list_runs():
166
+ run_entries = storage.list_runs()
167
+ run_ids = [
168
+ entry.run_id if hasattr(entry, "run_id") else entry for entry in run_entries
169
+ ]
170
+ if run_id not in run_ids:
166
171
  raise HTTPException(status_code=404, detail=f"Run not found: {run_id}")
167
172
 
168
173
  # Load records
@@ -179,25 +184,25 @@ def create_app(storage_path: str | Path = ".cache/experiments") -> FastAPI:
179
184
 
180
185
  # Extract scores
181
186
  scores = {}
182
- for metric_name, score_obj in eval_record.scores.items():
183
- if hasattr(score_obj, 'value'):
184
- value = score_obj.value
185
- elif isinstance(score_obj, (int, float)):
186
- value = float(score_obj)
187
- else:
188
- continue
189
-
187
+ for score_obj in eval_record.scores:
188
+ metric_name = score_obj.metric_name
189
+ value = score_obj.value
190
+
190
191
  scores[metric_name] = value
191
-
192
+
192
193
  if metric_name not in metrics_dict:
193
194
  metrics_dict[metric_name] = []
194
195
  metrics_dict[metric_name].append(value)
195
196
 
196
197
  # Build sample
198
+ sample_id = eval_record.sample_id
199
+ if sample_id is None and gen_record is not None:
200
+ sample_id = gen_record.task.metadata.get("dataset_id")
201
+
197
202
  sample = {
198
- "id": gen_record.id if gen_record else cache_key,
199
- "prompt": gen_record.prompt if gen_record else "",
200
- "response": gen_record.response if gen_record else "",
203
+ "id": sample_id or cache_key,
204
+ "prompt": gen_record.task.prompt.text if gen_record else "",
205
+ "response": gen_record.output.text if gen_record and gen_record.output else "",
201
206
  "scores": scores,
202
207
  }
203
208
  samples.append(sample)
@@ -220,7 +225,11 @@ def create_app(storage_path: str | Path = ".cache/experiments") -> FastAPI:
220
225
  @app.delete("/api/runs/{run_id}", tags=["runs"])
221
226
  async def delete_run(run_id: str):
222
227
  """Delete a run."""
223
- if run_id not in storage.list_runs():
228
+ run_entries = storage.list_runs()
229
+ run_ids = [
230
+ entry.run_id if hasattr(entry, "run_id") else entry for entry in run_entries
231
+ ]
232
+ if run_id not in run_ids:
224
233
  raise HTTPException(status_code=404, detail=f"Run not found: {run_id}")
225
234
 
226
235
  # Note: Current storage doesn't implement delete
@@ -234,7 +243,10 @@ def create_app(storage_path: str | Path = ".cache/experiments") -> FastAPI:
234
243
  async def compare_runs_api(request: ComparisonRequest):
235
244
  """Compare multiple runs."""
236
245
  # Validate runs exist
237
- existing_runs = set(storage.list_runs())
246
+ run_entries = storage.list_runs()
247
+ existing_runs = set(
248
+ entry.run_id if hasattr(entry, "run_id") else entry for entry in run_entries
249
+ )
238
250
  for run_id in request.run_ids:
239
251
  if run_id not in existing_runs:
240
252
  raise HTTPException(
themis/session.py ADDED
@@ -0,0 +1,125 @@
1
+ """Experiment session orchestration for vNext workflows."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+ from typing import Iterable, Sequence
8
+
9
+ from themis.core.entities import ExperimentReport, ModelSpec, SamplingConfig
10
+ from themis.evaluation.pipeline import EvaluationPipelineContract
11
+ from themis.experiment.orchestrator import ExperimentOrchestrator
12
+ from themis.generation.plan import GenerationPlan
13
+ from themis.generation.router import ProviderRouter
14
+ from themis.generation.runner import GenerationRunner
15
+ from themis.generation.templates import PromptTemplate
16
+ from themis.interfaces import DatasetAdapter
17
+ from themis.presets import parse_model_name
18
+ from themis.providers import create_provider
19
+ from themis.specs import ExecutionSpec, ExperimentSpec, StorageSpec
20
+
21
+
22
+ @dataclass
23
+ class ExperimentSession:
24
+ """Main entry point for running experiments with vNext specs."""
25
+
26
+ def run(
27
+ self,
28
+ spec: ExperimentSpec,
29
+ *,
30
+ execution: ExecutionSpec | None = None,
31
+ storage: StorageSpec | None = None,
32
+ ) -> ExperimentReport:
33
+ execution = execution or ExecutionSpec()
34
+ storage = storage or StorageSpec()
35
+
36
+ pipeline = spec.pipeline
37
+ if not isinstance(pipeline, EvaluationPipelineContract):
38
+ raise TypeError(
39
+ "spec.pipeline must implement EvaluationPipelineContract."
40
+ )
41
+
42
+ dataset = _resolve_dataset(spec.dataset)
43
+
44
+ provider_name, model_id, provider_options = _parse_model(spec.model)
45
+ model_spec = ModelSpec(identifier=model_id, provider=provider_name)
46
+ sampling = _build_sampling(spec.sampling)
47
+
48
+ plan = GenerationPlan(
49
+ templates=[PromptTemplate(name="default", template=spec.prompt)],
50
+ models=[model_spec],
51
+ sampling_parameters=[sampling],
52
+ dataset_id_field="id",
53
+ reference_field="answer",
54
+ )
55
+
56
+ provider = create_provider(provider_name, **provider_options)
57
+ router = ProviderRouter({(provider_name, model_id): provider})
58
+
59
+ runner = GenerationRunner(
60
+ provider=router,
61
+ max_parallel=execution.workers,
62
+ max_retries=execution.max_retries,
63
+ retry_initial_delay=execution.retry_initial_delay,
64
+ retry_backoff_multiplier=execution.retry_backoff_multiplier,
65
+ retry_max_delay=execution.retry_max_delay,
66
+ execution_backend=execution.backend,
67
+ )
68
+
69
+ storage_backend = _resolve_storage(storage)
70
+
71
+ orchestrator = ExperimentOrchestrator(
72
+ generation_plan=plan,
73
+ generation_runner=runner,
74
+ evaluation_pipeline=pipeline,
75
+ storage=storage_backend,
76
+ )
77
+
78
+ return orchestrator.run(
79
+ dataset=dataset,
80
+ run_id=spec.run_id,
81
+ resume=storage.cache,
82
+ cache_results=storage.cache,
83
+ )
84
+
85
+
86
+ def _parse_model(model: str) -> tuple[str, str, dict]:
87
+ if ":" in model:
88
+ provider_name, model_id = model.split(":", 1)
89
+ return provider_name, model_id, {}
90
+ return parse_model_name(model)
91
+
92
+
93
+ def _build_sampling(data: dict) -> SamplingConfig:
94
+ return SamplingConfig(
95
+ temperature=float(data.get("temperature", 0.0)),
96
+ top_p=float(data.get("top_p", 0.95)),
97
+ max_tokens=int(data.get("max_tokens", 512)),
98
+ )
99
+
100
+
101
+ def _resolve_dataset(dataset: object) -> list[dict]:
102
+ if isinstance(dataset, DatasetAdapter):
103
+ return list(dataset.iter_samples())
104
+ if isinstance(dataset, Iterable):
105
+ return list(dataset) # type: ignore[arg-type]
106
+ raise TypeError("spec.dataset must be iterable or implement DatasetAdapter.")
107
+
108
+
109
+ def _resolve_storage(storage: StorageSpec):
110
+ if storage.backend is not None:
111
+ backend = storage.backend
112
+ if hasattr(backend, "experiment_storage"):
113
+ return backend.experiment_storage
114
+ if not hasattr(backend, "start_run"):
115
+ raise TypeError(
116
+ "storage.backend must be ExperimentStorage-compatible."
117
+ )
118
+ return backend
119
+ root = Path(storage.path) if storage.path is not None else Path(".cache/experiments")
120
+ from themis.storage import ExperimentStorage
121
+
122
+ return ExperimentStorage(root)
123
+
124
+
125
+ __all__ = ["ExperimentSession"]
@@ -0,0 +1,7 @@
1
+ """Specification models for vNext workflows."""
2
+
3
+ from themis.specs.experiment import ExperimentSpec
4
+ from themis.specs.execution import ExecutionSpec
5
+ from themis.specs.storage import StorageSpec
6
+
7
+ __all__ = ["ExperimentSpec", "ExecutionSpec", "StorageSpec"]
@@ -0,0 +1,26 @@
1
+ """Execution specification for vNext workflows."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+
7
+
8
+ @dataclass(frozen=True)
9
+ class ExecutionSpec:
10
+ """Execution configuration for running experiments."""
11
+
12
+ backend: object | None = None
13
+ workers: int = 4
14
+ max_retries: int = 3
15
+ retry_initial_delay: float = 0.5
16
+ retry_backoff_multiplier: float = 2.0
17
+ retry_max_delay: float | None = 2.0
18
+
19
+ def __post_init__(self) -> None:
20
+ if self.workers < 1:
21
+ raise ValueError("ExecutionSpec.workers must be >= 1.")
22
+ if self.max_retries < 1:
23
+ raise ValueError("ExecutionSpec.max_retries must be >= 1.")
24
+
25
+
26
+ __all__ = ["ExecutionSpec"]
@@ -0,0 +1,33 @@
1
+ """Experiment specification for vNext workflows."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from typing import Any, Mapping
7
+
8
+
9
+ @dataclass(frozen=True)
10
+ class ExperimentSpec:
11
+ """Canonical experiment specification.
12
+
13
+ This spec is the single source of truth for the experiment's
14
+ dataset, prompt, model, sampling config, and evaluation pipeline.
15
+ """
16
+
17
+ dataset: object
18
+ prompt: str
19
+ model: str
20
+ sampling: Mapping[str, Any] = field(default_factory=dict)
21
+ pipeline: object | None = None
22
+ run_id: str | None = None
23
+
24
+ def __post_init__(self) -> None:
25
+ if not self.prompt:
26
+ raise ValueError("ExperimentSpec.prompt must be a non-empty string.")
27
+ if not self.model:
28
+ raise ValueError("ExperimentSpec.model must be a non-empty string.")
29
+ if self.pipeline is None:
30
+ raise ValueError("ExperimentSpec.pipeline must be provided.")
31
+
32
+
33
+ __all__ = ["ExperimentSpec"]
@@ -0,0 +1,18 @@
1
+ """Storage specification for vNext workflows."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+
8
+
9
+ @dataclass(frozen=True)
10
+ class StorageSpec:
11
+ """Storage configuration for experiment persistence and caching."""
12
+
13
+ backend: object | None = None
14
+ path: str | Path | None = None
15
+ cache: bool = True
16
+
17
+
18
+ __all__ = ["StorageSpec"]
@@ -0,0 +1,6 @@
1
+ """Storage backends and adapters for vNext workflows."""
2
+
3
+ from themis.backends.storage import LocalFileStorageBackend, StorageBackend
4
+ from themis.storage.experiment_storage import ExperimentStorage
5
+
6
+ __all__ = ["StorageBackend", "LocalFileStorageBackend", "ExperimentStorage"]
@@ -0,0 +1,7 @@
1
+ """Storage adapter module for vNext workflows."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from themis.experiment.storage import ExperimentStorage as ExperimentStorage
6
+
7
+ __all__ = ["ExperimentStorage"]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: themis-eval
3
- Version: 0.2.3
3
+ Version: 1.0.0
4
4
  Summary: Lightweight evaluation platform for LLM experiments
5
5
  Author: Pittawat Taveekitworachai
6
6
  License: MIT
@@ -100,13 +100,14 @@ pip install themis-eval[math,nlp,code,server]
100
100
  from themis import evaluate
101
101
 
102
102
  # Evaluate any model on any benchmark
103
- result = evaluate(
104
- benchmark="gsm8k",
103
+ report = evaluate(
104
+ "gsm8k",
105
105
  model="gpt-4",
106
- limit=100
106
+ limit=100,
107
107
  )
108
108
 
109
- print(f"Accuracy: {result.metrics['exact_match']:.2%}")
109
+ accuracy = report.evaluation_report.metrics["ExactMatch"].mean
110
+ print(f"Accuracy: {accuracy:.2%}")
110
111
  ```
111
112
 
112
113
  ### CLI Usage
@@ -122,6 +123,9 @@ themis compare gpt4-run claude-run
122
123
 
123
124
  # Start web dashboard
124
125
  themis serve
126
+
127
+ # Share a run
128
+ themis share gpt4-run --output-dir share
125
129
  ```
126
130
 
127
131
  ---
@@ -130,20 +134,28 @@ themis serve
130
134
 
131
135
  ### 🎯 Built-in Benchmarks
132
136
 
133
- Themis includes 6 popular benchmarks out-of-the-box:
137
+ Themis includes 19 built-in benchmarks out-of-the-box:
134
138
 
135
139
  ```python
136
140
  # Math reasoning
137
- evaluate(benchmark="gsm8k", model="gpt-4", limit=100)
138
- evaluate(benchmark="math500", model="gpt-4", limit=50)
139
- evaluate(benchmark="aime24", model="gpt-4")
141
+ evaluate("gsm8k", model="gpt-4", limit=100)
142
+ evaluate("math500", model="gpt-4", limit=50)
143
+ evaluate("aime24", model="gpt-4")
140
144
 
141
145
  # General knowledge
142
- evaluate(benchmark="mmlu_pro", model="gpt-4", limit=1000)
143
- evaluate(benchmark="supergpqa", model="gpt-4")
146
+ evaluate("mmlu-pro", model="gpt-4", limit=1000)
147
+ evaluate("supergpqa", model="gpt-4")
148
+
149
+ # Science & medical
150
+ evaluate("gpqa", model="gpt-4", limit=200)
151
+ evaluate("medmcqa", model="gpt-4", limit=200)
152
+
153
+ # Commonsense & conversational
154
+ evaluate("commonsense_qa", model="gpt-4", limit=200)
155
+ evaluate("coqa", model="gpt-4", limit=200)
144
156
 
145
157
  # Quick testing
146
- evaluate(benchmark="demo", model="fake-math-llm", limit=10)
158
+ evaluate("demo", model="fake-math-llm", limit=10)
147
159
  ```
148
160
 
149
161
  **See all available benchmarks:**
@@ -165,8 +177,7 @@ themis list benchmarks
165
177
 
166
178
  ```python
167
179
  # Use specific metrics
168
- result = evaluate(
169
- benchmark="gsm8k",
180
+ result = evaluate("gsm8k",
170
181
  model="gpt-4",
171
182
  metrics=["exact_match", "bleu", "rouge1"],
172
183
  )
@@ -192,7 +203,7 @@ print(report.summary())
192
203
 
193
204
  **CLI:**
194
205
  ```bash
195
- themis compare run-1 run-2 --test bootstrap --output comparison.html
206
+ themis compare run-1 run-2 --output comparison.html
196
207
  ```
197
208
 
198
209
  ### 🌐 Web Dashboard
@@ -218,19 +229,19 @@ Themis uses [LiteLLM](https://github.com/BerriAI/litellm) for broad provider sup
218
229
 
219
230
  ```python
220
231
  # OpenAI
221
- evaluate(benchmark="gsm8k", model="gpt-4")
232
+ evaluate("gsm8k", model="gpt-4")
222
233
 
223
234
  # Anthropic
224
- evaluate(benchmark="gsm8k", model="claude-3-opus-20240229")
235
+ evaluate("gsm8k", model="claude-3-opus-20240229")
225
236
 
226
237
  # Azure OpenAI
227
- evaluate(benchmark="gsm8k", model="azure/gpt-4")
238
+ evaluate("gsm8k", model="azure/gpt-4")
228
239
 
229
240
  # Local models (vLLM, Ollama, etc.)
230
- evaluate(benchmark="gsm8k", model="ollama/llama3")
241
+ evaluate("gsm8k", model="ollama/llama3")
231
242
 
232
243
  # AWS Bedrock
233
- evaluate(benchmark="gsm8k", model="bedrock/anthropic.claude-3")
244
+ evaluate("gsm8k", model="bedrock/anthropic.claude-3")
234
245
  ```
235
246
 
236
247
  ### 💾 Smart Caching
@@ -239,8 +250,7 @@ Themis automatically caches results and resumes failed runs:
239
250
 
240
251
  ```python
241
252
  # Run with caching
242
- result = evaluate(
243
- benchmark="gsm8k",
253
+ result = evaluate("gsm8k",
244
254
  model="gpt-4",
245
255
  limit=1000,
246
256
  run_id="my-experiment",
@@ -275,14 +285,13 @@ result = evaluate(
275
285
  metrics=["exact_match"],
276
286
  )
277
287
 
278
- print(result.report)
288
+ print(result.evaluation_report.metrics["ExactMatch"].mean)
279
289
  ```
280
290
 
281
291
  ### Advanced Configuration
282
292
 
283
293
  ```python
284
- result = evaluate(
285
- benchmark="gsm8k",
294
+ result = evaluate("gsm8k",
286
295
  model="gpt-4",
287
296
  temperature=0.7,
288
297
  max_tokens=512,
@@ -335,7 +344,7 @@ Themis is built on a clean, modular architecture:
335
344
  │ │
336
345
  ┌────▼─────┐ ┌────▼─────┐
337
346
  │Benchmarks│ │Evaluation│
338
- │(6 built- │ │ Pipeline │
347
+ │(19 built-│ │ Pipeline │
339
348
  │ in) │ └────┬─────┘
340
349
  └──────────┘ │
341
350
  ┌────▼─────┐
@@ -359,7 +368,7 @@ Themis is built on a clean, modular architecture:
359
368
 
360
369
  - **[API Reference](docs/index.md)** - Detailed API documentation
361
370
  - **[Examples](examples-simple/)** - Runnable code examples
362
- - **[Extending Backends](docs/customization/backends.md)** - Custom storage and execution
371
+ - **[Backends API](docs/api/backends.md)** - Custom storage and execution
363
372
  - **[API Server](docs/reference/api-server.md)** - Web dashboard and REST API
364
373
  - **[Comparison Engine](docs/guides/comparison.md)** - Statistical testing guide
365
374
 
@@ -382,14 +391,13 @@ class S3StorageBackend(StorageBackend):
382
391
  # ... implement other methods
383
392
 
384
393
  # Use custom backend
385
- result = evaluate(
386
- benchmark="gsm8k",
394
+ result = evaluate("gsm8k",
387
395
  model="gpt-4",
388
396
  storage_backend=S3StorageBackend(bucket="my-bucket")
389
397
  )
390
398
  ```
391
399
 
392
- See [docs/customization/backends.md](docs/customization/backends.md) for details.
400
+ See [docs/api/backends.md](docs/api/backends.md) for details.
393
401
 
394
402
  ### Distributed Execution
395
403
 
@@ -401,8 +409,7 @@ class RayExecutionBackend(ExecutionBackend):
401
409
  """Distributed execution with Ray"""
402
410
  # ... implementation
403
411
 
404
- result = evaluate(
405
- benchmark="math500",
412
+ result = evaluate("math500",
406
413
  model="gpt-4",
407
414
  execution_backend=RayExecutionBackend(num_cpus=32)
408
415
  )
@@ -454,10 +461,10 @@ themis eval <benchmark> --model <model> [options]
454
461
  themis compare <run-id-1> <run-id-2> [run-id-3...] [options]
455
462
 
456
463
  # Options:
464
+ # --metric NAME Restrict to one metric
457
465
  # --storage PATH Storage directory
458
- # --test STR Statistical test: t_test, bootstrap, permutation
459
- # --alpha FLOAT Significance level (default: 0.05)
460
466
  # --output FILE Export report (.json, .html, .md)
467
+ # --show-diff Include detailed per-sample differences in summary
461
468
  ```
462
469
 
463
470
  ### Server
@@ -539,6 +546,12 @@ uv run python examples-simple/04_comparison.py
539
546
 
540
547
  # API server example
541
548
  uv run python examples-simple/05_api_server.py
549
+
550
+ # Resume/cache example
551
+ uv run python examples-simple/08_resume_cache.py
552
+
553
+ # End-to-end research loop example
554
+ uv run python examples-simple/09_research_loop.py
542
555
  ```
543
556
 
544
557
  ---