themis-eval 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- themis/cli/__init__.py +5 -0
- themis/cli/__main__.py +6 -0
- themis/cli/commands/__init__.py +19 -0
- themis/cli/commands/benchmarks.py +221 -0
- themis/cli/commands/comparison.py +394 -0
- themis/cli/commands/config_commands.py +244 -0
- themis/cli/commands/cost.py +214 -0
- themis/cli/commands/demo.py +68 -0
- themis/cli/commands/info.py +90 -0
- themis/cli/commands/leaderboard.py +362 -0
- themis/cli/commands/math_benchmarks.py +318 -0
- themis/cli/commands/mcq_benchmarks.py +207 -0
- themis/cli/commands/sample_run.py +244 -0
- themis/cli/commands/visualize.py +299 -0
- themis/cli/main.py +93 -0
- themis/cli/new_project.py +33 -0
- themis/cli/utils.py +51 -0
- themis/config/__init__.py +19 -0
- themis/config/loader.py +27 -0
- themis/config/registry.py +34 -0
- themis/config/runtime.py +214 -0
- themis/config/schema.py +112 -0
- themis/core/__init__.py +5 -0
- themis/core/conversation.py +354 -0
- themis/core/entities.py +164 -0
- themis/core/serialization.py +231 -0
- themis/core/tools.py +393 -0
- themis/core/types.py +141 -0
- themis/datasets/__init__.py +273 -0
- themis/datasets/base.py +264 -0
- themis/datasets/commonsense_qa.py +174 -0
- themis/datasets/competition_math.py +265 -0
- themis/datasets/coqa.py +133 -0
- themis/datasets/gpqa.py +190 -0
- themis/datasets/gsm8k.py +123 -0
- themis/datasets/gsm_symbolic.py +124 -0
- themis/datasets/math500.py +122 -0
- themis/datasets/med_qa.py +179 -0
- themis/datasets/medmcqa.py +169 -0
- themis/datasets/mmlu_pro.py +262 -0
- themis/datasets/piqa.py +146 -0
- themis/datasets/registry.py +201 -0
- themis/datasets/schema.py +245 -0
- themis/datasets/sciq.py +150 -0
- themis/datasets/social_i_qa.py +151 -0
- themis/datasets/super_gpqa.py +263 -0
- themis/evaluation/__init__.py +1 -0
- themis/evaluation/conditional.py +410 -0
- themis/evaluation/extractors/__init__.py +19 -0
- themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
- themis/evaluation/extractors/exceptions.py +7 -0
- themis/evaluation/extractors/identity_extractor.py +29 -0
- themis/evaluation/extractors/json_field_extractor.py +45 -0
- themis/evaluation/extractors/math_verify_extractor.py +37 -0
- themis/evaluation/extractors/regex_extractor.py +43 -0
- themis/evaluation/math_verify_utils.py +87 -0
- themis/evaluation/metrics/__init__.py +21 -0
- themis/evaluation/metrics/composite_metric.py +47 -0
- themis/evaluation/metrics/consistency_metric.py +80 -0
- themis/evaluation/metrics/exact_match.py +51 -0
- themis/evaluation/metrics/length_difference_tolerance.py +33 -0
- themis/evaluation/metrics/math_verify_accuracy.py +40 -0
- themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
- themis/evaluation/metrics/response_length.py +33 -0
- themis/evaluation/metrics/rubric_judge_metric.py +134 -0
- themis/evaluation/pipeline.py +49 -0
- themis/evaluation/pipelines/__init__.py +15 -0
- themis/evaluation/pipelines/composable_pipeline.py +357 -0
- themis/evaluation/pipelines/standard_pipeline.py +288 -0
- themis/evaluation/reports.py +293 -0
- themis/evaluation/statistics/__init__.py +53 -0
- themis/evaluation/statistics/bootstrap.py +79 -0
- themis/evaluation/statistics/confidence_intervals.py +121 -0
- themis/evaluation/statistics/distributions.py +207 -0
- themis/evaluation/statistics/effect_sizes.py +124 -0
- themis/evaluation/statistics/hypothesis_tests.py +305 -0
- themis/evaluation/statistics/types.py +139 -0
- themis/evaluation/strategies/__init__.py +13 -0
- themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
- themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
- themis/evaluation/strategies/evaluation_strategy.py +24 -0
- themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
- themis/experiment/__init__.py +5 -0
- themis/experiment/builder.py +151 -0
- themis/experiment/cache_manager.py +129 -0
- themis/experiment/comparison.py +631 -0
- themis/experiment/cost.py +310 -0
- themis/experiment/definitions.py +62 -0
- themis/experiment/export.py +690 -0
- themis/experiment/export_csv.py +159 -0
- themis/experiment/integration_manager.py +104 -0
- themis/experiment/math.py +192 -0
- themis/experiment/mcq.py +169 -0
- themis/experiment/orchestrator.py +373 -0
- themis/experiment/pricing.py +317 -0
- themis/experiment/storage.py +255 -0
- themis/experiment/visualization.py +588 -0
- themis/generation/__init__.py +1 -0
- themis/generation/agentic_runner.py +420 -0
- themis/generation/batching.py +254 -0
- themis/generation/clients.py +143 -0
- themis/generation/conversation_runner.py +236 -0
- themis/generation/plan.py +456 -0
- themis/generation/providers/litellm_provider.py +221 -0
- themis/generation/providers/vllm_provider.py +135 -0
- themis/generation/router.py +34 -0
- themis/generation/runner.py +207 -0
- themis/generation/strategies.py +98 -0
- themis/generation/templates.py +71 -0
- themis/generation/turn_strategies.py +393 -0
- themis/generation/types.py +9 -0
- themis/integrations/__init__.py +0 -0
- themis/integrations/huggingface.py +61 -0
- themis/integrations/wandb.py +65 -0
- themis/interfaces/__init__.py +83 -0
- themis/project/__init__.py +20 -0
- themis/project/definitions.py +98 -0
- themis/project/patterns.py +230 -0
- themis/providers/__init__.py +5 -0
- themis/providers/registry.py +39 -0
- themis/utils/api_generator.py +379 -0
- themis/utils/cost_tracking.py +376 -0
- themis/utils/dashboard.py +452 -0
- themis/utils/logging_utils.py +41 -0
- themis/utils/progress.py +58 -0
- themis/utils/tracing.py +320 -0
- {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/METADATA +1 -1
- themis_eval-0.1.1.dist-info/RECORD +134 -0
- themis_eval-0.1.0.dist-info/RECORD +0 -8
- {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/WHEEL +0 -0
- {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/licenses/LICENSE +0 -0
- {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
"""Local storage helpers for experiment datasets and cached records."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
import json
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Dict, Iterable, List
|
|
9
|
+
|
|
10
|
+
from themis.core import entities as core_entities
|
|
11
|
+
from themis.core import serialization as core_serialization
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def task_cache_key(task: core_entities.GenerationTask) -> str:
|
|
15
|
+
"""Derive a stable cache key for a generation task."""
|
|
16
|
+
|
|
17
|
+
dataset_raw = task.metadata.get("dataset_id") or task.metadata.get("sample_id")
|
|
18
|
+
dataset_id = str(dataset_raw) if dataset_raw is not None else ""
|
|
19
|
+
prompt_hash = hashlib.sha256(task.prompt.text.encode("utf-8")).hexdigest()[:12]
|
|
20
|
+
sampling = task.sampling
|
|
21
|
+
sampling_key = (
|
|
22
|
+
f"{sampling.temperature:.3f}-{sampling.top_p:.3f}-{sampling.max_tokens}"
|
|
23
|
+
)
|
|
24
|
+
template = task.prompt.spec.name
|
|
25
|
+
model = task.model.identifier
|
|
26
|
+
return "::".join(
|
|
27
|
+
filter(None, [dataset_id, template, model, sampling_key, prompt_hash])
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class ExperimentStorage:
|
|
32
|
+
"""Persists datasets and generation records for resumability/caching."""
|
|
33
|
+
|
|
34
|
+
def __init__(self, root: str | Path) -> None:
|
|
35
|
+
self._root = Path(root)
|
|
36
|
+
self._root.mkdir(parents=True, exist_ok=True)
|
|
37
|
+
self._task_index: dict[str, set[str]] = {}
|
|
38
|
+
|
|
39
|
+
def cache_dataset(self, run_id: str, dataset: Iterable[dict[str, object]]) -> None:
|
|
40
|
+
path = self._dataset_path(run_id)
|
|
41
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
42
|
+
with path.open("w", encoding="utf-8") as handle:
|
|
43
|
+
for row in dataset:
|
|
44
|
+
handle.write(json.dumps(row) + "\n")
|
|
45
|
+
|
|
46
|
+
def load_dataset(self, run_id: str) -> List[dict[str, object]]:
|
|
47
|
+
path = self._dataset_path(run_id)
|
|
48
|
+
if not path.exists():
|
|
49
|
+
raise FileNotFoundError(f"Dataset cache not found for run '{run_id}'")
|
|
50
|
+
rows: list[dict[str, object]] = []
|
|
51
|
+
with path.open("r", encoding="utf-8") as handle:
|
|
52
|
+
for line in handle:
|
|
53
|
+
rows.append(json.loads(line))
|
|
54
|
+
return rows
|
|
55
|
+
|
|
56
|
+
def append_record(
|
|
57
|
+
self,
|
|
58
|
+
run_id: str,
|
|
59
|
+
record: core_entities.GenerationRecord,
|
|
60
|
+
*,
|
|
61
|
+
cache_key: str | None = None,
|
|
62
|
+
) -> None:
|
|
63
|
+
path = self._records_path(run_id)
|
|
64
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
65
|
+
payload = self._serialize_record(run_id, record)
|
|
66
|
+
payload["cache_key"] = cache_key or task_cache_key(record.task)
|
|
67
|
+
with path.open("a", encoding="utf-8") as handle:
|
|
68
|
+
handle.write(json.dumps(payload) + "\n")
|
|
69
|
+
|
|
70
|
+
def load_cached_records(
|
|
71
|
+
self, run_id: str
|
|
72
|
+
) -> Dict[str, core_entities.GenerationRecord]:
|
|
73
|
+
path = self._records_path(run_id)
|
|
74
|
+
if not path.exists():
|
|
75
|
+
return {}
|
|
76
|
+
tasks = self._load_tasks(run_id)
|
|
77
|
+
records: dict[str, core_entities.GenerationRecord] = {}
|
|
78
|
+
with path.open("r", encoding="utf-8") as handle:
|
|
79
|
+
for line in handle:
|
|
80
|
+
if not line.strip():
|
|
81
|
+
continue
|
|
82
|
+
data = json.loads(line)
|
|
83
|
+
key = data.get("cache_key")
|
|
84
|
+
if not key:
|
|
85
|
+
continue
|
|
86
|
+
record = self._deserialize_record(data, tasks)
|
|
87
|
+
records[key] = record
|
|
88
|
+
return records
|
|
89
|
+
|
|
90
|
+
def append_evaluation(
|
|
91
|
+
self,
|
|
92
|
+
run_id: str,
|
|
93
|
+
record: core_entities.GenerationRecord,
|
|
94
|
+
evaluation: core_entities.EvaluationRecord,
|
|
95
|
+
) -> None:
|
|
96
|
+
path = self._evaluation_path(run_id)
|
|
97
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
98
|
+
payload = {
|
|
99
|
+
"cache_key": task_cache_key(record.task),
|
|
100
|
+
"evaluation": core_serialization.serialize_evaluation_record(evaluation),
|
|
101
|
+
}
|
|
102
|
+
with path.open("a", encoding="utf-8") as handle:
|
|
103
|
+
handle.write(json.dumps(payload) + "\n")
|
|
104
|
+
|
|
105
|
+
def load_cached_evaluations(
|
|
106
|
+
self, run_id: str
|
|
107
|
+
) -> Dict[str, core_entities.EvaluationRecord]:
|
|
108
|
+
path = self._evaluation_path(run_id)
|
|
109
|
+
if not path.exists():
|
|
110
|
+
return {}
|
|
111
|
+
evaluations: dict[str, core_entities.EvaluationRecord] = {}
|
|
112
|
+
with path.open("r", encoding="utf-8") as handle:
|
|
113
|
+
for line in handle:
|
|
114
|
+
if not line.strip():
|
|
115
|
+
continue
|
|
116
|
+
data = json.loads(line)
|
|
117
|
+
key = data.get("cache_key")
|
|
118
|
+
if not key:
|
|
119
|
+
continue
|
|
120
|
+
evaluations[key] = core_serialization.deserialize_evaluation_record(
|
|
121
|
+
data["evaluation"]
|
|
122
|
+
)
|
|
123
|
+
return evaluations
|
|
124
|
+
|
|
125
|
+
def get_run_path(self, run_id: str) -> Path:
|
|
126
|
+
"""Get the filesystem path for a run's storage directory.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
run_id: Unique run identifier
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
Path to the run's storage directory
|
|
133
|
+
"""
|
|
134
|
+
return self._run_dir(run_id)
|
|
135
|
+
|
|
136
|
+
def _dataset_path(self, run_id: str) -> Path:
|
|
137
|
+
return self._run_dir(run_id) / "dataset.jsonl"
|
|
138
|
+
|
|
139
|
+
def _records_path(self, run_id: str) -> Path:
|
|
140
|
+
return self._run_dir(run_id) / "records.jsonl"
|
|
141
|
+
|
|
142
|
+
def _tasks_path(self, run_id: str) -> Path:
|
|
143
|
+
return self._run_dir(run_id) / "tasks.jsonl"
|
|
144
|
+
|
|
145
|
+
def _evaluation_path(self, run_id: str) -> Path:
|
|
146
|
+
return self._run_dir(run_id) / "evaluation.jsonl"
|
|
147
|
+
|
|
148
|
+
def _run_dir(self, run_id: str) -> Path:
|
|
149
|
+
return self._root / run_id
|
|
150
|
+
|
|
151
|
+
def _serialize_record(
|
|
152
|
+
self, run_id: str, record: core_entities.GenerationRecord
|
|
153
|
+
) -> dict[str, object]:
|
|
154
|
+
task_key = self._persist_task(run_id, record.task)
|
|
155
|
+
payload = {
|
|
156
|
+
"task_key": task_key,
|
|
157
|
+
"output": {
|
|
158
|
+
"text": record.output.text,
|
|
159
|
+
"raw": record.output.raw,
|
|
160
|
+
}
|
|
161
|
+
if record.output
|
|
162
|
+
else None,
|
|
163
|
+
"error": {
|
|
164
|
+
"message": record.error.message,
|
|
165
|
+
"kind": record.error.kind,
|
|
166
|
+
"details": record.error.details,
|
|
167
|
+
}
|
|
168
|
+
if record.error
|
|
169
|
+
else None,
|
|
170
|
+
"metrics": record.metrics,
|
|
171
|
+
"attempts": [
|
|
172
|
+
self._serialize_record(run_id, attempt) for attempt in record.attempts
|
|
173
|
+
],
|
|
174
|
+
}
|
|
175
|
+
return payload
|
|
176
|
+
|
|
177
|
+
def _deserialize_record(
|
|
178
|
+
self, payload: dict[str, object], tasks: dict[str, core_entities.GenerationTask]
|
|
179
|
+
) -> core_entities.GenerationRecord:
|
|
180
|
+
task_key = payload["task_key"]
|
|
181
|
+
task = tasks[task_key]
|
|
182
|
+
output_data = payload.get("output")
|
|
183
|
+
error_data = payload.get("error")
|
|
184
|
+
record = core_entities.GenerationRecord(
|
|
185
|
+
task=task,
|
|
186
|
+
output=core_entities.ModelOutput(
|
|
187
|
+
text=output_data["text"], raw=output_data.get("raw")
|
|
188
|
+
)
|
|
189
|
+
if output_data
|
|
190
|
+
else None,
|
|
191
|
+
error=core_entities.ModelError(
|
|
192
|
+
message=error_data["message"],
|
|
193
|
+
kind=error_data.get("kind", "model_error"),
|
|
194
|
+
details=error_data.get("details", {}),
|
|
195
|
+
)
|
|
196
|
+
if error_data
|
|
197
|
+
else None,
|
|
198
|
+
metrics=payload.get("metrics", {}),
|
|
199
|
+
)
|
|
200
|
+
record.attempts = [
|
|
201
|
+
self._deserialize_record(attempt, tasks)
|
|
202
|
+
for attempt in payload.get("attempts", [])
|
|
203
|
+
]
|
|
204
|
+
return record
|
|
205
|
+
|
|
206
|
+
def _persist_task(self, run_id: str, task: core_entities.GenerationTask) -> str:
|
|
207
|
+
key = task_cache_key(task)
|
|
208
|
+
index = self._load_task_index(run_id)
|
|
209
|
+
if key in index:
|
|
210
|
+
return key
|
|
211
|
+
path = self._tasks_path(run_id)
|
|
212
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
213
|
+
payload = {
|
|
214
|
+
"task_key": key,
|
|
215
|
+
"task": core_serialization.serialize_generation_task(task),
|
|
216
|
+
}
|
|
217
|
+
with path.open("a", encoding="utf-8") as handle:
|
|
218
|
+
handle.write(json.dumps(payload) + "\n")
|
|
219
|
+
index.add(key)
|
|
220
|
+
return key
|
|
221
|
+
|
|
222
|
+
def _load_tasks(self, run_id: str) -> dict[str, core_entities.GenerationTask]:
|
|
223
|
+
path = self._tasks_path(run_id)
|
|
224
|
+
tasks: dict[str, core_entities.GenerationTask] = {}
|
|
225
|
+
if not path.exists():
|
|
226
|
+
return tasks
|
|
227
|
+
with path.open("r", encoding="utf-8") as handle:
|
|
228
|
+
for line in handle:
|
|
229
|
+
if not line.strip():
|
|
230
|
+
continue
|
|
231
|
+
data = json.loads(line)
|
|
232
|
+
task_key = data["task_key"]
|
|
233
|
+
tasks[task_key] = core_serialization.deserialize_generation_task(
|
|
234
|
+
data["task"]
|
|
235
|
+
)
|
|
236
|
+
self._task_index[run_id] = set(tasks.keys())
|
|
237
|
+
return tasks
|
|
238
|
+
|
|
239
|
+
def _load_task_index(self, run_id: str) -> set[str]:
|
|
240
|
+
if run_id in self._task_index:
|
|
241
|
+
return self._task_index[run_id]
|
|
242
|
+
path = self._tasks_path(run_id)
|
|
243
|
+
index: set[str] = set()
|
|
244
|
+
if path.exists():
|
|
245
|
+
with path.open("r", encoding="utf-8") as handle:
|
|
246
|
+
for line in handle:
|
|
247
|
+
if not line.strip():
|
|
248
|
+
continue
|
|
249
|
+
data = json.loads(line)
|
|
250
|
+
index.add(data["task_key"])
|
|
251
|
+
self._task_index[run_id] = index
|
|
252
|
+
return index
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
__all__ = ["ExperimentStorage", "task_cache_key"]
|