themis-eval 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. themis/cli/__init__.py +5 -0
  2. themis/cli/__main__.py +6 -0
  3. themis/cli/commands/__init__.py +19 -0
  4. themis/cli/commands/benchmarks.py +221 -0
  5. themis/cli/commands/comparison.py +394 -0
  6. themis/cli/commands/config_commands.py +244 -0
  7. themis/cli/commands/cost.py +214 -0
  8. themis/cli/commands/demo.py +68 -0
  9. themis/cli/commands/info.py +90 -0
  10. themis/cli/commands/leaderboard.py +362 -0
  11. themis/cli/commands/math_benchmarks.py +318 -0
  12. themis/cli/commands/mcq_benchmarks.py +207 -0
  13. themis/cli/commands/sample_run.py +244 -0
  14. themis/cli/commands/visualize.py +299 -0
  15. themis/cli/main.py +93 -0
  16. themis/cli/new_project.py +33 -0
  17. themis/cli/utils.py +51 -0
  18. themis/config/__init__.py +19 -0
  19. themis/config/loader.py +27 -0
  20. themis/config/registry.py +34 -0
  21. themis/config/runtime.py +214 -0
  22. themis/config/schema.py +112 -0
  23. themis/core/__init__.py +5 -0
  24. themis/core/conversation.py +354 -0
  25. themis/core/entities.py +164 -0
  26. themis/core/serialization.py +231 -0
  27. themis/core/tools.py +393 -0
  28. themis/core/types.py +141 -0
  29. themis/datasets/__init__.py +273 -0
  30. themis/datasets/base.py +264 -0
  31. themis/datasets/commonsense_qa.py +174 -0
  32. themis/datasets/competition_math.py +265 -0
  33. themis/datasets/coqa.py +133 -0
  34. themis/datasets/gpqa.py +190 -0
  35. themis/datasets/gsm8k.py +123 -0
  36. themis/datasets/gsm_symbolic.py +124 -0
  37. themis/datasets/math500.py +122 -0
  38. themis/datasets/med_qa.py +179 -0
  39. themis/datasets/medmcqa.py +169 -0
  40. themis/datasets/mmlu_pro.py +262 -0
  41. themis/datasets/piqa.py +146 -0
  42. themis/datasets/registry.py +201 -0
  43. themis/datasets/schema.py +245 -0
  44. themis/datasets/sciq.py +150 -0
  45. themis/datasets/social_i_qa.py +151 -0
  46. themis/datasets/super_gpqa.py +263 -0
  47. themis/evaluation/__init__.py +1 -0
  48. themis/evaluation/conditional.py +410 -0
  49. themis/evaluation/extractors/__init__.py +19 -0
  50. themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
  51. themis/evaluation/extractors/exceptions.py +7 -0
  52. themis/evaluation/extractors/identity_extractor.py +29 -0
  53. themis/evaluation/extractors/json_field_extractor.py +45 -0
  54. themis/evaluation/extractors/math_verify_extractor.py +37 -0
  55. themis/evaluation/extractors/regex_extractor.py +43 -0
  56. themis/evaluation/math_verify_utils.py +87 -0
  57. themis/evaluation/metrics/__init__.py +21 -0
  58. themis/evaluation/metrics/composite_metric.py +47 -0
  59. themis/evaluation/metrics/consistency_metric.py +80 -0
  60. themis/evaluation/metrics/exact_match.py +51 -0
  61. themis/evaluation/metrics/length_difference_tolerance.py +33 -0
  62. themis/evaluation/metrics/math_verify_accuracy.py +40 -0
  63. themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
  64. themis/evaluation/metrics/response_length.py +33 -0
  65. themis/evaluation/metrics/rubric_judge_metric.py +134 -0
  66. themis/evaluation/pipeline.py +49 -0
  67. themis/evaluation/pipelines/__init__.py +15 -0
  68. themis/evaluation/pipelines/composable_pipeline.py +357 -0
  69. themis/evaluation/pipelines/standard_pipeline.py +288 -0
  70. themis/evaluation/reports.py +293 -0
  71. themis/evaluation/statistics/__init__.py +53 -0
  72. themis/evaluation/statistics/bootstrap.py +79 -0
  73. themis/evaluation/statistics/confidence_intervals.py +121 -0
  74. themis/evaluation/statistics/distributions.py +207 -0
  75. themis/evaluation/statistics/effect_sizes.py +124 -0
  76. themis/evaluation/statistics/hypothesis_tests.py +305 -0
  77. themis/evaluation/statistics/types.py +139 -0
  78. themis/evaluation/strategies/__init__.py +13 -0
  79. themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
  80. themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
  81. themis/evaluation/strategies/evaluation_strategy.py +24 -0
  82. themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
  83. themis/experiment/__init__.py +5 -0
  84. themis/experiment/builder.py +151 -0
  85. themis/experiment/cache_manager.py +129 -0
  86. themis/experiment/comparison.py +631 -0
  87. themis/experiment/cost.py +310 -0
  88. themis/experiment/definitions.py +62 -0
  89. themis/experiment/export.py +690 -0
  90. themis/experiment/export_csv.py +159 -0
  91. themis/experiment/integration_manager.py +104 -0
  92. themis/experiment/math.py +192 -0
  93. themis/experiment/mcq.py +169 -0
  94. themis/experiment/orchestrator.py +373 -0
  95. themis/experiment/pricing.py +317 -0
  96. themis/experiment/storage.py +255 -0
  97. themis/experiment/visualization.py +588 -0
  98. themis/generation/__init__.py +1 -0
  99. themis/generation/agentic_runner.py +420 -0
  100. themis/generation/batching.py +254 -0
  101. themis/generation/clients.py +143 -0
  102. themis/generation/conversation_runner.py +236 -0
  103. themis/generation/plan.py +456 -0
  104. themis/generation/providers/litellm_provider.py +221 -0
  105. themis/generation/providers/vllm_provider.py +135 -0
  106. themis/generation/router.py +34 -0
  107. themis/generation/runner.py +207 -0
  108. themis/generation/strategies.py +98 -0
  109. themis/generation/templates.py +71 -0
  110. themis/generation/turn_strategies.py +393 -0
  111. themis/generation/types.py +9 -0
  112. themis/integrations/__init__.py +0 -0
  113. themis/integrations/huggingface.py +61 -0
  114. themis/integrations/wandb.py +65 -0
  115. themis/interfaces/__init__.py +83 -0
  116. themis/project/__init__.py +20 -0
  117. themis/project/definitions.py +98 -0
  118. themis/project/patterns.py +230 -0
  119. themis/providers/__init__.py +5 -0
  120. themis/providers/registry.py +39 -0
  121. themis/utils/api_generator.py +379 -0
  122. themis/utils/cost_tracking.py +376 -0
  123. themis/utils/dashboard.py +452 -0
  124. themis/utils/logging_utils.py +41 -0
  125. themis/utils/progress.py +58 -0
  126. themis/utils/tracing.py +320 -0
  127. {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/METADATA +1 -1
  128. themis_eval-0.1.1.dist-info/RECORD +134 -0
  129. themis_eval-0.1.0.dist-info/RECORD +0 -8
  130. {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/WHEEL +0 -0
  131. {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/licenses/LICENSE +0 -0
  132. {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,255 @@
1
+ """Local storage helpers for experiment datasets and cached records."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import hashlib
6
+ import json
7
+ from pathlib import Path
8
+ from typing import Dict, Iterable, List
9
+
10
+ from themis.core import entities as core_entities
11
+ from themis.core import serialization as core_serialization
12
+
13
+
14
+ def task_cache_key(task: core_entities.GenerationTask) -> str:
15
+ """Derive a stable cache key for a generation task."""
16
+
17
+ dataset_raw = task.metadata.get("dataset_id") or task.metadata.get("sample_id")
18
+ dataset_id = str(dataset_raw) if dataset_raw is not None else ""
19
+ prompt_hash = hashlib.sha256(task.prompt.text.encode("utf-8")).hexdigest()[:12]
20
+ sampling = task.sampling
21
+ sampling_key = (
22
+ f"{sampling.temperature:.3f}-{sampling.top_p:.3f}-{sampling.max_tokens}"
23
+ )
24
+ template = task.prompt.spec.name
25
+ model = task.model.identifier
26
+ return "::".join(
27
+ filter(None, [dataset_id, template, model, sampling_key, prompt_hash])
28
+ )
29
+
30
+
31
+ class ExperimentStorage:
32
+ """Persists datasets and generation records for resumability/caching."""
33
+
34
+ def __init__(self, root: str | Path) -> None:
35
+ self._root = Path(root)
36
+ self._root.mkdir(parents=True, exist_ok=True)
37
+ self._task_index: dict[str, set[str]] = {}
38
+
39
+ def cache_dataset(self, run_id: str, dataset: Iterable[dict[str, object]]) -> None:
40
+ path = self._dataset_path(run_id)
41
+ path.parent.mkdir(parents=True, exist_ok=True)
42
+ with path.open("w", encoding="utf-8") as handle:
43
+ for row in dataset:
44
+ handle.write(json.dumps(row) + "\n")
45
+
46
+ def load_dataset(self, run_id: str) -> List[dict[str, object]]:
47
+ path = self._dataset_path(run_id)
48
+ if not path.exists():
49
+ raise FileNotFoundError(f"Dataset cache not found for run '{run_id}'")
50
+ rows: list[dict[str, object]] = []
51
+ with path.open("r", encoding="utf-8") as handle:
52
+ for line in handle:
53
+ rows.append(json.loads(line))
54
+ return rows
55
+
56
+ def append_record(
57
+ self,
58
+ run_id: str,
59
+ record: core_entities.GenerationRecord,
60
+ *,
61
+ cache_key: str | None = None,
62
+ ) -> None:
63
+ path = self._records_path(run_id)
64
+ path.parent.mkdir(parents=True, exist_ok=True)
65
+ payload = self._serialize_record(run_id, record)
66
+ payload["cache_key"] = cache_key or task_cache_key(record.task)
67
+ with path.open("a", encoding="utf-8") as handle:
68
+ handle.write(json.dumps(payload) + "\n")
69
+
70
+ def load_cached_records(
71
+ self, run_id: str
72
+ ) -> Dict[str, core_entities.GenerationRecord]:
73
+ path = self._records_path(run_id)
74
+ if not path.exists():
75
+ return {}
76
+ tasks = self._load_tasks(run_id)
77
+ records: dict[str, core_entities.GenerationRecord] = {}
78
+ with path.open("r", encoding="utf-8") as handle:
79
+ for line in handle:
80
+ if not line.strip():
81
+ continue
82
+ data = json.loads(line)
83
+ key = data.get("cache_key")
84
+ if not key:
85
+ continue
86
+ record = self._deserialize_record(data, tasks)
87
+ records[key] = record
88
+ return records
89
+
90
+ def append_evaluation(
91
+ self,
92
+ run_id: str,
93
+ record: core_entities.GenerationRecord,
94
+ evaluation: core_entities.EvaluationRecord,
95
+ ) -> None:
96
+ path = self._evaluation_path(run_id)
97
+ path.parent.mkdir(parents=True, exist_ok=True)
98
+ payload = {
99
+ "cache_key": task_cache_key(record.task),
100
+ "evaluation": core_serialization.serialize_evaluation_record(evaluation),
101
+ }
102
+ with path.open("a", encoding="utf-8") as handle:
103
+ handle.write(json.dumps(payload) + "\n")
104
+
105
+ def load_cached_evaluations(
106
+ self, run_id: str
107
+ ) -> Dict[str, core_entities.EvaluationRecord]:
108
+ path = self._evaluation_path(run_id)
109
+ if not path.exists():
110
+ return {}
111
+ evaluations: dict[str, core_entities.EvaluationRecord] = {}
112
+ with path.open("r", encoding="utf-8") as handle:
113
+ for line in handle:
114
+ if not line.strip():
115
+ continue
116
+ data = json.loads(line)
117
+ key = data.get("cache_key")
118
+ if not key:
119
+ continue
120
+ evaluations[key] = core_serialization.deserialize_evaluation_record(
121
+ data["evaluation"]
122
+ )
123
+ return evaluations
124
+
125
+ def get_run_path(self, run_id: str) -> Path:
126
+ """Get the filesystem path for a run's storage directory.
127
+
128
+ Args:
129
+ run_id: Unique run identifier
130
+
131
+ Returns:
132
+ Path to the run's storage directory
133
+ """
134
+ return self._run_dir(run_id)
135
+
136
+ def _dataset_path(self, run_id: str) -> Path:
137
+ return self._run_dir(run_id) / "dataset.jsonl"
138
+
139
+ def _records_path(self, run_id: str) -> Path:
140
+ return self._run_dir(run_id) / "records.jsonl"
141
+
142
+ def _tasks_path(self, run_id: str) -> Path:
143
+ return self._run_dir(run_id) / "tasks.jsonl"
144
+
145
+ def _evaluation_path(self, run_id: str) -> Path:
146
+ return self._run_dir(run_id) / "evaluation.jsonl"
147
+
148
+ def _run_dir(self, run_id: str) -> Path:
149
+ return self._root / run_id
150
+
151
+ def _serialize_record(
152
+ self, run_id: str, record: core_entities.GenerationRecord
153
+ ) -> dict[str, object]:
154
+ task_key = self._persist_task(run_id, record.task)
155
+ payload = {
156
+ "task_key": task_key,
157
+ "output": {
158
+ "text": record.output.text,
159
+ "raw": record.output.raw,
160
+ }
161
+ if record.output
162
+ else None,
163
+ "error": {
164
+ "message": record.error.message,
165
+ "kind": record.error.kind,
166
+ "details": record.error.details,
167
+ }
168
+ if record.error
169
+ else None,
170
+ "metrics": record.metrics,
171
+ "attempts": [
172
+ self._serialize_record(run_id, attempt) for attempt in record.attempts
173
+ ],
174
+ }
175
+ return payload
176
+
177
+ def _deserialize_record(
178
+ self, payload: dict[str, object], tasks: dict[str, core_entities.GenerationTask]
179
+ ) -> core_entities.GenerationRecord:
180
+ task_key = payload["task_key"]
181
+ task = tasks[task_key]
182
+ output_data = payload.get("output")
183
+ error_data = payload.get("error")
184
+ record = core_entities.GenerationRecord(
185
+ task=task,
186
+ output=core_entities.ModelOutput(
187
+ text=output_data["text"], raw=output_data.get("raw")
188
+ )
189
+ if output_data
190
+ else None,
191
+ error=core_entities.ModelError(
192
+ message=error_data["message"],
193
+ kind=error_data.get("kind", "model_error"),
194
+ details=error_data.get("details", {}),
195
+ )
196
+ if error_data
197
+ else None,
198
+ metrics=payload.get("metrics", {}),
199
+ )
200
+ record.attempts = [
201
+ self._deserialize_record(attempt, tasks)
202
+ for attempt in payload.get("attempts", [])
203
+ ]
204
+ return record
205
+
206
+ def _persist_task(self, run_id: str, task: core_entities.GenerationTask) -> str:
207
+ key = task_cache_key(task)
208
+ index = self._load_task_index(run_id)
209
+ if key in index:
210
+ return key
211
+ path = self._tasks_path(run_id)
212
+ path.parent.mkdir(parents=True, exist_ok=True)
213
+ payload = {
214
+ "task_key": key,
215
+ "task": core_serialization.serialize_generation_task(task),
216
+ }
217
+ with path.open("a", encoding="utf-8") as handle:
218
+ handle.write(json.dumps(payload) + "\n")
219
+ index.add(key)
220
+ return key
221
+
222
+ def _load_tasks(self, run_id: str) -> dict[str, core_entities.GenerationTask]:
223
+ path = self._tasks_path(run_id)
224
+ tasks: dict[str, core_entities.GenerationTask] = {}
225
+ if not path.exists():
226
+ return tasks
227
+ with path.open("r", encoding="utf-8") as handle:
228
+ for line in handle:
229
+ if not line.strip():
230
+ continue
231
+ data = json.loads(line)
232
+ task_key = data["task_key"]
233
+ tasks[task_key] = core_serialization.deserialize_generation_task(
234
+ data["task"]
235
+ )
236
+ self._task_index[run_id] = set(tasks.keys())
237
+ return tasks
238
+
239
+ def _load_task_index(self, run_id: str) -> set[str]:
240
+ if run_id in self._task_index:
241
+ return self._task_index[run_id]
242
+ path = self._tasks_path(run_id)
243
+ index: set[str] = set()
244
+ if path.exists():
245
+ with path.open("r", encoding="utf-8") as handle:
246
+ for line in handle:
247
+ if not line.strip():
248
+ continue
249
+ data = json.loads(line)
250
+ index.add(data["task_key"])
251
+ self._task_index[run_id] = index
252
+ return index
253
+
254
+
255
+ __all__ = ["ExperimentStorage", "task_cache_key"]