themis-eval 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. themis/cli/__init__.py +5 -0
  2. themis/cli/__main__.py +6 -0
  3. themis/cli/commands/__init__.py +19 -0
  4. themis/cli/commands/benchmarks.py +221 -0
  5. themis/cli/commands/comparison.py +394 -0
  6. themis/cli/commands/config_commands.py +244 -0
  7. themis/cli/commands/cost.py +214 -0
  8. themis/cli/commands/demo.py +68 -0
  9. themis/cli/commands/info.py +90 -0
  10. themis/cli/commands/leaderboard.py +362 -0
  11. themis/cli/commands/math_benchmarks.py +318 -0
  12. themis/cli/commands/mcq_benchmarks.py +207 -0
  13. themis/cli/commands/sample_run.py +244 -0
  14. themis/cli/commands/visualize.py +299 -0
  15. themis/cli/main.py +93 -0
  16. themis/cli/new_project.py +33 -0
  17. themis/cli/utils.py +51 -0
  18. themis/config/__init__.py +19 -0
  19. themis/config/loader.py +27 -0
  20. themis/config/registry.py +34 -0
  21. themis/config/runtime.py +214 -0
  22. themis/config/schema.py +112 -0
  23. themis/core/__init__.py +5 -0
  24. themis/core/conversation.py +354 -0
  25. themis/core/entities.py +164 -0
  26. themis/core/serialization.py +231 -0
  27. themis/core/tools.py +393 -0
  28. themis/core/types.py +141 -0
  29. themis/datasets/__init__.py +273 -0
  30. themis/datasets/base.py +264 -0
  31. themis/datasets/commonsense_qa.py +174 -0
  32. themis/datasets/competition_math.py +265 -0
  33. themis/datasets/coqa.py +133 -0
  34. themis/datasets/gpqa.py +190 -0
  35. themis/datasets/gsm8k.py +123 -0
  36. themis/datasets/gsm_symbolic.py +124 -0
  37. themis/datasets/math500.py +122 -0
  38. themis/datasets/med_qa.py +179 -0
  39. themis/datasets/medmcqa.py +169 -0
  40. themis/datasets/mmlu_pro.py +262 -0
  41. themis/datasets/piqa.py +146 -0
  42. themis/datasets/registry.py +201 -0
  43. themis/datasets/schema.py +245 -0
  44. themis/datasets/sciq.py +150 -0
  45. themis/datasets/social_i_qa.py +151 -0
  46. themis/datasets/super_gpqa.py +263 -0
  47. themis/evaluation/__init__.py +1 -0
  48. themis/evaluation/conditional.py +410 -0
  49. themis/evaluation/extractors/__init__.py +19 -0
  50. themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
  51. themis/evaluation/extractors/exceptions.py +7 -0
  52. themis/evaluation/extractors/identity_extractor.py +29 -0
  53. themis/evaluation/extractors/json_field_extractor.py +45 -0
  54. themis/evaluation/extractors/math_verify_extractor.py +37 -0
  55. themis/evaluation/extractors/regex_extractor.py +43 -0
  56. themis/evaluation/math_verify_utils.py +87 -0
  57. themis/evaluation/metrics/__init__.py +21 -0
  58. themis/evaluation/metrics/composite_metric.py +47 -0
  59. themis/evaluation/metrics/consistency_metric.py +80 -0
  60. themis/evaluation/metrics/exact_match.py +51 -0
  61. themis/evaluation/metrics/length_difference_tolerance.py +33 -0
  62. themis/evaluation/metrics/math_verify_accuracy.py +40 -0
  63. themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
  64. themis/evaluation/metrics/response_length.py +33 -0
  65. themis/evaluation/metrics/rubric_judge_metric.py +134 -0
  66. themis/evaluation/pipeline.py +49 -0
  67. themis/evaluation/pipelines/__init__.py +15 -0
  68. themis/evaluation/pipelines/composable_pipeline.py +357 -0
  69. themis/evaluation/pipelines/standard_pipeline.py +288 -0
  70. themis/evaluation/reports.py +293 -0
  71. themis/evaluation/statistics/__init__.py +53 -0
  72. themis/evaluation/statistics/bootstrap.py +79 -0
  73. themis/evaluation/statistics/confidence_intervals.py +121 -0
  74. themis/evaluation/statistics/distributions.py +207 -0
  75. themis/evaluation/statistics/effect_sizes.py +124 -0
  76. themis/evaluation/statistics/hypothesis_tests.py +305 -0
  77. themis/evaluation/statistics/types.py +139 -0
  78. themis/evaluation/strategies/__init__.py +13 -0
  79. themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
  80. themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
  81. themis/evaluation/strategies/evaluation_strategy.py +24 -0
  82. themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
  83. themis/experiment/__init__.py +5 -0
  84. themis/experiment/builder.py +151 -0
  85. themis/experiment/cache_manager.py +129 -0
  86. themis/experiment/comparison.py +631 -0
  87. themis/experiment/cost.py +310 -0
  88. themis/experiment/definitions.py +62 -0
  89. themis/experiment/export.py +690 -0
  90. themis/experiment/export_csv.py +159 -0
  91. themis/experiment/integration_manager.py +104 -0
  92. themis/experiment/math.py +192 -0
  93. themis/experiment/mcq.py +169 -0
  94. themis/experiment/orchestrator.py +373 -0
  95. themis/experiment/pricing.py +317 -0
  96. themis/experiment/storage.py +255 -0
  97. themis/experiment/visualization.py +588 -0
  98. themis/generation/__init__.py +1 -0
  99. themis/generation/agentic_runner.py +420 -0
  100. themis/generation/batching.py +254 -0
  101. themis/generation/clients.py +143 -0
  102. themis/generation/conversation_runner.py +236 -0
  103. themis/generation/plan.py +456 -0
  104. themis/generation/providers/litellm_provider.py +221 -0
  105. themis/generation/providers/vllm_provider.py +135 -0
  106. themis/generation/router.py +34 -0
  107. themis/generation/runner.py +207 -0
  108. themis/generation/strategies.py +98 -0
  109. themis/generation/templates.py +71 -0
  110. themis/generation/turn_strategies.py +393 -0
  111. themis/generation/types.py +9 -0
  112. themis/integrations/__init__.py +0 -0
  113. themis/integrations/huggingface.py +61 -0
  114. themis/integrations/wandb.py +65 -0
  115. themis/interfaces/__init__.py +83 -0
  116. themis/project/__init__.py +20 -0
  117. themis/project/definitions.py +98 -0
  118. themis/project/patterns.py +230 -0
  119. themis/providers/__init__.py +5 -0
  120. themis/providers/registry.py +39 -0
  121. themis/utils/api_generator.py +379 -0
  122. themis/utils/cost_tracking.py +376 -0
  123. themis/utils/dashboard.py +452 -0
  124. themis/utils/logging_utils.py +41 -0
  125. themis/utils/progress.py +58 -0
  126. themis/utils/tracing.py +320 -0
  127. {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/METADATA +1 -1
  128. themis_eval-0.1.1.dist-info/RECORD +134 -0
  129. themis_eval-0.1.0.dist-info/RECORD +0 -8
  130. {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/WHEEL +0 -0
  131. {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/licenses/LICENSE +0 -0
  132. {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,164 @@
1
+ """Shared dataclasses that represent Themis' internal world."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from typing import TYPE_CHECKING, Any, Dict, Generic, List, TypeVar
7
+
8
+ if TYPE_CHECKING:
9
+ from themis.evaluation.reports import EvaluationReport
10
+
11
+ # Type variable for generic Reference
12
+ T = TypeVar("T")
13
+
14
+
15
+ @dataclass(frozen=True)
16
+ class SamplingConfig:
17
+ temperature: float
18
+ top_p: float
19
+ max_tokens: int
20
+
21
+
22
+ @dataclass(frozen=True)
23
+ class ModelSpec:
24
+ identifier: str
25
+ provider: str
26
+ default_sampling: SamplingConfig | None = None
27
+ metadata: Dict[str, Any] = field(default_factory=dict)
28
+
29
+
30
+ @dataclass(frozen=True)
31
+ class PromptSpec:
32
+ name: str
33
+ template: str
34
+ metadata: Dict[str, Any] = field(default_factory=dict)
35
+
36
+
37
+ @dataclass(frozen=True)
38
+ class PromptRender:
39
+ spec: PromptSpec
40
+ text: str
41
+ context: Dict[str, Any] = field(default_factory=dict)
42
+ metadata: Dict[str, Any] = field(default_factory=dict)
43
+
44
+ @property
45
+ def prompt_text(self) -> str:
46
+ return self.text
47
+
48
+ @property
49
+ def template_name(self) -> str:
50
+ return self.spec.name
51
+
52
+
53
+ @dataclass(frozen=True)
54
+ class Reference(Generic[T]):
55
+ """Reference value with optional type information.
56
+
57
+ This is a generic dataclass that can hold typed reference values.
58
+ For backward compatibility, it can be used without type parameters
59
+ and will behave like Reference[Any].
60
+
61
+ Examples:
62
+ # Untyped (backward compatible)
63
+ ref = Reference(kind="answer", value="42")
64
+
65
+ # Typed
66
+ ref: Reference[str] = Reference(kind="answer", value="42")
67
+ ref: Reference[int] = Reference(kind="answer", value=42)
68
+ """
69
+
70
+ kind: str
71
+ value: T
72
+ schema: type[T] | None = None # Optional runtime type information
73
+
74
+
75
+ @dataclass(frozen=True)
76
+ class ModelOutput:
77
+ text: str
78
+ raw: Any | None = None
79
+ usage: Dict[str, int] | None = None # Token usage: {prompt_tokens, completion_tokens, total_tokens}
80
+
81
+
82
+ @dataclass(frozen=True)
83
+ class ModelError:
84
+ message: str
85
+ kind: str = "model_error"
86
+ details: Dict[str, Any] = field(default_factory=dict)
87
+
88
+
89
+ @dataclass
90
+ class GenerationTask:
91
+ prompt: PromptRender
92
+ model: ModelSpec
93
+ sampling: SamplingConfig
94
+ metadata: Dict[str, Any] = field(default_factory=dict)
95
+ reference: Reference | None = None
96
+
97
+
98
+ @dataclass
99
+ class GenerationRecord:
100
+ task: GenerationTask
101
+ output: ModelOutput | None
102
+ error: ModelError | None
103
+ metrics: Dict[str, Any] = field(default_factory=dict)
104
+ attempts: List["GenerationRecord"] = field(default_factory=list)
105
+
106
+
107
+ @dataclass(frozen=True)
108
+ class EvaluationItem:
109
+ record: GenerationRecord
110
+ reference: Reference | None
111
+
112
+
113
+ @dataclass(frozen=True)
114
+ class MetricScore:
115
+ metric_name: str
116
+ value: float
117
+ details: Dict[str, Any] = field(default_factory=dict)
118
+ metadata: Dict[str, Any] = field(default_factory=dict)
119
+
120
+
121
+ @dataclass
122
+ class EvaluationSummary:
123
+ scores: List[MetricScore]
124
+ failures: List[str] = field(default_factory=list)
125
+
126
+
127
+ @dataclass
128
+ class EvaluationRecord:
129
+ sample_id: str | None
130
+ scores: List[MetricScore]
131
+ failures: List[str] = field(default_factory=list)
132
+
133
+
134
+ @dataclass
135
+ class ExperimentFailure:
136
+ sample_id: str | None
137
+ message: str
138
+
139
+
140
+ @dataclass
141
+ class ExperimentReport:
142
+ generation_results: list[GenerationRecord]
143
+ evaluation_report: "EvaluationReport"
144
+ failures: list[ExperimentFailure]
145
+ metadata: dict[str, object]
146
+
147
+
148
+ __all__ = [
149
+ "SamplingConfig",
150
+ "ModelSpec",
151
+ "PromptSpec",
152
+ "PromptRender",
153
+ "Reference",
154
+ "ModelOutput",
155
+ "ModelError",
156
+ "GenerationTask",
157
+ "GenerationRecord",
158
+ "EvaluationItem",
159
+ "EvaluationRecord",
160
+ "MetricScore",
161
+ "EvaluationSummary",
162
+ "ExperimentFailure",
163
+ "ExperimentReport",
164
+ ]
@@ -0,0 +1,231 @@
1
+ """Serialization helpers for Themis core entities."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import copy
6
+ from typing import Any, Dict
7
+
8
+ from themis.core import entities as core_entities
9
+
10
+
11
+ def serialize_sampling(config: core_entities.SamplingConfig) -> Dict[str, Any]:
12
+ return {
13
+ "temperature": config.temperature,
14
+ "top_p": config.top_p,
15
+ "max_tokens": config.max_tokens,
16
+ }
17
+
18
+
19
+ def deserialize_sampling(data: Dict[str, Any]) -> core_entities.SamplingConfig:
20
+ return core_entities.SamplingConfig(
21
+ temperature=data["temperature"],
22
+ top_p=data["top_p"],
23
+ max_tokens=data["max_tokens"],
24
+ )
25
+
26
+
27
+ def serialize_model_spec(spec: core_entities.ModelSpec) -> Dict[str, Any]:
28
+ return {
29
+ "identifier": spec.identifier,
30
+ "provider": spec.provider,
31
+ "metadata": copy.deepcopy(spec.metadata),
32
+ "default_sampling": serialize_sampling(spec.default_sampling)
33
+ if spec.default_sampling
34
+ else None,
35
+ }
36
+
37
+
38
+ def deserialize_model_spec(data: Dict[str, Any]) -> core_entities.ModelSpec:
39
+ default_sampling = (
40
+ deserialize_sampling(data["default_sampling"])
41
+ if data.get("default_sampling")
42
+ else None
43
+ )
44
+ return core_entities.ModelSpec(
45
+ identifier=data["identifier"],
46
+ provider=data["provider"],
47
+ metadata=copy.deepcopy(data.get("metadata", {})),
48
+ default_sampling=default_sampling,
49
+ )
50
+
51
+
52
+ def serialize_prompt_spec(spec: core_entities.PromptSpec) -> Dict[str, Any]:
53
+ return {
54
+ "name": spec.name,
55
+ "template": spec.template,
56
+ "metadata": copy.deepcopy(spec.metadata),
57
+ }
58
+
59
+
60
+ def deserialize_prompt_spec(data: Dict[str, Any]) -> core_entities.PromptSpec:
61
+ return core_entities.PromptSpec(
62
+ name=data["name"],
63
+ template=data["template"],
64
+ metadata=copy.deepcopy(data.get("metadata", {})),
65
+ )
66
+
67
+
68
+ def serialize_prompt_render(render: core_entities.PromptRender) -> Dict[str, Any]:
69
+ return {
70
+ "spec": serialize_prompt_spec(render.spec),
71
+ "text": render.text,
72
+ "context": copy.deepcopy(render.context),
73
+ "metadata": copy.deepcopy(render.metadata),
74
+ }
75
+
76
+
77
+ def deserialize_prompt_render(data: Dict[str, Any]) -> core_entities.PromptRender:
78
+ return core_entities.PromptRender(
79
+ spec=deserialize_prompt_spec(data["spec"]),
80
+ text=data["text"],
81
+ context=copy.deepcopy(data.get("context", {})),
82
+ metadata=copy.deepcopy(data.get("metadata", {})),
83
+ )
84
+
85
+
86
+ def serialize_reference(
87
+ reference: core_entities.Reference | None,
88
+ ) -> Dict[str, Any] | None:
89
+ if reference is None:
90
+ return None
91
+ return {"kind": reference.kind, "value": reference.value}
92
+
93
+
94
+ def deserialize_reference(
95
+ data: Dict[str, Any] | None,
96
+ ) -> core_entities.Reference | None:
97
+ if data is None:
98
+ return None
99
+ return core_entities.Reference(kind=data["kind"], value=data.get("value"))
100
+
101
+
102
+ def serialize_generation_task(task: core_entities.GenerationTask) -> Dict[str, Any]:
103
+ return {
104
+ "prompt": serialize_prompt_render(task.prompt),
105
+ "model": serialize_model_spec(task.model),
106
+ "sampling": serialize_sampling(task.sampling),
107
+ "metadata": copy.deepcopy(task.metadata),
108
+ "reference": serialize_reference(task.reference),
109
+ }
110
+
111
+
112
+ def deserialize_generation_task(data: Dict[str, Any]) -> core_entities.GenerationTask:
113
+ return core_entities.GenerationTask(
114
+ prompt=deserialize_prompt_render(data["prompt"]),
115
+ model=deserialize_model_spec(data["model"]),
116
+ sampling=deserialize_sampling(data["sampling"]),
117
+ metadata=copy.deepcopy(data.get("metadata", {})),
118
+ reference=deserialize_reference(data.get("reference")),
119
+ )
120
+
121
+
122
+ def serialize_generation_record(
123
+ record: core_entities.GenerationRecord,
124
+ ) -> Dict[str, Any]:
125
+ return {
126
+ "task": serialize_generation_task(record.task),
127
+ "output": {
128
+ "text": record.output.text,
129
+ "raw": record.output.raw,
130
+ }
131
+ if record.output
132
+ else None,
133
+ "error": {
134
+ "message": record.error.message,
135
+ "kind": record.error.kind,
136
+ "details": copy.deepcopy(record.error.details),
137
+ }
138
+ if record.error
139
+ else None,
140
+ "metrics": copy.deepcopy(record.metrics),
141
+ "attempts": [
142
+ serialize_generation_record(attempt) for attempt in record.attempts
143
+ ],
144
+ }
145
+
146
+
147
+ def deserialize_generation_record(
148
+ data: Dict[str, Any],
149
+ ) -> core_entities.GenerationRecord:
150
+ output_data = data.get("output")
151
+ error_data = data.get("error")
152
+ return core_entities.GenerationRecord(
153
+ task=deserialize_generation_task(data["task"]),
154
+ output=core_entities.ModelOutput(
155
+ text=output_data["text"], raw=output_data.get("raw")
156
+ )
157
+ if output_data
158
+ else None,
159
+ error=core_entities.ModelError(
160
+ message=error_data["message"],
161
+ kind=error_data.get("kind", "model_error"),
162
+ details=copy.deepcopy(error_data.get("details", {})),
163
+ )
164
+ if error_data
165
+ else None,
166
+ metrics=copy.deepcopy(data.get("metrics", {})),
167
+ attempts=[
168
+ deserialize_generation_record(attempt)
169
+ for attempt in data.get("attempts", [])
170
+ ],
171
+ )
172
+
173
+
174
+ def serialize_metric_score(score: core_entities.MetricScore) -> Dict[str, Any]:
175
+ return {
176
+ "metric_name": score.metric_name,
177
+ "value": score.value,
178
+ "details": copy.deepcopy(score.details),
179
+ "metadata": copy.deepcopy(score.metadata),
180
+ }
181
+
182
+
183
+ def deserialize_metric_score(data: Dict[str, Any]) -> core_entities.MetricScore:
184
+ return core_entities.MetricScore(
185
+ metric_name=data["metric_name"],
186
+ value=data["value"],
187
+ details=copy.deepcopy(data.get("details", {})),
188
+ metadata=copy.deepcopy(data.get("metadata", {})),
189
+ )
190
+
191
+
192
+ def serialize_evaluation_record(
193
+ record: core_entities.EvaluationRecord,
194
+ ) -> Dict[str, Any]:
195
+ return {
196
+ "sample_id": record.sample_id,
197
+ "scores": [serialize_metric_score(score) for score in record.scores],
198
+ "failures": list(record.failures),
199
+ }
200
+
201
+
202
+ def deserialize_evaluation_record(
203
+ data: Dict[str, Any],
204
+ ) -> core_entities.EvaluationRecord:
205
+ return core_entities.EvaluationRecord(
206
+ sample_id=data.get("sample_id"),
207
+ scores=[deserialize_metric_score(score) for score in data.get("scores", [])],
208
+ failures=list(data.get("failures", [])),
209
+ )
210
+
211
+
212
+ __all__ = [
213
+ "serialize_generation_record",
214
+ "deserialize_generation_record",
215
+ "serialize_generation_task",
216
+ "deserialize_generation_task",
217
+ "serialize_evaluation_record",
218
+ "deserialize_evaluation_record",
219
+ "serialize_metric_score",
220
+ "deserialize_metric_score",
221
+ "serialize_sampling",
222
+ "deserialize_sampling",
223
+ "serialize_model_spec",
224
+ "deserialize_model_spec",
225
+ "serialize_prompt_spec",
226
+ "deserialize_prompt_spec",
227
+ "serialize_prompt_render",
228
+ "deserialize_prompt_render",
229
+ "serialize_reference",
230
+ "deserialize_reference",
231
+ ]