themis-eval 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- themis/__init__.py +12 -1
- themis/_version.py +2 -2
- themis/api.py +343 -0
- themis/backends/__init__.py +17 -0
- themis/backends/execution.py +197 -0
- themis/backends/storage.py +260 -0
- themis/cli/__init__.py +5 -0
- themis/cli/__main__.py +6 -0
- themis/cli/commands/__init__.py +19 -0
- themis/cli/commands/benchmarks.py +221 -0
- themis/cli/commands/comparison.py +394 -0
- themis/cli/commands/config_commands.py +244 -0
- themis/cli/commands/cost.py +214 -0
- themis/cli/commands/demo.py +68 -0
- themis/cli/commands/info.py +90 -0
- themis/cli/commands/leaderboard.py +362 -0
- themis/cli/commands/math_benchmarks.py +318 -0
- themis/cli/commands/mcq_benchmarks.py +207 -0
- themis/cli/commands/results.py +252 -0
- themis/cli/commands/sample_run.py +244 -0
- themis/cli/commands/visualize.py +299 -0
- themis/cli/main.py +463 -0
- themis/cli/new_project.py +33 -0
- themis/cli/utils.py +51 -0
- themis/comparison/__init__.py +25 -0
- themis/comparison/engine.py +348 -0
- themis/comparison/reports.py +283 -0
- themis/comparison/statistics.py +402 -0
- themis/config/__init__.py +19 -0
- themis/config/loader.py +27 -0
- themis/config/registry.py +34 -0
- themis/config/runtime.py +214 -0
- themis/config/schema.py +112 -0
- themis/core/__init__.py +5 -0
- themis/core/conversation.py +354 -0
- themis/core/entities.py +184 -0
- themis/core/serialization.py +231 -0
- themis/core/tools.py +393 -0
- themis/core/types.py +141 -0
- themis/datasets/__init__.py +273 -0
- themis/datasets/base.py +264 -0
- themis/datasets/commonsense_qa.py +174 -0
- themis/datasets/competition_math.py +265 -0
- themis/datasets/coqa.py +133 -0
- themis/datasets/gpqa.py +190 -0
- themis/datasets/gsm8k.py +123 -0
- themis/datasets/gsm_symbolic.py +124 -0
- themis/datasets/math500.py +122 -0
- themis/datasets/med_qa.py +179 -0
- themis/datasets/medmcqa.py +169 -0
- themis/datasets/mmlu_pro.py +262 -0
- themis/datasets/piqa.py +146 -0
- themis/datasets/registry.py +201 -0
- themis/datasets/schema.py +245 -0
- themis/datasets/sciq.py +150 -0
- themis/datasets/social_i_qa.py +151 -0
- themis/datasets/super_gpqa.py +263 -0
- themis/evaluation/__init__.py +1 -0
- themis/evaluation/conditional.py +410 -0
- themis/evaluation/extractors/__init__.py +19 -0
- themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
- themis/evaluation/extractors/exceptions.py +7 -0
- themis/evaluation/extractors/identity_extractor.py +29 -0
- themis/evaluation/extractors/json_field_extractor.py +45 -0
- themis/evaluation/extractors/math_verify_extractor.py +37 -0
- themis/evaluation/extractors/regex_extractor.py +43 -0
- themis/evaluation/math_verify_utils.py +87 -0
- themis/evaluation/metrics/__init__.py +21 -0
- themis/evaluation/metrics/code/__init__.py +19 -0
- themis/evaluation/metrics/code/codebleu.py +144 -0
- themis/evaluation/metrics/code/execution.py +280 -0
- themis/evaluation/metrics/code/pass_at_k.py +181 -0
- themis/evaluation/metrics/composite_metric.py +47 -0
- themis/evaluation/metrics/consistency_metric.py +80 -0
- themis/evaluation/metrics/exact_match.py +51 -0
- themis/evaluation/metrics/length_difference_tolerance.py +33 -0
- themis/evaluation/metrics/math_verify_accuracy.py +40 -0
- themis/evaluation/metrics/nlp/__init__.py +21 -0
- themis/evaluation/metrics/nlp/bertscore.py +138 -0
- themis/evaluation/metrics/nlp/bleu.py +129 -0
- themis/evaluation/metrics/nlp/meteor.py +153 -0
- themis/evaluation/metrics/nlp/rouge.py +136 -0
- themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
- themis/evaluation/metrics/response_length.py +33 -0
- themis/evaluation/metrics/rubric_judge_metric.py +134 -0
- themis/evaluation/pipeline.py +49 -0
- themis/evaluation/pipelines/__init__.py +15 -0
- themis/evaluation/pipelines/composable_pipeline.py +357 -0
- themis/evaluation/pipelines/standard_pipeline.py +348 -0
- themis/evaluation/reports.py +293 -0
- themis/evaluation/statistics/__init__.py +53 -0
- themis/evaluation/statistics/bootstrap.py +79 -0
- themis/evaluation/statistics/confidence_intervals.py +121 -0
- themis/evaluation/statistics/distributions.py +207 -0
- themis/evaluation/statistics/effect_sizes.py +124 -0
- themis/evaluation/statistics/hypothesis_tests.py +305 -0
- themis/evaluation/statistics/types.py +139 -0
- themis/evaluation/strategies/__init__.py +13 -0
- themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
- themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
- themis/evaluation/strategies/evaluation_strategy.py +24 -0
- themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
- themis/experiment/__init__.py +5 -0
- themis/experiment/builder.py +151 -0
- themis/experiment/cache_manager.py +134 -0
- themis/experiment/comparison.py +631 -0
- themis/experiment/cost.py +310 -0
- themis/experiment/definitions.py +62 -0
- themis/experiment/export.py +798 -0
- themis/experiment/export_csv.py +159 -0
- themis/experiment/integration_manager.py +104 -0
- themis/experiment/math.py +192 -0
- themis/experiment/mcq.py +169 -0
- themis/experiment/orchestrator.py +415 -0
- themis/experiment/pricing.py +317 -0
- themis/experiment/storage.py +1458 -0
- themis/experiment/visualization.py +588 -0
- themis/generation/__init__.py +1 -0
- themis/generation/agentic_runner.py +420 -0
- themis/generation/batching.py +254 -0
- themis/generation/clients.py +143 -0
- themis/generation/conversation_runner.py +236 -0
- themis/generation/plan.py +456 -0
- themis/generation/providers/litellm_provider.py +221 -0
- themis/generation/providers/vllm_provider.py +135 -0
- themis/generation/router.py +34 -0
- themis/generation/runner.py +207 -0
- themis/generation/strategies.py +98 -0
- themis/generation/templates.py +71 -0
- themis/generation/turn_strategies.py +393 -0
- themis/generation/types.py +9 -0
- themis/integrations/__init__.py +0 -0
- themis/integrations/huggingface.py +72 -0
- themis/integrations/wandb.py +77 -0
- themis/interfaces/__init__.py +169 -0
- themis/presets/__init__.py +10 -0
- themis/presets/benchmarks.py +354 -0
- themis/presets/models.py +190 -0
- themis/project/__init__.py +20 -0
- themis/project/definitions.py +98 -0
- themis/project/patterns.py +230 -0
- themis/providers/__init__.py +5 -0
- themis/providers/registry.py +39 -0
- themis/server/__init__.py +28 -0
- themis/server/app.py +337 -0
- themis/utils/api_generator.py +379 -0
- themis/utils/cost_tracking.py +376 -0
- themis/utils/dashboard.py +452 -0
- themis/utils/logging_utils.py +41 -0
- themis/utils/progress.py +58 -0
- themis/utils/tracing.py +320 -0
- themis_eval-0.2.0.dist-info/METADATA +596 -0
- themis_eval-0.2.0.dist-info/RECORD +157 -0
- {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/WHEEL +1 -1
- themis_eval-0.1.0.dist-info/METADATA +0 -758
- themis_eval-0.1.0.dist-info/RECORD +0 -8
- {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/top_level.txt +0 -0
themis/utils/tracing.py
ADDED
|
@@ -0,0 +1,320 @@
|
|
|
1
|
+
"""Distributed tracing and observability utilities.
|
|
2
|
+
|
|
3
|
+
This module provides lightweight span-based tracing for understanding
|
|
4
|
+
experiment execution performance and behavior. Tracing is opt-in and
|
|
5
|
+
disabled by default for minimal performance impact.
|
|
6
|
+
|
|
7
|
+
Examples:
|
|
8
|
+
# Enable tracing
|
|
9
|
+
tracing.enable()
|
|
10
|
+
|
|
11
|
+
# Use context manager for automatic span management
|
|
12
|
+
with tracing.span("my_operation", task_id="123"):
|
|
13
|
+
# ... do work ...
|
|
14
|
+
with tracing.span("subprocess"):
|
|
15
|
+
# ... nested work ...
|
|
16
|
+
pass
|
|
17
|
+
|
|
18
|
+
# Get trace for analysis
|
|
19
|
+
trace = tracing.get_trace()
|
|
20
|
+
print(f"Total time: {trace.duration_ms()}ms")
|
|
21
|
+
|
|
22
|
+
# Export trace
|
|
23
|
+
tracing.export_json("trace.json")
|
|
24
|
+
tracing.disable()
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from __future__ import annotations
|
|
28
|
+
|
|
29
|
+
import json
|
|
30
|
+
import threading
|
|
31
|
+
import time
|
|
32
|
+
from contextlib import contextmanager
|
|
33
|
+
from dataclasses import dataclass, field
|
|
34
|
+
from typing import Any, Iterator
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class Span:
|
|
39
|
+
"""Represents a traced operation with timing and metadata.
|
|
40
|
+
|
|
41
|
+
Spans can be nested to create a tree of operations showing
|
|
42
|
+
how time is spent during experiment execution.
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
name: str
|
|
46
|
+
start_time: float
|
|
47
|
+
end_time: float | None = None
|
|
48
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
49
|
+
parent: Span | None = None
|
|
50
|
+
children: list[Span] = field(default_factory=list)
|
|
51
|
+
span_id: str = field(default_factory=lambda: str(time.time_ns()))
|
|
52
|
+
|
|
53
|
+
def duration_ms(self) -> float:
|
|
54
|
+
"""Calculate span duration in milliseconds.
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
Duration in milliseconds, or time elapsed so far if span not closed
|
|
58
|
+
"""
|
|
59
|
+
if self.end_time is None:
|
|
60
|
+
return (time.perf_counter() - self.start_time) * 1000
|
|
61
|
+
return (self.end_time - self.start_time) * 1000
|
|
62
|
+
|
|
63
|
+
def is_complete(self) -> bool:
|
|
64
|
+
"""Check if span has been closed."""
|
|
65
|
+
return self.end_time is not None
|
|
66
|
+
|
|
67
|
+
def to_dict(self) -> dict[str, Any]:
|
|
68
|
+
"""Convert span to dictionary for serialization."""
|
|
69
|
+
return {
|
|
70
|
+
"name": self.name,
|
|
71
|
+
"span_id": self.span_id,
|
|
72
|
+
"start_time": self.start_time,
|
|
73
|
+
"end_time": self.end_time,
|
|
74
|
+
"duration_ms": self.duration_ms() if self.is_complete() else None,
|
|
75
|
+
"metadata": self.metadata,
|
|
76
|
+
"children": [child.to_dict() for child in self.children],
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
def find_spans(self, name: str) -> list[Span]:
|
|
80
|
+
"""Find all spans with given name in this span and descendants.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
name: Span name to search for
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
List of matching spans
|
|
87
|
+
"""
|
|
88
|
+
matches = []
|
|
89
|
+
if self.name == name:
|
|
90
|
+
matches.append(self)
|
|
91
|
+
|
|
92
|
+
for child in self.children:
|
|
93
|
+
matches.extend(child.find_spans(name))
|
|
94
|
+
|
|
95
|
+
return matches
|
|
96
|
+
|
|
97
|
+
def get_summary(self) -> dict[str, Any]:
|
|
98
|
+
"""Get summary statistics for this span tree.
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
Dictionary with timing statistics by span name
|
|
102
|
+
"""
|
|
103
|
+
summary: dict[str, dict[str, Any]] = {}
|
|
104
|
+
|
|
105
|
+
def collect(span: Span):
|
|
106
|
+
if span.name not in summary:
|
|
107
|
+
summary[span.name] = {
|
|
108
|
+
"count": 0,
|
|
109
|
+
"total_ms": 0.0,
|
|
110
|
+
"min_ms": float("inf"),
|
|
111
|
+
"max_ms": 0.0,
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
stats = summary[span.name]
|
|
115
|
+
if span.is_complete():
|
|
116
|
+
duration = span.duration_ms()
|
|
117
|
+
stats["count"] += 1
|
|
118
|
+
stats["total_ms"] += duration
|
|
119
|
+
stats["min_ms"] = min(stats["min_ms"], duration)
|
|
120
|
+
stats["max_ms"] = max(stats["max_ms"], duration)
|
|
121
|
+
|
|
122
|
+
for child in span.children:
|
|
123
|
+
collect(child)
|
|
124
|
+
|
|
125
|
+
collect(self)
|
|
126
|
+
|
|
127
|
+
# Calculate averages
|
|
128
|
+
for stats in summary.values():
|
|
129
|
+
if stats["count"] > 0:
|
|
130
|
+
stats["avg_ms"] = stats["total_ms"] / stats["count"]
|
|
131
|
+
|
|
132
|
+
return summary
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
class TracingContext:
|
|
136
|
+
"""Thread-local tracing context.
|
|
137
|
+
|
|
138
|
+
This class manages the current span stack for each thread, allowing
|
|
139
|
+
nested spans and proper parent-child relationships.
|
|
140
|
+
"""
|
|
141
|
+
|
|
142
|
+
def __init__(self):
|
|
143
|
+
self._local = threading.local()
|
|
144
|
+
self._enabled = False
|
|
145
|
+
|
|
146
|
+
def enable(self) -> None:
|
|
147
|
+
"""Enable tracing."""
|
|
148
|
+
self._enabled = True
|
|
149
|
+
|
|
150
|
+
def disable(self) -> None:
|
|
151
|
+
"""Disable tracing."""
|
|
152
|
+
self._enabled = False
|
|
153
|
+
|
|
154
|
+
def is_enabled(self) -> bool:
|
|
155
|
+
"""Check if tracing is enabled."""
|
|
156
|
+
return self._enabled
|
|
157
|
+
|
|
158
|
+
def _get_state(self):
|
|
159
|
+
"""Get thread-local state, initializing if needed."""
|
|
160
|
+
if not hasattr(self._local, "root"):
|
|
161
|
+
self._local.root = None
|
|
162
|
+
self._local.current = None
|
|
163
|
+
return self._local
|
|
164
|
+
|
|
165
|
+
def get_root(self) -> Span | None:
|
|
166
|
+
"""Get the root span for current thread."""
|
|
167
|
+
return self._get_state().root
|
|
168
|
+
|
|
169
|
+
def get_current(self) -> Span | None:
|
|
170
|
+
"""Get the current active span for current thread."""
|
|
171
|
+
return self._get_state().current
|
|
172
|
+
|
|
173
|
+
@contextmanager
|
|
174
|
+
def span(self, name: str, **metadata) -> Iterator[Span | None]:
|
|
175
|
+
"""Create a traced span as a context manager.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
name: Name of the span
|
|
179
|
+
**metadata: Additional metadata to attach to span
|
|
180
|
+
|
|
181
|
+
Yields:
|
|
182
|
+
Span object if tracing is enabled, None otherwise
|
|
183
|
+
"""
|
|
184
|
+
if not self._enabled:
|
|
185
|
+
yield None
|
|
186
|
+
return
|
|
187
|
+
|
|
188
|
+
state = self._get_state()
|
|
189
|
+
span_obj = Span(
|
|
190
|
+
name=name,
|
|
191
|
+
start_time=time.perf_counter(),
|
|
192
|
+
metadata=metadata,
|
|
193
|
+
parent=state.current,
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
# Link to parent
|
|
197
|
+
if state.current is not None:
|
|
198
|
+
state.current.children.append(span_obj)
|
|
199
|
+
else:
|
|
200
|
+
# This is the root span
|
|
201
|
+
state.root = span_obj
|
|
202
|
+
|
|
203
|
+
# Make this the current span
|
|
204
|
+
prev_current = state.current
|
|
205
|
+
state.current = span_obj
|
|
206
|
+
|
|
207
|
+
try:
|
|
208
|
+
yield span_obj
|
|
209
|
+
finally:
|
|
210
|
+
# Close span
|
|
211
|
+
span_obj.end_time = time.perf_counter()
|
|
212
|
+
|
|
213
|
+
# Restore previous current
|
|
214
|
+
state.current = prev_current
|
|
215
|
+
|
|
216
|
+
def reset(self) -> None:
|
|
217
|
+
"""Reset tracing state for current thread."""
|
|
218
|
+
state = self._get_state()
|
|
219
|
+
state.root = None
|
|
220
|
+
state.current = None
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
# Global tracing context
|
|
224
|
+
_global_context = TracingContext()
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
# Public API functions
|
|
228
|
+
def enable() -> None:
|
|
229
|
+
"""Enable tracing globally."""
|
|
230
|
+
_global_context.enable()
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def disable() -> None:
|
|
234
|
+
"""Disable tracing globally."""
|
|
235
|
+
_global_context.disable()
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def is_enabled() -> bool:
|
|
239
|
+
"""Check if tracing is enabled."""
|
|
240
|
+
return _global_context.is_enabled()
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
@contextmanager
|
|
244
|
+
def span(name: str, **metadata) -> Iterator[Span | None]:
|
|
245
|
+
"""Create a traced span.
|
|
246
|
+
|
|
247
|
+
This is the main API for creating spans. Use as a context manager:
|
|
248
|
+
|
|
249
|
+
with tracing.span("my_operation", task_id="123"):
|
|
250
|
+
# ... do work ...
|
|
251
|
+
pass
|
|
252
|
+
|
|
253
|
+
Args:
|
|
254
|
+
name: Name of the span
|
|
255
|
+
**metadata: Additional metadata to attach to span
|
|
256
|
+
|
|
257
|
+
Yields:
|
|
258
|
+
Span object if tracing is enabled, None otherwise
|
|
259
|
+
"""
|
|
260
|
+
with _global_context.span(name, **metadata) as s:
|
|
261
|
+
yield s
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def get_trace() -> Span | None:
|
|
265
|
+
"""Get the root span for the current thread's trace.
|
|
266
|
+
|
|
267
|
+
Returns:
|
|
268
|
+
Root span, or None if no trace exists
|
|
269
|
+
"""
|
|
270
|
+
return _global_context.get_root()
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def reset() -> None:
|
|
274
|
+
"""Reset tracing state for current thread."""
|
|
275
|
+
_global_context.reset()
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def export_json(filepath: str, indent: int = 2) -> None:
|
|
279
|
+
"""Export current trace to JSON file.
|
|
280
|
+
|
|
281
|
+
Args:
|
|
282
|
+
filepath: Path to write JSON file
|
|
283
|
+
indent: JSON indentation level (default: 2)
|
|
284
|
+
"""
|
|
285
|
+
trace = get_trace()
|
|
286
|
+
if trace is None:
|
|
287
|
+
raise ValueError("No trace to export")
|
|
288
|
+
|
|
289
|
+
with open(filepath, "w") as f:
|
|
290
|
+
json.dump(trace.to_dict(), f, indent=indent)
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def get_summary() -> dict[str, Any]:
|
|
294
|
+
"""Get summary statistics for current trace.
|
|
295
|
+
|
|
296
|
+
Returns:
|
|
297
|
+
Dictionary with timing statistics by span name
|
|
298
|
+
|
|
299
|
+
Raises:
|
|
300
|
+
ValueError: If no trace exists
|
|
301
|
+
"""
|
|
302
|
+
trace = get_trace()
|
|
303
|
+
if trace is None:
|
|
304
|
+
raise ValueError("No trace to summarize")
|
|
305
|
+
|
|
306
|
+
return trace.get_summary()
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
__all__ = [
|
|
310
|
+
"Span",
|
|
311
|
+
"TracingContext",
|
|
312
|
+
"enable",
|
|
313
|
+
"disable",
|
|
314
|
+
"is_enabled",
|
|
315
|
+
"span",
|
|
316
|
+
"get_trace",
|
|
317
|
+
"reset",
|
|
318
|
+
"export_json",
|
|
319
|
+
"get_summary",
|
|
320
|
+
]
|