themis-eval 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (158) hide show
  1. themis/__init__.py +12 -1
  2. themis/_version.py +2 -2
  3. themis/api.py +343 -0
  4. themis/backends/__init__.py +17 -0
  5. themis/backends/execution.py +197 -0
  6. themis/backends/storage.py +260 -0
  7. themis/cli/__init__.py +5 -0
  8. themis/cli/__main__.py +6 -0
  9. themis/cli/commands/__init__.py +19 -0
  10. themis/cli/commands/benchmarks.py +221 -0
  11. themis/cli/commands/comparison.py +394 -0
  12. themis/cli/commands/config_commands.py +244 -0
  13. themis/cli/commands/cost.py +214 -0
  14. themis/cli/commands/demo.py +68 -0
  15. themis/cli/commands/info.py +90 -0
  16. themis/cli/commands/leaderboard.py +362 -0
  17. themis/cli/commands/math_benchmarks.py +318 -0
  18. themis/cli/commands/mcq_benchmarks.py +207 -0
  19. themis/cli/commands/results.py +252 -0
  20. themis/cli/commands/sample_run.py +244 -0
  21. themis/cli/commands/visualize.py +299 -0
  22. themis/cli/main.py +463 -0
  23. themis/cli/new_project.py +33 -0
  24. themis/cli/utils.py +51 -0
  25. themis/comparison/__init__.py +25 -0
  26. themis/comparison/engine.py +348 -0
  27. themis/comparison/reports.py +283 -0
  28. themis/comparison/statistics.py +402 -0
  29. themis/config/__init__.py +19 -0
  30. themis/config/loader.py +27 -0
  31. themis/config/registry.py +34 -0
  32. themis/config/runtime.py +214 -0
  33. themis/config/schema.py +112 -0
  34. themis/core/__init__.py +5 -0
  35. themis/core/conversation.py +354 -0
  36. themis/core/entities.py +184 -0
  37. themis/core/serialization.py +231 -0
  38. themis/core/tools.py +393 -0
  39. themis/core/types.py +141 -0
  40. themis/datasets/__init__.py +273 -0
  41. themis/datasets/base.py +264 -0
  42. themis/datasets/commonsense_qa.py +174 -0
  43. themis/datasets/competition_math.py +265 -0
  44. themis/datasets/coqa.py +133 -0
  45. themis/datasets/gpqa.py +190 -0
  46. themis/datasets/gsm8k.py +123 -0
  47. themis/datasets/gsm_symbolic.py +124 -0
  48. themis/datasets/math500.py +122 -0
  49. themis/datasets/med_qa.py +179 -0
  50. themis/datasets/medmcqa.py +169 -0
  51. themis/datasets/mmlu_pro.py +262 -0
  52. themis/datasets/piqa.py +146 -0
  53. themis/datasets/registry.py +201 -0
  54. themis/datasets/schema.py +245 -0
  55. themis/datasets/sciq.py +150 -0
  56. themis/datasets/social_i_qa.py +151 -0
  57. themis/datasets/super_gpqa.py +263 -0
  58. themis/evaluation/__init__.py +1 -0
  59. themis/evaluation/conditional.py +410 -0
  60. themis/evaluation/extractors/__init__.py +19 -0
  61. themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
  62. themis/evaluation/extractors/exceptions.py +7 -0
  63. themis/evaluation/extractors/identity_extractor.py +29 -0
  64. themis/evaluation/extractors/json_field_extractor.py +45 -0
  65. themis/evaluation/extractors/math_verify_extractor.py +37 -0
  66. themis/evaluation/extractors/regex_extractor.py +43 -0
  67. themis/evaluation/math_verify_utils.py +87 -0
  68. themis/evaluation/metrics/__init__.py +21 -0
  69. themis/evaluation/metrics/code/__init__.py +19 -0
  70. themis/evaluation/metrics/code/codebleu.py +144 -0
  71. themis/evaluation/metrics/code/execution.py +280 -0
  72. themis/evaluation/metrics/code/pass_at_k.py +181 -0
  73. themis/evaluation/metrics/composite_metric.py +47 -0
  74. themis/evaluation/metrics/consistency_metric.py +80 -0
  75. themis/evaluation/metrics/exact_match.py +51 -0
  76. themis/evaluation/metrics/length_difference_tolerance.py +33 -0
  77. themis/evaluation/metrics/math_verify_accuracy.py +40 -0
  78. themis/evaluation/metrics/nlp/__init__.py +21 -0
  79. themis/evaluation/metrics/nlp/bertscore.py +138 -0
  80. themis/evaluation/metrics/nlp/bleu.py +129 -0
  81. themis/evaluation/metrics/nlp/meteor.py +153 -0
  82. themis/evaluation/metrics/nlp/rouge.py +136 -0
  83. themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
  84. themis/evaluation/metrics/response_length.py +33 -0
  85. themis/evaluation/metrics/rubric_judge_metric.py +134 -0
  86. themis/evaluation/pipeline.py +49 -0
  87. themis/evaluation/pipelines/__init__.py +15 -0
  88. themis/evaluation/pipelines/composable_pipeline.py +357 -0
  89. themis/evaluation/pipelines/standard_pipeline.py +348 -0
  90. themis/evaluation/reports.py +293 -0
  91. themis/evaluation/statistics/__init__.py +53 -0
  92. themis/evaluation/statistics/bootstrap.py +79 -0
  93. themis/evaluation/statistics/confidence_intervals.py +121 -0
  94. themis/evaluation/statistics/distributions.py +207 -0
  95. themis/evaluation/statistics/effect_sizes.py +124 -0
  96. themis/evaluation/statistics/hypothesis_tests.py +305 -0
  97. themis/evaluation/statistics/types.py +139 -0
  98. themis/evaluation/strategies/__init__.py +13 -0
  99. themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
  100. themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
  101. themis/evaluation/strategies/evaluation_strategy.py +24 -0
  102. themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
  103. themis/experiment/__init__.py +5 -0
  104. themis/experiment/builder.py +151 -0
  105. themis/experiment/cache_manager.py +134 -0
  106. themis/experiment/comparison.py +631 -0
  107. themis/experiment/cost.py +310 -0
  108. themis/experiment/definitions.py +62 -0
  109. themis/experiment/export.py +798 -0
  110. themis/experiment/export_csv.py +159 -0
  111. themis/experiment/integration_manager.py +104 -0
  112. themis/experiment/math.py +192 -0
  113. themis/experiment/mcq.py +169 -0
  114. themis/experiment/orchestrator.py +415 -0
  115. themis/experiment/pricing.py +317 -0
  116. themis/experiment/storage.py +1458 -0
  117. themis/experiment/visualization.py +588 -0
  118. themis/generation/__init__.py +1 -0
  119. themis/generation/agentic_runner.py +420 -0
  120. themis/generation/batching.py +254 -0
  121. themis/generation/clients.py +143 -0
  122. themis/generation/conversation_runner.py +236 -0
  123. themis/generation/plan.py +456 -0
  124. themis/generation/providers/litellm_provider.py +221 -0
  125. themis/generation/providers/vllm_provider.py +135 -0
  126. themis/generation/router.py +34 -0
  127. themis/generation/runner.py +207 -0
  128. themis/generation/strategies.py +98 -0
  129. themis/generation/templates.py +71 -0
  130. themis/generation/turn_strategies.py +393 -0
  131. themis/generation/types.py +9 -0
  132. themis/integrations/__init__.py +0 -0
  133. themis/integrations/huggingface.py +72 -0
  134. themis/integrations/wandb.py +77 -0
  135. themis/interfaces/__init__.py +169 -0
  136. themis/presets/__init__.py +10 -0
  137. themis/presets/benchmarks.py +354 -0
  138. themis/presets/models.py +190 -0
  139. themis/project/__init__.py +20 -0
  140. themis/project/definitions.py +98 -0
  141. themis/project/patterns.py +230 -0
  142. themis/providers/__init__.py +5 -0
  143. themis/providers/registry.py +39 -0
  144. themis/server/__init__.py +28 -0
  145. themis/server/app.py +337 -0
  146. themis/utils/api_generator.py +379 -0
  147. themis/utils/cost_tracking.py +376 -0
  148. themis/utils/dashboard.py +452 -0
  149. themis/utils/logging_utils.py +41 -0
  150. themis/utils/progress.py +58 -0
  151. themis/utils/tracing.py +320 -0
  152. themis_eval-0.2.0.dist-info/METADATA +596 -0
  153. themis_eval-0.2.0.dist-info/RECORD +157 -0
  154. {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/WHEEL +1 -1
  155. themis_eval-0.1.0.dist-info/METADATA +0 -758
  156. themis_eval-0.1.0.dist-info/RECORD +0 -8
  157. {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/licenses/LICENSE +0 -0
  158. {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,320 @@
1
+ """Distributed tracing and observability utilities.
2
+
3
+ This module provides lightweight span-based tracing for understanding
4
+ experiment execution performance and behavior. Tracing is opt-in and
5
+ disabled by default for minimal performance impact.
6
+
7
+ Examples:
8
+ # Enable tracing
9
+ tracing.enable()
10
+
11
+ # Use context manager for automatic span management
12
+ with tracing.span("my_operation", task_id="123"):
13
+ # ... do work ...
14
+ with tracing.span("subprocess"):
15
+ # ... nested work ...
16
+ pass
17
+
18
+ # Get trace for analysis
19
+ trace = tracing.get_trace()
20
+ print(f"Total time: {trace.duration_ms()}ms")
21
+
22
+ # Export trace
23
+ tracing.export_json("trace.json")
24
+ tracing.disable()
25
+ """
26
+
27
+ from __future__ import annotations
28
+
29
+ import json
30
+ import threading
31
+ import time
32
+ from contextlib import contextmanager
33
+ from dataclasses import dataclass, field
34
+ from typing import Any, Iterator
35
+
36
+
37
+ @dataclass
38
+ class Span:
39
+ """Represents a traced operation with timing and metadata.
40
+
41
+ Spans can be nested to create a tree of operations showing
42
+ how time is spent during experiment execution.
43
+ """
44
+
45
+ name: str
46
+ start_time: float
47
+ end_time: float | None = None
48
+ metadata: dict[str, Any] = field(default_factory=dict)
49
+ parent: Span | None = None
50
+ children: list[Span] = field(default_factory=list)
51
+ span_id: str = field(default_factory=lambda: str(time.time_ns()))
52
+
53
+ def duration_ms(self) -> float:
54
+ """Calculate span duration in milliseconds.
55
+
56
+ Returns:
57
+ Duration in milliseconds, or time elapsed so far if span not closed
58
+ """
59
+ if self.end_time is None:
60
+ return (time.perf_counter() - self.start_time) * 1000
61
+ return (self.end_time - self.start_time) * 1000
62
+
63
+ def is_complete(self) -> bool:
64
+ """Check if span has been closed."""
65
+ return self.end_time is not None
66
+
67
+ def to_dict(self) -> dict[str, Any]:
68
+ """Convert span to dictionary for serialization."""
69
+ return {
70
+ "name": self.name,
71
+ "span_id": self.span_id,
72
+ "start_time": self.start_time,
73
+ "end_time": self.end_time,
74
+ "duration_ms": self.duration_ms() if self.is_complete() else None,
75
+ "metadata": self.metadata,
76
+ "children": [child.to_dict() for child in self.children],
77
+ }
78
+
79
+ def find_spans(self, name: str) -> list[Span]:
80
+ """Find all spans with given name in this span and descendants.
81
+
82
+ Args:
83
+ name: Span name to search for
84
+
85
+ Returns:
86
+ List of matching spans
87
+ """
88
+ matches = []
89
+ if self.name == name:
90
+ matches.append(self)
91
+
92
+ for child in self.children:
93
+ matches.extend(child.find_spans(name))
94
+
95
+ return matches
96
+
97
+ def get_summary(self) -> dict[str, Any]:
98
+ """Get summary statistics for this span tree.
99
+
100
+ Returns:
101
+ Dictionary with timing statistics by span name
102
+ """
103
+ summary: dict[str, dict[str, Any]] = {}
104
+
105
+ def collect(span: Span):
106
+ if span.name not in summary:
107
+ summary[span.name] = {
108
+ "count": 0,
109
+ "total_ms": 0.0,
110
+ "min_ms": float("inf"),
111
+ "max_ms": 0.0,
112
+ }
113
+
114
+ stats = summary[span.name]
115
+ if span.is_complete():
116
+ duration = span.duration_ms()
117
+ stats["count"] += 1
118
+ stats["total_ms"] += duration
119
+ stats["min_ms"] = min(stats["min_ms"], duration)
120
+ stats["max_ms"] = max(stats["max_ms"], duration)
121
+
122
+ for child in span.children:
123
+ collect(child)
124
+
125
+ collect(self)
126
+
127
+ # Calculate averages
128
+ for stats in summary.values():
129
+ if stats["count"] > 0:
130
+ stats["avg_ms"] = stats["total_ms"] / stats["count"]
131
+
132
+ return summary
133
+
134
+
135
+ class TracingContext:
136
+ """Thread-local tracing context.
137
+
138
+ This class manages the current span stack for each thread, allowing
139
+ nested spans and proper parent-child relationships.
140
+ """
141
+
142
+ def __init__(self):
143
+ self._local = threading.local()
144
+ self._enabled = False
145
+
146
+ def enable(self) -> None:
147
+ """Enable tracing."""
148
+ self._enabled = True
149
+
150
+ def disable(self) -> None:
151
+ """Disable tracing."""
152
+ self._enabled = False
153
+
154
+ def is_enabled(self) -> bool:
155
+ """Check if tracing is enabled."""
156
+ return self._enabled
157
+
158
+ def _get_state(self):
159
+ """Get thread-local state, initializing if needed."""
160
+ if not hasattr(self._local, "root"):
161
+ self._local.root = None
162
+ self._local.current = None
163
+ return self._local
164
+
165
+ def get_root(self) -> Span | None:
166
+ """Get the root span for current thread."""
167
+ return self._get_state().root
168
+
169
+ def get_current(self) -> Span | None:
170
+ """Get the current active span for current thread."""
171
+ return self._get_state().current
172
+
173
+ @contextmanager
174
+ def span(self, name: str, **metadata) -> Iterator[Span | None]:
175
+ """Create a traced span as a context manager.
176
+
177
+ Args:
178
+ name: Name of the span
179
+ **metadata: Additional metadata to attach to span
180
+
181
+ Yields:
182
+ Span object if tracing is enabled, None otherwise
183
+ """
184
+ if not self._enabled:
185
+ yield None
186
+ return
187
+
188
+ state = self._get_state()
189
+ span_obj = Span(
190
+ name=name,
191
+ start_time=time.perf_counter(),
192
+ metadata=metadata,
193
+ parent=state.current,
194
+ )
195
+
196
+ # Link to parent
197
+ if state.current is not None:
198
+ state.current.children.append(span_obj)
199
+ else:
200
+ # This is the root span
201
+ state.root = span_obj
202
+
203
+ # Make this the current span
204
+ prev_current = state.current
205
+ state.current = span_obj
206
+
207
+ try:
208
+ yield span_obj
209
+ finally:
210
+ # Close span
211
+ span_obj.end_time = time.perf_counter()
212
+
213
+ # Restore previous current
214
+ state.current = prev_current
215
+
216
+ def reset(self) -> None:
217
+ """Reset tracing state for current thread."""
218
+ state = self._get_state()
219
+ state.root = None
220
+ state.current = None
221
+
222
+
223
+ # Global tracing context
224
+ _global_context = TracingContext()
225
+
226
+
227
+ # Public API functions
228
+ def enable() -> None:
229
+ """Enable tracing globally."""
230
+ _global_context.enable()
231
+
232
+
233
+ def disable() -> None:
234
+ """Disable tracing globally."""
235
+ _global_context.disable()
236
+
237
+
238
+ def is_enabled() -> bool:
239
+ """Check if tracing is enabled."""
240
+ return _global_context.is_enabled()
241
+
242
+
243
+ @contextmanager
244
+ def span(name: str, **metadata) -> Iterator[Span | None]:
245
+ """Create a traced span.
246
+
247
+ This is the main API for creating spans. Use as a context manager:
248
+
249
+ with tracing.span("my_operation", task_id="123"):
250
+ # ... do work ...
251
+ pass
252
+
253
+ Args:
254
+ name: Name of the span
255
+ **metadata: Additional metadata to attach to span
256
+
257
+ Yields:
258
+ Span object if tracing is enabled, None otherwise
259
+ """
260
+ with _global_context.span(name, **metadata) as s:
261
+ yield s
262
+
263
+
264
+ def get_trace() -> Span | None:
265
+ """Get the root span for the current thread's trace.
266
+
267
+ Returns:
268
+ Root span, or None if no trace exists
269
+ """
270
+ return _global_context.get_root()
271
+
272
+
273
+ def reset() -> None:
274
+ """Reset tracing state for current thread."""
275
+ _global_context.reset()
276
+
277
+
278
+ def export_json(filepath: str, indent: int = 2) -> None:
279
+ """Export current trace to JSON file.
280
+
281
+ Args:
282
+ filepath: Path to write JSON file
283
+ indent: JSON indentation level (default: 2)
284
+ """
285
+ trace = get_trace()
286
+ if trace is None:
287
+ raise ValueError("No trace to export")
288
+
289
+ with open(filepath, "w") as f:
290
+ json.dump(trace.to_dict(), f, indent=indent)
291
+
292
+
293
+ def get_summary() -> dict[str, Any]:
294
+ """Get summary statistics for current trace.
295
+
296
+ Returns:
297
+ Dictionary with timing statistics by span name
298
+
299
+ Raises:
300
+ ValueError: If no trace exists
301
+ """
302
+ trace = get_trace()
303
+ if trace is None:
304
+ raise ValueError("No trace to summarize")
305
+
306
+ return trace.get_summary()
307
+
308
+
309
+ __all__ = [
310
+ "Span",
311
+ "TracingContext",
312
+ "enable",
313
+ "disable",
314
+ "is_enabled",
315
+ "span",
316
+ "get_trace",
317
+ "reset",
318
+ "export_json",
319
+ "get_summary",
320
+ ]