themis-eval 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. themis/cli/__init__.py +5 -0
  2. themis/cli/__main__.py +6 -0
  3. themis/cli/commands/__init__.py +19 -0
  4. themis/cli/commands/benchmarks.py +221 -0
  5. themis/cli/commands/comparison.py +394 -0
  6. themis/cli/commands/config_commands.py +244 -0
  7. themis/cli/commands/cost.py +214 -0
  8. themis/cli/commands/demo.py +68 -0
  9. themis/cli/commands/info.py +90 -0
  10. themis/cli/commands/leaderboard.py +362 -0
  11. themis/cli/commands/math_benchmarks.py +318 -0
  12. themis/cli/commands/mcq_benchmarks.py +207 -0
  13. themis/cli/commands/sample_run.py +244 -0
  14. themis/cli/commands/visualize.py +299 -0
  15. themis/cli/main.py +93 -0
  16. themis/cli/new_project.py +33 -0
  17. themis/cli/utils.py +51 -0
  18. themis/config/__init__.py +19 -0
  19. themis/config/loader.py +27 -0
  20. themis/config/registry.py +34 -0
  21. themis/config/runtime.py +214 -0
  22. themis/config/schema.py +112 -0
  23. themis/core/__init__.py +5 -0
  24. themis/core/conversation.py +354 -0
  25. themis/core/entities.py +164 -0
  26. themis/core/serialization.py +231 -0
  27. themis/core/tools.py +393 -0
  28. themis/core/types.py +141 -0
  29. themis/datasets/__init__.py +273 -0
  30. themis/datasets/base.py +264 -0
  31. themis/datasets/commonsense_qa.py +174 -0
  32. themis/datasets/competition_math.py +265 -0
  33. themis/datasets/coqa.py +133 -0
  34. themis/datasets/gpqa.py +190 -0
  35. themis/datasets/gsm8k.py +123 -0
  36. themis/datasets/gsm_symbolic.py +124 -0
  37. themis/datasets/math500.py +122 -0
  38. themis/datasets/med_qa.py +179 -0
  39. themis/datasets/medmcqa.py +169 -0
  40. themis/datasets/mmlu_pro.py +262 -0
  41. themis/datasets/piqa.py +146 -0
  42. themis/datasets/registry.py +201 -0
  43. themis/datasets/schema.py +245 -0
  44. themis/datasets/sciq.py +150 -0
  45. themis/datasets/social_i_qa.py +151 -0
  46. themis/datasets/super_gpqa.py +263 -0
  47. themis/evaluation/__init__.py +1 -0
  48. themis/evaluation/conditional.py +410 -0
  49. themis/evaluation/extractors/__init__.py +19 -0
  50. themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
  51. themis/evaluation/extractors/exceptions.py +7 -0
  52. themis/evaluation/extractors/identity_extractor.py +29 -0
  53. themis/evaluation/extractors/json_field_extractor.py +45 -0
  54. themis/evaluation/extractors/math_verify_extractor.py +37 -0
  55. themis/evaluation/extractors/regex_extractor.py +43 -0
  56. themis/evaluation/math_verify_utils.py +87 -0
  57. themis/evaluation/metrics/__init__.py +21 -0
  58. themis/evaluation/metrics/composite_metric.py +47 -0
  59. themis/evaluation/metrics/consistency_metric.py +80 -0
  60. themis/evaluation/metrics/exact_match.py +51 -0
  61. themis/evaluation/metrics/length_difference_tolerance.py +33 -0
  62. themis/evaluation/metrics/math_verify_accuracy.py +40 -0
  63. themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
  64. themis/evaluation/metrics/response_length.py +33 -0
  65. themis/evaluation/metrics/rubric_judge_metric.py +134 -0
  66. themis/evaluation/pipeline.py +49 -0
  67. themis/evaluation/pipelines/__init__.py +15 -0
  68. themis/evaluation/pipelines/composable_pipeline.py +357 -0
  69. themis/evaluation/pipelines/standard_pipeline.py +288 -0
  70. themis/evaluation/reports.py +293 -0
  71. themis/evaluation/statistics/__init__.py +53 -0
  72. themis/evaluation/statistics/bootstrap.py +79 -0
  73. themis/evaluation/statistics/confidence_intervals.py +121 -0
  74. themis/evaluation/statistics/distributions.py +207 -0
  75. themis/evaluation/statistics/effect_sizes.py +124 -0
  76. themis/evaluation/statistics/hypothesis_tests.py +305 -0
  77. themis/evaluation/statistics/types.py +139 -0
  78. themis/evaluation/strategies/__init__.py +13 -0
  79. themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
  80. themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
  81. themis/evaluation/strategies/evaluation_strategy.py +24 -0
  82. themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
  83. themis/experiment/__init__.py +5 -0
  84. themis/experiment/builder.py +151 -0
  85. themis/experiment/cache_manager.py +129 -0
  86. themis/experiment/comparison.py +631 -0
  87. themis/experiment/cost.py +310 -0
  88. themis/experiment/definitions.py +62 -0
  89. themis/experiment/export.py +690 -0
  90. themis/experiment/export_csv.py +159 -0
  91. themis/experiment/integration_manager.py +104 -0
  92. themis/experiment/math.py +192 -0
  93. themis/experiment/mcq.py +169 -0
  94. themis/experiment/orchestrator.py +373 -0
  95. themis/experiment/pricing.py +317 -0
  96. themis/experiment/storage.py +255 -0
  97. themis/experiment/visualization.py +588 -0
  98. themis/generation/__init__.py +1 -0
  99. themis/generation/agentic_runner.py +420 -0
  100. themis/generation/batching.py +254 -0
  101. themis/generation/clients.py +143 -0
  102. themis/generation/conversation_runner.py +236 -0
  103. themis/generation/plan.py +456 -0
  104. themis/generation/providers/litellm_provider.py +221 -0
  105. themis/generation/providers/vllm_provider.py +135 -0
  106. themis/generation/router.py +34 -0
  107. themis/generation/runner.py +207 -0
  108. themis/generation/strategies.py +98 -0
  109. themis/generation/templates.py +71 -0
  110. themis/generation/turn_strategies.py +393 -0
  111. themis/generation/types.py +9 -0
  112. themis/integrations/__init__.py +0 -0
  113. themis/integrations/huggingface.py +61 -0
  114. themis/integrations/wandb.py +65 -0
  115. themis/interfaces/__init__.py +83 -0
  116. themis/project/__init__.py +20 -0
  117. themis/project/definitions.py +98 -0
  118. themis/project/patterns.py +230 -0
  119. themis/providers/__init__.py +5 -0
  120. themis/providers/registry.py +39 -0
  121. themis/utils/api_generator.py +379 -0
  122. themis/utils/cost_tracking.py +376 -0
  123. themis/utils/dashboard.py +452 -0
  124. themis/utils/logging_utils.py +41 -0
  125. themis/utils/progress.py +58 -0
  126. themis/utils/tracing.py +320 -0
  127. {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/METADATA +1 -1
  128. themis_eval-0.1.1.dist-info/RECORD +134 -0
  129. themis_eval-0.1.0.dist-info/RECORD +0 -8
  130. {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/WHEEL +0 -0
  131. {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/licenses/LICENSE +0 -0
  132. {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,320 @@
1
+ """Distributed tracing and observability utilities.
2
+
3
+ This module provides lightweight span-based tracing for understanding
4
+ experiment execution performance and behavior. Tracing is opt-in and
5
+ disabled by default for minimal performance impact.
6
+
7
+ Examples:
8
+ # Enable tracing
9
+ tracing.enable()
10
+
11
+ # Use context manager for automatic span management
12
+ with tracing.span("my_operation", task_id="123"):
13
+ # ... do work ...
14
+ with tracing.span("subprocess"):
15
+ # ... nested work ...
16
+ pass
17
+
18
+ # Get trace for analysis
19
+ trace = tracing.get_trace()
20
+ print(f"Total time: {trace.duration_ms()}ms")
21
+
22
+ # Export trace
23
+ tracing.export_json("trace.json")
24
+ tracing.disable()
25
+ """
26
+
27
+ from __future__ import annotations
28
+
29
+ import json
30
+ import threading
31
+ import time
32
+ from contextlib import contextmanager
33
+ from dataclasses import dataclass, field
34
+ from typing import Any, Iterator
35
+
36
+
37
+ @dataclass
38
+ class Span:
39
+ """Represents a traced operation with timing and metadata.
40
+
41
+ Spans can be nested to create a tree of operations showing
42
+ how time is spent during experiment execution.
43
+ """
44
+
45
+ name: str
46
+ start_time: float
47
+ end_time: float | None = None
48
+ metadata: dict[str, Any] = field(default_factory=dict)
49
+ parent: Span | None = None
50
+ children: list[Span] = field(default_factory=list)
51
+ span_id: str = field(default_factory=lambda: str(time.time_ns()))
52
+
53
+ def duration_ms(self) -> float:
54
+ """Calculate span duration in milliseconds.
55
+
56
+ Returns:
57
+ Duration in milliseconds, or time elapsed so far if span not closed
58
+ """
59
+ if self.end_time is None:
60
+ return (time.perf_counter() - self.start_time) * 1000
61
+ return (self.end_time - self.start_time) * 1000
62
+
63
+ def is_complete(self) -> bool:
64
+ """Check if span has been closed."""
65
+ return self.end_time is not None
66
+
67
+ def to_dict(self) -> dict[str, Any]:
68
+ """Convert span to dictionary for serialization."""
69
+ return {
70
+ "name": self.name,
71
+ "span_id": self.span_id,
72
+ "start_time": self.start_time,
73
+ "end_time": self.end_time,
74
+ "duration_ms": self.duration_ms() if self.is_complete() else None,
75
+ "metadata": self.metadata,
76
+ "children": [child.to_dict() for child in self.children],
77
+ }
78
+
79
+ def find_spans(self, name: str) -> list[Span]:
80
+ """Find all spans with given name in this span and descendants.
81
+
82
+ Args:
83
+ name: Span name to search for
84
+
85
+ Returns:
86
+ List of matching spans
87
+ """
88
+ matches = []
89
+ if self.name == name:
90
+ matches.append(self)
91
+
92
+ for child in self.children:
93
+ matches.extend(child.find_spans(name))
94
+
95
+ return matches
96
+
97
+ def get_summary(self) -> dict[str, Any]:
98
+ """Get summary statistics for this span tree.
99
+
100
+ Returns:
101
+ Dictionary with timing statistics by span name
102
+ """
103
+ summary: dict[str, dict[str, Any]] = {}
104
+
105
+ def collect(span: Span):
106
+ if span.name not in summary:
107
+ summary[span.name] = {
108
+ "count": 0,
109
+ "total_ms": 0.0,
110
+ "min_ms": float("inf"),
111
+ "max_ms": 0.0,
112
+ }
113
+
114
+ stats = summary[span.name]
115
+ if span.is_complete():
116
+ duration = span.duration_ms()
117
+ stats["count"] += 1
118
+ stats["total_ms"] += duration
119
+ stats["min_ms"] = min(stats["min_ms"], duration)
120
+ stats["max_ms"] = max(stats["max_ms"], duration)
121
+
122
+ for child in span.children:
123
+ collect(child)
124
+
125
+ collect(self)
126
+
127
+ # Calculate averages
128
+ for stats in summary.values():
129
+ if stats["count"] > 0:
130
+ stats["avg_ms"] = stats["total_ms"] / stats["count"]
131
+
132
+ return summary
133
+
134
+
135
+ class TracingContext:
136
+ """Thread-local tracing context.
137
+
138
+ This class manages the current span stack for each thread, allowing
139
+ nested spans and proper parent-child relationships.
140
+ """
141
+
142
+ def __init__(self):
143
+ self._local = threading.local()
144
+ self._enabled = False
145
+
146
+ def enable(self) -> None:
147
+ """Enable tracing."""
148
+ self._enabled = True
149
+
150
+ def disable(self) -> None:
151
+ """Disable tracing."""
152
+ self._enabled = False
153
+
154
+ def is_enabled(self) -> bool:
155
+ """Check if tracing is enabled."""
156
+ return self._enabled
157
+
158
+ def _get_state(self):
159
+ """Get thread-local state, initializing if needed."""
160
+ if not hasattr(self._local, "root"):
161
+ self._local.root = None
162
+ self._local.current = None
163
+ return self._local
164
+
165
+ def get_root(self) -> Span | None:
166
+ """Get the root span for current thread."""
167
+ return self._get_state().root
168
+
169
+ def get_current(self) -> Span | None:
170
+ """Get the current active span for current thread."""
171
+ return self._get_state().current
172
+
173
+ @contextmanager
174
+ def span(self, name: str, **metadata) -> Iterator[Span | None]:
175
+ """Create a traced span as a context manager.
176
+
177
+ Args:
178
+ name: Name of the span
179
+ **metadata: Additional metadata to attach to span
180
+
181
+ Yields:
182
+ Span object if tracing is enabled, None otherwise
183
+ """
184
+ if not self._enabled:
185
+ yield None
186
+ return
187
+
188
+ state = self._get_state()
189
+ span_obj = Span(
190
+ name=name,
191
+ start_time=time.perf_counter(),
192
+ metadata=metadata,
193
+ parent=state.current,
194
+ )
195
+
196
+ # Link to parent
197
+ if state.current is not None:
198
+ state.current.children.append(span_obj)
199
+ else:
200
+ # This is the root span
201
+ state.root = span_obj
202
+
203
+ # Make this the current span
204
+ prev_current = state.current
205
+ state.current = span_obj
206
+
207
+ try:
208
+ yield span_obj
209
+ finally:
210
+ # Close span
211
+ span_obj.end_time = time.perf_counter()
212
+
213
+ # Restore previous current
214
+ state.current = prev_current
215
+
216
+ def reset(self) -> None:
217
+ """Reset tracing state for current thread."""
218
+ state = self._get_state()
219
+ state.root = None
220
+ state.current = None
221
+
222
+
223
+ # Global tracing context
224
+ _global_context = TracingContext()
225
+
226
+
227
+ # Public API functions
228
+ def enable() -> None:
229
+ """Enable tracing globally."""
230
+ _global_context.enable()
231
+
232
+
233
+ def disable() -> None:
234
+ """Disable tracing globally."""
235
+ _global_context.disable()
236
+
237
+
238
+ def is_enabled() -> bool:
239
+ """Check if tracing is enabled."""
240
+ return _global_context.is_enabled()
241
+
242
+
243
+ @contextmanager
244
+ def span(name: str, **metadata) -> Iterator[Span | None]:
245
+ """Create a traced span.
246
+
247
+ This is the main API for creating spans. Use as a context manager:
248
+
249
+ with tracing.span("my_operation", task_id="123"):
250
+ # ... do work ...
251
+ pass
252
+
253
+ Args:
254
+ name: Name of the span
255
+ **metadata: Additional metadata to attach to span
256
+
257
+ Yields:
258
+ Span object if tracing is enabled, None otherwise
259
+ """
260
+ with _global_context.span(name, **metadata) as s:
261
+ yield s
262
+
263
+
264
+ def get_trace() -> Span | None:
265
+ """Get the root span for the current thread's trace.
266
+
267
+ Returns:
268
+ Root span, or None if no trace exists
269
+ """
270
+ return _global_context.get_root()
271
+
272
+
273
+ def reset() -> None:
274
+ """Reset tracing state for current thread."""
275
+ _global_context.reset()
276
+
277
+
278
+ def export_json(filepath: str, indent: int = 2) -> None:
279
+ """Export current trace to JSON file.
280
+
281
+ Args:
282
+ filepath: Path to write JSON file
283
+ indent: JSON indentation level (default: 2)
284
+ """
285
+ trace = get_trace()
286
+ if trace is None:
287
+ raise ValueError("No trace to export")
288
+
289
+ with open(filepath, "w") as f:
290
+ json.dump(trace.to_dict(), f, indent=indent)
291
+
292
+
293
+ def get_summary() -> dict[str, Any]:
294
+ """Get summary statistics for current trace.
295
+
296
+ Returns:
297
+ Dictionary with timing statistics by span name
298
+
299
+ Raises:
300
+ ValueError: If no trace exists
301
+ """
302
+ trace = get_trace()
303
+ if trace is None:
304
+ raise ValueError("No trace to summarize")
305
+
306
+ return trace.get_summary()
307
+
308
+
309
+ __all__ = [
310
+ "Span",
311
+ "TracingContext",
312
+ "enable",
313
+ "disable",
314
+ "is_enabled",
315
+ "span",
316
+ "get_trace",
317
+ "reset",
318
+ "export_json",
319
+ "get_summary",
320
+ ]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: themis-eval
3
- Version: 0.1.0
3
+ Version: 0.1.1
4
4
  Summary: Lightweight evaluation platform for LLM experiments
5
5
  Author: Pittawat Taveekitworachai
6
6
  License: MIT
@@ -0,0 +1,134 @@
1
+ themis/__init__.py,sha256=3bKi1PneI5PaTaDPXsArCVvfinkLFDRU91lvZIeg7V0,281
2
+ themis/_version.py,sha256=tc4TJqWVv2dx4UzItLqneMPaG7vM8CQFDNW5pJgNoKg,345
3
+ themis/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ themis/cli/__init__.py,sha256=An2DrMHRfmiee5BYJ6TGqvbG7sXWECjjyvEgcoGJ7cE,99
5
+ themis/cli/__main__.py,sha256=df2pOghoSuq18hZmVVikmGhaFSaRe-jeDOnrsu-1QDM,135
6
+ themis/cli/main.py,sha256=NSayD8zwH5NJjC_qaTwqhDR7_8PO3I2n1dPX-fBYa-A,3131
7
+ themis/cli/new_project.py,sha256=D8asV4QbjgQNYvmXt_WhK4nPM-wKHe_K0VJiBdgtO_E,1121
8
+ themis/cli/utils.py,sha256=NAPyFiXspfpx5vBxA8aEcOMmWEDyt-R8ywoHo_8Nr4A,1307
9
+ themis/cli/commands/__init__.py,sha256=CTx7su3qTtq96qxLNclDsE6UM_86NhaS01M9-x9wFiw,287
10
+ themis/cli/commands/benchmarks.py,sha256=HjCfmhu1FYUEGlaxUZGIZs0I-2gNG4tn_kF29NWPGcc,7885
11
+ themis/cli/commands/comparison.py,sha256=Ki1_MMFFR4vBJkZTeIMWLh-_zdjbtJZurI3YyrEs4vw,12364
12
+ themis/cli/commands/config_commands.py,sha256=eL6GtdIllOIHo8GbNN2jOqLn5VUPBuqhnro9ooPxDog,7387
13
+ themis/cli/commands/cost.py,sha256=fFdF6hKIzsbPsyrJ1nt6-2m43PpVGUj8jx5T90tBTNo,7233
14
+ themis/cli/commands/demo.py,sha256=akQqjG-hbUDfeB3bI8K4F5-S0ibJqhflGBFQ5nvdUgE,2135
15
+ themis/cli/commands/info.py,sha256=9maOaw-TFiBpuVhaqlMKukGuZ_zgESetqbMQ1Qdvjxs,2515
16
+ themis/cli/commands/leaderboard.py,sha256=AVvsYIwZAY18jn3sOq3QD45yNtfdHUEl7eixM4aMCKw,10615
17
+ themis/cli/commands/math_benchmarks.py,sha256=nQ4TcPB7T9O3piAy4_TgrOQOQxh2Q8OyBreK_HoPCeQ,9946
18
+ themis/cli/commands/mcq_benchmarks.py,sha256=Cls5W1jGd7TKizmw07CnZWY5N6ywR8VhJ6jKDnY_cRk,7026
19
+ themis/cli/commands/sample_run.py,sha256=r3Ymg5dVHg4IAVJvzoP0ZWUWWUE4Dia1t0062Yhdk9Q,9445
20
+ themis/cli/commands/visualize.py,sha256=ZECkB0NjIltuOeBE-Q1JnndZEMXVzc8KgcrbaP-GSXo,9740
21
+ themis/config/__init__.py,sha256=YMdFG1iLvOQUnSPlc_ZJVn5zCCTbIozML64b4qUtGR8,476
22
+ themis/config/loader.py,sha256=t_wcIDwekuy3EaLprQgWILSKH2h5lFkF7quvNfAHddM,746
23
+ themis/config/registry.py,sha256=sSrL1mTjUG86s30o-dhuRInY4YeumEICtmxdjS-PiiQ,1055
24
+ themis/config/runtime.py,sha256=hU69_oND7fJfAOIBJONENmsuf7Y8roO7n-w9OwxzoT8,7475
25
+ themis/config/schema.py,sha256=SMR9QHp8OBkSnb1dHyOgg-IJWSqpXfyAqywnBeMy46M,3196
26
+ themis/core/__init__.py,sha256=S8G1x-39sZ3_NQ5DJ6R1yBTWXp_gO0WxOtVjeB9sTwY,113
27
+ themis/core/conversation.py,sha256=wwO8RS4t4plDR0Sf1KjYv_ejonlvKe0ZwAD-4sfGak8,10155
28
+ themis/core/entities.py,sha256=S1Kw3qrx0vWWdOmskZnu3GuKGOGf0WDpHBazmwhEnmM,3770
29
+ themis/core/serialization.py,sha256=cxfoSKwcZiNsnR8g_SAJAq1ZLrfLXM4S9_rVEDUT8qs,7071
30
+ themis/core/tools.py,sha256=v0_ctsBCtinZGNC_I4C-h0GUPNM5ZeTi7z-U4iCtyp4,11035
31
+ themis/core/types.py,sha256=I5rr9MMS0irX4lo-xlqGjosx-FjPgT64RzQAraM223A,3652
32
+ themis/datasets/__init__.py,sha256=r1FobxkALtaXuphz1wU6LSbXhDezpmh6AUvoT-Hv1As,9013
33
+ themis/datasets/base.py,sha256=HtgIIiCgaog9aRnjZ4kJV3Ta2Dhl-_wrD8PB2lvK810,8417
34
+ themis/datasets/commonsense_qa.py,sha256=1eT_SaehbHC8F1HOJDpMi4RnzsKznGuJrdoL8RLuHu4,5842
35
+ themis/datasets/competition_math.py,sha256=XWsKMAPICb66RMXZNA8AvtUJwWQav3HVDuvAby8QjZU,7346
36
+ themis/datasets/coqa.py,sha256=bsh1AUYtagYIa7d-eM30JftGBQ7_0xUZq9dW9Pkz3Ks,4575
37
+ themis/datasets/gpqa.py,sha256=VQEDbDywDGhRE69M-mXwY5AxUO5WQw2RF6ND7hOlv50,7110
38
+ themis/datasets/gsm8k.py,sha256=wYbCACnF560RTAJTU-SMHqVvwzj8QT0bbzfQGTuRKl0,3887
39
+ themis/datasets/gsm_symbolic.py,sha256=PeWRBJIt8zEcsWIIj4xw5iT2drBotv8jB3DrHkN-PDw,4055
40
+ themis/datasets/math500.py,sha256=F1xBSadcCdnKWs-WKoFVHBeOt1tu6Rb6U2Yi4Ien578,3840
41
+ themis/datasets/med_qa.py,sha256=Z6FET8UrDsQnsbuvPoVlO1thJ6jeMcmSnqUvnHY0GtI,6019
42
+ themis/datasets/medmcqa.py,sha256=9MjkhjIqNaP_sslPnid4GHiqaGjXZkWGvwLJGaUeHzg,5532
43
+ themis/datasets/mmlu_pro.py,sha256=51cZGLRe1BVuXhQ5vFzIk4TCq4p8iYUow9E9hSSNFAE,8384
44
+ themis/datasets/piqa.py,sha256=YV-c56ZUrvIqS0wpBq4LdC683hDqEJaZCfhh4Bn5HwU,4420
45
+ themis/datasets/registry.py,sha256=GCgfdr7dp0LpDiK0DLhOFg8tS18m1Rx6t7bOJlh10rU,5530
46
+ themis/datasets/schema.py,sha256=9nfM0ygdtvh1PPNLLz1OF2P8Z4sjTVk7gT5GIffJsf0,7100
47
+ themis/datasets/sciq.py,sha256=VXxO2_cDPhs08AnUBtT5NE6aKvnGMeNa8FXdQZqas2U,4677
48
+ themis/datasets/social_i_qa.py,sha256=y2uT2mWwo7ArZwjEUSo5841oITDcFII77OCjNVTkwwk,4765
49
+ themis/datasets/super_gpqa.py,sha256=Mr1ag_FyAk1haxg6_ONX5F84wQYtbSVjV-MlMNmaHlI,8452
50
+ themis/evaluation/__init__.py,sha256=2Jl8tcVxYAsmHNAZev2mPS_mEwZcRzebqSM3QDc2cyY,36
51
+ themis/evaluation/conditional.py,sha256=ayndI7FcwxdIMR8B4ddgcKZd5Jl5NQcBJUp7eXI6Djk,13881
52
+ themis/evaluation/math_verify_utils.py,sha256=vXMvL11-IH16UHZ-mbi_r5hOFz7aUfR1J1laa6qmLMk,2213
53
+ themis/evaluation/pipeline.py,sha256=OOowN59UdOMF2Hwy_G4ky5yzR4ajAnohil6xteWBHqE,1503
54
+ themis/evaluation/reports.py,sha256=9om7jzZUtmlMH7EeteXp_98gfHct4x09AyTFy3FSAdQ,8715
55
+ themis/evaluation/extractors/__init__.py,sha256=BanoC_8e0iam-VU7l7uhvhac_6w_JJZYoYE4xXPUrGk,566
56
+ themis/evaluation/extractors/error_taxonomy_extractor.py,sha256=RrRx-23l3LwTdG89kvSQJng438cfYI-IdtOGUD6gEDw,2462
57
+ themis/evaluation/extractors/exceptions.py,sha256=lI4HOU98FEUwqhBq2uEzY0ym0evK85O5V3t6ULoANtM,167
58
+ themis/evaluation/extractors/identity_extractor.py,sha256=bBRgcry94AtUrchzBdHK-a9fnWAQT0eJDaPXjBmgBis,675
59
+ themis/evaluation/extractors/json_field_extractor.py,sha256=5_ndU7NBw9CkfP8oXmeue3wkGZZauryvFt_GVjO7X44,1232
60
+ themis/evaluation/extractors/math_verify_extractor.py,sha256=f8B_IeexS2QHCf-UUhqfk7xpqWcm8SgobLQ7GXYrM58,1155
61
+ themis/evaluation/extractors/regex_extractor.py,sha256=xNo8YRy7JOnF9ohbK_xUHcbLGMUmE1qooUlS0ucrw44,1148
62
+ themis/evaluation/metrics/__init__.py,sha256=HrrTgEwdCmXiib0ohUOArpUC0qHii2A_mgPX0m4LRoU,652
63
+ themis/evaluation/metrics/composite_metric.py,sha256=nF4FXMWwEMt1Gfq4NhwEuNVD28d97cgV30kFgt1LdEg,1474
64
+ themis/evaluation/metrics/consistency_metric.py,sha256=ITu4qFwgVmEiRD_tuVOhe5B9qtdmRZI3Pj-p0NlXizE,2404
65
+ themis/evaluation/metrics/exact_match.py,sha256=bAH3QCddbD7s0Mp3-4VGvzwnOv7MB8nexwcFpO0yJuo,1596
66
+ themis/evaluation/metrics/length_difference_tolerance.py,sha256=_YjDNGEMlodLogcw64RshG1-i0pKH27S3UiMeK6SdQw,966
67
+ themis/evaluation/metrics/math_verify_accuracy.py,sha256=YhBhpONLmouLELfpcjNHiVSlwpxkE2wkStF6__du3_0,1329
68
+ themis/evaluation/metrics/pairwise_judge_metric.py,sha256=DEYKwt3smzXiSUhDV4lWxDFXWoHz-JMg3z5bMjlLPKo,4890
69
+ themis/evaluation/metrics/response_length.py,sha256=Xn2PQi4pMLhC_3bMmSbLEf-QVFOrMNm2ZJr0PiCDH-E,910
70
+ themis/evaluation/metrics/rubric_judge_metric.py,sha256=KSSqwpMHaXCK6krbb_A93nppZ_0xk6Or30u7csnw7rM,4796
71
+ themis/evaluation/pipelines/__init__.py,sha256=5YI1xaUULHisctFxrumN4XRpWYneoonX7nd9zBtsjvQ,384
72
+ themis/evaluation/pipelines/composable_pipeline.py,sha256=nNP9MSvQQJvaSBw5_gO3FeyhGm9So2ZlGqh5qSvE8Ac,10905
73
+ themis/evaluation/pipelines/standard_pipeline.py,sha256=5ub_7zfQVj9YbDQxYTdxNIVijlMktR0rrfmN0GSndnI,11610
74
+ themis/evaluation/statistics/__init__.py,sha256=TTrScTLAW7EHNq0nbjuJs6iP3_HgDx1yy3EtYXx5JCk,1257
75
+ themis/evaluation/statistics/bootstrap.py,sha256=JUQ8rtzFvW2e41I2pLJ7pqgSEjuJ1r6McyYLI42At9g,2409
76
+ themis/evaluation/statistics/confidence_intervals.py,sha256=CN5EO2gWiSITQubuWuPryngnGXhGwczY9kO3mcG6JVc,3676
77
+ themis/evaluation/statistics/distributions.py,sha256=h-sJgtpB-KnTwgN1B2TXrugjCJp8fl9DmGy-ZxZVEek,5538
78
+ themis/evaluation/statistics/effect_sizes.py,sha256=EWFVDilczpR8rR3_YurWy7QcjYcNIEzGOvf931OYPww,3283
79
+ themis/evaluation/statistics/hypothesis_tests.py,sha256=MVlVsY8wXifbBG5aSwauFShsQtIKqYREJApbriojS2o,10042
80
+ themis/evaluation/statistics/types.py,sha256=hW0RYWs-G4C_njNl0ZGG9lJROgU2CfLWfnTQDWYmWuw,3685
81
+ themis/evaluation/strategies/__init__.py,sha256=3f5LQkzlu3pRbN7dgDbdYOUNZTRexcn6f8D8I5-C724,439
82
+ themis/evaluation/strategies/attempt_aware_evaluation_strategy.py,sha256=O3dlsQ2F0Ucv2Dhjz2Qf-jpPhwaVs3zrdQDRRu9du5w,1714
83
+ themis/evaluation/strategies/default_evaluation_strategy.py,sha256=LShW-3Nxg_W4Ln-4qUvHJZqe5YMt64gHoK3uNJYLQNo,693
84
+ themis/evaluation/strategies/evaluation_strategy.py,sha256=YFF-bXkz4Z52GuCw52FcklfEnf8dK8_z_I40DJRcmwE,669
85
+ themis/evaluation/strategies/judge_evaluation_strategy.py,sha256=58pDB30y1VpM_1KPB6sGS0JImGZk5WTgnK9CKDF8N5k,2304
86
+ themis/experiment/__init__.py,sha256=dGranqpESugmmfbQlTU9efwspazW6j3vcmAKEtAoWZk,182
87
+ themis/experiment/builder.py,sha256=AEjCDeSOI2B0i0PBjkfY1GUDNrYGTGiqPvt0SxnDQFo,5618
88
+ themis/experiment/cache_manager.py,sha256=KlSMPy5CICX3XmqnYwg6hP0TB4zzJu2nb-nLcZAFzy0,4012
89
+ themis/experiment/comparison.py,sha256=Mr1L5Zj7i87xk9XUQ_UueLTsC-sDZH8YGwLwg_gG0VI,21562
90
+ themis/experiment/cost.py,sha256=flhENfB5WKvyNWwPMDtygNZAv6y_yv4RoClsRz714Hc,10159
91
+ themis/experiment/definitions.py,sha256=oOZBFfEQkSBiZd9CMutCQ5luH6oeUT9yAZFd7fpVjnw,2015
92
+ themis/experiment/export.py,sha256=wsUICG9XG6HET2OBdQisHPXVv5pjF3HWCvcpv1A9igM,26012
93
+ themis/experiment/export_csv.py,sha256=80w3gEGjeLjuiNq539rRP73k3MBtwrzJy90hgE91AKw,6030
94
+ themis/experiment/integration_manager.py,sha256=wTVTjDGcUkzz4tfnwSxa5nK1A4e2FKCPazDYGcdzYS8,3325
95
+ themis/experiment/math.py,sha256=P2E9F_UKI7pb-aXepSztGdr_g309WEMe83zqg1nWO7A,6973
96
+ themis/experiment/mcq.py,sha256=DDB99FHQsU_5vMIRDRhSZ7pReYvVf57wLmmo3OU_An4,6276
97
+ themis/experiment/orchestrator.py,sha256=D2ANvg2s4Dyo0ridZ3alDVl1dTW4kazLoMeG0knQ6-M,14244
98
+ themis/experiment/pricing.py,sha256=fTM32yE3L8vahMP4sr1zr7dbp9zYCjiPN4D4VuZ8-q8,9346
99
+ themis/experiment/storage.py,sha256=58tSwHn3J36UMMCWblkbs00ZjCyIqLkoR2Vib9c-zgE,9156
100
+ themis/experiment/visualization.py,sha256=dJYHrp3mntl8CPc5HPI3iKqPztVsddQB3ogRkd_FCNc,18473
101
+ themis/generation/__init__.py,sha256=6KVwCQYMpPIsXNuWDZOGuqHkUkA45lbSacIFn8ZbD4s,36
102
+ themis/generation/agentic_runner.py,sha256=armBQBk7qZDBEwT8HqjIWomYDQm57NfrP5CZJzay2uA,13669
103
+ themis/generation/batching.py,sha256=ddpgpn1pq_EwipvTg-K4WcoSs3c2rbW37jEA5Pa_spo,7557
104
+ themis/generation/clients.py,sha256=6apXCp_VNQosnpnmohTHOhHGXw-VZgsUyLds8MwtYUE,4910
105
+ themis/generation/conversation_runner.py,sha256=kSZHwEvfqzxZ-eQYxmg5OkNZcgEHggZExjad6nBOeTM,7980
106
+ themis/generation/plan.py,sha256=RmPIdefXkQMHYv5EWiilpx91I9a-svw31imvG0wV3fE,15961
107
+ themis/generation/router.py,sha256=jZc0KFL483f8TrYtt9yxzFKs-T9CG2CoE2kfOQdHMEc,1082
108
+ themis/generation/runner.py,sha256=iHTE5vSMWMYRrv4PEWMaZflF939nv1wWccK8V0e092c,8009
109
+ themis/generation/strategies.py,sha256=hjqaVkNycFxJWh_edJ7ilBl7HS6bL-8pYm24zTfoAvg,2975
110
+ themis/generation/templates.py,sha256=ut_6akp8Y6Ey_9O3s64jDbwCB74pw62Zf8URlYcKHkA,2325
111
+ themis/generation/turn_strategies.py,sha256=w33qhzpQbGTsfeOgOgMDovV0wEeXeNZUUBm5yZy1naw,10973
112
+ themis/generation/types.py,sha256=MkJnZk6lMHmHzlJVEsuIC9ioRW8XhWcSk9AdDeb_aLE,338
113
+ themis/generation/providers/litellm_provider.py,sha256=rlTuglIwhcvSakCo5G-ffgQtEHbCEX0ZeKk6M1MaWmU,8155
114
+ themis/generation/providers/vllm_provider.py,sha256=0K4we6xDrRXlBXseC1ixLq2sJpRF4T8Ikv45dw-zNk4,4625
115
+ themis/integrations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
116
+ themis/integrations/huggingface.py,sha256=oUkLVyacmukDwHrMUVayHJ4fWnVCA77ZyuEkC7aogmY,2032
117
+ themis/integrations/wandb.py,sha256=VVVReLM-qjw4DyLVJcoaSH1rSTMNf3ReOKMxOFwJKu4,2247
118
+ themis/interfaces/__init__.py,sha256=OSHATlp1fNBJrVq0lNM85MGjWmSj0NKtnLnp3fmYn1k,2330
119
+ themis/project/__init__.py,sha256=vgLv2nS62yz1XsFSFzFf7eIo6FyQJXpOY9OPRUcTQLQ,465
120
+ themis/project/definitions.py,sha256=vHARw0IjFOWE4RL4mGRwvke36A6GWQGep6cQFIRcpJg,3329
121
+ themis/project/patterns.py,sha256=2J51Q9Jq7X-2N57uexvR191gaZKwusef5vIuIVUQY-E,7743
122
+ themis/providers/__init__.py,sha256=K5nG0DsK_YPY0cT9MBLk5BLcLbBo0wBP0vQvLjpAw_Y,189
123
+ themis/providers/registry.py,sha256=Za5Kg3-A-35wS_jiGpPXV2q1k6he_dRIWVqt36dKN-4,1056
124
+ themis/utils/api_generator.py,sha256=3oQ7mGZlFx2Dpm45pMg3rNIqNK2Smj05PjOMXp5RIkQ,10776
125
+ themis/utils/cost_tracking.py,sha256=9_Z2iTfNaQse9G_bnqn4hme4T0fG2W-fxOLEDeF_3VI,11545
126
+ themis/utils/dashboard.py,sha256=2yiIu9_oENglTde_J3G1d5cpQ5VtSnfbUvdliw5Og1E,13008
127
+ themis/utils/logging_utils.py,sha256=YNSiDfO4LsciSzUhHF1aTVI5rkfnWiVbn1NcGjjmJuQ,1019
128
+ themis/utils/progress.py,sha256=b3YwHKV5x3Cvr5rBukqifJimK3Si4CGY2fpN6a_ZySI,1434
129
+ themis/utils/tracing.py,sha256=VTeiRjcW_B5fOOoSeAp37nrmlwP1DiqPcoe6OtIQ7dk,8468
130
+ themis_eval-0.1.1.dist-info/licenses/LICENSE,sha256=K5FLE7iqn5-_6k1sf3IGy7w-Wx_Vdx3t0sOVJByNlF0,1076
131
+ themis_eval-0.1.1.dist-info/METADATA,sha256=7-oAglt5HH_AWi7yCzvTq_RUE07xTIVmZG02IcxHRjM,23516
132
+ themis_eval-0.1.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
133
+ themis_eval-0.1.1.dist-info/top_level.txt,sha256=QGIl4v-KB32upFS5UTXMJxHVX3vF7yBso82wJFI1Vbs,7
134
+ themis_eval-0.1.1.dist-info/RECORD,,
@@ -1,8 +0,0 @@
1
- themis/__init__.py,sha256=3bKi1PneI5PaTaDPXsArCVvfinkLFDRU91lvZIeg7V0,281
2
- themis/_version.py,sha256=tc4TJqWVv2dx4UzItLqneMPaG7vM8CQFDNW5pJgNoKg,345
3
- themis/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- themis_eval-0.1.0.dist-info/licenses/LICENSE,sha256=K5FLE7iqn5-_6k1sf3IGy7w-Wx_Vdx3t0sOVJByNlF0,1076
5
- themis_eval-0.1.0.dist-info/METADATA,sha256=bRdc6UhSKYmptIJVhp4cEK8K2-Vvc77rZnVfYav0uS4,23516
6
- themis_eval-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
7
- themis_eval-0.1.0.dist-info/top_level.txt,sha256=QGIl4v-KB32upFS5UTXMJxHVX3vF7yBso82wJFI1Vbs,7
8
- themis_eval-0.1.0.dist-info/RECORD,,