themis-eval 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- themis/__init__.py +23 -1
- themis/_version.py +1 -1
- themis/api.py +57 -4
- themis/evaluation/pipelines/standard_pipeline.py +2 -1
- themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +5 -2
- themis/evaluation/strategies/judge_evaluation_strategy.py +6 -1
- themis/generation/plan.py +28 -6
- themis/presets/__init__.py +13 -2
- themis/utils/logging_utils.py +8 -3
- themis/utils/progress.py +32 -13
- {themis_eval-0.2.1.dist-info → themis_eval-0.2.3.dist-info}/METADATA +6 -5
- {themis_eval-0.2.1.dist-info → themis_eval-0.2.3.dist-info}/RECORD +15 -15
- {themis_eval-0.2.1.dist-info → themis_eval-0.2.3.dist-info}/WHEEL +0 -0
- {themis_eval-0.2.1.dist-info → themis_eval-0.2.3.dist-info}/licenses/LICENSE +0 -0
- {themis_eval-0.2.1.dist-info → themis_eval-0.2.3.dist-info}/top_level.txt +0 -0
themis/__init__.py
CHANGED
|
@@ -4,15 +4,37 @@ The primary interface is the `evaluate()` function:
|
|
|
4
4
|
|
|
5
5
|
import themis
|
|
6
6
|
report = themis.evaluate("math500", model="gpt-4", limit=100)
|
|
7
|
+
|
|
8
|
+
Extension APIs for registering custom components:
|
|
9
|
+
- themis.register_metric() - Register custom metrics
|
|
10
|
+
- themis.register_dataset() - Register custom datasets
|
|
11
|
+
- themis.register_provider() - Register custom model providers
|
|
12
|
+
- themis.register_benchmark() - Register custom benchmark presets
|
|
7
13
|
"""
|
|
8
14
|
|
|
9
15
|
from themis import config, core, evaluation, experiment, generation, project
|
|
10
16
|
from themis._version import __version__
|
|
11
|
-
from themis.api import evaluate
|
|
17
|
+
from themis.api import evaluate, get_registered_metrics, register_metric
|
|
18
|
+
from themis.datasets import register_dataset, list_datasets, is_dataset_registered
|
|
19
|
+
from themis.presets import register_benchmark, list_benchmarks, get_benchmark_preset
|
|
20
|
+
from themis.providers import register_provider
|
|
12
21
|
|
|
13
22
|
__all__ = [
|
|
14
23
|
# Main API
|
|
15
24
|
"evaluate",
|
|
25
|
+
# Metrics
|
|
26
|
+
"register_metric",
|
|
27
|
+
"get_registered_metrics",
|
|
28
|
+
# Datasets
|
|
29
|
+
"register_dataset",
|
|
30
|
+
"list_datasets",
|
|
31
|
+
"is_dataset_registered",
|
|
32
|
+
# Benchmarks
|
|
33
|
+
"register_benchmark",
|
|
34
|
+
"list_benchmarks",
|
|
35
|
+
"get_benchmark_preset",
|
|
36
|
+
# Providers
|
|
37
|
+
"register_provider",
|
|
16
38
|
# Submodules
|
|
17
39
|
"config",
|
|
18
40
|
"core",
|
themis/_version.py
CHANGED
|
@@ -9,7 +9,7 @@ def _detect_version() -> str:
|
|
|
9
9
|
try:
|
|
10
10
|
return metadata.version("themis-eval")
|
|
11
11
|
except metadata.PackageNotFoundError: # pragma: no cover - local dev only
|
|
12
|
-
return "0.2.
|
|
12
|
+
return "0.2.3" # Fallback for development
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
__version__ = _detect_version()
|
themis/api.py
CHANGED
|
@@ -66,6 +66,55 @@ except ImportError:
|
|
|
66
66
|
logger = logging.getLogger(__name__)
|
|
67
67
|
|
|
68
68
|
|
|
69
|
+
# Module-level metrics registry for custom metrics
|
|
70
|
+
_METRICS_REGISTRY: dict[str, type] = {}
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def register_metric(name: str, metric_cls: type) -> None:
|
|
74
|
+
"""Register a custom metric for use in evaluate().
|
|
75
|
+
|
|
76
|
+
This allows users to add their own metrics to Themis without modifying
|
|
77
|
+
the source code. Registered metrics can be used by passing their names
|
|
78
|
+
to the `metrics` parameter in evaluate().
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
name: Metric name (used in evaluate(metrics=[name]))
|
|
82
|
+
metric_cls: Metric class implementing the Metric interface.
|
|
83
|
+
Must have a compute() method that takes prediction, references,
|
|
84
|
+
and metadata parameters.
|
|
85
|
+
|
|
86
|
+
Raises:
|
|
87
|
+
TypeError: If metric_cls is not a class
|
|
88
|
+
ValueError: If metric_cls doesn't implement the required interface
|
|
89
|
+
|
|
90
|
+
Example:
|
|
91
|
+
>>> from themis.evaluation.metrics import MyCustomMetric
|
|
92
|
+
>>> themis.register_metric("my_metric", MyCustomMetric)
|
|
93
|
+
>>> report = themis.evaluate("math500", model="gpt-4", metrics=["my_metric"])
|
|
94
|
+
"""
|
|
95
|
+
if not isinstance(metric_cls, type):
|
|
96
|
+
raise TypeError(f"metric_cls must be a class, got {type(metric_cls)}")
|
|
97
|
+
|
|
98
|
+
# Validate that it implements the Metric interface
|
|
99
|
+
if not hasattr(metric_cls, "compute"):
|
|
100
|
+
raise ValueError(
|
|
101
|
+
f"{metric_cls.__name__} must implement compute() method. "
|
|
102
|
+
f"See themis.evaluation.metrics for examples."
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
_METRICS_REGISTRY[name] = metric_cls
|
|
106
|
+
logger.info(f"Registered custom metric: {name} -> {metric_cls.__name__}")
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def get_registered_metrics() -> dict[str, type]:
|
|
110
|
+
"""Get all currently registered custom metrics.
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
Dictionary mapping metric names to their classes
|
|
114
|
+
"""
|
|
115
|
+
return _METRICS_REGISTRY.copy()
|
|
116
|
+
|
|
117
|
+
|
|
69
118
|
def evaluate(
|
|
70
119
|
benchmark_or_dataset: str | Sequence[dict[str, Any]],
|
|
71
120
|
*,
|
|
@@ -384,8 +433,8 @@ def _resolve_metrics(metric_names: list[str]) -> list:
|
|
|
384
433
|
except ImportError:
|
|
385
434
|
nlp_available = False
|
|
386
435
|
|
|
387
|
-
#
|
|
388
|
-
|
|
436
|
+
# Built-in metrics registry
|
|
437
|
+
BUILTIN_METRICS = {
|
|
389
438
|
# Core metrics
|
|
390
439
|
"exact_match": ExactMatch,
|
|
391
440
|
"math_verify": MathVerifyAccuracy,
|
|
@@ -394,7 +443,7 @@ def _resolve_metrics(metric_names: list[str]) -> list:
|
|
|
394
443
|
|
|
395
444
|
# Add NLP metrics if available
|
|
396
445
|
if nlp_available:
|
|
397
|
-
|
|
446
|
+
BUILTIN_METRICS.update({
|
|
398
447
|
"bleu": BLEU,
|
|
399
448
|
"rouge1": lambda: ROUGE(variant=ROUGEVariant.ROUGE_1),
|
|
400
449
|
"rouge2": lambda: ROUGE(variant=ROUGEVariant.ROUGE_2),
|
|
@@ -407,6 +456,10 @@ def _resolve_metrics(metric_names: list[str]) -> list:
|
|
|
407
456
|
# "pass_at_k": PassAtK,
|
|
408
457
|
# "codebleu": CodeBLEU,
|
|
409
458
|
|
|
459
|
+
# Merge built-in and custom metrics
|
|
460
|
+
# Custom metrics can override built-in metrics
|
|
461
|
+
METRICS_REGISTRY = {**BUILTIN_METRICS, **_METRICS_REGISTRY}
|
|
462
|
+
|
|
410
463
|
metrics = []
|
|
411
464
|
for name in metric_names:
|
|
412
465
|
if name not in METRICS_REGISTRY:
|
|
@@ -426,4 +479,4 @@ def _resolve_metrics(metric_names: list[str]) -> list:
|
|
|
426
479
|
return metrics
|
|
427
480
|
|
|
428
481
|
|
|
429
|
-
__all__ = ["evaluate"]
|
|
482
|
+
__all__ = ["evaluate", "register_metric", "get_registered_metrics"]
|
|
@@ -233,7 +233,8 @@ class EvaluationPipeline:
|
|
|
233
233
|
if reference is not None
|
|
234
234
|
else []
|
|
235
235
|
)
|
|
236
|
-
metadata
|
|
236
|
+
# Preserve all task metadata for metrics, add sample_id
|
|
237
|
+
metadata = {**record.task.metadata, "sample_id": sample_id}
|
|
237
238
|
extract_start = time.perf_counter()
|
|
238
239
|
item_scores_for_item: list[core_entities.MetricScore] = []
|
|
239
240
|
for metric in self._metrics:
|
|
@@ -37,13 +37,16 @@ class AttemptAwareEvaluationStrategy:
|
|
|
37
37
|
grouped.setdefault(score.metric_name, []).append(score)
|
|
38
38
|
for metric_name, group in grouped.items():
|
|
39
39
|
value = sum(item.value for item in group) / len(group)
|
|
40
|
+
# Preserve original metadata from first score
|
|
41
|
+
base_metadata = group[0].metadata.copy() if group[0].metadata else {}
|
|
40
42
|
aggregated.append(
|
|
41
43
|
core_entities.MetricScore(
|
|
42
44
|
metric_name=metric_name,
|
|
43
45
|
value=value,
|
|
44
46
|
metadata={
|
|
45
|
-
|
|
46
|
-
"
|
|
47
|
+
**base_metadata, # Preserve all original metadata
|
|
48
|
+
"attempts": len(group), # Add aggregation-specific field
|
|
49
|
+
"sample_id": base_metadata.get("sample_id"),
|
|
47
50
|
},
|
|
48
51
|
details={},
|
|
49
52
|
)
|
|
@@ -49,6 +49,8 @@ class JudgeEvaluationStrategy:
|
|
|
49
49
|
counts = Counter(labels)
|
|
50
50
|
agreement = max(counts.values()) / max(1, len(labels))
|
|
51
51
|
|
|
52
|
+
# Preserve original metadata from first score
|
|
53
|
+
base_metadata = group[0].metadata.copy() if group[0].metadata else {}
|
|
52
54
|
aggregated.append(
|
|
53
55
|
core_entities.MetricScore(
|
|
54
56
|
metric_name=metric_name,
|
|
@@ -58,7 +60,10 @@ class JudgeEvaluationStrategy:
|
|
|
58
60
|
"agreement": agreement,
|
|
59
61
|
"labels": labels,
|
|
60
62
|
},
|
|
61
|
-
metadata={
|
|
63
|
+
metadata={
|
|
64
|
+
**base_metadata, # Preserve all original metadata
|
|
65
|
+
"sample_id": base_metadata.get("sample_id"),
|
|
66
|
+
},
|
|
62
67
|
)
|
|
63
68
|
)
|
|
64
69
|
return aggregated
|
themis/generation/plan.py
CHANGED
|
@@ -88,9 +88,20 @@ class GenerationPlan:
|
|
|
88
88
|
}
|
|
89
89
|
if dataset_id is not None:
|
|
90
90
|
metadata["dataset_id"] = dataset_id
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
91
|
+
|
|
92
|
+
# If metadata_fields is explicitly specified, use it as a filter (existing behavior)
|
|
93
|
+
# Otherwise, include all fields by default (new behavior for custom metrics)
|
|
94
|
+
if self.metadata_fields:
|
|
95
|
+
# Explicit filter - only include specified fields
|
|
96
|
+
for field_name in self.metadata_fields:
|
|
97
|
+
if field_name in row:
|
|
98
|
+
metadata[field_name] = row[field_name]
|
|
99
|
+
else:
|
|
100
|
+
# No filter - include all fields except those used for other purposes
|
|
101
|
+
for field_name, field_value in row.items():
|
|
102
|
+
if field_name not in (self.dataset_id_field, self.reference_field):
|
|
103
|
+
metadata[field_name] = field_value
|
|
104
|
+
|
|
94
105
|
return metadata
|
|
95
106
|
|
|
96
107
|
def _build_reference(
|
|
@@ -209,9 +220,20 @@ class CartesianExpansionStrategy:
|
|
|
209
220
|
}
|
|
210
221
|
if dataset_id is not None:
|
|
211
222
|
metadata["dataset_id"] = dataset_id
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
223
|
+
|
|
224
|
+
# If metadata_fields is explicitly specified, use it as a filter (existing behavior)
|
|
225
|
+
# Otherwise, include all fields by default (new behavior for custom metrics)
|
|
226
|
+
if context.metadata_fields:
|
|
227
|
+
# Explicit filter - only include specified fields
|
|
228
|
+
for field_name in context.metadata_fields:
|
|
229
|
+
if field_name in row:
|
|
230
|
+
metadata[field_name] = row[field_name]
|
|
231
|
+
else:
|
|
232
|
+
# No filter - include all fields except those used for other purposes
|
|
233
|
+
for field_name, field_value in row.items():
|
|
234
|
+
if field_name not in (context.dataset_id_field, context.reference_field):
|
|
235
|
+
metadata[field_name] = field_value
|
|
236
|
+
|
|
215
237
|
return metadata
|
|
216
238
|
|
|
217
239
|
def _build_reference(
|
themis/presets/__init__.py
CHANGED
|
@@ -4,7 +4,18 @@ This module provides automatic configuration for popular benchmarks,
|
|
|
4
4
|
eliminating the need for manual setup of prompts, metrics, and extractors.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
-
from themis.presets.benchmarks import
|
|
7
|
+
from themis.presets.benchmarks import (
|
|
8
|
+
BenchmarkPreset,
|
|
9
|
+
get_benchmark_preset,
|
|
10
|
+
list_benchmarks,
|
|
11
|
+
register_benchmark,
|
|
12
|
+
)
|
|
8
13
|
from themis.presets.models import parse_model_name
|
|
9
14
|
|
|
10
|
-
__all__ = [
|
|
15
|
+
__all__ = [
|
|
16
|
+
"BenchmarkPreset",
|
|
17
|
+
"register_benchmark",
|
|
18
|
+
"get_benchmark_preset",
|
|
19
|
+
"list_benchmarks",
|
|
20
|
+
"parse_model_name",
|
|
21
|
+
]
|
themis/utils/logging_utils.py
CHANGED
|
@@ -5,6 +5,9 @@ from __future__ import annotations
|
|
|
5
5
|
import logging
|
|
6
6
|
from typing import Mapping
|
|
7
7
|
|
|
8
|
+
from rich.logging import RichHandler
|
|
9
|
+
from rich.traceback import install as install_rich_traceback
|
|
10
|
+
|
|
8
11
|
TRACE_LEVEL = 5
|
|
9
12
|
logging.addLevelName(TRACE_LEVEL, "TRACE")
|
|
10
13
|
|
|
@@ -28,12 +31,14 @@ _LEVELS: Mapping[str, int] = {
|
|
|
28
31
|
|
|
29
32
|
def configure_logging(level: str = "info") -> None:
|
|
30
33
|
"""Configure root logging with human-friendly formatting."""
|
|
31
|
-
|
|
34
|
+
install_rich_traceback()
|
|
32
35
|
numeric_level = _LEVELS.get(level.lower(), logging.INFO)
|
|
36
|
+
|
|
33
37
|
logging.basicConfig(
|
|
34
38
|
level=numeric_level,
|
|
35
|
-
format="%(
|
|
36
|
-
datefmt="%
|
|
39
|
+
format="%(message)s",
|
|
40
|
+
datefmt="[%X]",
|
|
41
|
+
handlers=[RichHandler(rich_tracebacks=True, markup=True)],
|
|
37
42
|
force=True,
|
|
38
43
|
)
|
|
39
44
|
|
themis/utils/progress.py
CHANGED
|
@@ -5,7 +5,16 @@ from __future__ import annotations
|
|
|
5
5
|
from contextlib import AbstractContextManager
|
|
6
6
|
from typing import Any, Callable
|
|
7
7
|
|
|
8
|
-
from
|
|
8
|
+
from rich.progress import (
|
|
9
|
+
BarColumn,
|
|
10
|
+
MofNCompleteColumn,
|
|
11
|
+
Progress,
|
|
12
|
+
SpinnerColumn,
|
|
13
|
+
TaskProgressColumn,
|
|
14
|
+
TextColumn,
|
|
15
|
+
TimeElapsedColumn,
|
|
16
|
+
TimeRemainingColumn,
|
|
17
|
+
)
|
|
9
18
|
|
|
10
19
|
|
|
11
20
|
class ProgressReporter(AbstractContextManager["ProgressReporter"]):
|
|
@@ -21,7 +30,8 @@ class ProgressReporter(AbstractContextManager["ProgressReporter"]):
|
|
|
21
30
|
self._description = description
|
|
22
31
|
self._unit = unit
|
|
23
32
|
self._leave = leave
|
|
24
|
-
self.
|
|
33
|
+
self._progress: Progress | None = None
|
|
34
|
+
self._task_id = None
|
|
25
35
|
|
|
26
36
|
def __enter__(self) -> "ProgressReporter":
|
|
27
37
|
self.start()
|
|
@@ -31,22 +41,31 @@ class ProgressReporter(AbstractContextManager["ProgressReporter"]):
|
|
|
31
41
|
self.close()
|
|
32
42
|
|
|
33
43
|
def start(self) -> None:
|
|
34
|
-
if self.
|
|
35
|
-
self.
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
44
|
+
if self._progress is None:
|
|
45
|
+
self._progress = Progress(
|
|
46
|
+
SpinnerColumn(),
|
|
47
|
+
TextColumn("[progress.description]{task.description}"),
|
|
48
|
+
BarColumn(),
|
|
49
|
+
TaskProgressColumn(),
|
|
50
|
+
MofNCompleteColumn(),
|
|
51
|
+
TimeElapsedColumn(),
|
|
52
|
+
TimeRemainingColumn(),
|
|
53
|
+
transient=not self._leave,
|
|
54
|
+
)
|
|
55
|
+
self._progress.start()
|
|
56
|
+
self._task_id = self._progress.add_task(
|
|
57
|
+
self._description, total=self._total
|
|
40
58
|
)
|
|
41
59
|
|
|
42
60
|
def close(self) -> None:
|
|
43
|
-
if self.
|
|
44
|
-
self.
|
|
45
|
-
self.
|
|
61
|
+
if self._progress is not None:
|
|
62
|
+
self._progress.stop()
|
|
63
|
+
self._progress = None
|
|
64
|
+
self._task_id = None
|
|
46
65
|
|
|
47
66
|
def increment(self, step: int = 1) -> None:
|
|
48
|
-
if self.
|
|
49
|
-
self.
|
|
67
|
+
if self._progress is not None and self._task_id is not None:
|
|
68
|
+
self._progress.update(self._task_id, advance=step)
|
|
50
69
|
|
|
51
70
|
def on_result(self, _record: Any) -> None:
|
|
52
71
|
self.increment()
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: themis-eval
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.3
|
|
4
4
|
Summary: Lightweight evaluation platform for LLM experiments
|
|
5
5
|
Author: Pittawat Taveekitworachai
|
|
6
6
|
License: MIT
|
|
@@ -25,6 +25,7 @@ Requires-Dist: tabulate>=0.9.0
|
|
|
25
25
|
Requires-Dist: tenacity>=9.1.2
|
|
26
26
|
Requires-Dist: plotly>=6.5.0
|
|
27
27
|
Requires-Dist: math-verify>=0.8.0
|
|
28
|
+
Requires-Dist: rich>=14.2.0
|
|
28
29
|
Provides-Extra: dev
|
|
29
30
|
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
30
31
|
Requires-Dist: pytest-cov>=6.0.0; extra == "dev"
|
|
@@ -358,9 +359,9 @@ Themis is built on a clean, modular architecture:
|
|
|
358
359
|
|
|
359
360
|
- **[API Reference](docs/index.md)** - Detailed API documentation
|
|
360
361
|
- **[Examples](examples-simple/)** - Runnable code examples
|
|
361
|
-
- **[Extending Backends](docs/
|
|
362
|
-
- **[API Server](docs/
|
|
363
|
-
- **[Comparison Engine](docs/
|
|
362
|
+
- **[Extending Backends](docs/customization/backends.md)** - Custom storage and execution
|
|
363
|
+
- **[API Server](docs/reference/api-server.md)** - Web dashboard and REST API
|
|
364
|
+
- **[Comparison Engine](docs/guides/comparison.md)** - Statistical testing guide
|
|
364
365
|
|
|
365
366
|
---
|
|
366
367
|
|
|
@@ -388,7 +389,7 @@ result = evaluate(
|
|
|
388
389
|
)
|
|
389
390
|
```
|
|
390
391
|
|
|
391
|
-
See [
|
|
392
|
+
See [docs/customization/backends.md](docs/customization/backends.md) for details.
|
|
392
393
|
|
|
393
394
|
### Distributed Execution
|
|
394
395
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
themis/__init__.py,sha256=
|
|
2
|
-
themis/_version.py,sha256=
|
|
3
|
-
themis/api.py,sha256=
|
|
1
|
+
themis/__init__.py,sha256=rQL3njf3i5lnAcmu0HuRzGGMELbA9xX21hzw4HrbIxw,1394
|
|
2
|
+
themis/_version.py,sha256=Tk4OCTQHYoZ61gm9JnkdgajR0vkBHbVm5OUjInzyJug,378
|
|
3
|
+
themis/api.py,sha256=flZTbU-jRcbv7oXcfRKG4hkZjASmWlT52A4PghKj9G0,17700
|
|
4
4
|
themis/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
5
|
themis/backends/__init__.py,sha256=RWM5SnV5FrS_cVjpHHeZZM_b9CgqBu1rPS5DlT5YQTY,578
|
|
6
6
|
themis/backends/execution.py,sha256=RAFuB9ri8TMil5PcnsisypKO2ViyLFXj08P_vjNYguU,6095
|
|
@@ -88,7 +88,7 @@ themis/evaluation/metrics/nlp/meteor.py,sha256=QZT09s4aiUcVvDJDVPZYjzi5SxXdS2gn2
|
|
|
88
88
|
themis/evaluation/metrics/nlp/rouge.py,sha256=YL05qluF-KsesHYFRfm5zELJlcvo6RvaKp7xKy6BuLI,4365
|
|
89
89
|
themis/evaluation/pipelines/__init__.py,sha256=5YI1xaUULHisctFxrumN4XRpWYneoonX7nd9zBtsjvQ,384
|
|
90
90
|
themis/evaluation/pipelines/composable_pipeline.py,sha256=nNP9MSvQQJvaSBw5_gO3FeyhGm9So2ZlGqh5qSvE8Ac,10905
|
|
91
|
-
themis/evaluation/pipelines/standard_pipeline.py,sha256=
|
|
91
|
+
themis/evaluation/pipelines/standard_pipeline.py,sha256=GI5_ImebBuM6D8GpGKLoNq4p3JhTq-ocOThlah8RxME,14754
|
|
92
92
|
themis/evaluation/statistics/__init__.py,sha256=TTrScTLAW7EHNq0nbjuJs6iP3_HgDx1yy3EtYXx5JCk,1257
|
|
93
93
|
themis/evaluation/statistics/bootstrap.py,sha256=JUQ8rtzFvW2e41I2pLJ7pqgSEjuJ1r6McyYLI42At9g,2409
|
|
94
94
|
themis/evaluation/statistics/confidence_intervals.py,sha256=CN5EO2gWiSITQubuWuPryngnGXhGwczY9kO3mcG6JVc,3676
|
|
@@ -97,10 +97,10 @@ themis/evaluation/statistics/effect_sizes.py,sha256=EWFVDilczpR8rR3_YurWy7QcjYcN
|
|
|
97
97
|
themis/evaluation/statistics/hypothesis_tests.py,sha256=MVlVsY8wXifbBG5aSwauFShsQtIKqYREJApbriojS2o,10042
|
|
98
98
|
themis/evaluation/statistics/types.py,sha256=hW0RYWs-G4C_njNl0ZGG9lJROgU2CfLWfnTQDWYmWuw,3685
|
|
99
99
|
themis/evaluation/strategies/__init__.py,sha256=3f5LQkzlu3pRbN7dgDbdYOUNZTRexcn6f8D8I5-C724,439
|
|
100
|
-
themis/evaluation/strategies/attempt_aware_evaluation_strategy.py,sha256=
|
|
100
|
+
themis/evaluation/strategies/attempt_aware_evaluation_strategy.py,sha256=MFcBdtK8rBeDXPFD2YWPSprez2iwSB-8yfyWhAlylug,1959
|
|
101
101
|
themis/evaluation/strategies/default_evaluation_strategy.py,sha256=LShW-3Nxg_W4Ln-4qUvHJZqe5YMt64gHoK3uNJYLQNo,693
|
|
102
102
|
themis/evaluation/strategies/evaluation_strategy.py,sha256=YFF-bXkz4Z52GuCw52FcklfEnf8dK8_z_I40DJRcmwE,669
|
|
103
|
-
themis/evaluation/strategies/judge_evaluation_strategy.py,sha256=
|
|
103
|
+
themis/evaluation/strategies/judge_evaluation_strategy.py,sha256=IRSgnnD2R6qrjiOTyA_PIOHUfQj4FqutkU3pKMth0CQ,2562
|
|
104
104
|
themis/experiment/__init__.py,sha256=dGranqpESugmmfbQlTU9efwspazW6j3vcmAKEtAoWZk,182
|
|
105
105
|
themis/experiment/builder.py,sha256=AEjCDeSOI2B0i0PBjkfY1GUDNrYGTGiqPvt0SxnDQFo,5618
|
|
106
106
|
themis/experiment/cache_manager.py,sha256=Fd8Qxifrmyn8f2zjAyPrLv-ZU4Dcp-MKo8-09BoW7tY,4361
|
|
@@ -121,7 +121,7 @@ themis/generation/agentic_runner.py,sha256=armBQBk7qZDBEwT8HqjIWomYDQm57NfrP5CZJ
|
|
|
121
121
|
themis/generation/batching.py,sha256=ddpgpn1pq_EwipvTg-K4WcoSs3c2rbW37jEA5Pa_spo,7557
|
|
122
122
|
themis/generation/clients.py,sha256=6apXCp_VNQosnpnmohTHOhHGXw-VZgsUyLds8MwtYUE,4910
|
|
123
123
|
themis/generation/conversation_runner.py,sha256=kSZHwEvfqzxZ-eQYxmg5OkNZcgEHggZExjad6nBOeTM,7980
|
|
124
|
-
themis/generation/plan.py,sha256=
|
|
124
|
+
themis/generation/plan.py,sha256=k6_gdKFM12nrKz7ac1c5vTZsFanIKJJgyQ8IhvakDNQ,17158
|
|
125
125
|
themis/generation/router.py,sha256=jZc0KFL483f8TrYtt9yxzFKs-T9CG2CoE2kfOQdHMEc,1082
|
|
126
126
|
themis/generation/runner.py,sha256=pH4Dw77qskMQk3yxEkaHYAl1PItTofI7OXdvevnFiCA,8984
|
|
127
127
|
themis/generation/strategies.py,sha256=hjqaVkNycFxJWh_edJ7ilBl7HS6bL-8pYm24zTfoAvg,2975
|
|
@@ -134,7 +134,7 @@ themis/integrations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSu
|
|
|
134
134
|
themis/integrations/huggingface.py,sha256=vrLwYwn65pU4W3FUe0ImCOZxKKlpRshDqMoLFsclB3E,2370
|
|
135
135
|
themis/integrations/wandb.py,sha256=LJOPojjlqG05EIPxcjy3QmA15Gxgs1db3encDWVzYYw,2545
|
|
136
136
|
themis/interfaces/__init__.py,sha256=78dNE_eHfFmb9hXNy5sLZ1jOTGWS8TzdVE_eiYQPFVc,5967
|
|
137
|
-
themis/presets/__init__.py,sha256=
|
|
137
|
+
themis/presets/__init__.py,sha256=w58fJcy4eNiE034qHO2xE5pp-H-4LNLXo5hLMuC7wIQ,533
|
|
138
138
|
themis/presets/benchmarks.py,sha256=s9JxRogHwZs8oiuiI7Z7uiUBZXEp3gg7AQZnBvdGieA,12026
|
|
139
139
|
themis/presets/models.py,sha256=c6-I_drHa4vMLIajSkCcrFbsJOsauFjY8fU1leBxZLg,5173
|
|
140
140
|
themis/project/__init__.py,sha256=vgLv2nS62yz1XsFSFzFf7eIo6FyQJXpOY9OPRUcTQLQ,465
|
|
@@ -147,11 +147,11 @@ themis/server/app.py,sha256=OZ39gCC47AXVqZxroC_4KtIYBYx_rfpde7C25AF3EI0,11166
|
|
|
147
147
|
themis/utils/api_generator.py,sha256=3oQ7mGZlFx2Dpm45pMg3rNIqNK2Smj05PjOMXp5RIkQ,10776
|
|
148
148
|
themis/utils/cost_tracking.py,sha256=9_Z2iTfNaQse9G_bnqn4hme4T0fG2W-fxOLEDeF_3VI,11545
|
|
149
149
|
themis/utils/dashboard.py,sha256=2yiIu9_oENglTde_J3G1d5cpQ5VtSnfbUvdliw5Og1E,13008
|
|
150
|
-
themis/utils/logging_utils.py,sha256=
|
|
151
|
-
themis/utils/progress.py,sha256=
|
|
150
|
+
themis/utils/logging_utils.py,sha256=buC64X-xOu-2SZ0wVkz3nCXzYVGiqKbxK-8DGSGsAdM,1173
|
|
151
|
+
themis/utils/progress.py,sha256=HS0-yVbRT7Ai9zRlsJcex_OKP6dUiKx1vOp_IsobiHM,2097
|
|
152
152
|
themis/utils/tracing.py,sha256=VTeiRjcW_B5fOOoSeAp37nrmlwP1DiqPcoe6OtIQ7dk,8468
|
|
153
|
-
themis_eval-0.2.
|
|
154
|
-
themis_eval-0.2.
|
|
155
|
-
themis_eval-0.2.
|
|
156
|
-
themis_eval-0.2.
|
|
157
|
-
themis_eval-0.2.
|
|
153
|
+
themis_eval-0.2.3.dist-info/licenses/LICENSE,sha256=K5FLE7iqn5-_6k1sf3IGy7w-Wx_Vdx3t0sOVJByNlF0,1076
|
|
154
|
+
themis_eval-0.2.3.dist-info/METADATA,sha256=4N7tBOyUi8PAlFT2qJseKIABjHOzkFmLtfqVVUSFz84,15235
|
|
155
|
+
themis_eval-0.2.3.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
|
|
156
|
+
themis_eval-0.2.3.dist-info/top_level.txt,sha256=QGIl4v-KB32upFS5UTXMJxHVX3vF7yBso82wJFI1Vbs,7
|
|
157
|
+
themis_eval-0.2.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|