themis-eval 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- themis/__init__.py +12 -1
- themis/_version.py +2 -2
- themis/api.py +343 -0
- themis/backends/__init__.py +17 -0
- themis/backends/execution.py +197 -0
- themis/backends/storage.py +260 -0
- themis/cli/__init__.py +5 -0
- themis/cli/__main__.py +6 -0
- themis/cli/commands/__init__.py +19 -0
- themis/cli/commands/benchmarks.py +221 -0
- themis/cli/commands/comparison.py +394 -0
- themis/cli/commands/config_commands.py +244 -0
- themis/cli/commands/cost.py +214 -0
- themis/cli/commands/demo.py +68 -0
- themis/cli/commands/info.py +90 -0
- themis/cli/commands/leaderboard.py +362 -0
- themis/cli/commands/math_benchmarks.py +318 -0
- themis/cli/commands/mcq_benchmarks.py +207 -0
- themis/cli/commands/results.py +252 -0
- themis/cli/commands/sample_run.py +244 -0
- themis/cli/commands/visualize.py +299 -0
- themis/cli/main.py +463 -0
- themis/cli/new_project.py +33 -0
- themis/cli/utils.py +51 -0
- themis/comparison/__init__.py +25 -0
- themis/comparison/engine.py +348 -0
- themis/comparison/reports.py +283 -0
- themis/comparison/statistics.py +402 -0
- themis/config/__init__.py +19 -0
- themis/config/loader.py +27 -0
- themis/config/registry.py +34 -0
- themis/config/runtime.py +214 -0
- themis/config/schema.py +112 -0
- themis/core/__init__.py +5 -0
- themis/core/conversation.py +354 -0
- themis/core/entities.py +184 -0
- themis/core/serialization.py +231 -0
- themis/core/tools.py +393 -0
- themis/core/types.py +141 -0
- themis/datasets/__init__.py +273 -0
- themis/datasets/base.py +264 -0
- themis/datasets/commonsense_qa.py +174 -0
- themis/datasets/competition_math.py +265 -0
- themis/datasets/coqa.py +133 -0
- themis/datasets/gpqa.py +190 -0
- themis/datasets/gsm8k.py +123 -0
- themis/datasets/gsm_symbolic.py +124 -0
- themis/datasets/math500.py +122 -0
- themis/datasets/med_qa.py +179 -0
- themis/datasets/medmcqa.py +169 -0
- themis/datasets/mmlu_pro.py +262 -0
- themis/datasets/piqa.py +146 -0
- themis/datasets/registry.py +201 -0
- themis/datasets/schema.py +245 -0
- themis/datasets/sciq.py +150 -0
- themis/datasets/social_i_qa.py +151 -0
- themis/datasets/super_gpqa.py +263 -0
- themis/evaluation/__init__.py +1 -0
- themis/evaluation/conditional.py +410 -0
- themis/evaluation/extractors/__init__.py +19 -0
- themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
- themis/evaluation/extractors/exceptions.py +7 -0
- themis/evaluation/extractors/identity_extractor.py +29 -0
- themis/evaluation/extractors/json_field_extractor.py +45 -0
- themis/evaluation/extractors/math_verify_extractor.py +37 -0
- themis/evaluation/extractors/regex_extractor.py +43 -0
- themis/evaluation/math_verify_utils.py +87 -0
- themis/evaluation/metrics/__init__.py +21 -0
- themis/evaluation/metrics/code/__init__.py +19 -0
- themis/evaluation/metrics/code/codebleu.py +144 -0
- themis/evaluation/metrics/code/execution.py +280 -0
- themis/evaluation/metrics/code/pass_at_k.py +181 -0
- themis/evaluation/metrics/composite_metric.py +47 -0
- themis/evaluation/metrics/consistency_metric.py +80 -0
- themis/evaluation/metrics/exact_match.py +51 -0
- themis/evaluation/metrics/length_difference_tolerance.py +33 -0
- themis/evaluation/metrics/math_verify_accuracy.py +40 -0
- themis/evaluation/metrics/nlp/__init__.py +21 -0
- themis/evaluation/metrics/nlp/bertscore.py +138 -0
- themis/evaluation/metrics/nlp/bleu.py +129 -0
- themis/evaluation/metrics/nlp/meteor.py +153 -0
- themis/evaluation/metrics/nlp/rouge.py +136 -0
- themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
- themis/evaluation/metrics/response_length.py +33 -0
- themis/evaluation/metrics/rubric_judge_metric.py +134 -0
- themis/evaluation/pipeline.py +49 -0
- themis/evaluation/pipelines/__init__.py +15 -0
- themis/evaluation/pipelines/composable_pipeline.py +357 -0
- themis/evaluation/pipelines/standard_pipeline.py +348 -0
- themis/evaluation/reports.py +293 -0
- themis/evaluation/statistics/__init__.py +53 -0
- themis/evaluation/statistics/bootstrap.py +79 -0
- themis/evaluation/statistics/confidence_intervals.py +121 -0
- themis/evaluation/statistics/distributions.py +207 -0
- themis/evaluation/statistics/effect_sizes.py +124 -0
- themis/evaluation/statistics/hypothesis_tests.py +305 -0
- themis/evaluation/statistics/types.py +139 -0
- themis/evaluation/strategies/__init__.py +13 -0
- themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
- themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
- themis/evaluation/strategies/evaluation_strategy.py +24 -0
- themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
- themis/experiment/__init__.py +5 -0
- themis/experiment/builder.py +151 -0
- themis/experiment/cache_manager.py +134 -0
- themis/experiment/comparison.py +631 -0
- themis/experiment/cost.py +310 -0
- themis/experiment/definitions.py +62 -0
- themis/experiment/export.py +798 -0
- themis/experiment/export_csv.py +159 -0
- themis/experiment/integration_manager.py +104 -0
- themis/experiment/math.py +192 -0
- themis/experiment/mcq.py +169 -0
- themis/experiment/orchestrator.py +415 -0
- themis/experiment/pricing.py +317 -0
- themis/experiment/storage.py +1458 -0
- themis/experiment/visualization.py +588 -0
- themis/generation/__init__.py +1 -0
- themis/generation/agentic_runner.py +420 -0
- themis/generation/batching.py +254 -0
- themis/generation/clients.py +143 -0
- themis/generation/conversation_runner.py +236 -0
- themis/generation/plan.py +456 -0
- themis/generation/providers/litellm_provider.py +221 -0
- themis/generation/providers/vllm_provider.py +135 -0
- themis/generation/router.py +34 -0
- themis/generation/runner.py +207 -0
- themis/generation/strategies.py +98 -0
- themis/generation/templates.py +71 -0
- themis/generation/turn_strategies.py +393 -0
- themis/generation/types.py +9 -0
- themis/integrations/__init__.py +0 -0
- themis/integrations/huggingface.py +72 -0
- themis/integrations/wandb.py +77 -0
- themis/interfaces/__init__.py +169 -0
- themis/presets/__init__.py +10 -0
- themis/presets/benchmarks.py +354 -0
- themis/presets/models.py +190 -0
- themis/project/__init__.py +20 -0
- themis/project/definitions.py +98 -0
- themis/project/patterns.py +230 -0
- themis/providers/__init__.py +5 -0
- themis/providers/registry.py +39 -0
- themis/server/__init__.py +28 -0
- themis/server/app.py +337 -0
- themis/utils/api_generator.py +379 -0
- themis/utils/cost_tracking.py +376 -0
- themis/utils/dashboard.py +452 -0
- themis/utils/logging_utils.py +41 -0
- themis/utils/progress.py +58 -0
- themis/utils/tracing.py +320 -0
- themis_eval-0.2.0.dist-info/METADATA +596 -0
- themis_eval-0.2.0.dist-info/RECORD +157 -0
- {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/WHEEL +1 -1
- themis_eval-0.1.0.dist-info/METADATA +0 -758
- themis_eval-0.1.0.dist-info/RECORD +0 -8
- {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
"""Utilities for assembling experiments from reusable components."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any, Callable, Mapping, Sequence, Type
|
|
7
|
+
|
|
8
|
+
from themis.config import schema as config
|
|
9
|
+
from themis.core import entities as core_entities
|
|
10
|
+
from themis.evaluation import pipeline as evaluation_pipeline
|
|
11
|
+
from themis.evaluation import strategies as evaluation_strategies
|
|
12
|
+
from themis.experiment import orchestrator
|
|
13
|
+
from themis.experiment import storage as experiment_storage
|
|
14
|
+
from themis.experiment.cache_manager import CacheManager
|
|
15
|
+
from themis.experiment.definitions import (
|
|
16
|
+
BuiltExperiment,
|
|
17
|
+
ExperimentDefinition,
|
|
18
|
+
ModelBinding,
|
|
19
|
+
)
|
|
20
|
+
from themis.experiment.integration_manager import IntegrationManager
|
|
21
|
+
from themis.generation import plan as generation_plan
|
|
22
|
+
from themis.generation import router as generation_router
|
|
23
|
+
from themis.generation import runner as generation_runner
|
|
24
|
+
from themis.generation import strategies as generation_strategies
|
|
25
|
+
from themis.interfaces import ModelProvider
|
|
26
|
+
from themis.providers import create_provider
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class ExperimentBuilder:
|
|
30
|
+
"""Composable builder for constructing experiment components."""
|
|
31
|
+
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
*,
|
|
35
|
+
extractor,
|
|
36
|
+
metrics,
|
|
37
|
+
runner_cls: Type[
|
|
38
|
+
generation_runner.GenerationRunner
|
|
39
|
+
] = generation_runner.GenerationRunner,
|
|
40
|
+
runner_kwargs: Mapping[str, Any] | None = None,
|
|
41
|
+
pipeline_cls: Type[
|
|
42
|
+
evaluation_pipeline.EvaluationPipeline
|
|
43
|
+
] = evaluation_pipeline.EvaluationPipeline,
|
|
44
|
+
pipeline_kwargs: Mapping[str, Any] | None = None,
|
|
45
|
+
router_cls: Type[ModelProvider] = generation_router.ProviderRouter,
|
|
46
|
+
router_kwargs: Mapping[str, Any] | None = None,
|
|
47
|
+
strategy_resolver: Callable[
|
|
48
|
+
[core_entities.GenerationTask], generation_strategies.GenerationStrategy
|
|
49
|
+
]
|
|
50
|
+
| None = None,
|
|
51
|
+
evaluation_strategy_resolver: Callable[
|
|
52
|
+
[core_entities.GenerationRecord], evaluation_strategies.EvaluationStrategy
|
|
53
|
+
]
|
|
54
|
+
| None = None,
|
|
55
|
+
) -> None:
|
|
56
|
+
self._extractor = extractor
|
|
57
|
+
self._metrics = list(metrics)
|
|
58
|
+
self._runner_cls = runner_cls
|
|
59
|
+
self._runner_kwargs = dict(runner_kwargs or {})
|
|
60
|
+
self._pipeline_cls = pipeline_cls
|
|
61
|
+
self._pipeline_kwargs = dict(pipeline_kwargs or {})
|
|
62
|
+
self._router_cls = router_cls
|
|
63
|
+
self._router_kwargs = dict(router_kwargs or {})
|
|
64
|
+
self._strategy_resolver = strategy_resolver
|
|
65
|
+
self._evaluation_strategy_resolver = evaluation_strategy_resolver
|
|
66
|
+
|
|
67
|
+
def build(
|
|
68
|
+
self,
|
|
69
|
+
definition: ExperimentDefinition,
|
|
70
|
+
*,
|
|
71
|
+
storage_dir: str | Path | None = None,
|
|
72
|
+
) -> BuiltExperiment:
|
|
73
|
+
plan_obj = self._build_plan(definition)
|
|
74
|
+
router = self._build_router(definition.model_bindings)
|
|
75
|
+
runner_kwargs = dict(self._runner_kwargs)
|
|
76
|
+
if self._strategy_resolver is not None:
|
|
77
|
+
runner_kwargs.setdefault("strategy_resolver", self._strategy_resolver)
|
|
78
|
+
runner = self._runner_cls(provider=router, **runner_kwargs)
|
|
79
|
+
pipeline_kwargs = dict(self._pipeline_kwargs)
|
|
80
|
+
if self._evaluation_strategy_resolver is not None:
|
|
81
|
+
pipeline_kwargs.setdefault(
|
|
82
|
+
"strategy_resolver", self._evaluation_strategy_resolver
|
|
83
|
+
)
|
|
84
|
+
pipeline = self._pipeline_cls(
|
|
85
|
+
extractor=self._extractor,
|
|
86
|
+
metrics=self._metrics,
|
|
87
|
+
**pipeline_kwargs,
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
# Create storage backend
|
|
91
|
+
storage = (
|
|
92
|
+
experiment_storage.ExperimentStorage(storage_dir)
|
|
93
|
+
if storage_dir is not None
|
|
94
|
+
else None
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
# Create managers for better separation of concerns
|
|
98
|
+
cache_manager = CacheManager(
|
|
99
|
+
storage=storage,
|
|
100
|
+
enable_resume=True,
|
|
101
|
+
enable_cache=True,
|
|
102
|
+
)
|
|
103
|
+
integration_manager = IntegrationManager(config=config.IntegrationsConfig())
|
|
104
|
+
|
|
105
|
+
# Create orchestrator with managers
|
|
106
|
+
orchestrator_obj = orchestrator.ExperimentOrchestrator(
|
|
107
|
+
generation_plan=plan_obj,
|
|
108
|
+
generation_runner=runner,
|
|
109
|
+
evaluation_pipeline=pipeline,
|
|
110
|
+
cache_manager=cache_manager,
|
|
111
|
+
integration_manager=integration_manager,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
return BuiltExperiment(
|
|
115
|
+
orchestrator=orchestrator_obj,
|
|
116
|
+
plan=plan_obj,
|
|
117
|
+
runner=runner,
|
|
118
|
+
pipeline=pipeline,
|
|
119
|
+
storage=storage,
|
|
120
|
+
router=router,
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
def _build_plan(
|
|
124
|
+
self, definition: ExperimentDefinition
|
|
125
|
+
) -> generation_plan.GenerationPlan:
|
|
126
|
+
return generation_plan.GenerationPlan(
|
|
127
|
+
templates=list(definition.templates),
|
|
128
|
+
models=[binding.spec for binding in definition.model_bindings],
|
|
129
|
+
sampling_parameters=list(definition.sampling_parameters),
|
|
130
|
+
dataset_id_field=definition.dataset_id_field,
|
|
131
|
+
reference_field=definition.reference_field,
|
|
132
|
+
metadata_fields=tuple(definition.metadata_fields),
|
|
133
|
+
context_builder=definition.context_builder,
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
def _build_router(self, bindings: Sequence[ModelBinding]) -> ModelProvider:
|
|
137
|
+
providers: dict[str, ModelProvider] = {}
|
|
138
|
+
for binding in bindings:
|
|
139
|
+
providers[binding.spec.identifier] = create_provider(
|
|
140
|
+
binding.provider_name,
|
|
141
|
+
**binding.provider_options,
|
|
142
|
+
)
|
|
143
|
+
return self._router_cls(providers, **self._router_kwargs)
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
__all__ = [
|
|
147
|
+
"ExperimentBuilder",
|
|
148
|
+
"ExperimentDefinition",
|
|
149
|
+
"ModelBinding",
|
|
150
|
+
"BuiltExperiment",
|
|
151
|
+
]
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
"""Cache management for experiment resumability and storage."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Sequence
|
|
6
|
+
|
|
7
|
+
from themis.core.entities import EvaluationRecord, GenerationRecord
|
|
8
|
+
from themis.experiment import storage as experiment_storage
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class CacheManager:
|
|
12
|
+
"""Manages experiment caching and resumability.
|
|
13
|
+
|
|
14
|
+
This class handles all storage-related operations including:
|
|
15
|
+
- Loading cached generation records
|
|
16
|
+
- Loading cached evaluations
|
|
17
|
+
- Saving datasets for resumability
|
|
18
|
+
- Saving generation records and evaluations
|
|
19
|
+
|
|
20
|
+
Single Responsibility: Cache and storage management
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
storage: experiment_storage.ExperimentStorage | None,
|
|
26
|
+
enable_resume: bool = True,
|
|
27
|
+
enable_cache: bool = True,
|
|
28
|
+
) -> None:
|
|
29
|
+
"""Initialize cache manager.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
storage: Storage backend (None disables caching)
|
|
33
|
+
enable_resume: Whether to load cached results on resume
|
|
34
|
+
enable_cache: Whether to save new results to cache
|
|
35
|
+
"""
|
|
36
|
+
self._storage = storage
|
|
37
|
+
self._enable_resume = enable_resume
|
|
38
|
+
self._enable_cache = enable_cache
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def has_storage(self) -> bool:
|
|
42
|
+
"""Check if storage is available."""
|
|
43
|
+
return self._storage is not None
|
|
44
|
+
|
|
45
|
+
def cache_dataset(self, run_id: str, dataset: Sequence[dict[str, object]]) -> None:
|
|
46
|
+
"""Cache dataset for future resumability.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
run_id: Unique run identifier
|
|
50
|
+
dataset: Dataset samples to cache
|
|
51
|
+
"""
|
|
52
|
+
if self._storage is not None and self._enable_cache:
|
|
53
|
+
self._storage.cache_dataset(run_id, list(dataset))
|
|
54
|
+
|
|
55
|
+
def load_cached_records(self, run_id: str) -> dict[str, GenerationRecord]:
|
|
56
|
+
"""Load cached generation records for resuming.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
run_id: Unique run identifier
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
Dictionary mapping cache keys to generation records
|
|
63
|
+
"""
|
|
64
|
+
if not self._enable_resume or self._storage is None:
|
|
65
|
+
return {}
|
|
66
|
+
return self._storage.load_cached_records(run_id)
|
|
67
|
+
|
|
68
|
+
def load_cached_evaluations(
|
|
69
|
+
self, run_id: str, evaluation_config: dict | None = None
|
|
70
|
+
) -> dict[str, EvaluationRecord]:
|
|
71
|
+
"""Load cached evaluation records for resuming.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
run_id: Unique run identifier
|
|
75
|
+
evaluation_config: Evaluation configuration (metrics, extractor) for cache matching
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
Dictionary mapping cache keys to evaluation records
|
|
79
|
+
"""
|
|
80
|
+
if not self._enable_resume or self._storage is None:
|
|
81
|
+
return {}
|
|
82
|
+
return self._storage.load_cached_evaluations(run_id, evaluation_config=evaluation_config)
|
|
83
|
+
|
|
84
|
+
def save_generation_record(
|
|
85
|
+
self,
|
|
86
|
+
run_id: str,
|
|
87
|
+
record: GenerationRecord,
|
|
88
|
+
cache_key: str,
|
|
89
|
+
) -> None:
|
|
90
|
+
"""Save a single generation record.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
run_id: Unique run identifier
|
|
94
|
+
record: Generation record to save
|
|
95
|
+
cache_key: Cache key for this record
|
|
96
|
+
"""
|
|
97
|
+
if self._storage is not None and self._enable_cache:
|
|
98
|
+
self._storage.append_record(run_id, record, cache_key=cache_key)
|
|
99
|
+
|
|
100
|
+
def save_evaluation_record(
|
|
101
|
+
self,
|
|
102
|
+
run_id: str,
|
|
103
|
+
generation_record: GenerationRecord,
|
|
104
|
+
evaluation_record: EvaluationRecord,
|
|
105
|
+
evaluation_config: dict | None = None,
|
|
106
|
+
) -> None:
|
|
107
|
+
"""Save a single evaluation record.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
run_id: Unique run identifier
|
|
111
|
+
generation_record: Corresponding generation record
|
|
112
|
+
evaluation_record: Evaluation record to save
|
|
113
|
+
evaluation_config: Evaluation configuration for cache invalidation
|
|
114
|
+
"""
|
|
115
|
+
if self._storage is not None and self._enable_cache:
|
|
116
|
+
self._storage.append_evaluation(
|
|
117
|
+
run_id, generation_record, evaluation_record, evaluation_config=evaluation_config
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
def get_run_path(self, run_id: str) -> str | None:
|
|
121
|
+
"""Get filesystem path for a run.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
run_id: Unique run identifier
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
Path to run directory, or None if no storage
|
|
128
|
+
"""
|
|
129
|
+
if self._storage is None:
|
|
130
|
+
return None
|
|
131
|
+
return str(self._storage.get_run_path(run_id))
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
__all__ = ["CacheManager"]
|