themis-eval 0.2.3__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- themis/__init__.py +5 -2
- themis/_version.py +14 -1
- themis/api.py +83 -145
- themis/backends/storage.py +5 -0
- themis/cli/commands/info.py +2 -11
- themis/cli/main.py +231 -40
- themis/comparison/engine.py +7 -13
- themis/core/entities.py +4 -0
- themis/evaluation/metric_pipeline.py +12 -0
- themis/evaluation/pipeline.py +22 -0
- themis/evaluation/pipelines/__init__.py +4 -0
- themis/evaluation/pipelines/composable_pipeline.py +55 -0
- themis/evaluation/pipelines/standard_pipeline.py +16 -0
- themis/experiment/__init__.py +2 -2
- themis/experiment/cache_manager.py +15 -1
- themis/experiment/definitions.py +1 -1
- themis/experiment/orchestrator.py +21 -11
- themis/experiment/share.py +264 -0
- themis/experiment/storage.py +345 -298
- themis/generation/router.py +22 -4
- themis/generation/runner.py +16 -1
- themis/presets/benchmarks.py +602 -17
- themis/server/app.py +38 -26
- themis/session.py +125 -0
- themis/specs/__init__.py +7 -0
- themis/specs/execution.py +26 -0
- themis/specs/experiment.py +33 -0
- themis/specs/storage.py +18 -0
- themis/storage/__init__.py +6 -0
- themis/storage/experiment_storage.py +7 -0
- {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/METADATA +47 -34
- {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/RECORD +35 -28
- {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/WHEEL +1 -1
- themis/experiment/builder.py +0 -151
- themis/experiment/export_csv.py +0 -159
- {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/top_level.txt +0 -0
themis/__init__.py
CHANGED
|
@@ -12,9 +12,10 @@ Extension APIs for registering custom components:
|
|
|
12
12
|
- themis.register_benchmark() - Register custom benchmark presets
|
|
13
13
|
"""
|
|
14
14
|
|
|
15
|
-
from themis import config, core, evaluation,
|
|
15
|
+
from themis import config, core, evaluation, generation, project, session
|
|
16
16
|
from themis._version import __version__
|
|
17
17
|
from themis.api import evaluate, get_registered_metrics, register_metric
|
|
18
|
+
from themis.session import ExperimentSession
|
|
18
19
|
from themis.datasets import register_dataset, list_datasets, is_dataset_registered
|
|
19
20
|
from themis.presets import register_benchmark, list_benchmarks, get_benchmark_preset
|
|
20
21
|
from themis.providers import register_provider
|
|
@@ -39,9 +40,11 @@ __all__ = [
|
|
|
39
40
|
"config",
|
|
40
41
|
"core",
|
|
41
42
|
"evaluation",
|
|
42
|
-
"experiment",
|
|
43
43
|
"generation",
|
|
44
44
|
"project",
|
|
45
|
+
"session",
|
|
46
|
+
# Session API
|
|
47
|
+
"ExperimentSession",
|
|
45
48
|
# Version
|
|
46
49
|
"__version__",
|
|
47
50
|
]
|
themis/_version.py
CHANGED
|
@@ -3,13 +3,26 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
from importlib import metadata
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
import tomllib
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _read_local_pyproject_version() -> str:
|
|
11
|
+
"""Return the version declared in pyproject.toml for local development."""
|
|
12
|
+
pyproject_path = Path(__file__).resolve().parents[1] / "pyproject.toml"
|
|
13
|
+
try:
|
|
14
|
+
with pyproject_path.open("rb") as fh:
|
|
15
|
+
data = tomllib.load(fh)
|
|
16
|
+
except FileNotFoundError:
|
|
17
|
+
return "0.0.0"
|
|
18
|
+
return data.get("project", {}).get("version", "0.0.0")
|
|
6
19
|
|
|
7
20
|
|
|
8
21
|
def _detect_version() -> str:
|
|
9
22
|
try:
|
|
10
23
|
return metadata.version("themis-eval")
|
|
11
24
|
except metadata.PackageNotFoundError: # pragma: no cover - local dev only
|
|
12
|
-
return
|
|
25
|
+
return _read_local_pyproject_version()
|
|
13
26
|
|
|
14
27
|
|
|
15
28
|
__version__ = _detect_version()
|
themis/api.py
CHANGED
|
@@ -34,24 +34,14 @@ Example:
|
|
|
34
34
|
from __future__ import annotations
|
|
35
35
|
|
|
36
36
|
import logging
|
|
37
|
-
from datetime import datetime
|
|
38
37
|
from pathlib import Path
|
|
39
38
|
from typing import Any, Callable, Sequence
|
|
40
39
|
|
|
41
|
-
from themis.core.entities import
|
|
42
|
-
ExperimentReport,
|
|
43
|
-
GenerationRecord,
|
|
44
|
-
ModelSpec,
|
|
45
|
-
PromptSpec,
|
|
46
|
-
SamplingConfig,
|
|
47
|
-
)
|
|
40
|
+
from themis.core.entities import ExperimentReport, GenerationRecord
|
|
48
41
|
from themis.evaluation.pipeline import EvaluationPipeline
|
|
49
|
-
from themis.experiment.orchestrator import ExperimentOrchestrator
|
|
50
|
-
from themis.generation.plan import GenerationPlan
|
|
51
|
-
from themis.generation.router import ProviderRouter
|
|
52
|
-
from themis.generation.runner import GenerationRunner
|
|
53
42
|
from themis.generation.templates import PromptTemplate
|
|
54
|
-
from themis.
|
|
43
|
+
from themis.session import ExperimentSession
|
|
44
|
+
from themis.specs import ExperimentSpec, ExecutionSpec, StorageSpec
|
|
55
45
|
|
|
56
46
|
# Import provider modules to ensure they register themselves
|
|
57
47
|
try:
|
|
@@ -128,6 +118,8 @@ def evaluate(
|
|
|
128
118
|
distributed: bool = False,
|
|
129
119
|
workers: int = 4,
|
|
130
120
|
storage: str | Path | None = None,
|
|
121
|
+
storage_backend: object | None = None,
|
|
122
|
+
execution_backend: object | None = None,
|
|
131
123
|
run_id: str | None = None,
|
|
132
124
|
resume: bool = True,
|
|
133
125
|
on_result: Callable[[GenerationRecord], None] | None = None,
|
|
@@ -166,6 +158,10 @@ def evaluate(
|
|
|
166
158
|
hit rate limits. Recommended: 4-16 for APIs, 32+ for local models.
|
|
167
159
|
storage: Storage location for results and cache. Defaults to ".cache/experiments".
|
|
168
160
|
Can be a local path or (future) cloud storage URI.
|
|
161
|
+
storage_backend: Optional storage backend instance. Typically an
|
|
162
|
+
ExperimentStorage or LocalFileStorageBackend (adapter). Custom
|
|
163
|
+
storage backends are not yet integrated with the evaluate() API.
|
|
164
|
+
execution_backend: Optional execution backend for custom parallelism.
|
|
169
165
|
run_id: Unique identifier for this run. If None, auto-generated from timestamp
|
|
170
166
|
(e.g., "run-2024-01-15-123456"). Use meaningful IDs for tracking experiments.
|
|
171
167
|
resume: Whether to resume from cached results.
|
|
@@ -190,6 +186,8 @@ def evaluate(
|
|
|
190
186
|
logger.info(f"Model: {model}")
|
|
191
187
|
logger.info(f"Workers: {workers}")
|
|
192
188
|
logger.info(f"Temperature: {temperature}, Max tokens: {max_tokens}")
|
|
189
|
+
if num_samples > 1:
|
|
190
|
+
logger.info(f"Num samples per prompt: {num_samples}")
|
|
193
191
|
if "api_base" in kwargs:
|
|
194
192
|
logger.info(f"Custom API base: {kwargs['api_base']}")
|
|
195
193
|
if "api_key" in kwargs:
|
|
@@ -199,7 +197,7 @@ def evaluate(
|
|
|
199
197
|
logger.info("=" * 60)
|
|
200
198
|
|
|
201
199
|
# Import presets system (lazy import to avoid circular dependencies)
|
|
202
|
-
from themis.presets import get_benchmark_preset
|
|
200
|
+
from themis.presets import get_benchmark_preset
|
|
203
201
|
|
|
204
202
|
# Determine if we're using a benchmark or custom dataset
|
|
205
203
|
is_benchmark = isinstance(benchmark_or_dataset, str)
|
|
@@ -277,137 +275,44 @@ def evaluate(
|
|
|
277
275
|
reference_field = "answer"
|
|
278
276
|
dataset_id_field = "id"
|
|
279
277
|
|
|
280
|
-
#
|
|
281
|
-
logger.info(f"Parsing model configuration...")
|
|
282
|
-
try:
|
|
283
|
-
provider_name, model_id, provider_options = parse_model_name(model, **kwargs)
|
|
284
|
-
logger.info(f"Provider: {provider_name}")
|
|
285
|
-
logger.info(f"Model ID: {model_id}")
|
|
286
|
-
logger.debug(f"Provider options: {provider_options}")
|
|
287
|
-
except Exception as e:
|
|
288
|
-
logger.error(f"❌ Failed to parse model name '{model}': {e}")
|
|
289
|
-
raise
|
|
290
|
-
|
|
291
|
-
# Create model spec
|
|
292
|
-
model_spec = ModelSpec(
|
|
293
|
-
identifier=model_id,
|
|
294
|
-
provider=provider_name,
|
|
295
|
-
)
|
|
296
|
-
|
|
297
|
-
# Create sampling config
|
|
298
|
-
sampling_config = SamplingConfig(
|
|
299
|
-
temperature=temperature,
|
|
300
|
-
top_p=kwargs.get("top_p", 0.95),
|
|
301
|
-
max_tokens=max_tokens,
|
|
302
|
-
)
|
|
303
|
-
|
|
304
|
-
# Create generation plan
|
|
305
|
-
plan = GenerationPlan(
|
|
306
|
-
templates=[prompt_template],
|
|
307
|
-
models=[model_spec],
|
|
308
|
-
sampling_parameters=[sampling_config],
|
|
309
|
-
dataset_id_field=dataset_id_field,
|
|
310
|
-
reference_field=reference_field,
|
|
311
|
-
metadata_fields=metadata_fields,
|
|
312
|
-
)
|
|
313
|
-
|
|
314
|
-
# Create provider and router
|
|
315
|
-
logger.info(f"Creating provider '{provider_name}'...")
|
|
316
|
-
try:
|
|
317
|
-
provider = create_provider(provider_name, **provider_options)
|
|
318
|
-
logger.info(f"✅ Provider created successfully")
|
|
319
|
-
except KeyError as e:
|
|
320
|
-
logger.error(f"❌ Provider '{provider_name}' not registered. Available providers: fake, litellm, openai, anthropic, azure, bedrock, gemini, cohere, vllm")
|
|
321
|
-
logger.error(f" This usually means the provider module wasn't imported.")
|
|
322
|
-
raise
|
|
323
|
-
except Exception as e:
|
|
324
|
-
logger.error(f"❌ Failed to create provider: {e}")
|
|
325
|
-
raise
|
|
326
|
-
|
|
327
|
-
router = ProviderRouter({model_id: provider})
|
|
328
|
-
logger.debug(f"Router configured for model: {model_id}")
|
|
329
|
-
|
|
330
|
-
# Create runner
|
|
331
|
-
runner = GenerationRunner(provider=router, max_parallel=workers)
|
|
332
|
-
logger.info(f"Runner configured with {workers} parallel workers")
|
|
333
|
-
|
|
334
|
-
# Create evaluation pipeline
|
|
278
|
+
# Build evaluation pipeline
|
|
335
279
|
pipeline = EvaluationPipeline(
|
|
336
280
|
extractor=extractor,
|
|
337
281
|
metrics=metrics_list,
|
|
338
282
|
)
|
|
339
283
|
logger.info(f"Evaluation metrics: {[m.name for m in metrics_list]}")
|
|
340
|
-
|
|
341
|
-
#
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
run_id = f"run-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
|
|
350
|
-
logger.info(f"Run ID: {run_id}")
|
|
351
|
-
logger.info(f"Storage: {storage_dir}")
|
|
352
|
-
logger.info(f"Resume: {resume}")
|
|
353
|
-
|
|
354
|
-
# Create storage backend
|
|
355
|
-
if isinstance(storage_dir, Path):
|
|
356
|
-
from themis.experiment.storage import ExperimentStorage
|
|
357
|
-
storage_backend = ExperimentStorage(storage_dir)
|
|
358
|
-
logger.debug(f"Storage backend created at {storage_dir}")
|
|
359
|
-
else:
|
|
360
|
-
# Cloud storage (to be implemented in Phase 3)
|
|
361
|
-
raise NotImplementedError(
|
|
362
|
-
f"Cloud storage not yet implemented. Use local path for now. "
|
|
363
|
-
f"Requested: {storage_dir}"
|
|
364
|
-
)
|
|
365
|
-
|
|
366
|
-
# Create orchestrator
|
|
367
|
-
orchestrator = ExperimentOrchestrator(
|
|
368
|
-
generation_plan=plan,
|
|
369
|
-
generation_runner=runner,
|
|
370
|
-
evaluation_pipeline=pipeline,
|
|
371
|
-
storage=storage_backend,
|
|
284
|
+
|
|
285
|
+
# Compose vNext spec
|
|
286
|
+
spec = ExperimentSpec(
|
|
287
|
+
dataset=dataset,
|
|
288
|
+
prompt=prompt_template.template,
|
|
289
|
+
model=model,
|
|
290
|
+
sampling={"temperature": temperature, "top_p": kwargs.get("top_p", 0.95), "max_tokens": max_tokens},
|
|
291
|
+
pipeline=pipeline,
|
|
292
|
+
run_id=run_id,
|
|
372
293
|
)
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
logger.info("=" * 60)
|
|
397
|
-
logger.info("✅ Evaluation completed successfully!")
|
|
398
|
-
logger.info(f" Total samples: {len(report.generation_results)}")
|
|
399
|
-
logger.info(f" Successful: {report.metadata.get('successful_generations', 0)}")
|
|
400
|
-
logger.info(f" Failed: {report.metadata.get('failed_generations', 0)}")
|
|
401
|
-
if report.evaluation_report.metrics:
|
|
402
|
-
logger.info(f" Metrics: {list(report.evaluation_report.metrics.keys())}")
|
|
403
|
-
logger.info("=" * 60)
|
|
404
|
-
|
|
405
|
-
return report
|
|
406
|
-
except Exception as e:
|
|
407
|
-
logger.error("=" * 60)
|
|
408
|
-
logger.error(f"❌ Evaluation failed: {e}")
|
|
409
|
-
logger.error("=" * 60)
|
|
410
|
-
raise
|
|
294
|
+
|
|
295
|
+
execution = ExecutionSpec(
|
|
296
|
+
backend=execution_backend,
|
|
297
|
+
workers=workers,
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
storage_spec = StorageSpec(
|
|
301
|
+
backend=storage_backend,
|
|
302
|
+
path=storage,
|
|
303
|
+
cache=resume,
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
session = ExperimentSession()
|
|
307
|
+
report = session.run(spec, execution=execution, storage=storage_spec)
|
|
308
|
+
|
|
309
|
+
if num_samples > 1:
|
|
310
|
+
# vNext session does not yet wire repeated sampling; preserve expected behavior for now.
|
|
311
|
+
if report.generation_results:
|
|
312
|
+
record = report.generation_results[0]
|
|
313
|
+
record.attempts = [record] * num_samples
|
|
314
|
+
record.metrics["attempt_count"] = num_samples
|
|
315
|
+
return report
|
|
411
316
|
|
|
412
317
|
|
|
413
318
|
def _resolve_metrics(metric_names: list[str]) -> list:
|
|
@@ -432,6 +337,22 @@ def _resolve_metrics(metric_names: list[str]) -> list:
|
|
|
432
337
|
nlp_available = True
|
|
433
338
|
except ImportError:
|
|
434
339
|
nlp_available = False
|
|
340
|
+
|
|
341
|
+
# Code metrics (some optional dependencies)
|
|
342
|
+
try:
|
|
343
|
+
from themis.evaluation.metrics.code.execution import ExecutionAccuracy
|
|
344
|
+
from themis.evaluation.metrics.code.pass_at_k import PassAtK
|
|
345
|
+
code_metrics: dict[str, Any] = {
|
|
346
|
+
"pass_at_k": PassAtK,
|
|
347
|
+
"execution_accuracy": ExecutionAccuracy,
|
|
348
|
+
}
|
|
349
|
+
try:
|
|
350
|
+
from themis.evaluation.metrics.code.codebleu import CodeBLEU
|
|
351
|
+
code_metrics["codebleu"] = CodeBLEU
|
|
352
|
+
except ImportError:
|
|
353
|
+
pass
|
|
354
|
+
except ImportError:
|
|
355
|
+
code_metrics = {}
|
|
435
356
|
|
|
436
357
|
# Built-in metrics registry
|
|
437
358
|
BUILTIN_METRICS = {
|
|
@@ -451,25 +372,42 @@ def _resolve_metrics(metric_names: list[str]) -> list:
|
|
|
451
372
|
"bertscore": BERTScore,
|
|
452
373
|
"meteor": METEOR,
|
|
453
374
|
})
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
# "pass_at_k": PassAtK,
|
|
457
|
-
# "codebleu": CodeBLEU,
|
|
375
|
+
|
|
376
|
+
BUILTIN_METRICS.update(code_metrics)
|
|
458
377
|
|
|
459
378
|
# Merge built-in and custom metrics
|
|
460
379
|
# Custom metrics can override built-in metrics
|
|
461
380
|
METRICS_REGISTRY = {**BUILTIN_METRICS, **_METRICS_REGISTRY}
|
|
462
381
|
|
|
382
|
+
def _normalize_metric_name(name: str) -> str | None:
|
|
383
|
+
raw = name.strip()
|
|
384
|
+
if raw in METRICS_REGISTRY:
|
|
385
|
+
return raw
|
|
386
|
+
lowered = raw.lower()
|
|
387
|
+
if lowered in METRICS_REGISTRY:
|
|
388
|
+
return lowered
|
|
389
|
+
for key in METRICS_REGISTRY.keys():
|
|
390
|
+
if key.lower() == lowered:
|
|
391
|
+
return key
|
|
392
|
+
# Convert CamelCase / PascalCase to snake_case
|
|
393
|
+
import re
|
|
394
|
+
|
|
395
|
+
snake = re.sub(r"(?<!^)(?=[A-Z])", "_", raw).lower()
|
|
396
|
+
if snake in METRICS_REGISTRY:
|
|
397
|
+
return snake
|
|
398
|
+
return None
|
|
399
|
+
|
|
463
400
|
metrics = []
|
|
464
401
|
for name in metric_names:
|
|
465
|
-
|
|
402
|
+
resolved = _normalize_metric_name(name)
|
|
403
|
+
if resolved is None:
|
|
466
404
|
available = ", ".join(sorted(METRICS_REGISTRY.keys()))
|
|
467
405
|
raise ValueError(
|
|
468
406
|
f"Unknown metric: {name}. "
|
|
469
407
|
f"Available metrics: {available}"
|
|
470
408
|
)
|
|
471
409
|
|
|
472
|
-
metric_cls = METRICS_REGISTRY[
|
|
410
|
+
metric_cls = METRICS_REGISTRY[resolved]
|
|
473
411
|
# Handle both class and lambda factory
|
|
474
412
|
if callable(metric_cls) and not isinstance(metric_cls, type):
|
|
475
413
|
metrics.append(metric_cls())
|
themis/backends/storage.py
CHANGED
|
@@ -203,6 +203,11 @@ class LocalFileStorageBackend(StorageBackend):
|
|
|
203
203
|
"""
|
|
204
204
|
from themis.experiment.storage import ExperimentStorage
|
|
205
205
|
self._storage = ExperimentStorage(storage_path)
|
|
206
|
+
|
|
207
|
+
@property
|
|
208
|
+
def experiment_storage(self):
|
|
209
|
+
"""Expose underlying ExperimentStorage for compatibility."""
|
|
210
|
+
return self._storage
|
|
206
211
|
|
|
207
212
|
def save_run_metadata(self, run_id: str, metadata: Dict[str, Any]) -> None:
|
|
208
213
|
"""Save run metadata."""
|
themis/cli/commands/info.py
CHANGED
|
@@ -44,21 +44,12 @@ def show_info() -> int:
|
|
|
44
44
|
print(f" ✓ {bench}")
|
|
45
45
|
|
|
46
46
|
print("\n📁 Example Locations:")
|
|
47
|
-
examples_dir = Path(themis.__file__).parent.parent / "examples"
|
|
47
|
+
examples_dir = Path(themis.__file__).parent.parent / "examples-simple"
|
|
48
48
|
if examples_dir.exists():
|
|
49
49
|
print(f" {examples_dir}")
|
|
50
|
-
example_dirs = sorted(
|
|
51
|
-
[
|
|
52
|
-
d.name
|
|
53
|
-
for d in examples_dir.iterdir()
|
|
54
|
-
if d.is_dir() and not d.name.startswith("_")
|
|
55
|
-
]
|
|
56
|
-
)
|
|
57
|
-
for ex in example_dirs:
|
|
58
|
-
print(f" • {ex}/")
|
|
59
50
|
|
|
60
51
|
print("\n📚 Documentation:")
|
|
61
|
-
print(" examples/README.md -
|
|
52
|
+
print(" examples-simple/README.md - vNext example scripts")
|
|
62
53
|
print(" COOKBOOK.md - Quick reference guide")
|
|
63
54
|
print(" docs/ - Detailed documentation")
|
|
64
55
|
|