themis-eval 0.2.2__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. themis/__init__.py +5 -2
  2. themis/_version.py +14 -1
  3. themis/api.py +83 -145
  4. themis/backends/storage.py +5 -0
  5. themis/cli/commands/info.py +2 -11
  6. themis/cli/main.py +231 -40
  7. themis/comparison/engine.py +7 -13
  8. themis/core/entities.py +4 -0
  9. themis/evaluation/metric_pipeline.py +12 -0
  10. themis/evaluation/pipeline.py +22 -0
  11. themis/evaluation/pipelines/__init__.py +4 -0
  12. themis/evaluation/pipelines/composable_pipeline.py +55 -0
  13. themis/evaluation/pipelines/standard_pipeline.py +18 -1
  14. themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +5 -2
  15. themis/evaluation/strategies/judge_evaluation_strategy.py +6 -1
  16. themis/experiment/__init__.py +2 -2
  17. themis/experiment/cache_manager.py +15 -1
  18. themis/experiment/definitions.py +1 -1
  19. themis/experiment/orchestrator.py +21 -11
  20. themis/experiment/share.py +264 -0
  21. themis/experiment/storage.py +345 -298
  22. themis/generation/plan.py +28 -6
  23. themis/generation/router.py +22 -4
  24. themis/generation/runner.py +16 -1
  25. themis/presets/benchmarks.py +602 -17
  26. themis/server/app.py +38 -26
  27. themis/session.py +125 -0
  28. themis/specs/__init__.py +7 -0
  29. themis/specs/execution.py +26 -0
  30. themis/specs/experiment.py +33 -0
  31. themis/specs/storage.py +18 -0
  32. themis/storage/__init__.py +6 -0
  33. themis/storage/experiment_storage.py +7 -0
  34. {themis_eval-0.2.2.dist-info → themis_eval-1.0.0.dist-info}/METADATA +47 -34
  35. {themis_eval-0.2.2.dist-info → themis_eval-1.0.0.dist-info}/RECORD +38 -31
  36. {themis_eval-0.2.2.dist-info → themis_eval-1.0.0.dist-info}/WHEEL +1 -1
  37. themis/experiment/builder.py +0 -151
  38. themis/experiment/export_csv.py +0 -159
  39. {themis_eval-0.2.2.dist-info → themis_eval-1.0.0.dist-info}/licenses/LICENSE +0 -0
  40. {themis_eval-0.2.2.dist-info → themis_eval-1.0.0.dist-info}/top_level.txt +0 -0
themis/__init__.py CHANGED
@@ -12,9 +12,10 @@ Extension APIs for registering custom components:
12
12
  - themis.register_benchmark() - Register custom benchmark presets
13
13
  """
14
14
 
15
- from themis import config, core, evaluation, experiment, generation, project
15
+ from themis import config, core, evaluation, generation, project, session
16
16
  from themis._version import __version__
17
17
  from themis.api import evaluate, get_registered_metrics, register_metric
18
+ from themis.session import ExperimentSession
18
19
  from themis.datasets import register_dataset, list_datasets, is_dataset_registered
19
20
  from themis.presets import register_benchmark, list_benchmarks, get_benchmark_preset
20
21
  from themis.providers import register_provider
@@ -39,9 +40,11 @@ __all__ = [
39
40
  "config",
40
41
  "core",
41
42
  "evaluation",
42
- "experiment",
43
43
  "generation",
44
44
  "project",
45
+ "session",
46
+ # Session API
47
+ "ExperimentSession",
45
48
  # Version
46
49
  "__version__",
47
50
  ]
themis/_version.py CHANGED
@@ -3,13 +3,26 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  from importlib import metadata
6
+ from pathlib import Path
7
+ import tomllib
8
+
9
+
10
+ def _read_local_pyproject_version() -> str:
11
+ """Return the version declared in pyproject.toml for local development."""
12
+ pyproject_path = Path(__file__).resolve().parents[1] / "pyproject.toml"
13
+ try:
14
+ with pyproject_path.open("rb") as fh:
15
+ data = tomllib.load(fh)
16
+ except FileNotFoundError:
17
+ return "0.0.0"
18
+ return data.get("project", {}).get("version", "0.0.0")
6
19
 
7
20
 
8
21
  def _detect_version() -> str:
9
22
  try:
10
23
  return metadata.version("themis-eval")
11
24
  except metadata.PackageNotFoundError: # pragma: no cover - local dev only
12
- return "0.2.2" # Fallback for development
25
+ return _read_local_pyproject_version()
13
26
 
14
27
 
15
28
  __version__ = _detect_version()
themis/api.py CHANGED
@@ -34,24 +34,14 @@ Example:
34
34
  from __future__ import annotations
35
35
 
36
36
  import logging
37
- from datetime import datetime
38
37
  from pathlib import Path
39
38
  from typing import Any, Callable, Sequence
40
39
 
41
- from themis.core.entities import (
42
- ExperimentReport,
43
- GenerationRecord,
44
- ModelSpec,
45
- PromptSpec,
46
- SamplingConfig,
47
- )
40
+ from themis.core.entities import ExperimentReport, GenerationRecord
48
41
  from themis.evaluation.pipeline import EvaluationPipeline
49
- from themis.experiment.orchestrator import ExperimentOrchestrator
50
- from themis.generation.plan import GenerationPlan
51
- from themis.generation.router import ProviderRouter
52
- from themis.generation.runner import GenerationRunner
53
42
  from themis.generation.templates import PromptTemplate
54
- from themis.providers import create_provider
43
+ from themis.session import ExperimentSession
44
+ from themis.specs import ExperimentSpec, ExecutionSpec, StorageSpec
55
45
 
56
46
  # Import provider modules to ensure they register themselves
57
47
  try:
@@ -128,6 +118,8 @@ def evaluate(
128
118
  distributed: bool = False,
129
119
  workers: int = 4,
130
120
  storage: str | Path | None = None,
121
+ storage_backend: object | None = None,
122
+ execution_backend: object | None = None,
131
123
  run_id: str | None = None,
132
124
  resume: bool = True,
133
125
  on_result: Callable[[GenerationRecord], None] | None = None,
@@ -166,6 +158,10 @@ def evaluate(
166
158
  hit rate limits. Recommended: 4-16 for APIs, 32+ for local models.
167
159
  storage: Storage location for results and cache. Defaults to ".cache/experiments".
168
160
  Can be a local path or (future) cloud storage URI.
161
+ storage_backend: Optional storage backend instance. Typically an
162
+ ExperimentStorage or LocalFileStorageBackend (adapter). Custom
163
+ storage backends are not yet integrated with the evaluate() API.
164
+ execution_backend: Optional execution backend for custom parallelism.
169
165
  run_id: Unique identifier for this run. If None, auto-generated from timestamp
170
166
  (e.g., "run-2024-01-15-123456"). Use meaningful IDs for tracking experiments.
171
167
  resume: Whether to resume from cached results.
@@ -190,6 +186,8 @@ def evaluate(
190
186
  logger.info(f"Model: {model}")
191
187
  logger.info(f"Workers: {workers}")
192
188
  logger.info(f"Temperature: {temperature}, Max tokens: {max_tokens}")
189
+ if num_samples > 1:
190
+ logger.info(f"Num samples per prompt: {num_samples}")
193
191
  if "api_base" in kwargs:
194
192
  logger.info(f"Custom API base: {kwargs['api_base']}")
195
193
  if "api_key" in kwargs:
@@ -199,7 +197,7 @@ def evaluate(
199
197
  logger.info("=" * 60)
200
198
 
201
199
  # Import presets system (lazy import to avoid circular dependencies)
202
- from themis.presets import get_benchmark_preset, parse_model_name
200
+ from themis.presets import get_benchmark_preset
203
201
 
204
202
  # Determine if we're using a benchmark or custom dataset
205
203
  is_benchmark = isinstance(benchmark_or_dataset, str)
@@ -277,137 +275,44 @@ def evaluate(
277
275
  reference_field = "answer"
278
276
  dataset_id_field = "id"
279
277
 
280
- # Parse model name to get provider and options
281
- logger.info(f"Parsing model configuration...")
282
- try:
283
- provider_name, model_id, provider_options = parse_model_name(model, **kwargs)
284
- logger.info(f"Provider: {provider_name}")
285
- logger.info(f"Model ID: {model_id}")
286
- logger.debug(f"Provider options: {provider_options}")
287
- except Exception as e:
288
- logger.error(f"❌ Failed to parse model name '{model}': {e}")
289
- raise
290
-
291
- # Create model spec
292
- model_spec = ModelSpec(
293
- identifier=model_id,
294
- provider=provider_name,
295
- )
296
-
297
- # Create sampling config
298
- sampling_config = SamplingConfig(
299
- temperature=temperature,
300
- top_p=kwargs.get("top_p", 0.95),
301
- max_tokens=max_tokens,
302
- )
303
-
304
- # Create generation plan
305
- plan = GenerationPlan(
306
- templates=[prompt_template],
307
- models=[model_spec],
308
- sampling_parameters=[sampling_config],
309
- dataset_id_field=dataset_id_field,
310
- reference_field=reference_field,
311
- metadata_fields=metadata_fields,
312
- )
313
-
314
- # Create provider and router
315
- logger.info(f"Creating provider '{provider_name}'...")
316
- try:
317
- provider = create_provider(provider_name, **provider_options)
318
- logger.info(f"✅ Provider created successfully")
319
- except KeyError as e:
320
- logger.error(f"❌ Provider '{provider_name}' not registered. Available providers: fake, litellm, openai, anthropic, azure, bedrock, gemini, cohere, vllm")
321
- logger.error(f" This usually means the provider module wasn't imported.")
322
- raise
323
- except Exception as e:
324
- logger.error(f"❌ Failed to create provider: {e}")
325
- raise
326
-
327
- router = ProviderRouter({model_id: provider})
328
- logger.debug(f"Router configured for model: {model_id}")
329
-
330
- # Create runner
331
- runner = GenerationRunner(provider=router, max_parallel=workers)
332
- logger.info(f"Runner configured with {workers} parallel workers")
333
-
334
- # Create evaluation pipeline
278
+ # Build evaluation pipeline
335
279
  pipeline = EvaluationPipeline(
336
280
  extractor=extractor,
337
281
  metrics=metrics_list,
338
282
  )
339
283
  logger.info(f"Evaluation metrics: {[m.name for m in metrics_list]}")
340
-
341
- # Determine storage location
342
- if storage is None:
343
- storage_dir = Path.home() / ".themis" / "runs"
344
- else:
345
- storage_dir = Path(storage) if not str(storage).startswith(("s3://", "gs://", "azure://")) else storage
346
-
347
- # Generate run ID if not provided
348
- if run_id is None:
349
- run_id = f"run-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
350
- logger.info(f"Run ID: {run_id}")
351
- logger.info(f"Storage: {storage_dir}")
352
- logger.info(f"Resume: {resume}")
353
-
354
- # Create storage backend
355
- if isinstance(storage_dir, Path):
356
- from themis.experiment.storage import ExperimentStorage
357
- storage_backend = ExperimentStorage(storage_dir)
358
- logger.debug(f"Storage backend created at {storage_dir}")
359
- else:
360
- # Cloud storage (to be implemented in Phase 3)
361
- raise NotImplementedError(
362
- f"Cloud storage not yet implemented. Use local path for now. "
363
- f"Requested: {storage_dir}"
364
- )
365
-
366
- # Create orchestrator
367
- orchestrator = ExperimentOrchestrator(
368
- generation_plan=plan,
369
- generation_runner=runner,
370
- evaluation_pipeline=pipeline,
371
- storage=storage_backend,
284
+
285
+ # Compose vNext spec
286
+ spec = ExperimentSpec(
287
+ dataset=dataset,
288
+ prompt=prompt_template.template,
289
+ model=model,
290
+ sampling={"temperature": temperature, "top_p": kwargs.get("top_p", 0.95), "max_tokens": max_tokens},
291
+ pipeline=pipeline,
292
+ run_id=run_id,
372
293
  )
373
-
374
- # Run evaluation
375
- if distributed:
376
- # Distributed execution (to be implemented in Phase 3)
377
- raise NotImplementedError(
378
- "Distributed execution not yet implemented. "
379
- "Set distributed=False to use local execution."
380
- )
381
-
382
- # Run locally
383
- logger.info("=" * 60)
384
- logger.info("🚀 Starting experiment execution...")
385
- logger.info("=" * 60)
386
-
387
- try:
388
- report = orchestrator.run(
389
- dataset=dataset,
390
- max_samples=limit,
391
- run_id=run_id,
392
- resume=resume,
393
- on_result=on_result,
394
- )
395
-
396
- logger.info("=" * 60)
397
- logger.info("✅ Evaluation completed successfully!")
398
- logger.info(f" Total samples: {len(report.generation_results)}")
399
- logger.info(f" Successful: {report.metadata.get('successful_generations', 0)}")
400
- logger.info(f" Failed: {report.metadata.get('failed_generations', 0)}")
401
- if report.evaluation_report.metrics:
402
- logger.info(f" Metrics: {list(report.evaluation_report.metrics.keys())}")
403
- logger.info("=" * 60)
404
-
405
- return report
406
- except Exception as e:
407
- logger.error("=" * 60)
408
- logger.error(f"❌ Evaluation failed: {e}")
409
- logger.error("=" * 60)
410
- raise
294
+
295
+ execution = ExecutionSpec(
296
+ backend=execution_backend,
297
+ workers=workers,
298
+ )
299
+
300
+ storage_spec = StorageSpec(
301
+ backend=storage_backend,
302
+ path=storage,
303
+ cache=resume,
304
+ )
305
+
306
+ session = ExperimentSession()
307
+ report = session.run(spec, execution=execution, storage=storage_spec)
308
+
309
+ if num_samples > 1:
310
+ # vNext session does not yet wire repeated sampling; preserve expected behavior for now.
311
+ if report.generation_results:
312
+ record = report.generation_results[0]
313
+ record.attempts = [record] * num_samples
314
+ record.metrics["attempt_count"] = num_samples
315
+ return report
411
316
 
412
317
 
413
318
  def _resolve_metrics(metric_names: list[str]) -> list:
@@ -432,6 +337,22 @@ def _resolve_metrics(metric_names: list[str]) -> list:
432
337
  nlp_available = True
433
338
  except ImportError:
434
339
  nlp_available = False
340
+
341
+ # Code metrics (some optional dependencies)
342
+ try:
343
+ from themis.evaluation.metrics.code.execution import ExecutionAccuracy
344
+ from themis.evaluation.metrics.code.pass_at_k import PassAtK
345
+ code_metrics: dict[str, Any] = {
346
+ "pass_at_k": PassAtK,
347
+ "execution_accuracy": ExecutionAccuracy,
348
+ }
349
+ try:
350
+ from themis.evaluation.metrics.code.codebleu import CodeBLEU
351
+ code_metrics["codebleu"] = CodeBLEU
352
+ except ImportError:
353
+ pass
354
+ except ImportError:
355
+ code_metrics = {}
435
356
 
436
357
  # Built-in metrics registry
437
358
  BUILTIN_METRICS = {
@@ -451,25 +372,42 @@ def _resolve_metrics(metric_names: list[str]) -> list:
451
372
  "bertscore": BERTScore,
452
373
  "meteor": METEOR,
453
374
  })
454
-
455
- # Code metrics (to be added later in Phase 2)
456
- # "pass_at_k": PassAtK,
457
- # "codebleu": CodeBLEU,
375
+
376
+ BUILTIN_METRICS.update(code_metrics)
458
377
 
459
378
  # Merge built-in and custom metrics
460
379
  # Custom metrics can override built-in metrics
461
380
  METRICS_REGISTRY = {**BUILTIN_METRICS, **_METRICS_REGISTRY}
462
381
 
382
+ def _normalize_metric_name(name: str) -> str | None:
383
+ raw = name.strip()
384
+ if raw in METRICS_REGISTRY:
385
+ return raw
386
+ lowered = raw.lower()
387
+ if lowered in METRICS_REGISTRY:
388
+ return lowered
389
+ for key in METRICS_REGISTRY.keys():
390
+ if key.lower() == lowered:
391
+ return key
392
+ # Convert CamelCase / PascalCase to snake_case
393
+ import re
394
+
395
+ snake = re.sub(r"(?<!^)(?=[A-Z])", "_", raw).lower()
396
+ if snake in METRICS_REGISTRY:
397
+ return snake
398
+ return None
399
+
463
400
  metrics = []
464
401
  for name in metric_names:
465
- if name not in METRICS_REGISTRY:
402
+ resolved = _normalize_metric_name(name)
403
+ if resolved is None:
466
404
  available = ", ".join(sorted(METRICS_REGISTRY.keys()))
467
405
  raise ValueError(
468
406
  f"Unknown metric: {name}. "
469
407
  f"Available metrics: {available}"
470
408
  )
471
409
 
472
- metric_cls = METRICS_REGISTRY[name]
410
+ metric_cls = METRICS_REGISTRY[resolved]
473
411
  # Handle both class and lambda factory
474
412
  if callable(metric_cls) and not isinstance(metric_cls, type):
475
413
  metrics.append(metric_cls())
@@ -203,6 +203,11 @@ class LocalFileStorageBackend(StorageBackend):
203
203
  """
204
204
  from themis.experiment.storage import ExperimentStorage
205
205
  self._storage = ExperimentStorage(storage_path)
206
+
207
+ @property
208
+ def experiment_storage(self):
209
+ """Expose underlying ExperimentStorage for compatibility."""
210
+ return self._storage
206
211
 
207
212
  def save_run_metadata(self, run_id: str, metadata: Dict[str, Any]) -> None:
208
213
  """Save run metadata."""
@@ -44,21 +44,12 @@ def show_info() -> int:
44
44
  print(f" ✓ {bench}")
45
45
 
46
46
  print("\n📁 Example Locations:")
47
- examples_dir = Path(themis.__file__).parent.parent / "examples"
47
+ examples_dir = Path(themis.__file__).parent.parent / "examples-simple"
48
48
  if examples_dir.exists():
49
49
  print(f" {examples_dir}")
50
- example_dirs = sorted(
51
- [
52
- d.name
53
- for d in examples_dir.iterdir()
54
- if d.is_dir() and not d.name.startswith("_")
55
- ]
56
- )
57
- for ex in example_dirs:
58
- print(f" • {ex}/")
59
50
 
60
51
  print("\n📚 Documentation:")
61
- print(" examples/README.md - Comprehensive tutorial cookbook")
52
+ print(" examples-simple/README.md - vNext example scripts")
62
53
  print(" COOKBOOK.md - Quick reference guide")
63
54
  print(" docs/ - Detailed documentation")
64
55