PyPI - themis-eval - Versions diffs - 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl - Mend

themis-eval 0.1.0py3-none-any.whl → 0.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (132) hide show

themis/cli/__init__.py +5 -0
themis/cli/__main__.py +6 -0
themis/cli/commands/__init__.py +19 -0
themis/cli/commands/benchmarks.py +221 -0
themis/cli/commands/comparison.py +394 -0
themis/cli/commands/config_commands.py +244 -0
themis/cli/commands/cost.py +214 -0
themis/cli/commands/demo.py +68 -0
themis/cli/commands/info.py +90 -0
themis/cli/commands/leaderboard.py +362 -0
themis/cli/commands/math_benchmarks.py +318 -0
themis/cli/commands/mcq_benchmarks.py +207 -0
themis/cli/commands/sample_run.py +244 -0
themis/cli/commands/visualize.py +299 -0
themis/cli/main.py +93 -0
themis/cli/new_project.py +33 -0
themis/cli/utils.py +51 -0
themis/config/__init__.py +19 -0
themis/config/loader.py +27 -0
themis/config/registry.py +34 -0
themis/config/runtime.py +214 -0
themis/config/schema.py +112 -0
themis/core/__init__.py +5 -0
themis/core/conversation.py +354 -0
themis/core/entities.py +164 -0
themis/core/serialization.py +231 -0
themis/core/tools.py +393 -0
themis/core/types.py +141 -0
themis/datasets/__init__.py +273 -0
themis/datasets/base.py +264 -0
themis/datasets/commonsense_qa.py +174 -0
themis/datasets/competition_math.py +265 -0
themis/datasets/coqa.py +133 -0
themis/datasets/gpqa.py +190 -0
themis/datasets/gsm8k.py +123 -0
themis/datasets/gsm_symbolic.py +124 -0
themis/datasets/math500.py +122 -0
themis/datasets/med_qa.py +179 -0
themis/datasets/medmcqa.py +169 -0
themis/datasets/mmlu_pro.py +262 -0
themis/datasets/piqa.py +146 -0
themis/datasets/registry.py +201 -0
themis/datasets/schema.py +245 -0
themis/datasets/sciq.py +150 -0
themis/datasets/social_i_qa.py +151 -0
themis/datasets/super_gpqa.py +263 -0
themis/evaluation/__init__.py +1 -0
themis/evaluation/conditional.py +410 -0
themis/evaluation/extractors/__init__.py +19 -0
themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
themis/evaluation/extractors/exceptions.py +7 -0
themis/evaluation/extractors/identity_extractor.py +29 -0
themis/evaluation/extractors/json_field_extractor.py +45 -0
themis/evaluation/extractors/math_verify_extractor.py +37 -0
themis/evaluation/extractors/regex_extractor.py +43 -0
themis/evaluation/math_verify_utils.py +87 -0
themis/evaluation/metrics/__init__.py +21 -0
themis/evaluation/metrics/composite_metric.py +47 -0
themis/evaluation/metrics/consistency_metric.py +80 -0
themis/evaluation/metrics/exact_match.py +51 -0
themis/evaluation/metrics/length_difference_tolerance.py +33 -0
themis/evaluation/metrics/math_verify_accuracy.py +40 -0
themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
themis/evaluation/metrics/response_length.py +33 -0
themis/evaluation/metrics/rubric_judge_metric.py +134 -0
themis/evaluation/pipeline.py +49 -0
themis/evaluation/pipelines/__init__.py +15 -0
themis/evaluation/pipelines/composable_pipeline.py +357 -0
themis/evaluation/pipelines/standard_pipeline.py +288 -0
themis/evaluation/reports.py +293 -0
themis/evaluation/statistics/__init__.py +53 -0
themis/evaluation/statistics/bootstrap.py +79 -0
themis/evaluation/statistics/confidence_intervals.py +121 -0
themis/evaluation/statistics/distributions.py +207 -0
themis/evaluation/statistics/effect_sizes.py +124 -0
themis/evaluation/statistics/hypothesis_tests.py +305 -0
themis/evaluation/statistics/types.py +139 -0
themis/evaluation/strategies/__init__.py +13 -0
themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
themis/evaluation/strategies/evaluation_strategy.py +24 -0
themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
themis/experiment/__init__.py +5 -0
themis/experiment/builder.py +151 -0
themis/experiment/cache_manager.py +129 -0
themis/experiment/comparison.py +631 -0
themis/experiment/cost.py +310 -0
themis/experiment/definitions.py +62 -0
themis/experiment/export.py +690 -0
themis/experiment/export_csv.py +159 -0
themis/experiment/integration_manager.py +104 -0
themis/experiment/math.py +192 -0
themis/experiment/mcq.py +169 -0
themis/experiment/orchestrator.py +373 -0
themis/experiment/pricing.py +317 -0
themis/experiment/storage.py +255 -0
themis/experiment/visualization.py +588 -0
themis/generation/__init__.py +1 -0
themis/generation/agentic_runner.py +420 -0
themis/generation/batching.py +254 -0
themis/generation/clients.py +143 -0
themis/generation/conversation_runner.py +236 -0
themis/generation/plan.py +456 -0
themis/generation/providers/litellm_provider.py +221 -0
themis/generation/providers/vllm_provider.py +135 -0
themis/generation/router.py +34 -0
themis/generation/runner.py +207 -0
themis/generation/strategies.py +98 -0
themis/generation/templates.py +71 -0
themis/generation/turn_strategies.py +393 -0
themis/generation/types.py +9 -0
themis/integrations/__init__.py +0 -0
themis/integrations/huggingface.py +61 -0
themis/integrations/wandb.py +65 -0
themis/interfaces/__init__.py +83 -0
themis/project/__init__.py +20 -0
themis/project/definitions.py +98 -0
themis/project/patterns.py +230 -0
themis/providers/__init__.py +5 -0
themis/providers/registry.py +39 -0
themis/utils/api_generator.py +379 -0
themis/utils/cost_tracking.py +376 -0
themis/utils/dashboard.py +452 -0
themis/utils/logging_utils.py +41 -0
themis/utils/progress.py +58 -0
themis/utils/tracing.py +320 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/METADATA +1 -1
themis_eval-0.1.1.dist-info/RECORD +134 -0
themis_eval-0.1.0.dist-info/RECORD +0 -8
{themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/WHEEL +0 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/licenses/LICENSE +0 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/top_level.txt +0 -0

themis/generation/providers/vllm_provider.py ADDED Viewed

@@ -0,0 +1,135 @@
+"""vLLM provider using AsyncLLMEngine."""
+from __future__ import annotations
+import asyncio
+import threading
+import time
+from typing import Any, Dict, List
+from themis.core import entities as core_entities
+from themis.interfaces import ModelProvider
+from themis.providers import register_provider
+class VLLMProvider(ModelProvider):
+    def __init__(
+        self,
+        *,
+        model: str,
+        tensor_parallel_size: int = 1,
+        max_parallel: int = 2,
+        engine_kwargs: Dict[str, Any] | None = None,
+    ) -> None:
+        self._model_name = model
+        self._tp_size = max(1, tensor_parallel_size)
+        self._max_parallel = max(1, max_parallel)
+        self._engine_kwargs = engine_kwargs or {}
+        self._engines = self._create_engines()
+        self._engine_lock = threading.Lock()
+        self._rr_index = 0
+        self._semaphore = threading.Semaphore(self._max_parallel)
+    def generate(
+        self, task: core_entities.GenerationTask
+    ) -> core_entities.GenerationRecord:  # type: ignore[override]
+        with self._semaphore:
+            engine = self._select_engine()
+            text, raw = asyncio.run(self._run_generation(engine, task))
+        metrics = {k: v for k, v in raw.items() if k != "chunks"}
+        return core_entities.GenerationRecord(
+            task=task,
+            output=core_entities.ModelOutput(text=text, raw=raw),
+            error=None,
+            metrics=metrics,
+        )
+    async def _run_generation(self, engine, task: core_entities.GenerationTask):
+        SamplingParams = self._sampling_params_cls
+        sampling_params = SamplingParams(
+            temperature=task.sampling.temperature,
+            top_p=task.sampling.top_p,
+            max_tokens=None
+            if task.sampling.max_tokens < 0
+            else task.sampling.max_tokens,
+        )
+        dataset_id = task.metadata.get("dataset_id", "sample")
+        request_id = f"themis-{dataset_id}-{time.time_ns()}"
+        chunks: List[str] = []
+        tokenizer = getattr(engine, "tokenizer", None)
+        async for output in engine.generate(
+            prompt=task.prompt.text,
+            sampling_params=sampling_params,
+            request_id=request_id,
+        ):
+            if output.outputs:
+                chunks.append(output.outputs[0].text)
+        final_text = chunks[-1] if chunks else ""
+        metrics = {"chunks": chunks}
+        if tokenizer is not None:
+            try:
+                metrics["prompt_tokens"] = len(tokenizer.encode(task.prompt.text))
+                metrics["response_tokens"] = len(tokenizer.encode(final_text))
+            except Exception:  # pragma: no cover
+                pass
+        return final_text, metrics
+    def _select_engine(self):
+        with self._engine_lock:
+            engine = self._engines[self._rr_index]
+            self._rr_index = (self._rr_index + 1) % len(self._engines)
+        return engine
+    def _create_engines(self):
+        AsyncLLMEngine, SamplingParams = self._load_vllm_classes()
+        self._sampling_params_cls = SamplingParams
+        engine_count = self._determine_engine_count()
+        engines = []
+        for idx in range(engine_count):
+            engine = AsyncLLMEngine(
+                model=self._model_name,
+                tensor_parallel_size=self._tp_size,
+                **self._engine_kwargs,
+            )
+            engines.append(engine)
+        return engines
+    def _determine_engine_count(self) -> int:
+        device_count = 0
+        try:
+            import torch
+            if torch.cuda.is_available():
+                device_count = torch.cuda.device_count()
+        except ImportError:
+            device_count = 0
+        if device_count and device_count % self._tp_size == 0:
+            return max(1, device_count // self._tp_size)
+        return 1
+    def count_tokens(self, text: str) -> int | None:
+        tokenizer = (
+            getattr(self._engines[0], "tokenizer", None) if self._engines else None
+        )
+        if tokenizer is None:
+            return None
+        try:
+            return len(tokenizer.encode(text))
+        except Exception:
+            return None
+    @staticmethod
+    def _load_vllm_classes():
+        try:
+            from vllm import AsyncLLMEngine, SamplingParams
+        except ImportError as exc:  # pragma: no cover - optional dep
+            raise RuntimeError(
+                "vLLM is not installed. Install via `pip install vllm` to use VLLMProvider."
+            ) from exc
+        return AsyncLLMEngine, SamplingParams
+register_provider("vllm", VLLMProvider)
+__all__ = ["VLLMProvider"]

themis/generation/router.py ADDED Viewed

@@ -0,0 +1,34 @@
+"""Utility router mapping generation tasks to providers."""
+from __future__ import annotations
+from typing import Mapping
+from themis.core import entities as core_entities
+from themis.interfaces import ModelProvider
+class ProviderRouter(ModelProvider):
+    """Dispatches generation tasks to concrete providers by model identifier."""
+    def __init__(self, providers: Mapping[str, ModelProvider]):
+        self._providers = dict(providers)
+    def generate(
+        self, task: core_entities.GenerationTask
+    ) -> core_entities.GenerationRecord:  # type: ignore[override]
+        provider = self._providers.get(task.model.identifier)
+        if provider is None:
+            known = ", ".join(sorted(self._providers)) or "<none>"
+            raise RuntimeError(
+                f"No provider registered for model '{task.model.identifier}'. "
+                f"Known providers: {known}."
+            )
+        return provider.generate(task)
+    @property
+    def providers(self) -> Mapping[str, ModelProvider]:
+        return self._providers
+__all__ = ["ProviderRouter"]

themis/generation/runner.py ADDED Viewed

@@ -0,0 +1,207 @@
+"""Generation runner primitives."""
+from __future__ import annotations
+import logging
+import time
+from concurrent.futures import ThreadPoolExecutor
+from typing import Callable, Iterable, Iterator, List
+from themis.core import entities as core_entities
+from themis.generation import strategies
+from themis.interfaces import ModelProvider
+from themis.utils import tracing
+logger = logging.getLogger(__name__)
+class GenerationRunner:
+    """Delegates generation tasks to an injected provider with strategy support."""
+    def __init__(
+        self,
+        *,
+        provider: ModelProvider,
+        strategy_resolver: Callable[
+            [core_entities.GenerationTask], strategies.GenerationStrategy
+        ]
+        | None = None,
+        max_parallel: int = 1,
+        max_retries: int = 3,
+        retry_initial_delay: float = 0.5,
+        retry_backoff_multiplier: float = 2.0,
+        retry_max_delay: float | None = 2.0,
+    ) -> None:
+        self._provider = provider
+        self._strategy_resolver = strategy_resolver or (
+            lambda task: strategies.SingleAttemptStrategy()
+        )
+        self._max_parallel = max(1, max_parallel)
+        self._max_retries = max(1, int(max_retries))
+        self._retry_initial_delay = max(0.0, retry_initial_delay)
+        self._retry_backoff_multiplier = max(1.0, retry_backoff_multiplier)
+        self._retry_max_delay = (
+            retry_max_delay if retry_max_delay is None else max(0.0, retry_max_delay)
+        )
+    def run(
+        self, tasks: Iterable[core_entities.GenerationTask]
+    ) -> Iterator[core_entities.GenerationRecord]:
+        task_list = list(tasks)
+        if not task_list:
+            return
+        if self._max_parallel <= 1:
+            for task in task_list:
+                yield self._execute_task(task)
+            return
+        with ThreadPoolExecutor(max_workers=self._max_parallel) as executor:
+            futures = [executor.submit(self._execute_task, task) for task in task_list]
+            for future in futures:
+                yield future.result()
+    def _run_single_attempt(
+        self, task: core_entities.GenerationTask
+    ) -> core_entities.GenerationRecord:
+        attempt_errors: List[dict[str, object]] = []
+        last_error: Exception | None = None
+        delay = self._retry_initial_delay
+        task_label = task.metadata.get("dataset_id") or task.prompt.template_name
+        for attempt in range(1, self._max_retries + 1):
+            try:
+                logger.debug(
+                    "Starting generation for %s attempt %s/%s",
+                    task_label,
+                    attempt,
+                    self._max_retries,
+                )
+                record = self._invoke_provider(task)
+                record.metrics["generation_attempts"] = attempt
+                if attempt_errors:
+                    record.metrics.setdefault("retry_errors", attempt_errors)
+                logger.debug("Completed %s in %s attempt(s)", task_label, attempt)
+                return record
+            except Exception as exc:  # pragma: no cover - defensive path
+                last_error = exc
+                logger.warning(
+                    "Attempt %s/%s for %s failed: %s",
+                    attempt,
+                    self._max_retries,
+                    task_label,
+                    exc,
+                )
+                attempt_errors.append(
+                    {
+                        "attempt": attempt,
+                        "error": str(exc),
+                        "exception_type": exc.__class__.__name__,
+                    }
+                )
+                if attempt >= self._max_retries:
+                    break
+                if delay > 0:
+                    time.sleep(delay)
+                delay = self._next_delay(delay)
+        return self._build_failure_record(task, attempt_errors, last_error)
+    def _invoke_provider(
+        self, task: core_entities.GenerationTask
+    ) -> core_entities.GenerationRecord:
+        start = time.perf_counter()
+        with tracing.span("provider_generate", model=task.model.identifier):
+            record = self._provider.generate(task)
+        elapsed_ms = (time.perf_counter() - start) * 1000
+        record.metrics.setdefault("generation_time_ms", elapsed_ms)
+        record.metrics.setdefault("prompt_chars", len(task.prompt.text))
+        prompt_tokens = record.metrics.get("prompt_tokens")
+        if prompt_tokens is None:
+            prompt_tokens = self._count_tokens(task.prompt.text)
+            if prompt_tokens is None:
+                prompt_tokens = len(task.prompt.text.split())
+            record.metrics["prompt_tokens"] = prompt_tokens
+        if record.output:
+            record.metrics.setdefault("response_chars", len(record.output.text))
+            response_tokens = record.metrics.get("response_tokens")
+            if response_tokens is None:
+                response_tokens = self._count_tokens(record.output.text)
+                if response_tokens is None:
+                    response_tokens = len(record.output.text.split())
+                record.metrics["response_tokens"] = response_tokens
+        return record
+    def _next_delay(self, previous_delay: float) -> float:
+        if previous_delay <= 0:
+            next_delay = self._retry_initial_delay
+        else:
+            next_delay = previous_delay * self._retry_backoff_multiplier
+        if self._retry_max_delay is not None:
+            next_delay = min(next_delay, self._retry_max_delay)
+        return next_delay
+    def _build_failure_record(
+        self,
+        task: core_entities.GenerationTask,
+        attempt_errors: List[dict[str, object]],
+        last_error: Exception | None,
+    ) -> core_entities.GenerationRecord:
+        attempts = len(attempt_errors) or 1
+        cause = str(last_error) if last_error else "unknown error"
+        message = (
+            f"Generation failed for model '{task.model.identifier}' "
+            f"after {attempts} attempt(s): {cause}"
+        )
+        logger.error(
+            "All attempts failed for %s after %s tries",
+            task.metadata.get("dataset_id") or task.prompt.template_name,
+            attempts,
+            exc_info=last_error,
+        )
+        return core_entities.GenerationRecord(
+            task=task,
+            output=None,
+            error=core_entities.ModelError(
+                message=message,
+                kind="provider_error",
+                details={
+                    "attempts": attempt_errors,
+                    "model": task.model.identifier,
+                    "provider": task.model.provider,
+                },
+            ),
+            metrics={"generation_attempts": attempts, "retry_errors": attempt_errors},
+        )
+    def _execute_task(
+        self, task: core_entities.GenerationTask
+    ) -> core_entities.GenerationRecord:
+        task_id = task.metadata.get("dataset_id", "unknown")
+        model_id = task.model.identifier
+        with tracing.span("execute_task", task_id=task_id, model=model_id):
+            strategy = self._strategy_resolver(task)
+            attempt_records: List[core_entities.GenerationRecord] = []
+            with tracing.span("expand_strategy"):
+                expansion = list(strategy.expand(task))
+            for attempt_task in expansion:
+                with tracing.span("run_attempt"):
+                    attempt_records.append(self._run_single_attempt(attempt_task))
+            with tracing.span("aggregate_strategy"):
+                aggregated = strategy.aggregate(task, attempt_records)
+            aggregated.attempts = attempt_records
+            return aggregated
+    def _count_tokens(self, text: str) -> int | None:
+        counter = getattr(self._provider, "count_tokens", None)
+        if callable(counter):
+            try:
+                return int(counter(text))
+            except Exception:  # pragma: no cover - tokenization failure
+                return None
+        return None

themis/generation/strategies.py ADDED Viewed

@@ -0,0 +1,98 @@
+"""Generation strategy interfaces and default implementations."""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Iterable, List, Protocol
+from themis.core import entities as core_entities
+class GenerationStrategy(Protocol):
+    """Strategy responsible for expanding a task into one or more execution attempts."""
+    def expand(
+        self, task: core_entities.GenerationTask
+    ) -> Iterable[core_entities.GenerationTask]:  # pragma: no cover - interface
+        ...
+    def aggregate(
+        self,
+        task: core_entities.GenerationTask,
+        records: List[core_entities.GenerationRecord],
+    ) -> core_entities.GenerationRecord:  # pragma: no cover - interface
+        ...
+@dataclass
+class SingleAttemptStrategy:
+    """Default strategy – run exactly once and pass-through result."""
+    def expand(
+        self, task: core_entities.GenerationTask
+    ) -> Iterable[core_entities.GenerationTask]:
+        return [task]
+    def aggregate(
+        self,
+        task: core_entities.GenerationTask,
+        records: List[core_entities.GenerationRecord],
+    ) -> core_entities.GenerationRecord:
+        record = records[0]
+        return core_entities.GenerationRecord(
+            task=task,
+            output=record.output,
+            error=record.error,
+            metrics=dict(record.metrics),
+        )
+@dataclass
+class RepeatedSamplingStrategy:
+    """Repeat the same task multiple times for test-time scaling."""
+    attempts: int
+    metadata_label: str = "attempts"
+    def expand(
+        self, task: core_entities.GenerationTask
+    ) -> Iterable[core_entities.GenerationTask]:
+        for index in range(self.attempts):
+            attempt_metadata = dict(task.metadata)
+            attempt_metadata[self.metadata_label] = index
+            yield core_entities.GenerationTask(
+                prompt=task.prompt,
+                model=task.model,
+                sampling=task.sampling,
+                metadata=attempt_metadata,
+                reference=task.reference,
+            )
+    def aggregate(
+        self,
+        task: core_entities.GenerationTask,
+        records: List[core_entities.GenerationRecord],
+    ) -> core_entities.GenerationRecord:
+        best = next((record for record in records if not record.error), records[0])
+        aggregated = core_entities.GenerationRecord(
+            task=task,
+            output=best.output,
+            error=best.error,
+            metrics=dict(best.metrics),
+        )
+        aggregated.metrics["attempt_count"] = len(records)
+        aggregated.metrics["attempt_outcomes"] = [
+            {
+                "output": record.output.text if record.output else None,
+                "error": record.error.message if record.error else None,
+            }
+            for record in records
+        ]
+        return aggregated
+__all__ = [
+    "GenerationStrategy",
+    "SingleAttemptStrategy",
+    "RepeatedSamplingStrategy",
+]

themis/generation/templates.py ADDED Viewed

@@ -0,0 +1,71 @@
+"""Prompt template primitives for Themis generation domain."""
+from __future__ import annotations
+from dataclasses import dataclass
+from itertools import product
+from typing import Any, Dict, Iterable, List
+from themis.core import entities as core_entities
+class TemplateRenderingError(RuntimeError):
+    """Raised when a prompt template cannot be rendered."""
+@dataclass
+class PromptTemplate:
+    """Represents a format string and associated metadata."""
+    name: str
+    template: str
+    metadata: Dict[str, Any] | None = None
+    def __post_init__(self) -> None:
+        self._spec = core_entities.PromptSpec(
+            name=self.name,
+            template=self.template,
+            metadata=dict(self.metadata or {}),
+        )
+    def render(self, **kwargs: Any) -> str:
+        try:
+            return self.template.format(**kwargs)
+        except KeyError as exc:  # pragma: no cover - defensive path
+            missing = exc.args[0]
+            raise TemplateRenderingError(
+                f"Missing template variable: {missing}"
+            ) from exc
+    def expand_variants(
+        self,
+        *,
+        base_context: Dict[str, Any],
+        variant_values: Dict[str, Iterable[Any]],
+    ) -> List[core_entities.PromptRender]:
+        """Generate prompts for the cross-product of variant fields."""
+        if not variant_values:
+            return [self._render_context(base_context)]
+        keys = sorted(variant_values.keys())
+        prompts: list[core_entities.PromptRender] = []
+        for combo in product(*(variant_values[key] for key in keys)):
+            combo_context = dict(base_context)
+            combo_context.update(dict(zip(keys, combo)))
+            prompts.append(self._render_context(combo_context))
+        return prompts
+    def render_prompt(self, context: Dict[str, Any]) -> core_entities.PromptRender:
+        """Render the template to a core PromptRender."""
+        return self._render_context(context)
+    def _render_context(self, context: Dict[str, Any]) -> core_entities.PromptRender:
+        prompt_text = self.render(**context)
+        metadata = dict(self.metadata or {})
+        return core_entities.PromptRender(
+            spec=self._spec,
+            text=prompt_text,
+            context=dict(context),
+            metadata=metadata,
+        )

themis-eval 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

themis-eval 0.1.0py3-none-any.whl → 0.1.1py3-none-any.whl