PyPI - themis-eval - Versions diffs - 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl - Mend

themis-eval 0.1.0py3-none-any.whl → 0.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (132) hide show

themis/cli/__init__.py +5 -0
themis/cli/__main__.py +6 -0
themis/cli/commands/__init__.py +19 -0
themis/cli/commands/benchmarks.py +221 -0
themis/cli/commands/comparison.py +394 -0
themis/cli/commands/config_commands.py +244 -0
themis/cli/commands/cost.py +214 -0
themis/cli/commands/demo.py +68 -0
themis/cli/commands/info.py +90 -0
themis/cli/commands/leaderboard.py +362 -0
themis/cli/commands/math_benchmarks.py +318 -0
themis/cli/commands/mcq_benchmarks.py +207 -0
themis/cli/commands/sample_run.py +244 -0
themis/cli/commands/visualize.py +299 -0
themis/cli/main.py +93 -0
themis/cli/new_project.py +33 -0
themis/cli/utils.py +51 -0
themis/config/__init__.py +19 -0
themis/config/loader.py +27 -0
themis/config/registry.py +34 -0
themis/config/runtime.py +214 -0
themis/config/schema.py +112 -0
themis/core/__init__.py +5 -0
themis/core/conversation.py +354 -0
themis/core/entities.py +164 -0
themis/core/serialization.py +231 -0
themis/core/tools.py +393 -0
themis/core/types.py +141 -0
themis/datasets/__init__.py +273 -0
themis/datasets/base.py +264 -0
themis/datasets/commonsense_qa.py +174 -0
themis/datasets/competition_math.py +265 -0
themis/datasets/coqa.py +133 -0
themis/datasets/gpqa.py +190 -0
themis/datasets/gsm8k.py +123 -0
themis/datasets/gsm_symbolic.py +124 -0
themis/datasets/math500.py +122 -0
themis/datasets/med_qa.py +179 -0
themis/datasets/medmcqa.py +169 -0
themis/datasets/mmlu_pro.py +262 -0
themis/datasets/piqa.py +146 -0
themis/datasets/registry.py +201 -0
themis/datasets/schema.py +245 -0
themis/datasets/sciq.py +150 -0
themis/datasets/social_i_qa.py +151 -0
themis/datasets/super_gpqa.py +263 -0
themis/evaluation/__init__.py +1 -0
themis/evaluation/conditional.py +410 -0
themis/evaluation/extractors/__init__.py +19 -0
themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
themis/evaluation/extractors/exceptions.py +7 -0
themis/evaluation/extractors/identity_extractor.py +29 -0
themis/evaluation/extractors/json_field_extractor.py +45 -0
themis/evaluation/extractors/math_verify_extractor.py +37 -0
themis/evaluation/extractors/regex_extractor.py +43 -0
themis/evaluation/math_verify_utils.py +87 -0
themis/evaluation/metrics/__init__.py +21 -0
themis/evaluation/metrics/composite_metric.py +47 -0
themis/evaluation/metrics/consistency_metric.py +80 -0
themis/evaluation/metrics/exact_match.py +51 -0
themis/evaluation/metrics/length_difference_tolerance.py +33 -0
themis/evaluation/metrics/math_verify_accuracy.py +40 -0
themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
themis/evaluation/metrics/response_length.py +33 -0
themis/evaluation/metrics/rubric_judge_metric.py +134 -0
themis/evaluation/pipeline.py +49 -0
themis/evaluation/pipelines/__init__.py +15 -0
themis/evaluation/pipelines/composable_pipeline.py +357 -0
themis/evaluation/pipelines/standard_pipeline.py +288 -0
themis/evaluation/reports.py +293 -0
themis/evaluation/statistics/__init__.py +53 -0
themis/evaluation/statistics/bootstrap.py +79 -0
themis/evaluation/statistics/confidence_intervals.py +121 -0
themis/evaluation/statistics/distributions.py +207 -0
themis/evaluation/statistics/effect_sizes.py +124 -0
themis/evaluation/statistics/hypothesis_tests.py +305 -0
themis/evaluation/statistics/types.py +139 -0
themis/evaluation/strategies/__init__.py +13 -0
themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
themis/evaluation/strategies/evaluation_strategy.py +24 -0
themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
themis/experiment/__init__.py +5 -0
themis/experiment/builder.py +151 -0
themis/experiment/cache_manager.py +129 -0
themis/experiment/comparison.py +631 -0
themis/experiment/cost.py +310 -0
themis/experiment/definitions.py +62 -0
themis/experiment/export.py +690 -0
themis/experiment/export_csv.py +159 -0
themis/experiment/integration_manager.py +104 -0
themis/experiment/math.py +192 -0
themis/experiment/mcq.py +169 -0
themis/experiment/orchestrator.py +373 -0
themis/experiment/pricing.py +317 -0
themis/experiment/storage.py +255 -0
themis/experiment/visualization.py +588 -0
themis/generation/__init__.py +1 -0
themis/generation/agentic_runner.py +420 -0
themis/generation/batching.py +254 -0
themis/generation/clients.py +143 -0
themis/generation/conversation_runner.py +236 -0
themis/generation/plan.py +456 -0
themis/generation/providers/litellm_provider.py +221 -0
themis/generation/providers/vllm_provider.py +135 -0
themis/generation/router.py +34 -0
themis/generation/runner.py +207 -0
themis/generation/strategies.py +98 -0
themis/generation/templates.py +71 -0
themis/generation/turn_strategies.py +393 -0
themis/generation/types.py +9 -0
themis/integrations/__init__.py +0 -0
themis/integrations/huggingface.py +61 -0
themis/integrations/wandb.py +65 -0
themis/interfaces/__init__.py +83 -0
themis/project/__init__.py +20 -0
themis/project/definitions.py +98 -0
themis/project/patterns.py +230 -0
themis/providers/__init__.py +5 -0
themis/providers/registry.py +39 -0
themis/utils/api_generator.py +379 -0
themis/utils/cost_tracking.py +376 -0
themis/utils/dashboard.py +452 -0
themis/utils/logging_utils.py +41 -0
themis/utils/progress.py +58 -0
themis/utils/tracing.py +320 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/METADATA +1 -1
themis_eval-0.1.1.dist-info/RECORD +134 -0
themis_eval-0.1.0.dist-info/RECORD +0 -8
{themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/WHEEL +0 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/licenses/LICENSE +0 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/top_level.txt +0 -0

themis/core/entities.py ADDED Viewed

@@ -0,0 +1,164 @@
+"""Shared dataclasses that represent Themis' internal world."""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any, Dict, Generic, List, TypeVar
+if TYPE_CHECKING:
+    from themis.evaluation.reports import EvaluationReport
+# Type variable for generic Reference
+T = TypeVar("T")
+@dataclass(frozen=True)
+class SamplingConfig:
+    temperature: float
+    top_p: float
+    max_tokens: int
+@dataclass(frozen=True)
+class ModelSpec:
+    identifier: str
+    provider: str
+    default_sampling: SamplingConfig | None = None
+    metadata: Dict[str, Any] = field(default_factory=dict)
+@dataclass(frozen=True)
+class PromptSpec:
+    name: str
+    template: str
+    metadata: Dict[str, Any] = field(default_factory=dict)
+@dataclass(frozen=True)
+class PromptRender:
+    spec: PromptSpec
+    text: str
+    context: Dict[str, Any] = field(default_factory=dict)
+    metadata: Dict[str, Any] = field(default_factory=dict)
+    @property
+    def prompt_text(self) -> str:
+        return self.text
+    @property
+    def template_name(self) -> str:
+        return self.spec.name
+@dataclass(frozen=True)
+class Reference(Generic[T]):
+    """Reference value with optional type information.
+    This is a generic dataclass that can hold typed reference values.
+    For backward compatibility, it can be used without type parameters
+    and will behave like Reference[Any].
+    Examples:
+        # Untyped (backward compatible)
+        ref = Reference(kind="answer", value="42")
+        # Typed
+        ref: Reference[str] = Reference(kind="answer", value="42")
+        ref: Reference[int] = Reference(kind="answer", value=42)
+    """
+    kind: str
+    value: T
+    schema: type[T] | None = None  # Optional runtime type information
+@dataclass(frozen=True)
+class ModelOutput:
+    text: str
+    raw: Any | None = None
+    usage: Dict[str, int] | None = None  # Token usage: {prompt_tokens, completion_tokens, total_tokens}
+@dataclass(frozen=True)
+class ModelError:
+    message: str
+    kind: str = "model_error"
+    details: Dict[str, Any] = field(default_factory=dict)
+@dataclass
+class GenerationTask:
+    prompt: PromptRender
+    model: ModelSpec
+    sampling: SamplingConfig
+    metadata: Dict[str, Any] = field(default_factory=dict)
+    reference: Reference | None = None
+@dataclass
+class GenerationRecord:
+    task: GenerationTask
+    output: ModelOutput | None
+    error: ModelError | None
+    metrics: Dict[str, Any] = field(default_factory=dict)
+    attempts: List["GenerationRecord"] = field(default_factory=list)
+@dataclass(frozen=True)
+class EvaluationItem:
+    record: GenerationRecord
+    reference: Reference | None
+@dataclass(frozen=True)
+class MetricScore:
+    metric_name: str
+    value: float
+    details: Dict[str, Any] = field(default_factory=dict)
+    metadata: Dict[str, Any] = field(default_factory=dict)
+@dataclass
+class EvaluationSummary:
+    scores: List[MetricScore]
+    failures: List[str] = field(default_factory=list)
+@dataclass
+class EvaluationRecord:
+    sample_id: str | None
+    scores: List[MetricScore]
+    failures: List[str] = field(default_factory=list)
+@dataclass
+class ExperimentFailure:
+    sample_id: str | None
+    message: str
+@dataclass
+class ExperimentReport:
+    generation_results: list[GenerationRecord]
+    evaluation_report: "EvaluationReport"
+    failures: list[ExperimentFailure]
+    metadata: dict[str, object]
+__all__ = [
+    "SamplingConfig",
+    "ModelSpec",
+    "PromptSpec",
+    "PromptRender",
+    "Reference",
+    "ModelOutput",
+    "ModelError",
+    "GenerationTask",
+    "GenerationRecord",
+    "EvaluationItem",
+    "EvaluationRecord",
+    "MetricScore",
+    "EvaluationSummary",
+    "ExperimentFailure",
+    "ExperimentReport",
+]

themis/core/serialization.py ADDED Viewed

@@ -0,0 +1,231 @@
+"""Serialization helpers for Themis core entities."""
+from __future__ import annotations
+import copy
+from typing import Any, Dict
+from themis.core import entities as core_entities
+def serialize_sampling(config: core_entities.SamplingConfig) -> Dict[str, Any]:
+    return {
+        "temperature": config.temperature,
+        "top_p": config.top_p,
+        "max_tokens": config.max_tokens,
+    }
+def deserialize_sampling(data: Dict[str, Any]) -> core_entities.SamplingConfig:
+    return core_entities.SamplingConfig(
+        temperature=data["temperature"],
+        top_p=data["top_p"],
+        max_tokens=data["max_tokens"],
+    )
+def serialize_model_spec(spec: core_entities.ModelSpec) -> Dict[str, Any]:
+    return {
+        "identifier": spec.identifier,
+        "provider": spec.provider,
+        "metadata": copy.deepcopy(spec.metadata),
+        "default_sampling": serialize_sampling(spec.default_sampling)
+        if spec.default_sampling
+        else None,
+    }
+def deserialize_model_spec(data: Dict[str, Any]) -> core_entities.ModelSpec:
+    default_sampling = (
+        deserialize_sampling(data["default_sampling"])
+        if data.get("default_sampling")
+        else None
+    )
+    return core_entities.ModelSpec(
+        identifier=data["identifier"],
+        provider=data["provider"],
+        metadata=copy.deepcopy(data.get("metadata", {})),
+        default_sampling=default_sampling,
+    )
+def serialize_prompt_spec(spec: core_entities.PromptSpec) -> Dict[str, Any]:
+    return {
+        "name": spec.name,
+        "template": spec.template,
+        "metadata": copy.deepcopy(spec.metadata),
+    }
+def deserialize_prompt_spec(data: Dict[str, Any]) -> core_entities.PromptSpec:
+    return core_entities.PromptSpec(
+        name=data["name"],
+        template=data["template"],
+        metadata=copy.deepcopy(data.get("metadata", {})),
+    )
+def serialize_prompt_render(render: core_entities.PromptRender) -> Dict[str, Any]:
+    return {
+        "spec": serialize_prompt_spec(render.spec),
+        "text": render.text,
+        "context": copy.deepcopy(render.context),
+        "metadata": copy.deepcopy(render.metadata),
+    }
+def deserialize_prompt_render(data: Dict[str, Any]) -> core_entities.PromptRender:
+    return core_entities.PromptRender(
+        spec=deserialize_prompt_spec(data["spec"]),
+        text=data["text"],
+        context=copy.deepcopy(data.get("context", {})),
+        metadata=copy.deepcopy(data.get("metadata", {})),
+    )
+def serialize_reference(
+    reference: core_entities.Reference | None,
+) -> Dict[str, Any] | None:
+    if reference is None:
+        return None
+    return {"kind": reference.kind, "value": reference.value}
+def deserialize_reference(
+    data: Dict[str, Any] | None,
+) -> core_entities.Reference | None:
+    if data is None:
+        return None
+    return core_entities.Reference(kind=data["kind"], value=data.get("value"))
+def serialize_generation_task(task: core_entities.GenerationTask) -> Dict[str, Any]:
+    return {
+        "prompt": serialize_prompt_render(task.prompt),
+        "model": serialize_model_spec(task.model),
+        "sampling": serialize_sampling(task.sampling),
+        "metadata": copy.deepcopy(task.metadata),
+        "reference": serialize_reference(task.reference),
+    }
+def deserialize_generation_task(data: Dict[str, Any]) -> core_entities.GenerationTask:
+    return core_entities.GenerationTask(
+        prompt=deserialize_prompt_render(data["prompt"]),
+        model=deserialize_model_spec(data["model"]),
+        sampling=deserialize_sampling(data["sampling"]),
+        metadata=copy.deepcopy(data.get("metadata", {})),
+        reference=deserialize_reference(data.get("reference")),
+    )
+def serialize_generation_record(
+    record: core_entities.GenerationRecord,
+) -> Dict[str, Any]:
+    return {
+        "task": serialize_generation_task(record.task),
+        "output": {
+            "text": record.output.text,
+            "raw": record.output.raw,
+        }
+        if record.output
+        else None,
+        "error": {
+            "message": record.error.message,
+            "kind": record.error.kind,
+            "details": copy.deepcopy(record.error.details),
+        }
+        if record.error
+        else None,
+        "metrics": copy.deepcopy(record.metrics),
+        "attempts": [
+            serialize_generation_record(attempt) for attempt in record.attempts
+        ],
+    }
+def deserialize_generation_record(
+    data: Dict[str, Any],
+) -> core_entities.GenerationRecord:
+    output_data = data.get("output")
+    error_data = data.get("error")
+    return core_entities.GenerationRecord(
+        task=deserialize_generation_task(data["task"]),
+        output=core_entities.ModelOutput(
+            text=output_data["text"], raw=output_data.get("raw")
+        )
+        if output_data
+        else None,
+        error=core_entities.ModelError(
+            message=error_data["message"],
+            kind=error_data.get("kind", "model_error"),
+            details=copy.deepcopy(error_data.get("details", {})),
+        )
+        if error_data
+        else None,
+        metrics=copy.deepcopy(data.get("metrics", {})),
+        attempts=[
+            deserialize_generation_record(attempt)
+            for attempt in data.get("attempts", [])
+        ],
+    )
+def serialize_metric_score(score: core_entities.MetricScore) -> Dict[str, Any]:
+    return {
+        "metric_name": score.metric_name,
+        "value": score.value,
+        "details": copy.deepcopy(score.details),
+        "metadata": copy.deepcopy(score.metadata),
+    }
+def deserialize_metric_score(data: Dict[str, Any]) -> core_entities.MetricScore:
+    return core_entities.MetricScore(
+        metric_name=data["metric_name"],
+        value=data["value"],
+        details=copy.deepcopy(data.get("details", {})),
+        metadata=copy.deepcopy(data.get("metadata", {})),
+    )
+def serialize_evaluation_record(
+    record: core_entities.EvaluationRecord,
+) -> Dict[str, Any]:
+    return {
+        "sample_id": record.sample_id,
+        "scores": [serialize_metric_score(score) for score in record.scores],
+        "failures": list(record.failures),
+    }
+def deserialize_evaluation_record(
+    data: Dict[str, Any],
+) -> core_entities.EvaluationRecord:
+    return core_entities.EvaluationRecord(
+        sample_id=data.get("sample_id"),
+        scores=[deserialize_metric_score(score) for score in data.get("scores", [])],
+        failures=list(data.get("failures", [])),
+    )
+__all__ = [
+    "serialize_generation_record",
+    "deserialize_generation_record",
+    "serialize_generation_task",
+    "deserialize_generation_task",
+    "serialize_evaluation_record",
+    "deserialize_evaluation_record",
+    "serialize_metric_score",
+    "deserialize_metric_score",
+    "serialize_sampling",
+    "deserialize_sampling",
+    "serialize_model_spec",
+    "deserialize_model_spec",
+    "serialize_prompt_spec",
+    "deserialize_prompt_spec",
+    "serialize_prompt_render",
+    "deserialize_prompt_render",
+    "serialize_reference",
+    "deserialize_reference",
+]

themis-eval 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

themis-eval 0.1.0py3-none-any.whl → 0.1.1py3-none-any.whl