PyPI - themis-eval - Versions diffs - 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

themis-eval 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (158) hide show

themis/__init__.py +12 -1
themis/_version.py +2 -2
themis/api.py +343 -0
themis/backends/__init__.py +17 -0
themis/backends/execution.py +197 -0
themis/backends/storage.py +260 -0
themis/cli/__init__.py +5 -0
themis/cli/__main__.py +6 -0
themis/cli/commands/__init__.py +19 -0
themis/cli/commands/benchmarks.py +221 -0
themis/cli/commands/comparison.py +394 -0
themis/cli/commands/config_commands.py +244 -0
themis/cli/commands/cost.py +214 -0
themis/cli/commands/demo.py +68 -0
themis/cli/commands/info.py +90 -0
themis/cli/commands/leaderboard.py +362 -0
themis/cli/commands/math_benchmarks.py +318 -0
themis/cli/commands/mcq_benchmarks.py +207 -0
themis/cli/commands/results.py +252 -0
themis/cli/commands/sample_run.py +244 -0
themis/cli/commands/visualize.py +299 -0
themis/cli/main.py +463 -0
themis/cli/new_project.py +33 -0
themis/cli/utils.py +51 -0
themis/comparison/__init__.py +25 -0
themis/comparison/engine.py +348 -0
themis/comparison/reports.py +283 -0
themis/comparison/statistics.py +402 -0
themis/config/__init__.py +19 -0
themis/config/loader.py +27 -0
themis/config/registry.py +34 -0
themis/config/runtime.py +214 -0
themis/config/schema.py +112 -0
themis/core/__init__.py +5 -0
themis/core/conversation.py +354 -0
themis/core/entities.py +184 -0
themis/core/serialization.py +231 -0
themis/core/tools.py +393 -0
themis/core/types.py +141 -0
themis/datasets/__init__.py +273 -0
themis/datasets/base.py +264 -0
themis/datasets/commonsense_qa.py +174 -0
themis/datasets/competition_math.py +265 -0
themis/datasets/coqa.py +133 -0
themis/datasets/gpqa.py +190 -0
themis/datasets/gsm8k.py +123 -0
themis/datasets/gsm_symbolic.py +124 -0
themis/datasets/math500.py +122 -0
themis/datasets/med_qa.py +179 -0
themis/datasets/medmcqa.py +169 -0
themis/datasets/mmlu_pro.py +262 -0
themis/datasets/piqa.py +146 -0
themis/datasets/registry.py +201 -0
themis/datasets/schema.py +245 -0
themis/datasets/sciq.py +150 -0
themis/datasets/social_i_qa.py +151 -0
themis/datasets/super_gpqa.py +263 -0
themis/evaluation/__init__.py +1 -0
themis/evaluation/conditional.py +410 -0
themis/evaluation/extractors/__init__.py +19 -0
themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
themis/evaluation/extractors/exceptions.py +7 -0
themis/evaluation/extractors/identity_extractor.py +29 -0
themis/evaluation/extractors/json_field_extractor.py +45 -0
themis/evaluation/extractors/math_verify_extractor.py +37 -0
themis/evaluation/extractors/regex_extractor.py +43 -0
themis/evaluation/math_verify_utils.py +87 -0
themis/evaluation/metrics/__init__.py +21 -0
themis/evaluation/metrics/code/__init__.py +19 -0
themis/evaluation/metrics/code/codebleu.py +144 -0
themis/evaluation/metrics/code/execution.py +280 -0
themis/evaluation/metrics/code/pass_at_k.py +181 -0
themis/evaluation/metrics/composite_metric.py +47 -0
themis/evaluation/metrics/consistency_metric.py +80 -0
themis/evaluation/metrics/exact_match.py +51 -0
themis/evaluation/metrics/length_difference_tolerance.py +33 -0
themis/evaluation/metrics/math_verify_accuracy.py +40 -0
themis/evaluation/metrics/nlp/__init__.py +21 -0
themis/evaluation/metrics/nlp/bertscore.py +138 -0
themis/evaluation/metrics/nlp/bleu.py +129 -0
themis/evaluation/metrics/nlp/meteor.py +153 -0
themis/evaluation/metrics/nlp/rouge.py +136 -0
themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
themis/evaluation/metrics/response_length.py +33 -0
themis/evaluation/metrics/rubric_judge_metric.py +134 -0
themis/evaluation/pipeline.py +49 -0
themis/evaluation/pipelines/__init__.py +15 -0
themis/evaluation/pipelines/composable_pipeline.py +357 -0
themis/evaluation/pipelines/standard_pipeline.py +348 -0
themis/evaluation/reports.py +293 -0
themis/evaluation/statistics/__init__.py +53 -0
themis/evaluation/statistics/bootstrap.py +79 -0
themis/evaluation/statistics/confidence_intervals.py +121 -0
themis/evaluation/statistics/distributions.py +207 -0
themis/evaluation/statistics/effect_sizes.py +124 -0
themis/evaluation/statistics/hypothesis_tests.py +305 -0
themis/evaluation/statistics/types.py +139 -0
themis/evaluation/strategies/__init__.py +13 -0
themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
themis/evaluation/strategies/evaluation_strategy.py +24 -0
themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
themis/experiment/__init__.py +5 -0
themis/experiment/builder.py +151 -0
themis/experiment/cache_manager.py +134 -0
themis/experiment/comparison.py +631 -0
themis/experiment/cost.py +310 -0
themis/experiment/definitions.py +62 -0
themis/experiment/export.py +798 -0
themis/experiment/export_csv.py +159 -0
themis/experiment/integration_manager.py +104 -0
themis/experiment/math.py +192 -0
themis/experiment/mcq.py +169 -0
themis/experiment/orchestrator.py +415 -0
themis/experiment/pricing.py +317 -0
themis/experiment/storage.py +1458 -0
themis/experiment/visualization.py +588 -0
themis/generation/__init__.py +1 -0
themis/generation/agentic_runner.py +420 -0
themis/generation/batching.py +254 -0
themis/generation/clients.py +143 -0
themis/generation/conversation_runner.py +236 -0
themis/generation/plan.py +456 -0
themis/generation/providers/litellm_provider.py +221 -0
themis/generation/providers/vllm_provider.py +135 -0
themis/generation/router.py +34 -0
themis/generation/runner.py +207 -0
themis/generation/strategies.py +98 -0
themis/generation/templates.py +71 -0
themis/generation/turn_strategies.py +393 -0
themis/generation/types.py +9 -0
themis/integrations/__init__.py +0 -0
themis/integrations/huggingface.py +72 -0
themis/integrations/wandb.py +77 -0
themis/interfaces/__init__.py +169 -0
themis/presets/__init__.py +10 -0
themis/presets/benchmarks.py +354 -0
themis/presets/models.py +190 -0
themis/project/__init__.py +20 -0
themis/project/definitions.py +98 -0
themis/project/patterns.py +230 -0
themis/providers/__init__.py +5 -0
themis/providers/registry.py +39 -0
themis/server/__init__.py +28 -0
themis/server/app.py +337 -0
themis/utils/api_generator.py +379 -0
themis/utils/cost_tracking.py +376 -0
themis/utils/dashboard.py +452 -0
themis/utils/logging_utils.py +41 -0
themis/utils/progress.py +58 -0
themis/utils/tracing.py +320 -0
themis_eval-0.2.0.dist-info/METADATA +596 -0
themis_eval-0.2.0.dist-info/RECORD +157 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/WHEEL +1 -1
themis_eval-0.1.0.dist-info/METADATA +0 -758
themis_eval-0.1.0.dist-info/RECORD +0 -8
{themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/licenses/LICENSE +0 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/top_level.txt +0 -0

themis/experiment/cost.py ADDED Viewed

@@ -0,0 +1,310 @@
+"""Cost tracking and estimation for LLM experiments."""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Any
+@dataclass
+class CostBreakdown:
+    """Detailed cost breakdown for an experiment.
+    Attributes:
+        total_cost: Total cost in USD
+        generation_cost: Cost of generation API calls
+        evaluation_cost: Cost of LLM-based evaluation (if applicable)
+        per_sample_costs: List of costs per sample
+        per_model_costs: Cost breakdown by model
+        token_counts: Token usage statistics
+        api_calls: Total number of API calls
+        currency: Currency code (default: USD)
+    """
+    total_cost: float
+    generation_cost: float
+    evaluation_cost: float = 0.0
+    per_sample_costs: list[float] = field(default_factory=list)
+    per_model_costs: dict[str, float] = field(default_factory=dict)
+    token_counts: dict[str, int] = field(default_factory=dict)
+    api_calls: int = 0
+    currency: str = "USD"
+    def __post_init__(self):
+        """Validate cost breakdown."""
+        if self.total_cost < 0:
+            raise ValueError("Total cost cannot be negative")
+        if self.generation_cost < 0:
+            raise ValueError("Generation cost cannot be negative")
+        if self.evaluation_cost < 0:
+            raise ValueError("Evaluation cost cannot be negative")
+@dataclass
+class CostEstimate:
+    """Cost estimate for an experiment.
+    Attributes:
+        estimated_cost: Expected cost in USD
+        lower_bound: Lower bound of 95% confidence interval
+        upper_bound: Upper bound of 95% confidence interval
+        breakdown_by_phase: Cost breakdown by experiment phase
+        assumptions: Assumptions used for estimation
+        currency: Currency code (default: USD)
+    """
+    estimated_cost: float
+    lower_bound: float
+    upper_bound: float
+    breakdown_by_phase: dict[str, float] = field(default_factory=dict)
+    assumptions: dict[str, Any] = field(default_factory=dict)
+    currency: str = "USD"
+class CostTracker:
+    """Tracks costs during experiment execution.
+    This class accumulates costs from generation and evaluation steps,
+    providing detailed breakdowns and per-sample tracking.
+    Example:
+        >>> tracker = CostTracker()
+        >>> tracker.record_generation("gpt-4", 100, 50, 0.0045)
+        >>> tracker.record_generation("gpt-4", 120, 60, 0.0054)
+        >>> breakdown = tracker.get_breakdown()
+        >>> print(f"Total cost: ${breakdown.total_cost:.4f}")
+    """
+    def __init__(self):
+        """Initialize cost tracker."""
+        self._generation_costs: list[tuple[str, float]] = []
+        self._evaluation_costs: list[tuple[str, float]] = []
+        self._token_counts: dict[str, int] = {
+            "prompt_tokens": 0,
+            "completion_tokens": 0,
+            "total_tokens": 0,
+        }
+        self._per_model_costs: dict[str, float] = {}
+        self._per_sample_costs: list[float] = []
+        self._api_calls: int = 0
+    def record_generation(
+        self,
+        model: str,
+        prompt_tokens: int,
+        completion_tokens: int,
+        cost: float,
+    ) -> None:
+        """Record cost of a generation call.
+        Args:
+            model: Model identifier
+            prompt_tokens: Number of prompt tokens
+            completion_tokens: Number of completion tokens
+            cost: Cost in USD
+        """
+        self._generation_costs.append((model, cost))
+        self._token_counts["prompt_tokens"] += prompt_tokens
+        self._token_counts["completion_tokens"] += completion_tokens
+        self._token_counts["total_tokens"] += prompt_tokens + completion_tokens
+        self._per_model_costs[model] = self._per_model_costs.get(model, 0.0) + cost
+        self._per_sample_costs.append(cost)
+        self._api_calls += 1
+    def record_evaluation(self, metric: str, cost: float) -> None:
+        """Record cost of LLM-based evaluation.
+        Args:
+            metric: Metric name that incurred the cost
+            cost: Cost in USD
+        """
+        self._evaluation_costs.append((metric, cost))
+        # Evaluation costs also count as API calls
+        self._api_calls += 1
+    def get_breakdown(self) -> CostBreakdown:
+        """Get detailed cost breakdown.
+        Returns:
+            CostBreakdown with all accumulated costs
+        """
+        generation_cost = sum(cost for _, cost in self._generation_costs)
+        evaluation_cost = sum(cost for _, cost in self._evaluation_costs)
+        total_cost = generation_cost + evaluation_cost
+        return CostBreakdown(
+            total_cost=total_cost,
+            generation_cost=generation_cost,
+            evaluation_cost=evaluation_cost,
+            per_sample_costs=self._per_sample_costs.copy(),
+            per_model_costs=self._per_model_costs.copy(),
+            token_counts=self._token_counts.copy(),
+            api_calls=self._api_calls,
+        )
+    def reset(self) -> None:
+        """Reset all tracked costs."""
+        self._generation_costs.clear()
+        self._evaluation_costs.clear()
+        self._token_counts = {
+            "prompt_tokens": 0,
+            "completion_tokens": 0,
+            "total_tokens": 0,
+        }
+        self._per_model_costs.clear()
+        self._per_sample_costs.clear()
+        self._api_calls = 0
+class BudgetMonitor:
+    """Monitor and enforce budget limits during experiments.
+    Example:
+        >>> monitor = BudgetMonitor(max_cost=10.0, alert_threshold=0.8)
+        >>> monitor.add_cost(7.0)
+        >>> within_budget, message = monitor.check_budget()
+        >>> print(message)  # "Warning: 70% of budget used"
+        >>> monitor.add_cost(4.0)  # Exceeds budget
+        >>> within_budget, message = monitor.check_budget()
+        >>> print(message)  # "Budget exceeded: $11.00 >= $10.00"
+    """
+    def __init__(self, max_cost: float, alert_threshold: float = 0.8):
+        """Initialize budget monitor.
+        Args:
+            max_cost: Maximum allowed cost in USD
+            alert_threshold: Threshold (0.0-1.0) for warning alerts
+        Raises:
+            ValueError: If max_cost is negative or alert_threshold is invalid
+        """
+        if max_cost < 0:
+            raise ValueError("Max cost cannot be negative")
+        if not 0.0 <= alert_threshold <= 1.0:
+            raise ValueError("Alert threshold must be between 0.0 and 1.0")
+        self.max_cost = max_cost
+        self.alert_threshold = alert_threshold
+        self.current_cost = 0.0
+    def add_cost(self, cost: float) -> None:
+        """Add cost to current total.
+        Args:
+            cost: Cost to add in USD
+        """
+        self.current_cost += cost
+    def check_budget(self) -> tuple[bool, str]:
+        """Check if budget is within limits.
+        Returns:
+            Tuple of (within_budget, message)
+            - within_budget: True if under max_cost
+            - message: Status message or warning
+        """
+        if self.current_cost >= self.max_cost:
+            return (
+                False,
+                f"Budget exceeded: ${self.current_cost:.2f} >= ${self.max_cost:.2f}",
+            )
+        if self.current_cost >= self.max_cost * self.alert_threshold:
+            percentage = (self.current_cost / self.max_cost) * 100
+            return (
+                True,
+                f"Warning: {percentage:.0f}% of budget used "
+                f"(${self.current_cost:.2f} / ${self.max_cost:.2f})",
+            )
+        return True, "Budget OK"
+    def remaining_budget(self) -> float:
+        """Get remaining budget.
+        Returns:
+            Remaining budget in USD (may be negative if exceeded)
+        """
+        return self.max_cost - self.current_cost
+    def percentage_used(self) -> float:
+        """Get percentage of budget used.
+        Returns:
+            Percentage (0.0-100.0+) of budget used
+        """
+        if self.max_cost == 0:
+            return 100.0 if self.current_cost > 0 else 0.0
+        return (self.current_cost / self.max_cost) * 100
+def estimate_experiment_cost(
+    model: str,
+    dataset_size: int,
+    avg_prompt_tokens: int = 500,
+    avg_completion_tokens: int = 300,
+    confidence_level: float = 0.95,
+) -> CostEstimate:
+    """Estimate total cost for an experiment.
+    Args:
+        model: Model identifier
+        dataset_size: Number of samples in dataset
+        avg_prompt_tokens: Average prompt tokens per sample
+        avg_completion_tokens: Average completion tokens per sample
+        confidence_level: Confidence level for bounds (default: 0.95)
+    Returns:
+        CostEstimate with expected cost and confidence bounds
+    Example:
+        >>> estimate = estimate_experiment_cost("gpt-4", 100, 500, 300)
+        >>> print(f"Estimated cost: ${estimate.estimated_cost:.2f}")
+        >>> print(f"Range: ${estimate.lower_bound:.2f} - ${estimate.upper_bound:.2f}")
+    """
+    from themis.experiment.pricing import calculate_cost
+    # Calculate cost per sample
+    cost_per_sample = calculate_cost(model, avg_prompt_tokens, avg_completion_tokens)
+    # Estimate total cost
+    estimated_cost = cost_per_sample * dataset_size
+    # Calculate confidence bounds (assuming ~20% variance)
+    variance_factor = 0.2
+    margin = estimated_cost * variance_factor * (1 - (1 - confidence_level))
+    lower_bound = max(0.0, estimated_cost - margin)
+    upper_bound = estimated_cost + margin
+    breakdown = {
+        "generation": estimated_cost,
+        "evaluation": 0.0,  # No LLM-based evaluation assumed
+    }
+    assumptions = {
+        "model": model,
+        "dataset_size": dataset_size,
+        "avg_prompt_tokens": avg_prompt_tokens,
+        "avg_completion_tokens": avg_completion_tokens,
+        "cost_per_sample": cost_per_sample,
+        "confidence_level": confidence_level,
+    }
+    return CostEstimate(
+        estimated_cost=estimated_cost,
+        lower_bound=lower_bound,
+        upper_bound=upper_bound,
+        breakdown_by_phase=breakdown,
+        assumptions=assumptions,
+    )
+__all__ = [
+    "CostBreakdown",
+    "CostEstimate",
+    "CostTracker",
+    "BudgetMonitor",
+    "estimate_experiment_cost",
+]

themis/experiment/definitions.py ADDED Viewed

@@ -0,0 +1,62 @@
+"""Shared experiment definitions used by the builder."""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any, Callable, Sequence
+from themis.core import entities as core_entities
+if TYPE_CHECKING:
+    from themis.evaluation.pipelines.standard_pipeline import EvaluationPipeline
+    from themis.experiment.orchestrator import ExperimentOrchestrator
+    from themis.experiment.storage import ExperimentStorage
+    from themis.generation.plan import GenerationPlan
+    from themis.generation.runner import GenerationRunner
+    from themis.interfaces import ModelProvider
+@dataclass
+class ModelBinding:
+    spec: core_entities.ModelSpec
+    provider_name: str
+    provider_options: dict[str, Any] = field(default_factory=dict)
+@dataclass
+class ExperimentDefinition:
+    templates: Sequence
+    sampling_parameters: Sequence[core_entities.SamplingConfig]
+    model_bindings: Sequence[ModelBinding]
+    dataset_id_field: str = "id"
+    reference_field: str | None = "expected"
+    metadata_fields: Sequence[str] = field(default_factory=tuple)
+    context_builder: Callable[[dict[str, Any]], dict[str, Any]] | None = None
+@dataclass
+class BuiltExperiment:
+    """Built experiment with all components assembled.
+    Attributes:
+        plan: Generation plan for expanding tasks from dataset samples
+        runner: Generation runner for executing tasks via providers
+        pipeline: Evaluation pipeline for scoring outputs
+        storage: Optional experiment storage for caching and resumability
+        router: Provider router for dispatching to correct LLM provider
+        orchestrator: Main orchestrator coordinating generation and evaluation
+    """
+    plan: "GenerationPlan"
+    runner: "GenerationRunner"
+    pipeline: "EvaluationPipeline"
+    storage: "ExperimentStorage | None"
+    router: "ModelProvider"
+    orchestrator: "ExperimentOrchestrator"
+__all__ = [
+    "ModelBinding",
+    "ExperimentDefinition",
+    "BuiltExperiment",
+]

themis-eval 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

themis-eval 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl