PyPI - themis-eval - Versions diffs - 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl - Mend

themis-eval 0.1.0py3-none-any.whl → 0.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (132) hide show

themis/cli/__init__.py +5 -0
themis/cli/__main__.py +6 -0
themis/cli/commands/__init__.py +19 -0
themis/cli/commands/benchmarks.py +221 -0
themis/cli/commands/comparison.py +394 -0
themis/cli/commands/config_commands.py +244 -0
themis/cli/commands/cost.py +214 -0
themis/cli/commands/demo.py +68 -0
themis/cli/commands/info.py +90 -0
themis/cli/commands/leaderboard.py +362 -0
themis/cli/commands/math_benchmarks.py +318 -0
themis/cli/commands/mcq_benchmarks.py +207 -0
themis/cli/commands/sample_run.py +244 -0
themis/cli/commands/visualize.py +299 -0
themis/cli/main.py +93 -0
themis/cli/new_project.py +33 -0
themis/cli/utils.py +51 -0
themis/config/__init__.py +19 -0
themis/config/loader.py +27 -0
themis/config/registry.py +34 -0
themis/config/runtime.py +214 -0
themis/config/schema.py +112 -0
themis/core/__init__.py +5 -0
themis/core/conversation.py +354 -0
themis/core/entities.py +164 -0
themis/core/serialization.py +231 -0
themis/core/tools.py +393 -0
themis/core/types.py +141 -0
themis/datasets/__init__.py +273 -0
themis/datasets/base.py +264 -0
themis/datasets/commonsense_qa.py +174 -0
themis/datasets/competition_math.py +265 -0
themis/datasets/coqa.py +133 -0
themis/datasets/gpqa.py +190 -0
themis/datasets/gsm8k.py +123 -0
themis/datasets/gsm_symbolic.py +124 -0
themis/datasets/math500.py +122 -0
themis/datasets/med_qa.py +179 -0
themis/datasets/medmcqa.py +169 -0
themis/datasets/mmlu_pro.py +262 -0
themis/datasets/piqa.py +146 -0
themis/datasets/registry.py +201 -0
themis/datasets/schema.py +245 -0
themis/datasets/sciq.py +150 -0
themis/datasets/social_i_qa.py +151 -0
themis/datasets/super_gpqa.py +263 -0
themis/evaluation/__init__.py +1 -0
themis/evaluation/conditional.py +410 -0
themis/evaluation/extractors/__init__.py +19 -0
themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
themis/evaluation/extractors/exceptions.py +7 -0
themis/evaluation/extractors/identity_extractor.py +29 -0
themis/evaluation/extractors/json_field_extractor.py +45 -0
themis/evaluation/extractors/math_verify_extractor.py +37 -0
themis/evaluation/extractors/regex_extractor.py +43 -0
themis/evaluation/math_verify_utils.py +87 -0
themis/evaluation/metrics/__init__.py +21 -0
themis/evaluation/metrics/composite_metric.py +47 -0
themis/evaluation/metrics/consistency_metric.py +80 -0
themis/evaluation/metrics/exact_match.py +51 -0
themis/evaluation/metrics/length_difference_tolerance.py +33 -0
themis/evaluation/metrics/math_verify_accuracy.py +40 -0
themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
themis/evaluation/metrics/response_length.py +33 -0
themis/evaluation/metrics/rubric_judge_metric.py +134 -0
themis/evaluation/pipeline.py +49 -0
themis/evaluation/pipelines/__init__.py +15 -0
themis/evaluation/pipelines/composable_pipeline.py +357 -0
themis/evaluation/pipelines/standard_pipeline.py +288 -0
themis/evaluation/reports.py +293 -0
themis/evaluation/statistics/__init__.py +53 -0
themis/evaluation/statistics/bootstrap.py +79 -0
themis/evaluation/statistics/confidence_intervals.py +121 -0
themis/evaluation/statistics/distributions.py +207 -0
themis/evaluation/statistics/effect_sizes.py +124 -0
themis/evaluation/statistics/hypothesis_tests.py +305 -0
themis/evaluation/statistics/types.py +139 -0
themis/evaluation/strategies/__init__.py +13 -0
themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
themis/evaluation/strategies/evaluation_strategy.py +24 -0
themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
themis/experiment/__init__.py +5 -0
themis/experiment/builder.py +151 -0
themis/experiment/cache_manager.py +129 -0
themis/experiment/comparison.py +631 -0
themis/experiment/cost.py +310 -0
themis/experiment/definitions.py +62 -0
themis/experiment/export.py +690 -0
themis/experiment/export_csv.py +159 -0
themis/experiment/integration_manager.py +104 -0
themis/experiment/math.py +192 -0
themis/experiment/mcq.py +169 -0
themis/experiment/orchestrator.py +373 -0
themis/experiment/pricing.py +317 -0
themis/experiment/storage.py +255 -0
themis/experiment/visualization.py +588 -0
themis/generation/__init__.py +1 -0
themis/generation/agentic_runner.py +420 -0
themis/generation/batching.py +254 -0
themis/generation/clients.py +143 -0
themis/generation/conversation_runner.py +236 -0
themis/generation/plan.py +456 -0
themis/generation/providers/litellm_provider.py +221 -0
themis/generation/providers/vllm_provider.py +135 -0
themis/generation/router.py +34 -0
themis/generation/runner.py +207 -0
themis/generation/strategies.py +98 -0
themis/generation/templates.py +71 -0
themis/generation/turn_strategies.py +393 -0
themis/generation/types.py +9 -0
themis/integrations/__init__.py +0 -0
themis/integrations/huggingface.py +61 -0
themis/integrations/wandb.py +65 -0
themis/interfaces/__init__.py +83 -0
themis/project/__init__.py +20 -0
themis/project/definitions.py +98 -0
themis/project/patterns.py +230 -0
themis/providers/__init__.py +5 -0
themis/providers/registry.py +39 -0
themis/utils/api_generator.py +379 -0
themis/utils/cost_tracking.py +376 -0
themis/utils/dashboard.py +452 -0
themis/utils/logging_utils.py +41 -0
themis/utils/progress.py +58 -0
themis/utils/tracing.py +320 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/METADATA +1 -1
themis_eval-0.1.1.dist-info/RECORD +134 -0
themis_eval-0.1.0.dist-info/RECORD +0 -8
{themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/WHEEL +0 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/licenses/LICENSE +0 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/top_level.txt +0 -0

themis/evaluation/statistics/bootstrap.py ADDED Viewed

@@ -0,0 +1,79 @@
+"""Bootstrap resampling for confidence intervals."""
+from __future__ import annotations
+import random
+from statistics import mean
+from typing import Callable, Sequence
+from .types import BootstrapResult
+def bootstrap_ci(
+    values: Sequence[float],
+    statistic: Callable[[Sequence[float]], float] = mean,
+    n_bootstrap: int = 10000,
+    confidence_level: float = 0.95,
+    seed: int | None = None,
+) -> BootstrapResult:
+    """Compute bootstrap confidence interval for a statistic.
+    Bootstrap resampling provides non-parametric confidence intervals
+    without assuming normality of the underlying distribution.
+    Args:
+        values: Sample values
+        statistic: Function to compute on each bootstrap sample (default: mean)
+        n_bootstrap: Number of bootstrap iterations (default: 10000)
+        confidence_level: Confidence level (default: 0.95)
+        seed: Random seed for reproducibility
+    Returns:
+        BootstrapResult with CI bounds and point estimate
+    Raises:
+        ValueError: If values is empty
+    Example:
+        >>> values = [1.2, 2.3, 3.1, 2.8, 3.5]
+        >>> result = bootstrap_ci(values, statistic=mean, n_bootstrap=10000)
+        >>> print(f"Mean: {result.statistic:.2f}, 95% CI: [{result.ci_lower:.2f}, {result.ci_upper:.2f}]")
+    """
+    if not values:
+        raise ValueError("Cannot compute bootstrap CI for empty sequence")
+    rng = random.Random(seed)
+    n = len(values)
+    values_list = list(values)
+    # Compute observed statistic
+    observed_stat = statistic(values_list)
+    # Bootstrap iterations
+    bootstrap_stats = []
+    for _ in range(n_bootstrap):
+        # Resample with replacement
+        resample = rng.choices(values_list, k=n)
+        boot_stat = statistic(resample)
+        bootstrap_stats.append(boot_stat)
+    # Sort bootstrap statistics
+    bootstrap_stats.sort()
+    # Compute percentile CI
+    alpha = 1 - confidence_level
+    lower_idx = int(n_bootstrap * alpha / 2)
+    upper_idx = int(n_bootstrap * (1 - alpha / 2))
+    # Ensure indices are within bounds
+    lower_idx = max(0, min(lower_idx, n_bootstrap - 1))
+    upper_idx = max(0, min(upper_idx, n_bootstrap - 1))
+    return BootstrapResult(
+        statistic=observed_stat,
+        ci_lower=bootstrap_stats[lower_idx],
+        ci_upper=bootstrap_stats[upper_idx],
+        confidence_level=confidence_level,
+        n_bootstrap=n_bootstrap,
+    )

themis/evaluation/statistics/confidence_intervals.py ADDED Viewed

@@ -0,0 +1,121 @@
+"""Confidence interval computation."""
+from __future__ import annotations
+import math
+from statistics import mean, stdev
+from typing import List, Sequence
+from themis.core import entities as core_entities
+from .distributions import inverse_normal_cdf, t_critical_value
+from .types import ConfidenceInterval, StatisticalSummary
+def compute_confidence_interval(
+    values: Sequence[float],
+    confidence_level: float = 0.95,
+) -> ConfidenceInterval:
+    """Compute confidence interval for a sample mean using t-distribution.
+    Args:
+        values: Sequence of numeric values
+        confidence_level: Confidence level (default: 0.95)
+    Returns:
+        ConfidenceInterval with bounds and statistics
+    Raises:
+        ValueError: If values is empty or has insufficient data
+    """
+    n = len(values)
+    if n == 0:
+        raise ValueError("Cannot compute confidence interval for empty sequence")
+    if n == 1:
+        # Single value - return degenerate interval
+        val = float(values[0])
+        return ConfidenceInterval(
+            mean=val,
+            lower=val,
+            upper=val,
+            confidence_level=confidence_level,
+            sample_size=1,
+        )
+    sample_mean = mean(values)
+    sample_std = stdev(values)
+    # For large samples (n >= 30), use normal approximation with z-score
+    # For small samples, use t-distribution critical value
+    if n >= 30:
+        # Normal approximation: use z-scores
+        # For 95% CI: z = 1.96, for 99% CI: z = 2.576
+        if abs(confidence_level - 0.95) < 0.01:
+            critical_value = 1.96
+        elif abs(confidence_level - 0.99) < 0.01:
+            critical_value = 2.576
+        elif abs(confidence_level - 0.90) < 0.01:
+            critical_value = 1.645
+        else:
+            # General approximation using inverse normal CDF
+            critical_value = inverse_normal_cdf((1 + confidence_level) / 2)
+    else:
+        # Small sample: use t-distribution critical value (approximation)
+        critical_value = t_critical_value(n - 1, confidence_level)
+    standard_error = sample_std / math.sqrt(n)
+    margin_of_error = critical_value * standard_error
+    return ConfidenceInterval(
+        mean=sample_mean,
+        lower=sample_mean - margin_of_error,
+        upper=sample_mean + margin_of_error,
+        confidence_level=confidence_level,
+        sample_size=n,
+    )
+def compute_statistical_summary(
+    scores: List[core_entities.MetricScore],
+) -> StatisticalSummary:
+    """Compute comprehensive statistical summary for metric scores.
+    Args:
+        scores: List of MetricScore objects
+    Returns:
+        StatisticalSummary with descriptive statistics
+    Raises:
+        ValueError: If scores is empty
+    """
+    if not scores:
+        raise ValueError("Cannot compute statistical summary for empty scores list")
+    metric_name = scores[0].metric_name
+    values = [score.value for score in scores]
+    n = len(values)
+    # Sort for percentile calculations
+    sorted_values = sorted(values)
+    median_idx = n // 2
+    if n % 2 == 0:
+        median_value = (sorted_values[median_idx - 1] + sorted_values[median_idx]) / 2.0
+    else:
+        median_value = sorted_values[median_idx]
+    # Compute confidence interval if we have enough data
+    ci_95 = None
+    if n >= 2:
+        ci_95 = compute_confidence_interval(values, confidence_level=0.95)
+    return StatisticalSummary(
+        metric_name=metric_name,
+        count=n,
+        mean=mean(values),
+        std=stdev(values) if n >= 2 else 0.0,
+        min_value=min(values),
+        max_value=max(values),
+        median=median_value,
+        confidence_interval_95=ci_95,
+    )

themis/evaluation/statistics/distributions.py ADDED Viewed

@@ -0,0 +1,207 @@
+"""Helper functions for statistical distributions."""
+from __future__ import annotations
+import math
+def inverse_normal_cdf(p: float) -> float:
+    """Approximate inverse normal CDF (probit function) for standard normal.
+    Uses Beasley-Springer-Moro approximation.
+    Args:
+        p: Probability value between 0 and 1
+    Returns:
+        z-score corresponding to probability p
+    Raises:
+        ValueError: If p is not between 0 and 1
+    """
+    if p <= 0 or p >= 1:
+        raise ValueError("Probability must be between 0 and 1")
+    # Constants for approximation
+    a = [2.50662823884, -18.61500062529, 41.39119773534, -25.44106049637]
+    b = [-8.47351093090, 23.08336743743, -21.06224101826, 3.13082909833]
+    c = [
+        0.3374754822726147,
+        0.9761690190917186,
+        0.1607979714918209,
+        0.0276438810333863,
+        0.0038405729373609,
+        0.0003951896511919,
+        0.0000321767881768,
+        0.0000002888167364,
+        0.0000003960315187,
+    ]
+    # Transform to standard normal
+    y = p - 0.5
+    if abs(y) < 0.42:
+        # Central region
+        r = y * y
+        x = (
+            y
+            * (((a[3] * r + a[2]) * r + a[1]) * r + a[0])
+            / (((b[3] * r + b[2]) * r + b[1]) * r + b[0] + 1.0)
+        )
+        return x
+    else:
+        # Tail region
+        r = p if y > 0 else 1 - p
+        r = math.log(-math.log(r))
+        x = c[0] + r * (
+            c[1]
+            + r
+            * (
+                c[2]
+                + r
+                * (c[3] + r * (c[4] + r * (c[5] + r * (c[6] + r * (c[7] + r * c[8])))))
+            )
+        )
+        if y < 0:
+            x = -x
+        return x
+def t_critical_value(df: int, confidence_level: float) -> float:
+    """Approximate t-distribution critical value.
+    This is a simplified approximation. For production use, consider scipy.stats.t.ppf.
+    Args:
+        df: Degrees of freedom
+        confidence_level: Confidence level (e.g., 0.95)
+    Returns:
+        Critical value for two-tailed test
+    """
+    try:
+        from scipy import stats
+    except Exception:  # pragma: no cover - optional dependency
+        stats = None
+    if stats is not None:
+        alpha = (1 - confidence_level) / 2
+        return float(stats.t.ppf(1 - alpha, df))
+    # For common confidence levels and degrees of freedom, use lookup table
+    # Otherwise, use normal approximation for large df
+    if df >= 30:
+        # Use normal approximation for large df
+        alpha = (1 - confidence_level) / 2
+        return inverse_normal_cdf(1 - alpha)
+    # Simplified lookup table for small df (two-tailed)
+    # Format: {confidence_level: {df: critical_value}}
+    lookup_95 = {
+        1: 12.706,
+        2: 4.303,
+        3: 3.182,
+        4: 2.776,
+        5: 2.571,
+        6: 2.447,
+        7: 2.365,
+        8: 2.306,
+        9: 2.262,
+        10: 2.228,
+        15: 2.131,
+        20: 2.086,
+        25: 2.060,
+        29: 2.045,
+    }
+    lookup_99 = {
+        1: 63.657,
+        2: 9.925,
+        3: 5.841,
+        4: 4.604,
+        5: 4.032,
+        6: 3.707,
+        7: 3.499,
+        8: 3.355,
+        9: 3.250,
+        10: 3.169,
+        15: 2.947,
+        20: 2.845,
+        25: 2.787,
+        29: 2.756,
+    }
+    if abs(confidence_level - 0.95) < 0.01:
+        lookup = lookup_95
+    elif abs(confidence_level - 0.99) < 0.01:
+        lookup = lookup_99
+    else:
+        # Fall back to normal approximation
+        alpha = (1 - confidence_level) / 2
+        return inverse_normal_cdf(1 - alpha)
+    # Find closest df in lookup table
+    if df in lookup:
+        return lookup[df]
+    else:
+        # Linear interpolation or nearest neighbor
+        df_keys = sorted(lookup.keys())
+        for i, key_df in enumerate(df_keys):
+            if df < key_df:
+                if i == 0:
+                    return lookup[key_df]
+                else:
+                    # Interpolate between previous and current
+                    prev_df = df_keys[i - 1]
+                    weight = (df - prev_df) / (key_df - prev_df)
+                    return lookup[prev_df] * (1 - weight) + lookup[key_df] * weight
+        return lookup[df_keys[-1]]
+def t_to_p_value(t_stat: float, df: int) -> float:
+    """Approximate two-tailed p-value for t-statistic.
+    This is a simplified approximation. For production use, consider scipy.stats.t.cdf.
+    Args:
+        t_stat: t-statistic value
+        df: Degrees of freedom
+    Returns:
+        Two-tailed p-value
+    """
+    try:
+        from scipy import stats
+    except Exception:  # pragma: no cover - optional dependency
+        stats = None
+    if stats is not None:
+        p_one_tail = stats.t.cdf(-abs(t_stat), df)
+        return float(2 * p_one_tail)
+    # For large df, use normal approximation
+    if df >= 30:
+        # Use normal distribution CDF
+        p_one_tail = normal_cdf(-abs(t_stat))
+        return 2 * p_one_tail
+    # For small df, use approximation
+    # Very rough approximation: convert t to approximate p-value
+    if abs(t_stat) < 0.5:
+        return 1.0
+    elif abs(t_stat) > 10:
+        return 0.0001
+    else:
+        # Rough approximation using exponential decay
+        base_p = math.exp(-abs(t_stat) * 0.5) * (df / (df + t_stat**2))
+        return min(1.0, 2 * base_p)
+def normal_cdf(x: float) -> float:
+    """Standard normal CDF using error function approximation.
+    Args:
+        x: Value to evaluate CDF at
+    Returns:
+        Cumulative probability
+    """
+    return 0.5 * (1 + math.erf(x / math.sqrt(2)))

themis/evaluation/statistics/effect_sizes.py ADDED Viewed

@@ -0,0 +1,124 @@
+"""Effect size measures for statistical comparisons."""
+from __future__ import annotations
+import math
+from typing import Sequence
+from .types import EffectSize
+def cohens_h(p1: float, p2: float) -> EffectSize:
+    """Compute Cohen's h effect size for comparing two proportions.
+    Cohen's h measures the distance between two proportions using
+    the arcsine transformation. This is useful for comparing success
+    rates, accuracy proportions, etc.
+    Args:
+        p1: Proportion for group 1 (e.g., baseline accuracy)
+        p2: Proportion for group 2 (e.g., treatment accuracy)
+    Returns:
+        EffectSize with value and interpretation
+    Interpretation:
+        - |h| < 0.2: negligible
+        - 0.2 <= |h| < 0.5: small
+        - 0.5 <= |h| < 0.8: medium
+        - |h| >= 0.8: large
+    Example:
+        >>> # Baseline: 65% accuracy, Treatment: 75% accuracy
+        >>> effect = cohens_h(0.65, 0.75)
+        >>> print(f"Effect: {effect.value:.3f} ({effect.interpretation})")
+    """
+    # Arcsine transformation
+    phi1 = 2 * math.asin(math.sqrt(p1))
+    phi2 = 2 * math.asin(math.sqrt(p2))
+    h = phi2 - phi1
+    # Interpret effect size
+    abs_h = abs(h)
+    if abs_h < 0.2:
+        interpretation = "negligible"
+    elif abs_h < 0.5:
+        interpretation = "small"
+    elif abs_h < 0.8:
+        interpretation = "medium"
+    else:
+        interpretation = "large"
+    return EffectSize(
+        name="cohen_h",
+        value=h,
+        interpretation=interpretation,
+    )
+def cohens_d(group1: Sequence[float], group2: Sequence[float]) -> EffectSize:
+    """Compute Cohen's d effect size for comparing two means.
+    Cohen's d measures the standardized difference between two group means.
+    This is the most common effect size for t-tests.
+    Args:
+        group1: Values from first group (e.g., baseline)
+        group2: Values from second group (e.g., treatment)
+    Returns:
+        EffectSize with value and interpretation
+    Interpretation:
+        - |d| < 0.2: negligible
+        - 0.2 <= |d| < 0.5: small
+        - 0.5 <= |d| < 0.8: medium
+        - |d| >= 0.8: large
+    Example:
+        >>> baseline = [1.2, 1.5, 1.3, 1.4]
+        >>> treatment = [1.8, 2.0, 1.9, 2.1]
+        >>> effect = cohens_d(baseline, treatment)
+    """
+    from statistics import mean, stdev
+    n1 = len(group1)
+    n2 = len(group2)
+    if n1 < 2 or n2 < 2:
+        raise ValueError("Each group must have at least 2 values")
+    mean1 = mean(group1)
+    mean2 = mean(group2)
+    std1 = stdev(group1)
+    std2 = stdev(group2)
+    # Pooled standard deviation
+    pooled_std = math.sqrt(((n1 - 1) * std1**2 + (n2 - 1) * std2**2) / (n1 + n2 - 2))
+    if pooled_std == 0:
+        # No variance - return 0 if means are equal, infinity otherwise
+        if mean1 == mean2:
+            d = 0.0
+        else:
+            d = float("inf")
+    else:
+        d = (mean2 - mean1) / pooled_std
+    # Interpret effect size
+    abs_d = abs(d)
+    if abs_d < 0.2:
+        interpretation = "negligible"
+    elif abs_d < 0.5:
+        interpretation = "small"
+    elif abs_d < 0.8:
+        interpretation = "medium"
+    else:
+        interpretation = "large"
+    return EffectSize(
+        name="cohen_d",
+        value=d,
+        interpretation=interpretation,
+    )

themis-eval 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

themis-eval 0.1.0py3-none-any.whl → 0.1.1py3-none-any.whl