PyPI - themis-eval - Versions diffs - 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl - Mend

themis-eval 0.1.0py3-none-any.whl → 0.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (132) hide show

themis/cli/__init__.py +5 -0
themis/cli/__main__.py +6 -0
themis/cli/commands/__init__.py +19 -0
themis/cli/commands/benchmarks.py +221 -0
themis/cli/commands/comparison.py +394 -0
themis/cli/commands/config_commands.py +244 -0
themis/cli/commands/cost.py +214 -0
themis/cli/commands/demo.py +68 -0
themis/cli/commands/info.py +90 -0
themis/cli/commands/leaderboard.py +362 -0
themis/cli/commands/math_benchmarks.py +318 -0
themis/cli/commands/mcq_benchmarks.py +207 -0
themis/cli/commands/sample_run.py +244 -0
themis/cli/commands/visualize.py +299 -0
themis/cli/main.py +93 -0
themis/cli/new_project.py +33 -0
themis/cli/utils.py +51 -0
themis/config/__init__.py +19 -0
themis/config/loader.py +27 -0
themis/config/registry.py +34 -0
themis/config/runtime.py +214 -0
themis/config/schema.py +112 -0
themis/core/__init__.py +5 -0
themis/core/conversation.py +354 -0
themis/core/entities.py +164 -0
themis/core/serialization.py +231 -0
themis/core/tools.py +393 -0
themis/core/types.py +141 -0
themis/datasets/__init__.py +273 -0
themis/datasets/base.py +264 -0
themis/datasets/commonsense_qa.py +174 -0
themis/datasets/competition_math.py +265 -0
themis/datasets/coqa.py +133 -0
themis/datasets/gpqa.py +190 -0
themis/datasets/gsm8k.py +123 -0
themis/datasets/gsm_symbolic.py +124 -0
themis/datasets/math500.py +122 -0
themis/datasets/med_qa.py +179 -0
themis/datasets/medmcqa.py +169 -0
themis/datasets/mmlu_pro.py +262 -0
themis/datasets/piqa.py +146 -0
themis/datasets/registry.py +201 -0
themis/datasets/schema.py +245 -0
themis/datasets/sciq.py +150 -0
themis/datasets/social_i_qa.py +151 -0
themis/datasets/super_gpqa.py +263 -0
themis/evaluation/__init__.py +1 -0
themis/evaluation/conditional.py +410 -0
themis/evaluation/extractors/__init__.py +19 -0
themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
themis/evaluation/extractors/exceptions.py +7 -0
themis/evaluation/extractors/identity_extractor.py +29 -0
themis/evaluation/extractors/json_field_extractor.py +45 -0
themis/evaluation/extractors/math_verify_extractor.py +37 -0
themis/evaluation/extractors/regex_extractor.py +43 -0
themis/evaluation/math_verify_utils.py +87 -0
themis/evaluation/metrics/__init__.py +21 -0
themis/evaluation/metrics/composite_metric.py +47 -0
themis/evaluation/metrics/consistency_metric.py +80 -0
themis/evaluation/metrics/exact_match.py +51 -0
themis/evaluation/metrics/length_difference_tolerance.py +33 -0
themis/evaluation/metrics/math_verify_accuracy.py +40 -0
themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
themis/evaluation/metrics/response_length.py +33 -0
themis/evaluation/metrics/rubric_judge_metric.py +134 -0
themis/evaluation/pipeline.py +49 -0
themis/evaluation/pipelines/__init__.py +15 -0
themis/evaluation/pipelines/composable_pipeline.py +357 -0
themis/evaluation/pipelines/standard_pipeline.py +288 -0
themis/evaluation/reports.py +293 -0
themis/evaluation/statistics/__init__.py +53 -0
themis/evaluation/statistics/bootstrap.py +79 -0
themis/evaluation/statistics/confidence_intervals.py +121 -0
themis/evaluation/statistics/distributions.py +207 -0
themis/evaluation/statistics/effect_sizes.py +124 -0
themis/evaluation/statistics/hypothesis_tests.py +305 -0
themis/evaluation/statistics/types.py +139 -0
themis/evaluation/strategies/__init__.py +13 -0
themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
themis/evaluation/strategies/evaluation_strategy.py +24 -0
themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
themis/experiment/__init__.py +5 -0
themis/experiment/builder.py +151 -0
themis/experiment/cache_manager.py +129 -0
themis/experiment/comparison.py +631 -0
themis/experiment/cost.py +310 -0
themis/experiment/definitions.py +62 -0
themis/experiment/export.py +690 -0
themis/experiment/export_csv.py +159 -0
themis/experiment/integration_manager.py +104 -0
themis/experiment/math.py +192 -0
themis/experiment/mcq.py +169 -0
themis/experiment/orchestrator.py +373 -0
themis/experiment/pricing.py +317 -0
themis/experiment/storage.py +255 -0
themis/experiment/visualization.py +588 -0
themis/generation/__init__.py +1 -0
themis/generation/agentic_runner.py +420 -0
themis/generation/batching.py +254 -0
themis/generation/clients.py +143 -0
themis/generation/conversation_runner.py +236 -0
themis/generation/plan.py +456 -0
themis/generation/providers/litellm_provider.py +221 -0
themis/generation/providers/vllm_provider.py +135 -0
themis/generation/router.py +34 -0
themis/generation/runner.py +207 -0
themis/generation/strategies.py +98 -0
themis/generation/templates.py +71 -0
themis/generation/turn_strategies.py +393 -0
themis/generation/types.py +9 -0
themis/integrations/__init__.py +0 -0
themis/integrations/huggingface.py +61 -0
themis/integrations/wandb.py +65 -0
themis/interfaces/__init__.py +83 -0
themis/project/__init__.py +20 -0
themis/project/definitions.py +98 -0
themis/project/patterns.py +230 -0
themis/providers/__init__.py +5 -0
themis/providers/registry.py +39 -0
themis/utils/api_generator.py +379 -0
themis/utils/cost_tracking.py +376 -0
themis/utils/dashboard.py +452 -0
themis/utils/logging_utils.py +41 -0
themis/utils/progress.py +58 -0
themis/utils/tracing.py +320 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/METADATA +1 -1
themis_eval-0.1.1.dist-info/RECORD +134 -0
themis_eval-0.1.0.dist-info/RECORD +0 -8
{themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/WHEEL +0 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/licenses/LICENSE +0 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/top_level.txt +0 -0

themis/evaluation/pipelines/standard_pipeline.py ADDED Viewed

@@ -0,0 +1,288 @@
+"""Standard evaluation pipeline implementation."""
+from __future__ import annotations
+import logging
+import time
+from typing import Callable, Sequence
+from themis.core import entities as core_entities
+from themis.evaluation import extractors
+from themis.evaluation import strategies as evaluation_strategies
+from themis.evaluation.reports import (
+    EvaluationFailure,
+    EvaluationReport,
+    MetricAggregate,
+)
+from themis.interfaces import Metric as MetricInterface
+from themis.utils import tracing
+logger = logging.getLogger(__name__)
+def _default_reference_selector(record: core_entities.GenerationRecord):
+    """Default reference selector from generation record.
+    Args:
+        record: Generation record
+    Returns:
+        Reference value or None
+    """
+    reference = record.task.reference
+    if reference is None:
+        return None
+    return reference.value
+def _normalize_references(reference):
+    """Normalize reference to list format.
+    Args:
+        reference: Reference value
+    Returns:
+        List of references
+    """
+    if isinstance(reference, core_entities.Reference):
+        reference = reference.value
+    if isinstance(reference, list):
+        return reference
+    return [reference]
+class EvaluationPipeline:
+    """Traditional batch evaluation pipeline.
+    This pipeline evaluates generation records using extractors, metrics,
+    and evaluation strategies. It supports slicing for subset analysis.
+    Example:
+        >>> pipeline = EvaluationPipeline(
+        ...     extractor=JsonFieldExtractor("answer"),
+        ...     metrics=[ExactMatch()]
+        ... )
+        >>> report = pipeline.evaluate(records)
+    Attributes:
+        _extractor: Extractor for parsing model output
+        _metrics: List of metrics to compute
+        _reference_selector: Function to extract reference from record
+        _strategy_resolver: Function to resolve evaluation strategy
+        _slices: List of (name, predicate) tuples for slicing
+    """
+    def __init__(
+        self,
+        *,
+        extractor,
+        metrics: Sequence[MetricInterface],
+        reference_selector: Callable[[core_entities.GenerationRecord], object]
+        | None = None,
+        strategy_resolver: Callable[
+            [core_entities.GenerationRecord], evaluation_strategies.EvaluationStrategy
+        ]
+        | None = None,
+    ) -> None:
+        """Initialize evaluation pipeline.
+        Args:
+            extractor: Extractor for parsing model output
+            metrics: List of metrics to compute
+            reference_selector: Optional function to extract reference
+            strategy_resolver: Optional function to resolve strategy
+        """
+        self._extractor = extractor
+        self._metrics = list(metrics)
+        self._reference_selector = reference_selector or _default_reference_selector
+        self._strategy_resolver = strategy_resolver or (
+            lambda record: evaluation_strategies.DefaultEvaluationStrategy()
+        )
+        self._slices: list[
+            tuple[str, Callable[[core_entities.GenerationRecord], bool]]
+        ] = []
+    def evaluate(
+        self, records: Sequence[core_entities.GenerationRecord]
+    ) -> EvaluationReport:
+        """Evaluate generation records.
+        Args:
+            records: Generation records to evaluate
+        Returns:
+            Evaluation report with metrics and failures
+        """
+        with tracing.span("evaluate_pipeline", total_records=len(records)):
+            per_metric: dict[str, list[core_entities.MetricScore]] = {
+                metric.name: [] for metric in self._metrics
+            }
+            failures: list[EvaluationFailure] = []
+            per_record: list[core_entities.EvaluationRecord] = []
+            slice_members: dict[str, set[str]] = {
+                name: set() for name, _ in self._slices
+            }
+            for record in records:
+                with tracing.span("evaluate_record"):
+                    logger.debug(
+                        "Evaluating sample %s with %s metric(s)",
+                        record.task.metadata.get("dataset_id")
+                        or record.task.metadata.get("sample_id"),
+                        len(self._metrics),
+                    )
+                    strategy = self._strategy_resolver(record)
+                    task_metadata = record.task.metadata
+                    sample_id = task_metadata.get("dataset_id") or task_metadata.get(
+                        "sample_id"
+                    )
+                    for name, fn in self._slices:
+                        try:
+                            if fn(record) and sample_id is not None:
+                                slice_members[name].add(sample_id)
+                        except Exception:
+                            pass
+                    eval_items = list(strategy.prepare(record))
+                    item_scores: list[core_entities.MetricScore] = []
+                    record_failures: list[str] = []
+                    for item in eval_items:
+                        if item.record.output is None:
+                            message = "Missing model output"
+                            failures.append(
+                                EvaluationFailure(sample_id=sample_id, message=message)
+                            )
+                            record_failures.append(message)
+                            continue
+                        try:
+                            with tracing.span("extract"):
+                                prediction = self._extractor.extract(
+                                    item.record.output.text
+                                )
+                        except extractors.FieldExtractionError as exc:
+                            message = str(exc)
+                            failures.append(
+                                EvaluationFailure(sample_id=sample_id, message=message)
+                            )
+                            record_failures.append(message)
+                            continue
+                        reference = item.reference or self._reference_selector(record)
+                        references = (
+                            _normalize_references(reference)
+                            if reference is not None
+                            else []
+                        )
+                        metadata = {"sample_id": sample_id}
+                        extract_start = time.perf_counter()
+                        item_scores_for_item: list[core_entities.MetricScore] = []
+                        for metric in self._metrics:
+                            requires_reference = getattr(
+                                metric, "requires_reference", True
+                            )
+                            if requires_reference and not references:
+                                message = (
+                                    f"Missing reference for metric '{metric.name}'"
+                                )
+                                failures.append(
+                                    EvaluationFailure(
+                                        sample_id=sample_id, message=message
+                                    )
+                                )
+                                record_failures.append(message)
+                                continue
+                            metric_start = time.perf_counter()
+                            try:
+                                with tracing.span(
+                                    "compute_metric", metric_name=metric.name
+                                ):
+                                    score = metric.compute(
+                                        prediction=prediction,
+                                        references=references,
+                                        metadata=metadata,
+                                    )
+                                score.metadata["evaluation_time_ms"] = (
+                                    time.perf_counter() - metric_start
+                                ) * 1000
+                                item_scores_for_item.append(score)
+                            except Exception as exc:  # pragma: no cover - guarded
+                                message = (
+                                    f"Metric '{metric.name}' failed for sample {sample_id}: {exc}"
+                                )
+                                logger.warning(message)
+                                failures.append(
+                                    EvaluationFailure(
+                                        sample_id=sample_id, message=message
+                                    )
+                                )
+                                record_failures.append(message)
+                        extraction_duration = (
+                            time.perf_counter() - extract_start
+                        ) * 1000
+                        for score in item_scores_for_item:
+                            score.metadata.setdefault(
+                                "extraction_time_ms", extraction_duration
+                            )
+                        item_scores.extend(item_scores_for_item)
+                    aggregated_scores = strategy.aggregate(record, item_scores)
+                    for score in aggregated_scores:
+                        per_metric[score.metric_name].append(score)
+                    per_record.append(
+                        core_entities.EvaluationRecord(
+                            sample_id=sample_id,
+                            scores=aggregated_scores,
+                            failures=record_failures,
+                        )
+                    )
+            aggregates = {
+                name: MetricAggregate.from_scores(name, scores)
+                for name, scores in per_metric.items()
+            }
+            return EvaluationReport(
+                metrics=aggregates,
+                failures=failures,
+                records=per_record,
+                slices=self._compute_slice_aggregates(per_metric, slice_members),
+            )
+    def register_slice(
+        self, name: str, fn: Callable[[core_entities.GenerationRecord], bool]
+    ) -> None:
+        """Register a slice for subset analysis.
+        Args:
+            name: Slice name
+            fn: Predicate function to determine slice membership
+        """
+        self._slices.append((name, fn))
+    def _compute_slice_aggregates(
+        self,
+        per_metric: dict[str, list[core_entities.MetricScore]],
+        slice_members: dict[str, set[str]],
+    ) -> dict[str, dict[str, MetricAggregate]]:
+        """Compute metric aggregates for each slice.
+        Args:
+            per_metric: Scores by metric name
+            slice_members: Sample IDs by slice name
+        Returns:
+            Nested dict of slice -> metric -> aggregate
+        """
+        if not slice_members:
+            return {}
+        slice_aggregates: dict[str, dict[str, MetricAggregate]] = {}
+        for name, members in slice_members.items():
+            slice_scores_by_metric: dict[str, list[core_entities.MetricScore]] = {}
+            for metric_name, scores in per_metric.items():
+                filtered = [s for s in scores if s.metadata.get("sample_id") in members]
+                slice_scores_by_metric[metric_name] = filtered
+            slice_aggregates[name] = {
+                metric_name: MetricAggregate.from_scores(metric_name, scores)
+                for metric_name, scores in slice_scores_by_metric.items()
+            }
+        return slice_aggregates

themis/evaluation/reports.py ADDED Viewed

@@ -0,0 +1,293 @@
+"""Evaluation report data structures."""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from statistics import mean
+from typing import Dict, List, Literal, Sequence
+from themis.core import entities as core_entities
+from themis.evaluation.statistics import (
+    bootstrap_ci,
+    cohens_d,
+    cohens_h,
+    holm_bonferroni,
+    paired_permutation_test,
+    paired_t_test,
+    permutation_test,
+)
+from themis.evaluation.statistics.types import (
+    BootstrapResult,
+    ComparisonResult,
+    EffectSize,
+    PermutationTestResult,
+)
+@dataclass
+class EvaluationFailure:
+    sample_id: str | None
+    message: str
+@dataclass
+class MetricAggregate:
+    name: str
+    count: int
+    mean: float
+    per_sample: List[core_entities.MetricScore]
+    @classmethod
+    def from_scores(
+        cls, name: str, scores: List[core_entities.MetricScore]
+    ) -> "MetricAggregate":
+        if not scores:
+            return cls(name=name, count=0, mean=0.0, per_sample=[])
+        return cls(
+            name=name,
+            count=len(scores),
+            mean=mean(score.value for score in scores),
+            per_sample=scores,
+        )
+@dataclass
+class EvaluationReport:
+    metrics: dict[str, MetricAggregate]
+    failures: List[EvaluationFailure]
+    records: List[core_entities.EvaluationRecord]
+    slices: dict[str, dict[str, MetricAggregate]] = field(default_factory=dict)
+def _metric_values(report: EvaluationReport, metric_name: str) -> list[float]:
+    agg = report.metrics.get(metric_name)
+    if not agg:
+        return []
+    return [s.value for s in agg.per_sample]
+def _metric_values_by_sample(
+    report: EvaluationReport, metric_name: str
+) -> dict[str, float]:
+    values: dict[str, float] = {}
+    for record in report.records:
+        if not record.sample_id:
+            continue
+        for score in record.scores:
+            if score.metric_name == metric_name:
+                values[record.sample_id] = score.value
+                break
+    return values
+def aligned_metric_values(
+    report_a: EvaluationReport, report_b: EvaluationReport, metric_name: str
+) -> tuple[list[float], list[float]]:
+    values_a = _metric_values_by_sample(report_a, metric_name)
+    values_b = _metric_values_by_sample(report_b, metric_name)
+    common_ids = sorted(set(values_a) & set(values_b))
+    if not common_ids:
+        raise ValueError(f"No overlapping sample_ids for metric '{metric_name}'")
+    aligned_a = [values_a[sample_id] for sample_id in common_ids]
+    aligned_b = [values_b[sample_id] for sample_id in common_ids]
+    return aligned_a, aligned_b
+def ci_for_metric(
+    report: EvaluationReport,
+    metric_name: str,
+    confidence_level: float = 0.95,
+    n_bootstrap: int = 10000,
+) -> BootstrapResult:
+    values = _metric_values(report, metric_name)
+    if not values:
+        raise ValueError(f"No scores for metric '{metric_name}'")
+    return bootstrap_ci(
+        values, n_bootstrap=n_bootstrap, confidence_level=confidence_level
+    )
+def permutation_test_for_metric(
+    report_a: EvaluationReport,
+    report_b: EvaluationReport,
+    metric_name: str,
+    statistic: Literal["mean_diff", "median_diff"] = "mean_diff",
+    n_permutations: int = 10000,
+    seed: int | None = None,
+    align_by_sample_id: bool = True,
+) -> PermutationTestResult:
+    if align_by_sample_id:
+        values_a, values_b = aligned_metric_values(report_a, report_b, metric_name)
+    else:
+        values_a = _metric_values(report_a, metric_name)
+        values_b = _metric_values(report_b, metric_name)
+    if not values_a or not values_b:
+        raise ValueError(f"Both reports must have scores for metric '{metric_name}'")
+    return permutation_test(
+        values_a,
+        values_b,
+        statistic=statistic,
+        n_permutations=n_permutations,
+        seed=seed,
+    )
+def paired_permutation_test_for_metric(
+    report_a: EvaluationReport,
+    report_b: EvaluationReport,
+    metric_name: str,
+    statistic: Literal["mean_diff", "median_diff"] = "mean_diff",
+    n_permutations: int = 10000,
+    seed: int | None = None,
+) -> PermutationTestResult:
+    values_a, values_b = aligned_metric_values(report_a, report_b, metric_name)
+    return paired_permutation_test(
+        values_a,
+        values_b,
+        statistic=statistic,
+        n_permutations=n_permutations,
+        seed=seed,
+    )
+def cohens_h_for_metric(
+    report_a: EvaluationReport,
+    report_b: EvaluationReport,
+    metric_name: str,
+) -> EffectSize:
+    agg_a = report_a.metrics.get(metric_name)
+    agg_b = report_b.metrics.get(metric_name)
+    if not agg_a or not agg_b:
+        raise ValueError(f"Both reports must have aggregate for metric '{metric_name}'")
+    return cohens_h(agg_a.mean, agg_b.mean)
+def cohens_d_for_metric(
+    report_a: EvaluationReport,
+    report_b: EvaluationReport,
+    metric_name: str,
+) -> EffectSize:
+    values_a, values_b = aligned_metric_values(report_a, report_b, metric_name)
+    if len(values_a) < 2 or len(values_b) < 2:
+        raise ValueError("Each group must have at least 2 values for Cohen's d")
+    return cohens_d(values_a, values_b)
+def paired_t_test_for_metric(
+    report_a: EvaluationReport,
+    report_b: EvaluationReport,
+    metric_name: str,
+    significance_level: float = 0.05,
+) -> ComparisonResult:
+    values_a, values_b = aligned_metric_values(report_a, report_b, metric_name)
+    result = paired_t_test(values_a, values_b, significance_level=significance_level)
+    return ComparisonResult(
+        metric_name=metric_name,
+        baseline_mean=result.baseline_mean,
+        treatment_mean=result.treatment_mean,
+        difference=result.difference,
+        relative_change=result.relative_change,
+        t_statistic=result.t_statistic,
+        p_value=result.p_value,
+        is_significant=result.is_significant,
+        baseline_ci=result.baseline_ci,
+        treatment_ci=result.treatment_ci,
+    )
+def _slice_metric_values(
+    report: EvaluationReport, slice_name: str, metric_name: str
+) -> list[float]:
+    slice_map = report.slices.get(slice_name)
+    if not slice_map:
+        return []
+    agg = slice_map.get(metric_name)
+    if not agg:
+        return []
+    return [s.value for s in agg.per_sample]
+def ci_for_slice_metric(
+    report: EvaluationReport,
+    slice_name: str,
+    metric_name: str,
+    confidence_level: float = 0.95,
+    n_bootstrap: int = 10000,
+) -> BootstrapResult:
+    values = _slice_metric_values(report, slice_name, metric_name)
+    if not values:
+        raise ValueError(
+            f"No scores for metric '{metric_name}' in slice '{slice_name}'"
+        )
+    return bootstrap_ci(
+        values, n_bootstrap=n_bootstrap, confidence_level=confidence_level
+    )
+def compare_reports_with_holm(
+    report_a: EvaluationReport,
+    report_b: EvaluationReport,
+    metric_names: Sequence[str],
+    statistic: Literal["mean_diff", "median_diff"] = "mean_diff",
+    n_permutations: int = 10000,
+    seed: int | None = None,
+    paired: bool = True,
+) -> Dict[str, object]:
+    p_values: list[float] = []
+    pt_results: Dict[str, PermutationTestResult] = {}
+    for name in metric_names:
+        if paired:
+            pt = paired_permutation_test_for_metric(
+                report_a,
+                report_b,
+                name,
+                statistic=statistic,
+                n_permutations=n_permutations,
+                seed=seed,
+            )
+        else:
+            pt = permutation_test_for_metric(
+                report_a,
+                report_b,
+                name,
+                statistic=statistic,
+                n_permutations=n_permutations,
+                seed=seed,
+                align_by_sample_id=True,
+            )
+        pt_results[name] = pt
+        p_values.append(pt.p_value)
+    corrected = holm_bonferroni(p_values)
+    return {
+        "per_metric": pt_results,
+        "holm_significant": corrected,
+    }
+def confusion_matrix(
+    labels_true: Sequence[str], labels_pred: Sequence[str]
+) -> Dict[str, Dict[str, int]]:
+    if len(labels_true) != len(labels_pred):
+        raise ValueError("labels_true and labels_pred must have same length")
+    cm: Dict[str, Dict[str, int]] = {}
+    for t, p in zip(labels_true, labels_pred):
+        cm.setdefault(t, {})
+        cm[t][p] = cm[t].get(p, 0) + 1
+    return cm
+__all__ = [
+    "EvaluationFailure",
+    "MetricAggregate",
+    "EvaluationReport",
+    "aligned_metric_values",
+    "ci_for_metric",
+    "ci_for_slice_metric",
+    "permutation_test_for_metric",
+    "paired_permutation_test_for_metric",
+    "cohens_h_for_metric",
+    "cohens_d_for_metric",
+    "paired_t_test_for_metric",
+    "confusion_matrix",
+    "compare_reports_with_holm",
+]

themis/evaluation/statistics/__init__.py ADDED Viewed

@@ -0,0 +1,53 @@
+"""Statistical analysis utilities for experiment evaluation results.
+This module provides statistical analysis tools for computing confidence intervals,
+significance tests, and statistical comparisons across experiment runs.
+"""
+from __future__ import annotations
+from .bootstrap import bootstrap_ci
+from .confidence_intervals import (
+    compute_confidence_interval,
+    compute_statistical_summary,
+)
+from .effect_sizes import cohens_d, cohens_h
+from .hypothesis_tests import (
+    compare_metrics,
+    holm_bonferroni,
+    paired_permutation_test,
+    paired_t_test,
+    permutation_test,
+)
+from .types import (
+    BootstrapResult,
+    ComparisonResult,
+    ConfidenceInterval,
+    EffectSize,
+    PermutationTestResult,
+    StatisticalSummary,
+)
+__all__ = [
+    # Types
+    "ConfidenceInterval",
+    "StatisticalSummary",
+    "ComparisonResult",
+    "PermutationTestResult",
+    "BootstrapResult",
+    "EffectSize",
+    # Confidence intervals
+    "compute_confidence_interval",
+    "compute_statistical_summary",
+    # Hypothesis tests
+    "compare_metrics",
+    "permutation_test",
+    "paired_permutation_test",
+    "paired_t_test",
+    "holm_bonferroni",
+    # Bootstrap
+    "bootstrap_ci",
+    # Effect sizes
+    "cohens_h",
+    "cohens_d",
+]

themis-eval 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

themis-eval 0.1.0py3-none-any.whl → 0.1.1py3-none-any.whl