PyPI - themis-eval - Versions diffs - 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl - Mend

themis-eval 0.1.0py3-none-any.whl → 0.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (132) hide show

themis/cli/__init__.py +5 -0
themis/cli/__main__.py +6 -0
themis/cli/commands/__init__.py +19 -0
themis/cli/commands/benchmarks.py +221 -0
themis/cli/commands/comparison.py +394 -0
themis/cli/commands/config_commands.py +244 -0
themis/cli/commands/cost.py +214 -0
themis/cli/commands/demo.py +68 -0
themis/cli/commands/info.py +90 -0
themis/cli/commands/leaderboard.py +362 -0
themis/cli/commands/math_benchmarks.py +318 -0
themis/cli/commands/mcq_benchmarks.py +207 -0
themis/cli/commands/sample_run.py +244 -0
themis/cli/commands/visualize.py +299 -0
themis/cli/main.py +93 -0
themis/cli/new_project.py +33 -0
themis/cli/utils.py +51 -0
themis/config/__init__.py +19 -0
themis/config/loader.py +27 -0
themis/config/registry.py +34 -0
themis/config/runtime.py +214 -0
themis/config/schema.py +112 -0
themis/core/__init__.py +5 -0
themis/core/conversation.py +354 -0
themis/core/entities.py +164 -0
themis/core/serialization.py +231 -0
themis/core/tools.py +393 -0
themis/core/types.py +141 -0
themis/datasets/__init__.py +273 -0
themis/datasets/base.py +264 -0
themis/datasets/commonsense_qa.py +174 -0
themis/datasets/competition_math.py +265 -0
themis/datasets/coqa.py +133 -0
themis/datasets/gpqa.py +190 -0
themis/datasets/gsm8k.py +123 -0
themis/datasets/gsm_symbolic.py +124 -0
themis/datasets/math500.py +122 -0
themis/datasets/med_qa.py +179 -0
themis/datasets/medmcqa.py +169 -0
themis/datasets/mmlu_pro.py +262 -0
themis/datasets/piqa.py +146 -0
themis/datasets/registry.py +201 -0
themis/datasets/schema.py +245 -0
themis/datasets/sciq.py +150 -0
themis/datasets/social_i_qa.py +151 -0
themis/datasets/super_gpqa.py +263 -0
themis/evaluation/__init__.py +1 -0
themis/evaluation/conditional.py +410 -0
themis/evaluation/extractors/__init__.py +19 -0
themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
themis/evaluation/extractors/exceptions.py +7 -0
themis/evaluation/extractors/identity_extractor.py +29 -0
themis/evaluation/extractors/json_field_extractor.py +45 -0
themis/evaluation/extractors/math_verify_extractor.py +37 -0
themis/evaluation/extractors/regex_extractor.py +43 -0
themis/evaluation/math_verify_utils.py +87 -0
themis/evaluation/metrics/__init__.py +21 -0
themis/evaluation/metrics/composite_metric.py +47 -0
themis/evaluation/metrics/consistency_metric.py +80 -0
themis/evaluation/metrics/exact_match.py +51 -0
themis/evaluation/metrics/length_difference_tolerance.py +33 -0
themis/evaluation/metrics/math_verify_accuracy.py +40 -0
themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
themis/evaluation/metrics/response_length.py +33 -0
themis/evaluation/metrics/rubric_judge_metric.py +134 -0
themis/evaluation/pipeline.py +49 -0
themis/evaluation/pipelines/__init__.py +15 -0
themis/evaluation/pipelines/composable_pipeline.py +357 -0
themis/evaluation/pipelines/standard_pipeline.py +288 -0
themis/evaluation/reports.py +293 -0
themis/evaluation/statistics/__init__.py +53 -0
themis/evaluation/statistics/bootstrap.py +79 -0
themis/evaluation/statistics/confidence_intervals.py +121 -0
themis/evaluation/statistics/distributions.py +207 -0
themis/evaluation/statistics/effect_sizes.py +124 -0
themis/evaluation/statistics/hypothesis_tests.py +305 -0
themis/evaluation/statistics/types.py +139 -0
themis/evaluation/strategies/__init__.py +13 -0
themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
themis/evaluation/strategies/evaluation_strategy.py +24 -0
themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
themis/experiment/__init__.py +5 -0
themis/experiment/builder.py +151 -0
themis/experiment/cache_manager.py +129 -0
themis/experiment/comparison.py +631 -0
themis/experiment/cost.py +310 -0
themis/experiment/definitions.py +62 -0
themis/experiment/export.py +690 -0
themis/experiment/export_csv.py +159 -0
themis/experiment/integration_manager.py +104 -0
themis/experiment/math.py +192 -0
themis/experiment/mcq.py +169 -0
themis/experiment/orchestrator.py +373 -0
themis/experiment/pricing.py +317 -0
themis/experiment/storage.py +255 -0
themis/experiment/visualization.py +588 -0
themis/generation/__init__.py +1 -0
themis/generation/agentic_runner.py +420 -0
themis/generation/batching.py +254 -0
themis/generation/clients.py +143 -0
themis/generation/conversation_runner.py +236 -0
themis/generation/plan.py +456 -0
themis/generation/providers/litellm_provider.py +221 -0
themis/generation/providers/vllm_provider.py +135 -0
themis/generation/router.py +34 -0
themis/generation/runner.py +207 -0
themis/generation/strategies.py +98 -0
themis/generation/templates.py +71 -0
themis/generation/turn_strategies.py +393 -0
themis/generation/types.py +9 -0
themis/integrations/__init__.py +0 -0
themis/integrations/huggingface.py +61 -0
themis/integrations/wandb.py +65 -0
themis/interfaces/__init__.py +83 -0
themis/project/__init__.py +20 -0
themis/project/definitions.py +98 -0
themis/project/patterns.py +230 -0
themis/providers/__init__.py +5 -0
themis/providers/registry.py +39 -0
themis/utils/api_generator.py +379 -0
themis/utils/cost_tracking.py +376 -0
themis/utils/dashboard.py +452 -0
themis/utils/logging_utils.py +41 -0
themis/utils/progress.py +58 -0
themis/utils/tracing.py +320 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/METADATA +1 -1
themis_eval-0.1.1.dist-info/RECORD +134 -0
themis_eval-0.1.0.dist-info/RECORD +0 -8
{themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/WHEEL +0 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/licenses/LICENSE +0 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/top_level.txt +0 -0

themis/datasets/commonsense_qa.py ADDED Viewed

@@ -0,0 +1,174 @@
+"""Helpers for working with the tau/commonsense_qa dataset."""
+from __future__ import annotations
+import json
+import string
+from pathlib import Path
+from typing import Any, Iterable, Iterator, List, Sequence
+from pydantic import BaseModel, Field, ValidationInfo, field_validator
+_DATASET_NAME = "tau/commonsense_qa"
+_CHOICE_LABELS = tuple(string.ascii_uppercase)
+class CommonsenseQaSample(BaseModel):
+    unique_id: str
+    question: str
+    choices: list[str]
+    answer: str
+    concept: str = Field(default="")
+    metadata: dict[str, Any] = Field(default_factory=dict)
+    choice_labels: list[str] = Field(default_factory=list)
+    @field_validator("choices", mode="before")
+    @classmethod
+    def _ensure_choices(cls, value: Any) -> list[str]:
+        if value is None:
+            return []
+        if isinstance(value, dict):
+            return [str(v) for _, v in sorted(value.items())]
+        if isinstance(value, (list, tuple)):
+            return [str(item) for item in value]
+        raise TypeError("choices must be a sequence or mapping")
+    @field_validator("choice_labels", mode="before")
+    @classmethod
+    def _build_choice_labels(cls, value: Any, info: ValidationInfo) -> list[str]:
+        if value:
+            return [str(item) for item in value]
+        choices = info.data.get("choices") if hasattr(info, "data") else None
+        total = len(choices) if isinstance(choices, list) else 0
+        return [*_CHOICE_LABELS[:total]]
+    def to_generation_example(self) -> dict[str, Any]:
+        effective_labels = (
+            list(self.choice_labels)
+            if self.choice_labels
+            else list(_CHOICE_LABELS[: len(self.choices)])
+        )
+        return {
+            "unique_id": self.unique_id,
+            "question": self.question,
+            "choices": list(self.choices),
+            "choice_labels": effective_labels,
+            "answer": self.answer,
+            "concept": self.concept,
+            "metadata": dict(self.metadata),
+        }
+def load_commonsense_qa(
+    *,
+    split: str = "validation", # Test set usually has no labels
+    limit: int | None = None,
+    source: str = "huggingface",
+    data_dir: str | Path | None = None,
+) -> List[CommonsenseQaSample]:
+    """Load CommonsenseQA samples from Hugging Face or a local directory."""
+    if source not in {"huggingface", "local"}:
+        raise ValueError(
+            f"Unsupported source '{source}'. Expected one of: 'huggingface', 'local'."
+        )
+    if source == "huggingface":
+        rows = _load_from_huggingface(split=split)
+    else:
+        if data_dir is None:
+            raise ValueError(
+                "data_dir must be provided when source='local'. "
+                "Pass dataset.data_dir in configs or --data-dir on the CLI."
+            )
+        rows = _load_from_local(Path(data_dir))
+    samples: list[CommonsenseQaSample] = []
+    for index, row in enumerate(rows, start=1):
+        sample = _row_to_sample(row, index=index)
+        samples.append(sample)
+        if limit is not None and len(samples) >= limit:
+            break
+    return samples
+def _row_to_sample(row: dict[str, Any], *, index: int) -> CommonsenseQaSample:
+    unique_id = (
+        row.get("id")
+        or row.get("unique_id")
+        or f"csqa-{index:05d}"
+    )
+    question = row.get("question") or ""
+    # CommonsenseQA format:
+    # choices: {'label': ['A', 'B', ...], 'text': ['text1', 'text2', ...]}
+    # answerKey: 'A'
+    choices_data = row.get("choices") or {}
+    choices = []
+    choice_labels = []
+    if isinstance(choices_data, dict):
+        labels = choices_data.get("label") or []
+        texts = choices_data.get("text") or []
+        # Zip and sort by label
+        zipped = sorted(zip(labels, texts), key=lambda x: x[0])
+        for label, text in zipped:
+            choices.append(str(text))
+            choice_labels.append(str(label))
+    answer = str(row.get("answerKey") or "")
+    concept = str(row.get("question_concept") or "")
+    metadata_keys = {
+        "question", "choices", "answerKey", "question_concept", "id"
+    }
+    metadata = {key: value for key, value in row.items() if key not in metadata_keys}
+    return CommonsenseQaSample(
+        unique_id=str(unique_id),
+        question=str(question),
+        choices=choices,
+        choice_labels=choice_labels,
+        answer=answer,
+        concept=concept,
+        metadata=metadata,
+    )
+def _load_from_huggingface(*, split: str) -> Iterable[dict[str, Any]]:
+    try:
+        from datasets import load_dataset
+    except ImportError as exc:  # pragma: no cover - optional dependency
+        raise RuntimeError(
+            "datasets is required to load CommonsenseQA from Hugging Face. Install it via `uv pip install '.[hf]'`."
+        ) from exc
+    dataset = load_dataset(_DATASET_NAME, split=split)
+    for row in dataset:
+        yield dict(row)
+def _load_from_local(root: Path) -> Iterator[dict[str, Any]]:
+    if not root.exists():
+        raise FileNotFoundError(f"Local dataset directory not found: {root}")
+    for path in root.rglob("*"):
+        if path.suffix.lower() == ".json":
+            with path.open("r", encoding="utf-8") as handle:
+                row = json.load(handle)
+                row.setdefault("id", path.stem)
+                yield row
+        elif path.suffix.lower() in {".jsonl", ".ndjson"}:
+            with path.open("r", encoding="utf-8") as handle:
+                for line_num, line in enumerate(handle, start=1):
+                    line = line.strip()
+                    if not line:
+                        continue
+                    row = json.loads(line)
+                    row.setdefault("id", f"{path.stem}-{line_num}")
+                    yield row
+__all__ = ["CommonsenseQaSample", "load_commonsense_qa"]

themis/datasets/competition_math.py ADDED Viewed

@@ -0,0 +1,265 @@
+"""Helpers for competition-style math benchmarks from Hugging Face."""
+from __future__ import annotations
+import json
+from pathlib import Path
+from typing import Any, Iterable, Iterator, List, Sequence
+from pydantic import BaseModel, Field, field_validator
+class CompetitionMathSample(BaseModel):
+    unique_id: str
+    problem: str
+    solution: str
+    answer: str
+    subject: str = Field(default="unknown")
+    level: str | int = Field(default="unknown")
+    metadata: dict[str, Any] = Field(default_factory=dict)
+    @field_validator("metadata", mode="before")
+    @classmethod
+    def _ensure_metadata(cls, value: Any) -> dict[str, Any]:
+        return dict(value or {})
+    @field_validator("level", mode="before")
+    @classmethod
+    def _normalize_level(cls, value: Any) -> str | int:
+        if value is None or value == "":
+            return "unknown"
+        try:
+            return int(value)
+        except (TypeError, ValueError):
+            return str(value)
+    def to_generation_example(self) -> dict[str, Any]:
+        payload = {
+            "unique_id": self.unique_id,
+            "problem": self.problem,
+            "solution": self.solution,
+            "answer": self.answer,
+            "subject": self.subject,
+            "level": self.level,
+        }
+        payload.update(self.metadata)
+        return payload
+def load_competition_math(
+    *,
+    dataset: str,
+    split: str = "test",
+    limit: int | None = None,
+    source: str = "huggingface",
+    data_dir: str | Path | None = None,
+    subjects: Sequence[str] | None = None,
+    subset: str | None = None,
+) -> List[CompetitionMathSample]:
+    """Load math competition samples from Hugging Face or a local directory."""
+    if source not in {"huggingface", "local"}:
+        raise ValueError(
+            f"Unsupported source '{source}'. Expected one of: 'huggingface', 'local'."
+        )
+    if source == "huggingface":
+        rows = _load_from_huggingface(dataset=dataset, split=split, subset=subset)
+    else:
+        if data_dir is None:
+            raise ValueError(
+                "data_dir must be provided when source='local'. "
+                "Pass dataset.data_dir in configs or --data-dir on the CLI."
+            )
+        rows = _load_from_local(Path(data_dir))
+    samples: list[CompetitionMathSample] = []
+    selected_subjects = {s.lower() for s in subjects} if subjects else None
+    for index, row in enumerate(rows, start=1):
+        subject = _extract_subject(row) or "unknown"
+        if selected_subjects and subject.lower() not in selected_subjects:
+            continue
+        sample = _row_to_sample(
+            row=row,
+            index=index,
+            dataset=dataset,
+            subject=subject,
+        )
+        samples.append(sample)
+        if limit is not None and len(samples) >= limit:
+            break
+    return samples
+def _load_from_huggingface(
+    *, dataset: str, split: str, subset: str | None
+) -> Iterable[dict[str, Any]]:
+    try:
+        from datasets import load_dataset
+    except ImportError as exc:  # pragma: no cover - optional dependency
+        raise RuntimeError(
+            "datasets is required to load competition math benchmarks from Hugging Face. "
+            "Install it via `uv pip install '.[hf]'`."
+        ) from exc
+    if subset:
+        hf_dataset = load_dataset(dataset, subset, split=split)
+    else:
+        hf_dataset = load_dataset(dataset, split=split)
+    for row in hf_dataset:
+        yield dict(row)
+def _load_from_local(root: Path) -> Iterator[dict[str, Any]]:
+    if not root.exists():
+        raise FileNotFoundError(f"Local dataset directory not found: {root}")
+    for path in root.rglob("*"):
+        if path.suffix.lower() == ".json":
+            with path.open("r", encoding="utf-8") as handle:
+                row = json.load(handle)
+                row.setdefault("id", path.stem)
+                yield row
+        elif path.suffix.lower() in {".jsonl", ".ndjson"}:
+            with path.open("r", encoding="utf-8") as handle:
+                for line_num, line in enumerate(handle, start=1):
+                    line = line.strip()
+                    if not line:
+                        continue
+                    row = json.loads(line)
+                    row.setdefault("id", f"{path.stem}-{line_num}")
+                    yield row
+def _extract_subject(row: dict[str, Any]) -> str | None:
+    for key in (
+        "subject",
+        "category",
+        "topic",
+        "domain",
+        "contest",
+        "source",
+        "level",
+    ):
+        value = row.get(key)
+        if value:
+            return str(value)
+    return None
+def _extract_problem(row: dict[str, Any]) -> str:
+    for key in (
+        "problem",
+        "problem_text",
+        "problem_statement",
+        "question",
+        "prompt",
+        "problem_markdown",
+    ):
+        value = row.get(key)
+        if value:
+            return str(value)
+    return ""
+def _extract_solution(row: dict[str, Any]) -> str:
+    for key in (
+        "solution",
+        "solution_text",
+        "solution_markdown",
+        "answer_explanation",
+        "worked_solution",
+        "reasoning",
+    ):
+        value = row.get(key)
+        if value:
+            return str(value)
+    return ""
+def _extract_answer(row: dict[str, Any]) -> str:
+    for key in (
+        "answer",
+        "final_answer",
+        "ground_truth",
+        "answer_text",
+        "answer_value",
+    ):
+        value = row.get(key)
+        if value is not None:
+            return str(value).strip()
+    return ""
+def _extract_level(row: dict[str, Any]) -> str | int:
+    for key in ("difficulty", "level", "year"):
+        value = row.get(key)
+        if value:
+            return value
+    return "unknown"
+def _row_to_sample(
+    *,
+    row: dict[str, Any],
+    index: int,
+    dataset: str,
+    subject: str,
+) -> CompetitionMathSample:
+    unique_id = (
+        row.get("id")
+        or row.get("problem_id")
+        or row.get("unique_id")
+        or f"{dataset.replace('/', '-')}-{index:05d}"
+    )
+    problem = _extract_problem(row)
+    solution = _extract_solution(row)
+    answer = _extract_answer(row)
+    level = _extract_level(row)
+    core_keys = {
+        "id",
+        "problem_id",
+        "unique_id",
+        "problem",
+        "problem_text",
+        "problem_statement",
+        "question",
+        "prompt",
+        "problem_markdown",
+        "solution",
+        "solution_text",
+        "solution_markdown",
+        "answer_explanation",
+        "worked_solution",
+        "reasoning",
+        "answer",
+        "final_answer",
+        "ground_truth",
+        "answer_text",
+        "answer_value",
+        "difficulty",
+        "level",
+        "year",
+        "subject",
+        "category",
+        "topic",
+        "domain",
+        "contest",
+        "source",
+    }
+    metadata = {key: value for key, value in row.items() if key not in core_keys}
+    sample = CompetitionMathSample.model_validate(
+        {
+            "unique_id": str(unique_id),
+            "problem": problem,
+            "solution": solution,
+            "answer": answer,
+            "subject": str(subject),
+            "level": level,
+            "metadata": metadata,
+        }
+    )
+    return sample
+__all__ = ["CompetitionMathSample", "load_competition_math"]

themis/datasets/coqa.py ADDED Viewed

@@ -0,0 +1,133 @@
+"""Helpers for working with the stanfordnlp/coqa dataset."""
+from __future__ import annotations
+import json
+from pathlib import Path
+from typing import Any, Iterable, Iterator, List, Sequence
+from pydantic import BaseModel, Field, field_validator
+_DATASET_NAME = "stanfordnlp/coqa"
+class CoQaSample(BaseModel):
+    unique_id: str
+    story: str
+    question: str
+    answer: str
+    metadata: dict[str, Any] = Field(default_factory=dict)
+    @field_validator("metadata", mode="before")
+    @classmethod
+    def _ensure_metadata(cls, value: Any) -> dict[str, Any]:
+        return dict(value or {})
+    def to_generation_example(self) -> dict[str, Any]:
+        return {
+            "unique_id": self.unique_id,
+            "story": self.story,
+            "question": self.question,
+            "answer": self.answer,
+            "metadata": dict(self.metadata),
+        }
+def load_coqa(
+    *,
+    split: str = "validation", # Test set usually has no labels
+    limit: int | None = None,
+    source: str = "huggingface",
+    data_dir: str | Path | None = None,
+) -> List[CoQaSample]:
+    """Load CoQA samples from Hugging Face or a local directory."""
+    if source not in {"huggingface", "local"}:
+        raise ValueError(
+            f"Unsupported source '{source}'. Expected one of: 'huggingface', 'local'."
+        )
+    if source == "huggingface":
+        rows = _load_from_huggingface(split=split)
+    else:
+        if data_dir is None:
+            raise ValueError(
+                "data_dir must be provided when source='local'. "
+                "Pass dataset.data_dir in configs or --data-dir on the CLI."
+            )
+        rows = _load_from_local(Path(data_dir))
+    samples: list[CoQaSample] = []
+    for index, row in enumerate(rows, start=1):
+        # CoQA has multiple questions per story. We need to flatten them.
+        # But wait, usually we want to evaluate turn-by-turn or just single turn.
+        # For simplicity, let's flatten: each question is a sample.
+        # Or maybe just take the first one? No, that's wasteful.
+        # Let's see the structure:
+        # 'questions': ['q1', 'q2'], 'answers': {'input_text': ['a1', 'a2'], ...}
+        story = row.get("story") or ""
+        questions = row.get("questions") or []
+        answers_data = row.get("answers") or {}
+        answers = answers_data.get("input_text") or []
+        if len(questions) != len(answers):
+            # Mismatch, skip or warn?
+            # Let's just take the minimum length
+            min_len = min(len(questions), len(answers))
+            questions = questions[:min_len]
+            answers = answers[:min_len]
+        for i, (q, a) in enumerate(zip(questions, answers)):
+            sample = CoQaSample(
+                unique_id=f"coqa-{index:05d}-{i:02d}",
+                story=story,
+                question=str(q),
+                answer=str(a),
+                metadata={"turn": i, "source": row.get("source")},
+            )
+            samples.append(sample)
+            if limit is not None and len(samples) >= limit:
+                break
+        if limit is not None and len(samples) >= limit:
+            break
+    return samples
+def _load_from_huggingface(*, split: str) -> Iterable[dict[str, Any]]:
+    try:
+        from datasets import load_dataset
+    except ImportError as exc:  # pragma: no cover - optional dependency
+        raise RuntimeError(
+            "datasets is required to load CoQA from Hugging Face. Install it via `uv pip install '.[hf]'`."
+        ) from exc
+    dataset = load_dataset(_DATASET_NAME, split=split)
+    for row in dataset:
+        yield dict(row)
+def _load_from_local(root: Path) -> Iterator[dict[str, Any]]:
+    if not root.exists():
+        raise FileNotFoundError(f"Local dataset directory not found: {root}")
+    for path in root.rglob("*"):
+        if path.suffix.lower() == ".json":
+            with path.open("r", encoding="utf-8") as handle:
+                row = json.load(handle)
+                row.setdefault("id", path.stem)
+                yield row
+        elif path.suffix.lower() in {".jsonl", ".ndjson"}:
+            with path.open("r", encoding="utf-8") as handle:
+                for line_num, line in enumerate(handle, start=1):
+                    line = line.strip()
+                    if not line:
+                        continue
+                    row = json.loads(line)
+                    row.setdefault("id", f"{path.stem}-{line_num}")
+                    yield row
+__all__ = ["CoQaSample", "load_coqa"]

themis-eval 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

themis-eval 0.1.0py3-none-any.whl → 0.1.1py3-none-any.whl